[llvm] [SLPVectorizer] Widen strided loads. (PR #153074)

Thu Sep 11 05:57:44 PDT 2025

https://github.com/mgudim updated https://github.com/llvm/llvm-project/pull/153074

>From e1232414195e6f0bd291f7b34697eba4edfdc8f8 Mon Sep 17 00:00:00 2001
From: Mikhail Gudim <mgudim at ventanamicro.com>
Date: Fri, 8 Aug 2025 07:52:29 -0700
Subject: [PATCH 01/17] [SLPVectorizer] Widen strided loads.

Currently SLPVectorizer can generate strided loads only for this
pattern:

```
load %base + 0 * %stride
load %base + 1 * %stride
...
load %base + n * %stride
```
In this PR we extend it to this pattern:

```
; load w consecutive elements starting at %base
load %base + 0 * %stride + 0
load %base + 0 * %stride + 1
load %base + 0 * %stride + 2
...
load %base + 0 * %stride + (w - 1)

; load w consecutive elements starting at %base + 1 * %stride
load %base + 1 * %stride + 0
load %base + 1 * %stride + 1
load %base + 1 * %stride + 2
...
load %base + 1 * %stride + (w - 1)
...
; load w consecutive elements starting at %base + n * %stride
load %base + n * %stride
load %base + n * %stride + 0
load %base + n * %stride + 1
load %base + n * %stride + 2
...
load %base + n * %stride + (w - 1)
```
This works for both run-time and constant strides.
---
 .../Target/RISCV/RISCVTargetTransformInfo.cpp |   9 +
 .../Transforms/Vectorize/SLPVectorizer.cpp    | 493 +++++++++++++-----
 .../SLPVectorizer/RISCV/x264-satd-8x4.ll      | 483 +++++++++++++++++
 3 files changed, 852 insertions(+), 133 deletions(-)
 create mode 100644 llvm/test/Transforms/SLPVectorizer/RISCV/x264-satd-8x4.ll

diff --git a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp
index 1ca513214f67c..5fc73f66a0030 100644
--- a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp
+++ b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp
@@ -37,6 +37,11 @@ static cl::opt<unsigned> SLPMaxVF(
         "exclusively by SLP vectorizer."),
     cl::Hidden);
 
+static cl::opt<bool> SLPPreferAltOpcVectorization(
+    "riscv-v-slp-prefer-alt-opc-vectorization",
+    cl::desc("Controls preferAlternateOpcodeVectorization"), cl::init(false),
+    cl::Hidden);
+
 static cl::opt<unsigned>
     RVVMinTripCount("riscv-v-min-trip-count",
                     cl::desc("Set the lower bound of a trip count to decide on "
@@ -3052,3 +3057,7 @@ RISCVTTIImpl::enableMemCmpExpansion(bool OptSize, bool IsZeroCmp) const {
   }
   return Options;
 }
+
+bool RISCVTTIImpl::preferAlternateOpcodeVectorization() const {
+  return SLPPreferAltOpcVectorization;
+}
diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index 1cfcd3ffbd664..2ff7b8a217623 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -1916,6 +1916,21 @@ class BoUpSLP {
   class ShuffleCostEstimator;
   class ShuffleInstructionBuilder;
 
+  // If we decide to generate strided load / store, this struct contains all the
+  // necessary info. It's fields are calculated by analyzeRtStrideCandidate and
+  // analyzeConstantStrideCandidate. Note that Stride can be given either as a
+  // SCEV or as a Value if it already exists.
+  // To get the stride in bytes, StrideVal (or value obtained from StrideSCEV)
+  // has to by multiplied by the size of element of FixedVectorType.
+  struct StridedPtrInfo {
+    Value *StrideVal = nullptr;
+    const SCEV *StrideSCEV = nullptr;
+    // Represents the ammount which needs to be added to the base pointer of
+    // strided load.
+    FixedVectorType *Ty = nullptr;
+  };
+  DenseMap<TreeEntry *, StridedPtrInfo> TreeEntryToStridedPtrInfoMap;
+
 public:
   /// Tracks the state we can represent the loads in the given sequence.
   enum class LoadsState {
@@ -2076,6 +2091,7 @@ class BoUpSLP {
     UserIgnoreList = nullptr;
     PostponedGathers.clear();
     ValueToGatherNodes.clear();
+    TreeEntryToStridedPtrInfoMap.clear();
   }
 
   unsigned getTreeSize() const { return VectorizableTree.size(); }
@@ -2212,6 +2228,17 @@ class BoUpSLP {
   ///       may not be necessary.
   bool isLoadCombineCandidate(ArrayRef<Value *> Stores) const;
 
+  bool analyzeRtStrideCandidate(ArrayRef<Value *> PointerOps, Type *ElemTy,
+                                Align CommonAlignment,
+                                SmallVectorImpl<unsigned> &SortedIndices,
+                                StridedPtrInfo *SPtrInfo) const;
+
+  bool analyzeConstantStrideCandidate(ArrayRef<Value *> PointerOps,
+                                      Type *ElemTy, Align CommonAlignment,
+                                      SmallVectorImpl<unsigned> &SortedIndices,
+                                      StridedPtrInfo *SPtrInfo, int64_t Diff,
+                                      Value *Ptr0, Value *PtrN) const;
+
   /// Checks if the given array of loads can be represented as a vectorized,
   /// scatter or just simple gather.
   /// \param VL list of loads.
@@ -2225,6 +2252,7 @@ class BoUpSLP {
   LoadsState canVectorizeLoads(ArrayRef<Value *> VL, const Value *VL0,
                                SmallVectorImpl<unsigned> &Order,
                                SmallVectorImpl<Value *> &PointerOps,
+                               StridedPtrInfo *SPtrInfo = nullptr,
                                unsigned *BestVF = nullptr,
                                bool TryRecursiveCheck = true) const;
 
@@ -4469,11 +4497,10 @@ class BoUpSLP {
 
   /// Checks if the specified list of the instructions/values can be vectorized
   /// and fills required data before actual scheduling of the instructions.
-  TreeEntry::EntryState
-  getScalarsVectorizationState(const InstructionsState &S, ArrayRef<Value *> VL,
-                               bool IsScatterVectorizeUserTE,
-                               OrdersType &CurrentOrder,
-                               SmallVectorImpl<Value *> &PointerOps);
+  TreeEntry::EntryState getScalarsVectorizationState(
+      const InstructionsState &S, ArrayRef<Value *> VL,
+      bool IsScatterVectorizeUserTE, OrdersType &CurrentOrder,
+      SmallVectorImpl<Value *> &PointerOps, StridedPtrInfo *SPtrInfo);
 
   /// Maps a specific scalar to its tree entry(ies).
   SmallDenseMap<Value *, SmallVector<TreeEntry *>> ScalarToTreeEntries;
@@ -6418,12 +6445,14 @@ static const SCEV *calculateRtStride(ArrayRef<Value *> PointerOps, Type *ElemTy,
       const auto *SC = dyn_cast<SCEVConstant>(Coeff);
       if (!SC || isa<SCEVCouldNotCompute>(SC))
         return nullptr;
+      Coeffs.push_back((int64_t)SC->getAPInt().getLimitedValue());
       if (!SE.getMinusSCEV(PtrSCEV, SE.getAddExpr(PtrSCEVLowest,
                                                   SE.getMulExpr(Stride, SC)))
                ->isZero())
         return nullptr;
       Dist = SC->getAPInt().getZExtValue();
-    }
+    } else
+      Coeffs.push_back(0);
     // If the strides are not the same or repeated, we can't vectorize.
     if ((Dist / Size) * Size != Dist || (Dist / Size) >= SCEVs.size())
       return nullptr;
@@ -6437,18 +6466,161 @@ static const SCEV *calculateRtStride(ArrayRef<Value *> PointerOps, Type *ElemTy,
   if (Offsets.size() != SCEVs.size())
     return nullptr;
   SortedIndices.clear();
-  if (!IsConsecutive) {
-    // Fill SortedIndices array only if it is non-consecutive.
-    SortedIndices.resize(PointerOps.size());
-    Cnt = 0;
-    for (const std::pair<int64_t, int> &Pair : Offsets) {
-      SortedIndices[Cnt] = Pair.second;
-      ++Cnt;
-    }
+  SortedIndices.resize(PointerOps.size());
+  Cnt = 0;
+  for (const std::pair<int64_t, int> &Pair : Offsets) {
+    SortedIndices[Cnt] = Pair.second;
+    ++Cnt;
   }
   return Stride;
 }
 
+// Suppose we are given pointers of the form: %b + x * %s + y * %c
+// where %c is constant. Check if the pointers can be rearranged as follows:
+//  %b + 0 * %s + 0
+//  %b + 0 * %s + 1
+//  %b + 0 * %s + 2
+//  ...
+//  %b + 0 * %s + w
+//
+//  %b + 1 * %s + 0
+//  %b + 1 * %s + 1
+//  %b + 1 * %s + 2
+//  ...
+//  %b + 1 * %s + w
+//  ...
+//
+//  If the pointers can be rearanged in the above pattern, it means that the
+//  memory can be accessed with a strided loads of width `w` and stride `%s`.
+bool BoUpSLP::analyzeRtStrideCandidate(ArrayRef<Value *> PointerOps,
+                                       Type *ElemTy, Align CommonAlignment,
+                                       SmallVectorImpl<unsigned> &SortedIndices,
+                                       StridedPtrInfo *SPtrInfo) const {
+  // Group the pointers by constant offset.
+  DenseMap<int64_t, std::pair<SmallVector<Value *>, SmallVector<unsigned>>>
+      OffsetToPointerOpIdxMap;
+  for (auto [Idx, Ptr] : enumerate(PointerOps)) {
+    const SCEV *PtrSCEV = SE->getSCEV(Ptr);
+    if (!PtrSCEV)
+      return false;
+
+    const SCEVAddExpr *Add = dyn_cast<SCEVAddExpr>(PtrSCEV);
+    int64_t Offset = 0;
+    if (Add) {
+      for (int i = 0; i < (int)Add->getNumOperands(); ++i) {
+        const SCEVConstant *SC = dyn_cast<SCEVConstant>(Add->getOperand(i));
+        if (!SC)
+          continue;
+        Offset = (int64_t)(SC->getAPInt().getLimitedValue());
+        break;
+      }
+    }
+    OffsetToPointerOpIdxMap[Offset].first.push_back(Ptr);
+    OffsetToPointerOpIdxMap[Offset].second.push_back(Idx);
+  }
+  int NumOffsets = OffsetToPointerOpIdxMap.size();
+
+  const unsigned Sz = PointerOps.size();
+  unsigned VecSz = Sz;
+  Type *ScalarTy = ElemTy;
+  if (NumOffsets > 1) {
+    if (Sz % NumOffsets != 0)
+      return false;
+    VecSz = Sz / NumOffsets;
+    ScalarTy = Type::getIntNTy(SE->getContext(),
+                               DL->getTypeSizeInBits(ElemTy).getFixedValue() *
+                                   NumOffsets);
+  }
+  FixedVectorType *StridedLoadTy = getWidenedType(ScalarTy, VecSz);
+  if (!TTI->isTypeLegal(StridedLoadTy) ||
+      !TTI->isLegalStridedLoadStore(StridedLoadTy, CommonAlignment))
+    return false;
+
+  SmallVector<int> SortedOffsetsV;
+  for (auto [K, V] : OffsetToPointerOpIdxMap) {
+    SortedOffsetsV.push_back(K);
+  }
+  llvm::sort(SortedOffsetsV);
+  if (NumOffsets > 1) {
+    int CommonDiff = SortedOffsetsV[1] - SortedOffsetsV[0];
+    if ((CommonDiff) != 1)
+      return false;
+    for (int i = 1; i < (int)SortedOffsetsV.size() - 1; ++i) {
+      if (SortedOffsetsV[i + 1] - SortedOffsetsV[i] != CommonDiff)
+        return false;
+    }
+  }
+
+  int64_t LowestOffset = SortedOffsetsV[0];
+  SmallVector<Value *> &PointerOps0 =
+      OffsetToPointerOpIdxMap[LowestOffset].first;
+  SmallVector<unsigned> &IndicesInAllPointerOps0 =
+      OffsetToPointerOpIdxMap[LowestOffset].second;
+
+  SmallVector<int64_t> Coeffs0;
+  SmallVector<unsigned> SortedIndicesForOffset0;
+  const SCEV *Stride0 = calculateRtStride(PointerOps0, ElemTy, *DL, *SE,
+                                          SortedIndicesForOffset0, Coeffs0);
+  if (!Stride0)
+    return false;
+  unsigned NumCoeffs0 = Coeffs0.size();
+  if (NumCoeffs0 * NumOffsets != Sz)
+    return false;
+  llvm::sort(Coeffs0);
+
+  SmallVector<unsigned> SortedIndicesDraft;
+  SortedIndicesDraft.resize(Sz);
+  auto updateSortedIndices =
+      [&](SmallVectorImpl<unsigned> &SortedIndicesForOffset,
+          SmallVectorImpl<unsigned> &IndicesInAllPointerOps,
+          int64_t OffsetNum) {
+        unsigned Num = 0;
+        for (unsigned Idx : SortedIndicesForOffset) {
+          SortedIndicesDraft[Num * NumOffsets + OffsetNum] =
+              IndicesInAllPointerOps[Idx];
+          ++Num;
+        }
+      };
+
+  updateSortedIndices(SortedIndicesForOffset0, IndicesInAllPointerOps0, 0);
+
+  SmallVector<int64_t> Coeffs;
+  SmallVector<unsigned> SortedIndicesForOffset;
+  for (int i = 1; i < NumOffsets; ++i) {
+    Coeffs.clear();
+    SortedIndicesForOffset.clear();
+
+    int64_t Offset = SortedOffsetsV[i];
+    SmallVector<Value *> &PointerOpsForOffset =
+        OffsetToPointerOpIdxMap[Offset].first;
+    SmallVector<unsigned> &IndicesInAllPointerOps =
+        OffsetToPointerOpIdxMap[Offset].second;
+    const SCEV *StrideWithinGroup = calculateRtStride(
+        PointerOpsForOffset, ElemTy, *DL, *SE, SortedIndicesForOffset, Coeffs);
+
+    if ((!StrideWithinGroup) || StrideWithinGroup != Stride0) {
+      return false;
+    }
+    if (Coeffs.size() != NumCoeffs0)
+      return false;
+    llvm::sort(Coeffs);
+    for (unsigned i = 0; i < NumCoeffs0; ++i) {
+      if (Coeffs[i] != Coeffs0[i])
+        return false;
+    }
+
+    updateSortedIndices(SortedIndicesForOffset, IndicesInAllPointerOps, i);
+  }
+
+  SortedIndices.clear();
+  SortedIndices = SortedIndicesDraft;
+  if (SPtrInfo) {
+    SPtrInfo->StrideSCEV = Stride0;
+    SPtrInfo->Ty = StridedLoadTy;
+  }
+  return true;
+}
+
 static std::pair<InstructionCost, InstructionCost>
 getGEPCosts(const TargetTransformInfo &TTI, ArrayRef<Value *> Ptrs,
             Value *BasePtr, unsigned Opcode, TTI::TargetCostKind CostKind,
@@ -6776,77 +6948,133 @@ isMaskedLoadCompress(ArrayRef<Value *> VL, ArrayRef<Value *> PointerOps,
                               CompressMask, LoadVecTy);
 }
 
-/// Checks if strided loads can be generated out of \p VL loads with pointers \p
-/// PointerOps:
-/// 1. Target with strided load support is detected.
-/// 2. The number of loads is greater than MinProfitableStridedLoads, or the
-/// potential stride <= MaxProfitableLoadStride and the potential stride is
-/// power-of-2 (to avoid perf regressions for the very small number of loads)
-/// and max distance > number of loads, or potential stride is -1.
-/// 3. The loads are ordered, or number of unordered loads <=
-/// MaxProfitableUnorderedLoads, or loads are in reversed order. (this check is
-/// to avoid extra costs for very expensive shuffles).
-/// 4. Any pointer operand is an instruction with the users outside of the
-/// current graph (for masked gathers extra extractelement instructions
-/// might be required).
-static bool isStridedLoad(ArrayRef<Value *> VL, ArrayRef<Value *> PointerOps,
-                          ArrayRef<unsigned> Order,
-                          const TargetTransformInfo &TTI, const DataLayout &DL,
-                          ScalarEvolution &SE,
-                          const bool IsAnyPointerUsedOutGraph,
-                          const int64_t Diff) {
-  const size_t Sz = VL.size();
-  const uint64_t AbsoluteDiff = std::abs(Diff);
-  Type *ScalarTy = VL.front()->getType();
-  auto *VecTy = getWidenedType(ScalarTy, Sz);
-  if (IsAnyPointerUsedOutGraph ||
-      (AbsoluteDiff > Sz &&
-       (Sz > MinProfitableStridedLoads ||
-        (AbsoluteDiff <= MaxProfitableLoadStride * Sz &&
-         AbsoluteDiff % Sz == 0 && has_single_bit(AbsoluteDiff / Sz)))) ||
-      Diff == -(static_cast<int64_t>(Sz) - 1)) {
-    int64_t Stride = Diff / static_cast<int64_t>(Sz - 1);
-    if (Diff != Stride * static_cast<int64_t>(Sz - 1))
+// Same as analyzeRtStrideCandidate, but for constant strides.
+bool BoUpSLP::analyzeConstantStrideCandidate(
+    ArrayRef<Value *> PointerOps, Type *ElemTy, Align CommonAlignment,
+    SmallVectorImpl<unsigned> &SortedIndices, StridedPtrInfo *SPtrInfo,
+    int64_t Diff, Value *Ptr0, Value *PtrN) const {
+  const unsigned Sz = PointerOps.size();
+  SmallVector<int64_t> SortedOffsetsFromBase;
+  SortedOffsetsFromBase.resize(Sz);
+  for (unsigned i = 0; i < Sz; ++i) {
+    Value *Ptr =
+        SortedIndices.empty() ? PointerOps[i] : PointerOps[SortedIndices[i]];
+    SortedOffsetsFromBase[i] =
+        *getPointersDiff(ElemTy, Ptr0, ElemTy, Ptr, *DL, *SE);
+  }
+
+  // Find where the first group ends.
+  assert(SortedOffsetsFromBase.size() > 1);
+  int64_t StrideWithinGroup =
+      SortedOffsetsFromBase[1] - SortedOffsetsFromBase[0];
+  unsigned GroupSize = 1;
+  for (; GroupSize != SortedOffsetsFromBase.size(); ++GroupSize) {
+    if (SortedOffsetsFromBase[GroupSize] -
+            SortedOffsetsFromBase[GroupSize - 1] !=
+        StrideWithinGroup)
+      break;
+  }
+  unsigned VecSz = Sz;
+  Type *ScalarTy = ElemTy;
+  int64_t StrideIntVal = StrideWithinGroup;
+  FixedVectorType *StridedLoadTy = getWidenedType(ScalarTy, VecSz);
+
+  if (Sz != GroupSize) {
+    if (Sz % GroupSize != 0)
       return false;
-    Align Alignment =
-        cast<LoadInst>(Order.empty() ? VL.front() : VL[Order.front()])
-            ->getAlign();
-    if (!TTI.isLegalStridedLoadStore(VecTy, Alignment))
+    VecSz = Sz / GroupSize;
+
+    if (StrideWithinGroup != 1)
       return false;
-    Value *Ptr0;
-    Value *PtrN;
-    if (Order.empty()) {
-      Ptr0 = PointerOps.front();
-      PtrN = PointerOps.back();
-    } else {
-      Ptr0 = PointerOps[Order.front()];
-      PtrN = PointerOps[Order.back()];
-    }
-    // Iterate through all pointers and check if all distances are
-    // unique multiple of Dist.
-    SmallSet<int64_t, 4> Dists;
-    for (Value *Ptr : PointerOps) {
-      int64_t Dist = 0;
-      if (Ptr == PtrN)
-        Dist = Diff;
-      else if (Ptr != Ptr0)
-        Dist = *getPointersDiff(ScalarTy, Ptr0, ScalarTy, Ptr, DL, SE);
-      // If the strides are not the same or repeated, we can't
-      // vectorize.
-      if (((Dist / Stride) * Stride) != Dist || !Dists.insert(Dist).second)
+    unsigned VecSz = Sz / GroupSize;
+    ScalarTy = Type::getIntNTy(SE->getContext(),
+                               DL->getTypeSizeInBits(ElemTy).getFixedValue() *
+                                   GroupSize);
+    StridedLoadTy = getWidenedType(ScalarTy, VecSz);
+    if (!TTI->isTypeLegal(StridedLoadTy) ||
+        !TTI->isLegalStridedLoadStore(StridedLoadTy, CommonAlignment))
+      return false;
+
+    unsigned PrevGroupStartIdx = 0;
+    unsigned CurrentGroupStartIdx = GroupSize;
+    int64_t StrideBetweenGroups =
+        SortedOffsetsFromBase[GroupSize] - SortedOffsetsFromBase[0];
+    StrideIntVal = StrideBetweenGroups;
+    while (CurrentGroupStartIdx != Sz) {
+      if (SortedOffsetsFromBase[CurrentGroupStartIdx] -
+              SortedOffsetsFromBase[PrevGroupStartIdx] !=
+          StrideBetweenGroups)
         break;
+      PrevGroupStartIdx = CurrentGroupStartIdx;
+      CurrentGroupStartIdx += GroupSize;
+    }
+    if (CurrentGroupStartIdx != Sz)
+      return false;
+
+    auto checkGroup = [&](unsigned StartIdx, unsigned GroupSize0,
+                          int64_t StrideWithinGroup) -> bool {
+      unsigned GroupEndIdx = StartIdx + 1;
+      for (; GroupEndIdx != Sz; ++GroupEndIdx) {
+        if (SortedOffsetsFromBase[GroupEndIdx] -
+                SortedOffsetsFromBase[GroupEndIdx - 1] !=
+            StrideWithinGroup)
+          break;
+      }
+      return (GroupEndIdx - StartIdx == GroupSize0);
+    };
+    for (unsigned i = 0; i < Sz; i += GroupSize) {
+      if (!checkGroup(i, GroupSize, StrideWithinGroup))
+        return false;
     }
-    if (Dists.size() == Sz)
-      return true;
+  }
+
+  // Try to generate strided load node if:
+  // 1. Target with strided load support is detected.
+  // 2. The number of loads is greater than MinProfitableStridedLoads,
+  // or the potential stride <= MaxProfitableLoadStride and the
+  // potential stride is power-of-2 (to avoid perf regressions for the very
+  // small number of loads) and max distance > number of loads, or potential
+  // stride is -1.
+  // 3. The loads are ordered, or number of unordered loads <=
+  // MaxProfitableUnorderedLoads, or loads are in reversed order.
+  // (this check is to avoid extra costs for very expensive shuffles).
+  // 4. Any pointer operand is an instruction with the users outside of the
+  // current graph (for masked gathers extra extractelement instructions
+  // might be required).
+
+  if (!TTI->isTypeLegal(StridedLoadTy) ||
+      !TTI->isLegalStridedLoadStore(StridedLoadTy, CommonAlignment))
+    return false;
+
+  // Simple check if not a strided access - clear order.
+  bool IsPossibleStrided = Diff % (VecSz - 1) == 0;
+  auto IsAnyPointerUsedOutGraph =
+      IsPossibleStrided && any_of(PointerOps, [&](Value *V) {
+        return isa<Instruction>(V) && any_of(V->users(), [&](User *U) {
+                 return !isVectorized(U) && !MustGather.contains(U);
+               });
+      });
+  const unsigned AbsoluteDiff = std::abs(Diff);
+  if (IsAnyPointerUsedOutGraph ||
+      ((VecSz > MinProfitableStridedLoads ||
+        (AbsoluteDiff <= MaxProfitableLoadStride * VecSz &&
+         has_single_bit(AbsoluteDiff))) &&
+       AbsoluteDiff > VecSz) ||
+      Diff == -(static_cast<int>(VecSz) - 1)) {
+    if (SPtrInfo) {
+      Type *StrideTy = DL->getIndexType(Ptr0->getType());
+      SPtrInfo->StrideVal = ConstantInt::get(StrideTy, StrideIntVal);
+      SPtrInfo->Ty = StridedLoadTy;
+    }
+    return true;
   }
   return false;
 }
 
-BoUpSLP::LoadsState
-BoUpSLP::canVectorizeLoads(ArrayRef<Value *> VL, const Value *VL0,
-                           SmallVectorImpl<unsigned> &Order,
-                           SmallVectorImpl<Value *> &PointerOps,
-                           unsigned *BestVF, bool TryRecursiveCheck) const {
+BoUpSLP::LoadsState BoUpSLP::canVectorizeLoads(
+    ArrayRef<Value *> VL, const Value *VL0, SmallVectorImpl<unsigned> &Order,
+    SmallVectorImpl<Value *> &PointerOps, StridedPtrInfo *SPtrInfo,
+    unsigned *BestVF, bool TryRecursiveCheck) const {
   // Check that a vectorized load would load the same memory as a scalar
   // load. For example, we don't want to vectorize loads that are smaller
   // than 8-bit. Even though we have a packed struct {<i2, i2, i2, i2>} LLVM
@@ -6883,11 +7111,10 @@ BoUpSLP::canVectorizeLoads(ArrayRef<Value *> VL, const Value *VL0,
   auto *VecTy = getWidenedType(ScalarTy, Sz);
   Align CommonAlignment = computeCommonAlignment<LoadInst>(VL);
   if (!IsSorted) {
-    if (Sz > MinProfitableStridedLoads && TTI->isTypeLegal(VecTy)) {
-      if (TTI->isLegalStridedLoadStore(VecTy, CommonAlignment) &&
-          calculateRtStride(PointerOps, ScalarTy, *DL, *SE, Order))
-        return LoadsState::StridedVectorize;
-    }
+    if (Sz > MinProfitableStridedLoads &&
+        analyzeRtStrideCandidate(PointerOps, ScalarTy, CommonAlignment, Order,
+                                 SPtrInfo))
+      return LoadsState::StridedVectorize;
 
     if (!TTI->isLegalMaskedGather(VecTy, CommonAlignment) ||
         TTI->forceScalarizeMaskedGather(VecTy, CommonAlignment))
@@ -6920,17 +7147,8 @@ BoUpSLP::canVectorizeLoads(ArrayRef<Value *> VL, const Value *VL0,
                              }))
       return LoadsState::CompressVectorize;
     // Simple check if not a strided access - clear order.
-    bool IsPossibleStrided = *Diff % (Sz - 1) == 0;
-    // Try to generate strided load node.
-    auto IsAnyPointerUsedOutGraph =
-        IsPossibleStrided && any_of(PointerOps, [&](Value *V) {
-          return isa<Instruction>(V) && any_of(V->users(), [&](User *U) {
-                   return !isVectorized(U) && !MustGather.contains(U);
-                 });
-        });
-    if (IsPossibleStrided &&
-        isStridedLoad(VL, PointerOps, Order, *TTI, *DL, *SE,
-                      IsAnyPointerUsedOutGraph, *Diff))
+    if (analyzeConstantStrideCandidate(PointerOps, ScalarTy, CommonAlignment,
+                                       Order, SPtrInfo, *Diff, Ptr0, PtrN))
       return LoadsState::StridedVectorize;
   }
   if (!TTI->isLegalMaskedGather(VecTy, CommonAlignment) ||
@@ -7014,9 +7232,9 @@ BoUpSLP::canVectorizeLoads(ArrayRef<Value *> VL, const Value *VL0,
         ArrayRef<Value *> Slice = VL.slice(Cnt, VF);
         SmallVector<unsigned> Order;
         SmallVector<Value *> PointerOps;
-        LoadsState LS =
-            canVectorizeLoads(Slice, Slice.front(), Order, PointerOps, BestVF,
-                              /*TryRecursiveCheck=*/false);
+        LoadsState LS = canVectorizeLoads(Slice, Slice.front(), Order,
+                                          PointerOps, SPtrInfo, BestVF,
+                                          /*TryRecursiveCheck=*/false);
         // Check that the sorted loads are consecutive.
         if (LS == LoadsState::Gather) {
           if (BestVF) {
@@ -9193,8 +9411,9 @@ void BoUpSLP::tryToVectorizeGatheredLoads(
           // Try to build vector load.
           ArrayRef<Value *> Values(
               reinterpret_cast<Value *const *>(Slice.begin()), Slice.size());
-          LoadsState LS = canVectorizeLoads(Values, Slice.front(), CurrentOrder,
-                                            PointerOps, &BestVF);
+          LoadsState LS =
+              canVectorizeLoads(Values, Slice.front(), CurrentOrder, PointerOps,
+                                /*SPtrInfo =*/nullptr, &BestVF);
           if (LS != LoadsState::Gather ||
               (BestVF > 1 && static_cast<unsigned>(NumElts) == 2 * BestVF)) {
             if (LS == LoadsState::ScatterVectorize) {
@@ -9799,7 +10018,7 @@ getVectorCallCosts(CallInst *CI, FixedVectorType *VecTy,
 BoUpSLP::TreeEntry::EntryState BoUpSLP::getScalarsVectorizationState(
     const InstructionsState &S, ArrayRef<Value *> VL,
     bool IsScatterVectorizeUserTE, OrdersType &CurrentOrder,
-    SmallVectorImpl<Value *> &PointerOps) {
+    SmallVectorImpl<Value *> &PointerOps, StridedPtrInfo *SPtrInfo) {
   assert(S.getMainOp() &&
          "Expected instructions with same/alternate opcodes only.");
 
@@ -9901,7 +10120,7 @@ BoUpSLP::TreeEntry::EntryState BoUpSLP::getScalarsVectorizationState(
         });
       });
     };
-    switch (canVectorizeLoads(VL, VL0, CurrentOrder, PointerOps)) {
+    switch (canVectorizeLoads(VL, VL0, CurrentOrder, PointerOps, SPtrInfo)) {
     case LoadsState::Vectorize:
       return TreeEntry::Vectorize;
     case LoadsState::CompressVectorize:
@@ -11374,8 +11593,9 @@ void BoUpSLP::buildTreeRec(ArrayRef<Value *> VLRef, unsigned Depth,
       UserTreeIdx.UserTE->State == TreeEntry::ScatterVectorize;
   OrdersType CurrentOrder;
   SmallVector<Value *> PointerOps;
+  StridedPtrInfo SPtrInfo;
   TreeEntry::EntryState State = getScalarsVectorizationState(
-      S, VL, IsScatterVectorizeUserTE, CurrentOrder, PointerOps);
+      S, VL, IsScatterVectorizeUserTE, CurrentOrder, PointerOps, &SPtrInfo);
   if (State == TreeEntry::NeedToGather) {
     newGatherTreeEntry(VL, S, UserTreeIdx, ReuseShuffleIndices);
     return;
@@ -11535,6 +11755,7 @@ void BoUpSLP::buildTreeRec(ArrayRef<Value *> VLRef, unsigned Depth,
         // Vectorizing non-consecutive loads with `llvm.masked.gather`.
         TE = newTreeEntry(VL, TreeEntry::StridedVectorize, Bundle, S,
                           UserTreeIdx, ReuseShuffleIndices, CurrentOrder);
+        TreeEntryToStridedPtrInfoMap[TE] = SPtrInfo;
         LLVM_DEBUG(dbgs() << "SLP: added a new TreeEntry (strided LoadInst).\n";
                    TE->dump());
         break;
@@ -13030,10 +13251,20 @@ void BoUpSLP::transformNodes() {
         InstructionCost StridedCost = TTI->getStridedMemoryOpCost(
             Instruction::Load, VecTy, BaseLI->getPointerOperand(),
             /*VariableMask=*/false, CommonAlignment, CostKind, BaseLI);
-        if (StridedCost < OriginalVecCost)
+        if (StridedCost < OriginalVecCost) {
           // Strided load is more profitable than consecutive load + reverse -
           // transform the node to strided load.
+          // TODO: StrideTy =
+          // DL->getIndexType(E.scalars.front()->getPointerOperand()->getType());
+          StridedPtrInfo SPtrInfo;
+          Type *StrideTy = Type::getIntNTy(
+              SE->getContext(),
+              DL->getTypeStoreSizeInBits(E.Scalars.front()->getType()));
+          SPtrInfo.StrideVal = ConstantInt::get(StrideTy, 1);
+          SPtrInfo.Ty = VecTy;
+          TreeEntryToStridedPtrInfoMap[&E] = SPtrInfo;
           E.State = TreeEntry::StridedVectorize;
+        }
       }
       break;
     }
@@ -19474,6 +19705,7 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
 
       LoadInst *LI = cast<LoadInst>(VL0);
       Instruction *NewLI;
+      FixedVectorType *StridedLoadTy = nullptr;
       Value *PO = LI->getPointerOperand();
       if (E->State == TreeEntry::Vectorize) {
         NewLI = Builder.CreateAlignedLoad(VecTy, PO, LI->getAlign());
@@ -19511,43 +19743,36 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
         Value *Ptr0 = cast<LoadInst>(E->Scalars.front())->getPointerOperand();
         Value *PtrN = cast<LoadInst>(E->Scalars.back())->getPointerOperand();
         PO = IsReverseOrder ? PtrN : Ptr0;
-        std::optional<int64_t> Diff = getPointersDiff(
-            VL0->getType(), Ptr0, VL0->getType(), PtrN, *DL, *SE);
         Type *StrideTy = DL->getIndexType(PO->getType());
         Value *StrideVal;
-        if (Diff) {
-          int64_t Stride =
-              *Diff / (static_cast<int64_t>(E->Scalars.size()) - 1);
-          StrideVal =
-              ConstantInt::get(StrideTy, (IsReverseOrder ? -1 : 1) * Stride *
-                                             DL->getTypeAllocSize(ScalarTy));
-        } else {
-          SmallVector<Value *> PointerOps(E->Scalars.size(), nullptr);
-          transform(E->Scalars, PointerOps.begin(), [](Value *V) {
-            return cast<LoadInst>(V)->getPointerOperand();
-          });
-          OrdersType Order;
-          const SCEV *StrideSCEV =
-              calculateRtStride(PointerOps, ScalarTy, *DL, *SE, Order);
-          assert(StrideSCEV && "At this point stride should be known");
+        StridedPtrInfo &SPtrInfo = TreeEntryToStridedPtrInfoMap[E];
+        StridedLoadTy = SPtrInfo.Ty;
+        assert(StridedLoadTy);
+        unsigned StridedLoadEC =
+            StridedLoadTy->getElementCount().getKnownMinValue();
+
+        Value *Stride = SPtrInfo.StrideVal;
+        if (!Stride) {
+          const SCEV *StrideSCEV = SPtrInfo.StrideSCEV;
+          assert(StrideSCEV);
           SCEVExpander Expander(*SE, *DL, "strided-load-vec");
-          Value *Stride = Expander.expandCodeFor(
-              StrideSCEV, StrideSCEV->getType(), &*Builder.GetInsertPoint());
-          Value *NewStride =
-              Builder.CreateIntCast(Stride, StrideTy, /*isSigned=*/true);
-          StrideVal = Builder.CreateMul(
-              NewStride,
-              ConstantInt::get(
-                  StrideTy,
-                  (IsReverseOrder ? -1 : 1) *
-                      static_cast<int>(DL->getTypeAllocSize(ScalarTy))));
-        }
+          Stride = Expander.expandCodeFor(StrideSCEV, StrideSCEV->getType(),
+                                          &*Builder.GetInsertPoint());
+        }
+        Value *NewStride =
+            Builder.CreateIntCast(Stride, StrideTy, /*isSigned=*/true);
+        StrideVal = Builder.CreateMul(
+            NewStride, ConstantInt::get(
+                           StrideTy, (IsReverseOrder ? -1 : 1) *
+                                         static_cast<int>(
+                                             DL->getTypeAllocSize(ScalarTy))));
         Align CommonAlignment = computeCommonAlignment<LoadInst>(E->Scalars);
         auto *Inst = Builder.CreateIntrinsic(
             Intrinsic::experimental_vp_strided_load,
-            {VecTy, PO->getType(), StrideTy},
-            {PO, StrideVal, Builder.getAllOnesMask(VecTy->getElementCount()),
-             Builder.getInt32(E->Scalars.size())});
+            {StridedLoadTy, PO->getType(), StrideTy},
+            {PO, StrideVal,
+             Builder.getAllOnesMask(ElementCount::getFixed(StridedLoadEC)),
+             Builder.getInt32(StridedLoadEC)});
         Inst->addParamAttr(
             /*ArgNo=*/0,
             Attribute::getWithAlignment(Inst->getContext(), CommonAlignment));
@@ -19584,6 +19809,8 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
                      ? NewLI
                      : ::propagateMetadata(NewLI, E->Scalars);
 
+      if (StridedLoadTy)
+        V = Builder.CreateBitOrPointerCast(V, VecTy);
       V = FinalShuffle(V, E);
       E->VectorizedValue = V;
       ++NumVectorInstructions;
diff --git a/llvm/test/Transforms/SLPVectorizer/RISCV/x264-satd-8x4.ll b/llvm/test/Transforms/SLPVectorizer/RISCV/x264-satd-8x4.ll
new file mode 100644
index 0000000000000..38cf1214081fa
--- /dev/null
+++ b/llvm/test/Transforms/SLPVectorizer/RISCV/x264-satd-8x4.ll
@@ -0,0 +1,483 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt -mtriple=riscv64 -mattr=+m,+v,+unaligned-vector-mem \
+; RUN: -riscv-v-slp-prefer-alt-opc-vectorization=true \
+; RUN: -passes=slp-vectorizer -S < %s | FileCheck %s
+; Function Attrs: nounwind uwtable vscale_range(8,1024)
+define i32 @x264_pixel_satd_8x4(ptr %pix1, i32  %i_pix1, ptr  %pix2, i32  %i_pix2) {
+; CHECK-LABEL: define i32 @x264_pixel_satd_8x4(
+; CHECK-SAME: ptr [[PIX1:%.*]], i32 [[I_PIX1:%.*]], ptr [[PIX2:%.*]], i32 [[I_PIX2:%.*]]) #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[IDX_EXT:%.*]] = sext i32 [[I_PIX1]] to i64
+; CHECK-NEXT:    [[IDX_EXT63:%.*]] = sext i32 [[I_PIX2]] to i64
+; CHECK-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds nuw i8, ptr [[PIX1]], i64 4
+; CHECK-NEXT:    [[ARRAYIDX5:%.*]] = getelementptr inbounds nuw i8, ptr [[PIX2]], i64 4
+; CHECK-NEXT:    [[TMP0:%.*]] = mul i64 [[IDX_EXT]], 1
+; CHECK-NEXT:    [[TMP1:%.*]] = call <4 x i32> @llvm.experimental.vp.strided.load.v4i32.p0.i64(ptr align 1 [[PIX1]], i64 [[TMP0]], <4 x i1> splat (i1 true), i32 4)
+; CHECK-NEXT:    [[TMP19:%.*]] = bitcast <4 x i32> [[TMP1]] to <16 x i8>
+; CHECK-NEXT:    [[TMP20:%.*]] = zext <16 x i8> [[TMP19]] to <16 x i32>
+; CHECK-NEXT:    [[TMP4:%.*]] = mul i64 [[IDX_EXT63]], 1
+; CHECK-NEXT:    [[TMP5:%.*]] = call <4 x i32> @llvm.experimental.vp.strided.load.v4i32.p0.i64(ptr align 1 [[PIX2]], i64 [[TMP4]], <4 x i1> splat (i1 true), i32 4)
+; CHECK-NEXT:    [[TMP28:%.*]] = bitcast <4 x i32> [[TMP5]] to <16 x i8>
+; CHECK-NEXT:    [[TMP29:%.*]] = zext <16 x i8> [[TMP28]] to <16 x i32>
+; CHECK-NEXT:    [[TMP30:%.*]] = sub nsw <16 x i32> [[TMP20]], [[TMP29]]
+; CHECK-NEXT:    [[TMP9:%.*]] = mul i64 [[IDX_EXT]], 1
+; CHECK-NEXT:    [[TMP10:%.*]] = call <4 x i32> @llvm.experimental.vp.strided.load.v4i32.p0.i64(ptr align 1 [[ARRAYIDX3]], i64 [[TMP9]], <4 x i1> splat (i1 true), i32 4)
+; CHECK-NEXT:    [[TMP38:%.*]] = bitcast <4 x i32> [[TMP10]] to <16 x i8>
+; CHECK-NEXT:    [[TMP39:%.*]] = zext <16 x i8> [[TMP38]] to <16 x i32>
+; CHECK-NEXT:    [[TMP13:%.*]] = mul i64 [[IDX_EXT63]], 1
+; CHECK-NEXT:    [[TMP14:%.*]] = call <4 x i32> @llvm.experimental.vp.strided.load.v4i32.p0.i64(ptr align 1 [[ARRAYIDX5]], i64 [[TMP13]], <4 x i1> splat (i1 true), i32 4)
+; CHECK-NEXT:    [[TMP47:%.*]] = bitcast <4 x i32> [[TMP14]] to <16 x i8>
+; CHECK-NEXT:    [[TMP48:%.*]] = zext <16 x i8> [[TMP47]] to <16 x i32>
+; CHECK-NEXT:    [[TMP49:%.*]] = sub nsw <16 x i32> [[TMP39]], [[TMP48]]
+; CHECK-NEXT:    [[TMP50:%.*]] = shl nsw <16 x i32> [[TMP49]], splat (i32 16)
+; CHECK-NEXT:    [[TMP51:%.*]] = add nsw <16 x i32> [[TMP50]], [[TMP30]]
+; CHECK-NEXT:    [[TMP52:%.*]] = shufflevector <16 x i32> [[TMP51]], <16 x i32> poison, <16 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6, i32 9, i32 8, i32 11, i32 10, i32 13, i32 12, i32 15, i32 14>
+; CHECK-NEXT:    [[TMP53:%.*]] = add nsw <16 x i32> [[TMP52]], [[TMP51]]
+; CHECK-NEXT:    [[TMP54:%.*]] = sub nsw <16 x i32> [[TMP52]], [[TMP51]]
+; CHECK-NEXT:    [[TMP55:%.*]] = shufflevector <16 x i32> [[TMP53]], <16 x i32> [[TMP54]], <16 x i32> <i32 0, i32 17, i32 2, i32 19, i32 4, i32 21, i32 6, i32 23, i32 8, i32 25, i32 10, i32 27, i32 12, i32 29, i32 14, i32 31>
+; CHECK-NEXT:    [[TMP56:%.*]] = shufflevector <16 x i32> [[TMP55]], <16 x i32> poison, <16 x i32> <i32 2, i32 3, i32 0, i32 1, i32 6, i32 7, i32 4, i32 5, i32 10, i32 11, i32 8, i32 9, i32 14, i32 15, i32 12, i32 13>
+; CHECK-NEXT:    [[TMP57:%.*]] = add nsw <16 x i32> [[TMP55]], [[TMP56]]
+; CHECK-NEXT:    [[TMP58:%.*]] = sub nsw <16 x i32> [[TMP55]], [[TMP56]]
+; CHECK-NEXT:    [[TMP59:%.*]] = shufflevector <16 x i32> [[TMP57]], <16 x i32> [[TMP58]], <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 20, i32 21, i32 6, i32 7, i32 24, i32 25, i32 10, i32 11, i32 28, i32 29, i32 14, i32 15>
+; CHECK-NEXT:    [[TMP60:%.*]] = shufflevector <16 x i32> [[TMP59]], <16 x i32> poison, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 12, i32 13, i32 14, i32 15, i32 8, i32 9, i32 10, i32 11>
+; CHECK-NEXT:    [[TMP61:%.*]] = sub nsw <16 x i32> [[TMP59]], [[TMP60]]
+; CHECK-NEXT:    [[TMP62:%.*]] = add nsw <16 x i32> [[TMP59]], [[TMP60]]
+; CHECK-NEXT:    [[TMP63:%.*]] = shufflevector <16 x i32> [[TMP61]], <16 x i32> [[TMP62]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 20, i32 21, i32 22, i32 23, i32 8, i32 9, i32 10, i32 11, i32 28, i32 29, i32 30, i32 31>
+; CHECK-NEXT:    [[TMP64:%.*]] = shufflevector <16 x i32> [[TMP63]], <16 x i32> poison, <16 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEXT:    [[TMP65:%.*]] = add nsw <16 x i32> [[TMP63]], [[TMP64]]
+; CHECK-NEXT:    [[TMP66:%.*]] = sub nsw <16 x i32> [[TMP63]], [[TMP64]]
+; CHECK-NEXT:    [[TMP67:%.*]] = shufflevector <16 x i32> [[TMP65]], <16 x i32> [[TMP66]], <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; CHECK-NEXT:    [[TMP68:%.*]] = lshr <16 x i32> [[TMP67]], splat (i32 15)
+; CHECK-NEXT:    [[TMP69:%.*]] = and <16 x i32> [[TMP68]], splat (i32 65537)
+; CHECK-NEXT:    [[TMP70:%.*]] = mul nuw <16 x i32> [[TMP69]], splat (i32 65535)
+; CHECK-NEXT:    [[TMP71:%.*]] = add <16 x i32> [[TMP70]], [[TMP67]]
+; CHECK-NEXT:    [[TMP72:%.*]] = xor <16 x i32> [[TMP71]], [[TMP70]]
+; CHECK-NEXT:    [[TMP73:%.*]] = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[TMP72]])
+; CHECK-NEXT:    [[CONV118:%.*]] = and i32 [[TMP73]], 65535
+; CHECK-NEXT:    [[SHR:%.*]] = lshr i32 [[TMP73]], 16
+; CHECK-NEXT:    [[ADD119:%.*]] = add nuw nsw i32 [[CONV118]], [[SHR]]
+; CHECK-NEXT:    [[SHR120:%.*]] = lshr i32 [[ADD119]], 1
+; CHECK-NEXT:    ret i32 [[SHR120]]
+;
+entry:
+  %idx.ext = sext i32 %i_pix1 to i64
+  %idx.ext63 = sext i32 %i_pix2 to i64
+  %0 = load i8, ptr %pix1, align 1
+  %conv = zext i8 %0 to i32
+  %1 = load i8, ptr %pix2, align 1
+  %conv2 = zext i8 %1 to i32
+  %sub = sub nsw i32 %conv, %conv2
+  %arrayidx3 = getelementptr inbounds nuw i8, ptr %pix1, i64 4
+  %2 = load i8, ptr %arrayidx3, align 1
+  %conv4 = zext i8 %2 to i32
+  %arrayidx5 = getelementptr inbounds nuw i8, ptr %pix2, i64 4
+  %3 = load i8, ptr %arrayidx5, align 1
+  %conv6 = zext i8 %3 to i32
+  %sub7 = sub nsw i32 %conv4, %conv6
+  %shl = shl nsw i32 %sub7, 16
+  %add = add nsw i32 %shl, %sub
+  %arrayidx8 = getelementptr inbounds nuw i8, ptr %pix1, i64 1
+  %4 = load i8, ptr %arrayidx8, align 1
+  %conv9 = zext i8 %4 to i32
+  %arrayidx10 = getelementptr inbounds nuw i8, ptr %pix2, i64 1
+  %5 = load i8, ptr %arrayidx10, align 1
+  %conv11 = zext i8 %5 to i32
+  %sub12 = sub nsw i32 %conv9, %conv11
+  %arrayidx13 = getelementptr inbounds nuw i8, ptr %pix1, i64 5
+  %6 = load i8, ptr %arrayidx13, align 1
+  %conv14 = zext i8 %6 to i32
+  %arrayidx15 = getelementptr inbounds nuw i8, ptr %pix2, i64 5
+  %7 = load i8, ptr %arrayidx15, align 1
+  %conv16 = zext i8 %7 to i32
+  %sub17 = sub nsw i32 %conv14, %conv16
+  %shl18 = shl nsw i32 %sub17, 16
+  %add19 = add nsw i32 %shl18, %sub12
+  %arrayidx20 = getelementptr inbounds nuw i8, ptr %pix1, i64 2
+  %8 = load i8, ptr %arrayidx20, align 1
+  %conv21 = zext i8 %8 to i32
+  %arrayidx22 = getelementptr inbounds nuw i8, ptr %pix2, i64 2
+  %9 = load i8, ptr %arrayidx22, align 1
+  %conv23 = zext i8 %9 to i32
+  %sub24 = sub nsw i32 %conv21, %conv23
+  %arrayidx25 = getelementptr inbounds nuw i8, ptr %pix1, i64 6
+  %10 = load i8, ptr %arrayidx25, align 1
+  %conv26 = zext i8 %10 to i32
+  %arrayidx27 = getelementptr inbounds nuw i8, ptr %pix2, i64 6
+  %11 = load i8, ptr %arrayidx27, align 1
+  %conv28 = zext i8 %11 to i32
+  %sub29 = sub nsw i32 %conv26, %conv28
+  %shl30 = shl nsw i32 %sub29, 16
+  %add31 = add nsw i32 %shl30, %sub24
+  %arrayidx32 = getelementptr inbounds nuw i8, ptr %pix1, i64 3
+  %12 = load i8, ptr %arrayidx32, align 1
+  %conv33 = zext i8 %12 to i32
+  %arrayidx34 = getelementptr inbounds nuw i8, ptr %pix2, i64 3
+  %13 = load i8, ptr %arrayidx34, align 1
+  %conv35 = zext i8 %13 to i32
+  %sub36 = sub nsw i32 %conv33, %conv35
+  %arrayidx37 = getelementptr inbounds nuw i8, ptr %pix1, i64 7
+  %14 = load i8, ptr %arrayidx37, align 1
+  %conv38 = zext i8 %14 to i32
+  %arrayidx39 = getelementptr inbounds nuw i8, ptr %pix2, i64 7
+  %15 = load i8, ptr %arrayidx39, align 1
+  %conv40 = zext i8 %15 to i32
+  %sub41 = sub nsw i32 %conv38, %conv40
+  %shl42 = shl nsw i32 %sub41, 16
+  %add43 = add nsw i32 %shl42, %sub36
+  %add44 = add nsw i32 %add19, %add
+  %sub45 = sub nsw i32 %add, %add19
+  %add46 = add nsw i32 %add43, %add31
+  %sub47 = sub nsw i32 %add31, %add43
+  %add48 = add nsw i32 %add46, %add44
+  %sub51 = sub nsw i32 %add44, %add46
+  %add55 = add nsw i32 %sub47, %sub45
+  %sub59 = sub nsw i32 %sub45, %sub47
+  %add.ptr = getelementptr inbounds i8, ptr %pix1, i64 %idx.ext
+  %add.ptr64 = getelementptr inbounds i8, ptr %pix2, i64 %idx.ext63
+  %16 = load i8, ptr %add.ptr, align 1
+  %conv.1 = zext i8 %16 to i32
+  %17 = load i8, ptr %add.ptr64, align 1
+  %conv2.1 = zext i8 %17 to i32
+  %sub.1 = sub nsw i32 %conv.1, %conv2.1
+  %arrayidx3.1 = getelementptr inbounds nuw i8, ptr %add.ptr, i64 4
+  %18 = load i8, ptr %arrayidx3.1, align 1
+  %conv4.1 = zext i8 %18 to i32
+  %arrayidx5.1 = getelementptr inbounds nuw i8, ptr %add.ptr64, i64 4
+  %19 = load i8, ptr %arrayidx5.1, align 1
+  %conv6.1 = zext i8 %19 to i32
+  %sub7.1 = sub nsw i32 %conv4.1, %conv6.1
+  %shl.1 = shl nsw i32 %sub7.1, 16
+  %add.1 = add nsw i32 %shl.1, %sub.1
+  %arrayidx8.1 = getelementptr inbounds nuw i8, ptr %add.ptr, i64 1
+  %20 = load i8, ptr %arrayidx8.1, align 1
+  %conv9.1 = zext i8 %20 to i32
+  %arrayidx10.1 = getelementptr inbounds nuw i8, ptr %add.ptr64, i64 1
+  %21 = load i8, ptr %arrayidx10.1, align 1
+  %conv11.1 = zext i8 %21 to i32
+  %sub12.1 = sub nsw i32 %conv9.1, %conv11.1
+  %arrayidx13.1 = getelementptr inbounds nuw i8, ptr %add.ptr, i64 5
+  %22 = load i8, ptr %arrayidx13.1, align 1
+  %conv14.1 = zext i8 %22 to i32
+  %arrayidx15.1 = getelementptr inbounds nuw i8, ptr %add.ptr64, i64 5
+  %23 = load i8, ptr %arrayidx15.1, align 1
+  %conv16.1 = zext i8 %23 to i32
+  %sub17.1 = sub nsw i32 %conv14.1, %conv16.1
+  %shl18.1 = shl nsw i32 %sub17.1, 16
+  %add19.1 = add nsw i32 %shl18.1, %sub12.1
+  %arrayidx20.1 = getelementptr inbounds nuw i8, ptr %add.ptr, i64 2
+  %24 = load i8, ptr %arrayidx20.1, align 1
+  %conv21.1 = zext i8 %24 to i32
+  %arrayidx22.1 = getelementptr inbounds nuw i8, ptr %add.ptr64, i64 2
+  %25 = load i8, ptr %arrayidx22.1, align 1
+  %conv23.1 = zext i8 %25 to i32
+  %sub24.1 = sub nsw i32 %conv21.1, %conv23.1
+  %arrayidx25.1 = getelementptr inbounds nuw i8, ptr %add.ptr, i64 6
+  %26 = load i8, ptr %arrayidx25.1, align 1
+  %conv26.1 = zext i8 %26 to i32
+  %arrayidx27.1 = getelementptr inbounds nuw i8, ptr %add.ptr64, i64 6
+  %27 = load i8, ptr %arrayidx27.1, align 1
+  %conv28.1 = zext i8 %27 to i32
+  %sub29.1 = sub nsw i32 %conv26.1, %conv28.1
+  %shl30.1 = shl nsw i32 %sub29.1, 16
+  %add31.1 = add nsw i32 %shl30.1, %sub24.1
+  %arrayidx32.1 = getelementptr inbounds nuw i8, ptr %add.ptr, i64 3
+  %28 = load i8, ptr %arrayidx32.1, align 1
+  %conv33.1 = zext i8 %28 to i32
+  %arrayidx34.1 = getelementptr inbounds nuw i8, ptr %add.ptr64, i64 3
+  %29 = load i8, ptr %arrayidx34.1, align 1
+  %conv35.1 = zext i8 %29 to i32
+  %sub36.1 = sub nsw i32 %conv33.1, %conv35.1
+  %arrayidx37.1 = getelementptr inbounds nuw i8, ptr %add.ptr, i64 7
+  %30 = load i8, ptr %arrayidx37.1, align 1
+  %conv38.1 = zext i8 %30 to i32
+  %arrayidx39.1 = getelementptr inbounds nuw i8, ptr %add.ptr64, i64 7
+  %31 = load i8, ptr %arrayidx39.1, align 1
+  %conv40.1 = zext i8 %31 to i32
+  %sub41.1 = sub nsw i32 %conv38.1, %conv40.1
+  %shl42.1 = shl nsw i32 %sub41.1, 16
+  %add43.1 = add nsw i32 %shl42.1, %sub36.1
+  %add44.1 = add nsw i32 %add19.1, %add.1
+  %sub45.1 = sub nsw i32 %add.1, %add19.1
+  %add46.1 = add nsw i32 %add43.1, %add31.1
+  %sub47.1 = sub nsw i32 %add31.1, %add43.1
+  %add48.1 = add nsw i32 %add46.1, %add44.1
+  %sub51.1 = sub nsw i32 %add44.1, %add46.1
+  %add55.1 = add nsw i32 %sub47.1, %sub45.1
+  %sub59.1 = sub nsw i32 %sub45.1, %sub47.1
+  %add.ptr.1 = getelementptr inbounds i8, ptr %add.ptr, i64 %idx.ext
+  %add.ptr64.1 = getelementptr inbounds i8, ptr %add.ptr64, i64 %idx.ext63
+  %32 = load i8, ptr %add.ptr.1, align 1
+  %conv.2 = zext i8 %32 to i32
+  %33 = load i8, ptr %add.ptr64.1, align 1
+  %conv2.2 = zext i8 %33 to i32
+  %sub.2 = sub nsw i32 %conv.2, %conv2.2
+  %arrayidx3.2 = getelementptr inbounds nuw i8, ptr %add.ptr.1, i64 4
+  %34 = load i8, ptr %arrayidx3.2, align 1
+  %conv4.2 = zext i8 %34 to i32
+  %arrayidx5.2 = getelementptr inbounds nuw i8, ptr %add.ptr64.1, i64 4
+  %35 = load i8, ptr %arrayidx5.2, align 1
+  %conv6.2 = zext i8 %35 to i32
+  %sub7.2 = sub nsw i32 %conv4.2, %conv6.2
+  %shl.2 = shl nsw i32 %sub7.2, 16
+  %add.2 = add nsw i32 %shl.2, %sub.2
+  %arrayidx8.2 = getelementptr inbounds nuw i8, ptr %add.ptr.1, i64 1
+  %36 = load i8, ptr %arrayidx8.2, align 1
+  %conv9.2 = zext i8 %36 to i32
+  %arrayidx10.2 = getelementptr inbounds nuw i8, ptr %add.ptr64.1, i64 1
+  %37 = load i8, ptr %arrayidx10.2, align 1
+  %conv11.2 = zext i8 %37 to i32
+  %sub12.2 = sub nsw i32 %conv9.2, %conv11.2
+  %arrayidx13.2 = getelementptr inbounds nuw i8, ptr %add.ptr.1, i64 5
+  %38 = load i8, ptr %arrayidx13.2, align 1
+  %conv14.2 = zext i8 %38 to i32
+  %arrayidx15.2 = getelementptr inbounds nuw i8, ptr %add.ptr64.1, i64 5
+  %39 = load i8, ptr %arrayidx15.2, align 1
+  %conv16.2 = zext i8 %39 to i32
+  %sub17.2 = sub nsw i32 %conv14.2, %conv16.2
+  %shl18.2 = shl nsw i32 %sub17.2, 16
+  %add19.2 = add nsw i32 %shl18.2, %sub12.2
+  %arrayidx20.2 = getelementptr inbounds nuw i8, ptr %add.ptr.1, i64 2
+  %40 = load i8, ptr %arrayidx20.2, align 1
+  %conv21.2 = zext i8 %40 to i32
+  %arrayidx22.2 = getelementptr inbounds nuw i8, ptr %add.ptr64.1, i64 2
+  %41 = load i8, ptr %arrayidx22.2, align 1
+  %conv23.2 = zext i8 %41 to i32
+  %sub24.2 = sub nsw i32 %conv21.2, %conv23.2
+  %arrayidx25.2 = getelementptr inbounds nuw i8, ptr %add.ptr.1, i64 6
+  %42 = load i8, ptr %arrayidx25.2, align 1
+  %conv26.2 = zext i8 %42 to i32
+  %arrayidx27.2 = getelementptr inbounds nuw i8, ptr %add.ptr64.1, i64 6
+  %43 = load i8, ptr %arrayidx27.2, align 1
+  %conv28.2 = zext i8 %43 to i32
+  %sub29.2 = sub nsw i32 %conv26.2, %conv28.2
+  %shl30.2 = shl nsw i32 %sub29.2, 16
+  %add31.2 = add nsw i32 %shl30.2, %sub24.2
+  %arrayidx32.2 = getelementptr inbounds nuw i8, ptr %add.ptr.1, i64 3
+  %44 = load i8, ptr %arrayidx32.2, align 1
+  %conv33.2 = zext i8 %44 to i32
+  %arrayidx34.2 = getelementptr inbounds nuw i8, ptr %add.ptr64.1, i64 3
+  %45 = load i8, ptr %arrayidx34.2, align 1
+  %conv35.2 = zext i8 %45 to i32
+  %sub36.2 = sub nsw i32 %conv33.2, %conv35.2
+  %arrayidx37.2 = getelementptr inbounds nuw i8, ptr %add.ptr.1, i64 7
+  %46 = load i8, ptr %arrayidx37.2, align 1
+  %conv38.2 = zext i8 %46 to i32
+  %arrayidx39.2 = getelementptr inbounds nuw i8, ptr %add.ptr64.1, i64 7
+  %47 = load i8, ptr %arrayidx39.2, align 1
+  %conv40.2 = zext i8 %47 to i32
+  %sub41.2 = sub nsw i32 %conv38.2, %conv40.2
+  %shl42.2 = shl nsw i32 %sub41.2, 16
+  %add43.2 = add nsw i32 %shl42.2, %sub36.2
+  %add44.2 = add nsw i32 %add19.2, %add.2
+  %sub45.2 = sub nsw i32 %add.2, %add19.2
+  %add46.2 = add nsw i32 %add43.2, %add31.2
+  %sub47.2 = sub nsw i32 %add31.2, %add43.2
+  %add48.2 = add nsw i32 %add46.2, %add44.2
+  %sub51.2 = sub nsw i32 %add44.2, %add46.2
+  %add55.2 = add nsw i32 %sub47.2, %sub45.2
+  %sub59.2 = sub nsw i32 %sub45.2, %sub47.2
+  %add.ptr.2 = getelementptr inbounds i8, ptr %add.ptr.1, i64 %idx.ext
+  %add.ptr64.2 = getelementptr inbounds i8, ptr %add.ptr64.1, i64 %idx.ext63
+  %48 = load i8, ptr %add.ptr.2, align 1
+  %conv.3 = zext i8 %48 to i32
+  %49 = load i8, ptr %add.ptr64.2, align 1
+  %conv2.3 = zext i8 %49 to i32
+  %sub.3 = sub nsw i32 %conv.3, %conv2.3
+  %arrayidx3.3 = getelementptr inbounds nuw i8, ptr %add.ptr.2, i64 4
+  %50 = load i8, ptr %arrayidx3.3, align 1
+  %conv4.3 = zext i8 %50 to i32
+  %arrayidx5.3 = getelementptr inbounds nuw i8, ptr %add.ptr64.2, i64 4
+  %51 = load i8, ptr %arrayidx5.3, align 1
+  %conv6.3 = zext i8 %51 to i32
+  %sub7.3 = sub nsw i32 %conv4.3, %conv6.3
+  %shl.3 = shl nsw i32 %sub7.3, 16
+  %add.3 = add nsw i32 %shl.3, %sub.3
+  %arrayidx8.3 = getelementptr inbounds nuw i8, ptr %add.ptr.2, i64 1
+  %52 = load i8, ptr %arrayidx8.3, align 1
+  %conv9.3 = zext i8 %52 to i32
+  %arrayidx10.3 = getelementptr inbounds nuw i8, ptr %add.ptr64.2, i64 1
+  %53 = load i8, ptr %arrayidx10.3, align 1
+  %conv11.3 = zext i8 %53 to i32
+  %sub12.3 = sub nsw i32 %conv9.3, %conv11.3
+  %arrayidx13.3 = getelementptr inbounds nuw i8, ptr %add.ptr.2, i64 5
+  %54 = load i8, ptr %arrayidx13.3, align 1
+  %conv14.3 = zext i8 %54 to i32
+  %arrayidx15.3 = getelementptr inbounds nuw i8, ptr %add.ptr64.2, i64 5
+  %55 = load i8, ptr %arrayidx15.3, align 1
+  %conv16.3 = zext i8 %55 to i32
+  %sub17.3 = sub nsw i32 %conv14.3, %conv16.3
+  %shl18.3 = shl nsw i32 %sub17.3, 16
+  %add19.3 = add nsw i32 %shl18.3, %sub12.3
+  %arrayidx20.3 = getelementptr inbounds nuw i8, ptr %add.ptr.2, i64 2
+  %56 = load i8, ptr %arrayidx20.3, align 1
+  %conv21.3 = zext i8 %56 to i32
+  %arrayidx22.3 = getelementptr inbounds nuw i8, ptr %add.ptr64.2, i64 2
+  %57 = load i8, ptr %arrayidx22.3, align 1
+  %conv23.3 = zext i8 %57 to i32
+  %sub24.3 = sub nsw i32 %conv21.3, %conv23.3
+  %arrayidx25.3 = getelementptr inbounds nuw i8, ptr %add.ptr.2, i64 6
+  %58 = load i8, ptr %arrayidx25.3, align 1
+  %conv26.3 = zext i8 %58 to i32
+  %arrayidx27.3 = getelementptr inbounds nuw i8, ptr %add.ptr64.2, i64 6
+  %59 = load i8, ptr %arrayidx27.3, align 1
+  %conv28.3 = zext i8 %59 to i32
+  %sub29.3 = sub nsw i32 %conv26.3, %conv28.3
+  %shl30.3 = shl nsw i32 %sub29.3, 16
+  %add31.3 = add nsw i32 %shl30.3, %sub24.3
+  %arrayidx32.3 = getelementptr inbounds nuw i8, ptr %add.ptr.2, i64 3
+  %60 = load i8, ptr %arrayidx32.3, align 1
+  %conv33.3 = zext i8 %60 to i32
+  %arrayidx34.3 = getelementptr inbounds nuw i8, ptr %add.ptr64.2, i64 3
+  %61 = load i8, ptr %arrayidx34.3, align 1
+  %conv35.3 = zext i8 %61 to i32
+  %sub36.3 = sub nsw i32 %conv33.3, %conv35.3
+  %arrayidx37.3 = getelementptr inbounds nuw i8, ptr %add.ptr.2, i64 7
+  %62 = load i8, ptr %arrayidx37.3, align 1
+  %conv38.3 = zext i8 %62 to i32
+  %arrayidx39.3 = getelementptr inbounds nuw i8, ptr %add.ptr64.2, i64 7
+  %63 = load i8, ptr %arrayidx39.3, align 1
+  %conv40.3 = zext i8 %63 to i32
+  %sub41.3 = sub nsw i32 %conv38.3, %conv40.3
+  %shl42.3 = shl nsw i32 %sub41.3, 16
+  %add43.3 = add nsw i32 %shl42.3, %sub36.3
+  %add44.3 = add nsw i32 %add19.3, %add.3
+  %sub45.3 = sub nsw i32 %add.3, %add19.3
+  %add46.3 = add nsw i32 %add43.3, %add31.3
+  %sub47.3 = sub nsw i32 %add31.3, %add43.3
+  %add48.3 = add nsw i32 %add46.3, %add44.3
+  %sub51.3 = sub nsw i32 %add44.3, %add46.3
+  %add55.3 = add nsw i32 %sub47.3, %sub45.3
+  %sub59.3 = sub nsw i32 %sub45.3, %sub47.3
+  %add78 = add nsw i32 %add48.1, %add48
+  %sub86 = sub nsw i32 %add48, %add48.1
+  %add94 = add nsw i32 %add48.3, %add48.2
+  %sub102 = sub nsw i32 %add48.2, %add48.3
+  %add103 = add nsw i32 %add94, %add78
+  %sub104 = sub nsw i32 %add78, %add94
+  %add105 = add nsw i32 %sub102, %sub86
+  %sub106 = sub nsw i32 %sub86, %sub102
+  %shr.i = lshr i32 %add103, 15
+  %and.i = and i32 %shr.i, 65537
+  %mul.i = mul nuw i32 %and.i, 65535
+  %add.i = add i32 %mul.i, %add103
+  %xor.i = xor i32 %add.i, %mul.i
+  %shr.i169 = lshr i32 %add105, 15
+  %and.i170 = and i32 %shr.i169, 65537
+  %mul.i171 = mul nuw i32 %and.i170, 65535
+  %add.i172 = add i32 %mul.i171, %add105
+  %xor.i173 = xor i32 %add.i172, %mul.i171
+  %shr.i174 = lshr i32 %sub104, 15
+  %and.i175 = and i32 %shr.i174, 65537
+  %mul.i176 = mul nuw i32 %and.i175, 65535
+  %add.i177 = add i32 %mul.i176, %sub104
+  %xor.i178 = xor i32 %add.i177, %mul.i176
+  %shr.i179 = lshr i32 %sub106, 15
+  %and.i180 = and i32 %shr.i179, 65537
+  %mul.i181 = mul nuw i32 %and.i180, 65535
+  %add.i182 = add i32 %mul.i181, %sub106
+  %xor.i183 = xor i32 %add.i182, %mul.i181
+  %add110 = add i32 %xor.i173, %xor.i
+  %add112 = add i32 %add110, %xor.i178
+  %add113 = add i32 %add112, %xor.i183
+  %add78.1 = add nsw i32 %add55.1, %add55
+  %sub86.1 = sub nsw i32 %add55, %add55.1
+  %add94.1 = add nsw i32 %add55.3, %add55.2
+  %sub102.1 = sub nsw i32 %add55.2, %add55.3
+  %add103.1 = add nsw i32 %add94.1, %add78.1
+  %sub104.1 = sub nsw i32 %add78.1, %add94.1
+  %add105.1 = add nsw i32 %sub102.1, %sub86.1
+  %sub106.1 = sub nsw i32 %sub86.1, %sub102.1
+  %shr.i.1 = lshr i32 %add103.1, 15
+  %and.i.1 = and i32 %shr.i.1, 65537
+  %mul.i.1 = mul nuw i32 %and.i.1, 65535
+  %add.i.1 = add i32 %mul.i.1, %add103.1
+  %xor.i.1 = xor i32 %add.i.1, %mul.i.1
+  %shr.i169.1 = lshr i32 %add105.1, 15
+  %and.i170.1 = and i32 %shr.i169.1, 65537
+  %mul.i171.1 = mul nuw i32 %and.i170.1, 65535
+  %add.i172.1 = add i32 %mul.i171.1, %add105.1
+  %xor.i173.1 = xor i32 %add.i172.1, %mul.i171.1
+  %shr.i174.1 = lshr i32 %sub104.1, 15
+  %and.i175.1 = and i32 %shr.i174.1, 65537
+  %mul.i176.1 = mul nuw i32 %and.i175.1, 65535
+  %add.i177.1 = add i32 %mul.i176.1, %sub104.1
+  %xor.i178.1 = xor i32 %add.i177.1, %mul.i176.1
+  %shr.i179.1 = lshr i32 %sub106.1, 15
+  %and.i180.1 = and i32 %shr.i179.1, 65537
+  %mul.i181.1 = mul nuw i32 %and.i180.1, 65535
+  %add.i182.1 = add i32 %mul.i181.1, %sub106.1
+  %xor.i183.1 = xor i32 %add.i182.1, %mul.i181.1
+  %add108.1 = add i32 %xor.i173.1, %add113
+  %add110.1 = add i32 %add108.1, %xor.i.1
+  %add112.1 = add i32 %add110.1, %xor.i178.1
+  %add113.1 = add i32 %add112.1, %xor.i183.1
+  %add78.2 = add nsw i32 %sub51.1, %sub51
+  %sub86.2 = sub nsw i32 %sub51, %sub51.1
+  %add94.2 = add nsw i32 %sub51.3, %sub51.2
+  %sub102.2 = sub nsw i32 %sub51.2, %sub51.3
+  %add103.2 = add nsw i32 %add94.2, %add78.2
+  %sub104.2 = sub nsw i32 %add78.2, %add94.2
+  %add105.2 = add nsw i32 %sub102.2, %sub86.2
+  %sub106.2 = sub nsw i32 %sub86.2, %sub102.2
+  %shr.i.2 = lshr i32 %add103.2, 15
+  %and.i.2 = and i32 %shr.i.2, 65537
+  %mul.i.2 = mul nuw i32 %and.i.2, 65535
+  %add.i.2 = add i32 %mul.i.2, %add103.2
+  %xor.i.2 = xor i32 %add.i.2, %mul.i.2
+  %shr.i169.2 = lshr i32 %add105.2, 15
+  %and.i170.2 = and i32 %shr.i169.2, 65537
+  %mul.i171.2 = mul nuw i32 %and.i170.2, 65535
+  %add.i172.2 = add i32 %mul.i171.2, %add105.2
+  %xor.i173.2 = xor i32 %add.i172.2, %mul.i171.2
+  %shr.i174.2 = lshr i32 %sub104.2, 15
+  %and.i175.2 = and i32 %shr.i174.2, 65537
+  %mul.i176.2 = mul nuw i32 %and.i175.2, 65535
+  %add.i177.2 = add i32 %mul.i176.2, %sub104.2
+  %xor.i178.2 = xor i32 %add.i177.2, %mul.i176.2
+  %shr.i179.2 = lshr i32 %sub106.2, 15
+  %and.i180.2 = and i32 %shr.i179.2, 65537
+  %mul.i181.2 = mul nuw i32 %and.i180.2, 65535
+  %add.i182.2 = add i32 %mul.i181.2, %sub106.2
+  %xor.i183.2 = xor i32 %add.i182.2, %mul.i181.2
+  %add108.2 = add i32 %xor.i173.2, %add113.1
+  %add110.2 = add i32 %add108.2, %xor.i.2
+  %add112.2 = add i32 %add110.2, %xor.i178.2
+  %add113.2 = add i32 %add112.2, %xor.i183.2
+  %add78.3 = add nsw i32 %sub59.1, %sub59
+  %sub86.3 = sub nsw i32 %sub59, %sub59.1
+  %add94.3 = add nsw i32 %sub59.3, %sub59.2
+  %sub102.3 = sub nsw i32 %sub59.2, %sub59.3
+  %add103.3 = add nsw i32 %add94.3, %add78.3
+  %sub104.3 = sub nsw i32 %add78.3, %add94.3
+  %add105.3 = add nsw i32 %sub102.3, %sub86.3
+  %sub106.3 = sub nsw i32 %sub86.3, %sub102.3
+  %shr.i.3 = lshr i32 %add103.3, 15
+  %and.i.3 = and i32 %shr.i.3, 65537
+  %mul.i.3 = mul nuw i32 %and.i.3, 65535
+  %add.i.3 = add i32 %mul.i.3, %add103.3
+  %xor.i.3 = xor i32 %add.i.3, %mul.i.3
+  %shr.i169.3 = lshr i32 %add105.3, 15
+  %and.i170.3 = and i32 %shr.i169.3, 65537
+  %mul.i171.3 = mul nuw i32 %and.i170.3, 65535
+  %add.i172.3 = add i32 %mul.i171.3, %add105.3
+  %xor.i173.3 = xor i32 %add.i172.3, %mul.i171.3
+  %shr.i174.3 = lshr i32 %sub104.3, 15
+  %and.i175.3 = and i32 %shr.i174.3, 65537
+  %mul.i176.3 = mul nuw i32 %and.i175.3, 65535
+  %add.i177.3 = add i32 %mul.i176.3, %sub104.3
+  %xor.i178.3 = xor i32 %add.i177.3, %mul.i176.3
+  %shr.i179.3 = lshr i32 %sub106.3, 15
+  %and.i180.3 = and i32 %shr.i179.3, 65537
+  %mul.i181.3 = mul nuw i32 %and.i180.3, 65535
+  %add.i182.3 = add i32 %mul.i181.3, %sub106.3
+  %xor.i183.3 = xor i32 %add.i182.3, %mul.i181.3
+  %add108.3 = add i32 %xor.i173.3, %add113.2
+  %add110.3 = add i32 %add108.3, %xor.i.3
+  %add112.3 = add i32 %add110.3, %xor.i178.3
+  %add113.3 = add i32 %add112.3, %xor.i183.3
+  %conv118 = and i32 %add113.3, 65535
+  %shr = lshr i32 %add113.3, 16
+  %add119 = add nuw nsw i32 %conv118, %shr
+  %shr120 = lshr i32 %add119, 1
+  ret i32 %shr120
+}

>From f144b0513bb91a5a202eac783742d735fd147c77 Mon Sep 17 00:00:00 2001
From: Mikhail Gudim <mgudim at ventanamicro.com>
Date: Tue, 12 Aug 2025 10:21:30 -0700
Subject: [PATCH 02/17] updated basic-strided-loads.ll

---
 .../RISCV/basic-strided-loads.ll              | 20 +++----------------
 1 file changed, 3 insertions(+), 17 deletions(-)

diff --git a/llvm/test/Transforms/SLPVectorizer/RISCV/basic-strided-loads.ll b/llvm/test/Transforms/SLPVectorizer/RISCV/basic-strided-loads.ll
index 645dbc49269f0..0135d3c01d9f6 100644
--- a/llvm/test/Transforms/SLPVectorizer/RISCV/basic-strided-loads.ll
+++ b/llvm/test/Transforms/SLPVectorizer/RISCV/basic-strided-loads.ll
@@ -630,25 +630,11 @@ define void @rt_stride_widen_no_reordering(ptr %pl, i64 %stride, ptr %ps) {
 ; CHECK-LABEL: define void @rt_stride_widen_no_reordering(
 ; CHECK-SAME: ptr [[PL:%.*]], i64 [[STRIDE:%.*]], ptr [[PS:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[OFFSET0:%.*]] = mul nsw i64 [[STRIDE]], 0
-; CHECK-NEXT:    [[OFFSET4:%.*]] = mul nsw i64 [[STRIDE]], 1
-; CHECK-NEXT:    [[OFFSET8:%.*]] = mul nsw i64 [[STRIDE]], 2
-; CHECK-NEXT:    [[OFFSET12:%.*]] = mul nsw i64 [[STRIDE]], 3
 ; CHECK-NEXT:    [[GEP_L0:%.*]] = getelementptr inbounds i8, ptr [[PL]], i64 [[OFFSET0]]
-; CHECK-NEXT:    [[GEP_L4:%.*]] = getelementptr inbounds i8, ptr [[PL]], i64 [[OFFSET4]]
-; CHECK-NEXT:    [[GEP_L8:%.*]] = getelementptr inbounds i8, ptr [[PL]], i64 [[OFFSET8]]
-; CHECK-NEXT:    [[GEP_L12:%.*]] = getelementptr inbounds i8, ptr [[PL]], i64 [[OFFSET12]]
 ; CHECK-NEXT:    [[GEP_S0:%.*]] = getelementptr inbounds i8, ptr [[PS]], i64 0
-; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i8>, ptr [[GEP_L0]], align 16
-; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i8>, ptr [[GEP_L4]], align 16
-; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i8>, ptr [[GEP_L8]], align 16
-; CHECK-NEXT:    [[TMP4:%.*]] = load <4 x i8>, ptr [[GEP_L12]], align 16
-; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <4 x i8> [[TMP1]], <4 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP6:%.*]] = shufflevector <4 x i8> [[TMP2]], <4 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP7:%.*]] = shufflevector <4 x i8> [[TMP1]], <4 x i8> [[TMP2]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP11:%.*]] = shufflevector <4 x i8> [[TMP3]], <4 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP9:%.*]] = shufflevector <16 x i8> [[TMP7]], <16 x i8> [[TMP11]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP10:%.*]] = shufflevector <4 x i8> [[TMP4]], <4 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP8:%.*]] = shufflevector <16 x i8> [[TMP9]], <16 x i8> [[TMP10]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 18, i32 19>
+; CHECK-NEXT:    [[TMP1:%.*]] = mul i64 [[STRIDE]], 1
+; CHECK-NEXT:    [[TMP2:%.*]] = call <4 x i32> @llvm.experimental.vp.strided.load.v4i32.p0.i64(ptr align 16 [[GEP_L0]], i64 [[TMP1]], <4 x i1> splat (i1 true), i32 4)
+; CHECK-NEXT:    [[TMP8:%.*]] = bitcast <4 x i32> [[TMP2]] to <16 x i8>
 ; CHECK-NEXT:    store <16 x i8> [[TMP8]], ptr [[GEP_S0]], align 16
 ; CHECK-NEXT:    ret void
 ;

>From 6599a44d3650dcd058a6a7d5ddcc9488c1b5a024 Mon Sep 17 00:00:00 2001
From: Mikhail Gudim <mgudim at ventanamicro.com>
Date: Tue, 12 Aug 2025 10:41:51 -0700
Subject: [PATCH 03/17] did a "todo".

---
 llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp | 8 +++-----
 1 file changed, 3 insertions(+), 5 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index 2ff7b8a217623..470dbc9ed9ecb 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -13254,12 +13254,10 @@ void BoUpSLP::transformNodes() {
         if (StridedCost < OriginalVecCost) {
           // Strided load is more profitable than consecutive load + reverse -
           // transform the node to strided load.
-          // TODO: StrideTy =
-          // DL->getIndexType(E.scalars.front()->getPointerOperand()->getType());
+          Type *StrideTy = DL->getIndexType(cast<LoadInst>(E.Scalars.front())
+                                                ->getPointerOperand()
+                                                ->getType());
           StridedPtrInfo SPtrInfo;
-          Type *StrideTy = Type::getIntNTy(
-              SE->getContext(),
-              DL->getTypeStoreSizeInBits(E.Scalars.front()->getType()));
           SPtrInfo.StrideVal = ConstantInt::get(StrideTy, 1);
           SPtrInfo.Ty = VecTy;
           TreeEntryToStridedPtrInfoMap[&E] = SPtrInfo;

>From 2c6f901cda71eecedf3e0a202b3dffa6fb7e88b6 Mon Sep 17 00:00:00 2001
From: Mikhail Gudim <mgudim at ventanamicro.com>
Date: Tue, 12 Aug 2025 12:22:28 -0700
Subject: [PATCH 04/17] addressed review comments.

---
 .../Transforms/Vectorize/SLPVectorizer.cpp    | 112 +++++++++---------
 1 file changed, 53 insertions(+), 59 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index 470dbc9ed9ecb..469ab64d72571 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -1929,7 +1929,7 @@ class BoUpSLP {
     // strided load.
     FixedVectorType *Ty = nullptr;
   };
-  DenseMap<TreeEntry *, StridedPtrInfo> TreeEntryToStridedPtrInfoMap;
+  SmallDenseMap<TreeEntry *, StridedPtrInfo> TreeEntryToStridedPtrInfoMap;
 
 public:
   /// Tracks the state we can represent the loads in the given sequence.
@@ -2231,12 +2231,12 @@ class BoUpSLP {
   bool analyzeRtStrideCandidate(ArrayRef<Value *> PointerOps, Type *ElemTy,
                                 Align CommonAlignment,
                                 SmallVectorImpl<unsigned> &SortedIndices,
-                                StridedPtrInfo *SPtrInfo) const;
+                                StridedPtrInfo &SPtrInfo) const;
 
   bool analyzeConstantStrideCandidate(ArrayRef<Value *> PointerOps,
                                       Type *ElemTy, Align CommonAlignment,
                                       SmallVectorImpl<unsigned> &SortedIndices,
-                                      StridedPtrInfo *SPtrInfo, int64_t Diff,
+                                      StridedPtrInfo &SPtrInfo, int64_t Diff,
                                       Value *Ptr0, Value *PtrN) const;
 
   /// Checks if the given array of loads can be represented as a vectorized,
@@ -4500,7 +4500,7 @@ class BoUpSLP {
   TreeEntry::EntryState getScalarsVectorizationState(
       const InstructionsState &S, ArrayRef<Value *> VL,
       bool IsScatterVectorizeUserTE, OrdersType &CurrentOrder,
-      SmallVectorImpl<Value *> &PointerOps, StridedPtrInfo *SPtrInfo);
+      SmallVectorImpl<Value *> &PointerOps, StridedPtrInfo &SPtrInfo);
 
   /// Maps a specific scalar to its tree entry(ies).
   SmallDenseMap<Value *, SmallVector<TreeEntry *>> ScalarToTreeEntries;
@@ -6495,7 +6495,7 @@ static const SCEV *calculateRtStride(ArrayRef<Value *> PointerOps, Type *ElemTy,
 bool BoUpSLP::analyzeRtStrideCandidate(ArrayRef<Value *> PointerOps,
                                        Type *ElemTy, Align CommonAlignment,
                                        SmallVectorImpl<unsigned> &SortedIndices,
-                                       StridedPtrInfo *SPtrInfo) const {
+                                       StridedPtrInfo &SPtrInfo) const {
   // Group the pointers by constant offset.
   DenseMap<int64_t, std::pair<SmallVector<Value *>, SmallVector<unsigned>>>
       OffsetToPointerOpIdxMap;
@@ -6505,15 +6505,15 @@ bool BoUpSLP::analyzeRtStrideCandidate(ArrayRef<Value *> PointerOps,
       return false;
 
     const SCEVAddExpr *Add = dyn_cast<SCEVAddExpr>(PtrSCEV);
+    if (!Add)
+      return false;
     int64_t Offset = 0;
-    if (Add) {
-      for (int i = 0; i < (int)Add->getNumOperands(); ++i) {
-        const SCEVConstant *SC = dyn_cast<SCEVConstant>(Add->getOperand(i));
-        if (!SC)
-          continue;
-        Offset = (int64_t)(SC->getAPInt().getLimitedValue());
-        break;
-      }
+    for (int I : seq<int>(Add->getNumOperands())) {
+      auto *SC = dyn_cast<SCEVConstant>(Add->getOperand(I));
+      if (!SC)
+        continue;
+      Offset = SC->getAPInt().getSExtValue();
+      break;
     }
     OffsetToPointerOpIdxMap[Offset].first.push_back(Ptr);
     OffsetToPointerOpIdxMap[Offset].second.push_back(Idx);
@@ -6536,17 +6536,17 @@ bool BoUpSLP::analyzeRtStrideCandidate(ArrayRef<Value *> PointerOps,
       !TTI->isLegalStridedLoadStore(StridedLoadTy, CommonAlignment))
     return false;
 
-  SmallVector<int> SortedOffsetsV;
+  SmallVector<int64_t> SortedOffsetsV;
   for (auto [K, V] : OffsetToPointerOpIdxMap) {
     SortedOffsetsV.push_back(K);
   }
-  llvm::sort(SortedOffsetsV);
+  sort(SortedOffsetsV);
   if (NumOffsets > 1) {
-    int CommonDiff = SortedOffsetsV[1] - SortedOffsetsV[0];
-    if ((CommonDiff) != 1)
+    int64_t CommonDiff = SortedOffsetsV[1] - SortedOffsetsV[0];
+    if (CommonDiff != 1)
       return false;
-    for (int i = 1; i < (int)SortedOffsetsV.size() - 1; ++i) {
-      if (SortedOffsetsV[i + 1] - SortedOffsetsV[i] != CommonDiff)
+    for (int I : seq<int>(1, SortedOffsetsV.size() - 1)) {
+      if (SortedOffsetsV[I + 1] - SortedOffsetsV[I] != CommonDiff)
         return false;
     }
   }
@@ -6566,31 +6566,29 @@ bool BoUpSLP::analyzeRtStrideCandidate(ArrayRef<Value *> PointerOps,
   unsigned NumCoeffs0 = Coeffs0.size();
   if (NumCoeffs0 * NumOffsets != Sz)
     return false;
-  llvm::sort(Coeffs0);
+  sort(Coeffs0);
 
   SmallVector<unsigned> SortedIndicesDraft;
   SortedIndicesDraft.resize(Sz);
-  auto updateSortedIndices =
+  auto UpdateSortedIndices =
       [&](SmallVectorImpl<unsigned> &SortedIndicesForOffset,
           SmallVectorImpl<unsigned> &IndicesInAllPointerOps,
           int64_t OffsetNum) {
-        unsigned Num = 0;
-        for (unsigned Idx : SortedIndicesForOffset) {
+        for (const auto [Num, Idx] : enumerate(SortedIndicesForOffset)) {
           SortedIndicesDraft[Num * NumOffsets + OffsetNum] =
               IndicesInAllPointerOps[Idx];
-          ++Num;
         }
       };
 
-  updateSortedIndices(SortedIndicesForOffset0, IndicesInAllPointerOps0, 0);
+  UpdateSortedIndices(SortedIndicesForOffset0, IndicesInAllPointerOps0, 0);
 
   SmallVector<int64_t> Coeffs;
   SmallVector<unsigned> SortedIndicesForOffset;
-  for (int i = 1; i < NumOffsets; ++i) {
+  for (int I : seq<int>(1, NumOffsets)) {
     Coeffs.clear();
     SortedIndicesForOffset.clear();
 
-    int64_t Offset = SortedOffsetsV[i];
+    int64_t Offset = SortedOffsetsV[I];
     SmallVector<Value *> &PointerOpsForOffset =
         OffsetToPointerOpIdxMap[Offset].first;
     SmallVector<unsigned> &IndicesInAllPointerOps =
@@ -6598,26 +6596,23 @@ bool BoUpSLP::analyzeRtStrideCandidate(ArrayRef<Value *> PointerOps,
     const SCEV *StrideWithinGroup = calculateRtStride(
         PointerOpsForOffset, ElemTy, *DL, *SE, SortedIndicesForOffset, Coeffs);
 
-    if ((!StrideWithinGroup) || StrideWithinGroup != Stride0) {
+    if ((!StrideWithinGroup) || StrideWithinGroup != Stride0)
       return false;
-    }
     if (Coeffs.size() != NumCoeffs0)
       return false;
-    llvm::sort(Coeffs);
-    for (unsigned i = 0; i < NumCoeffs0; ++i) {
-      if (Coeffs[i] != Coeffs0[i])
+    sort(Coeffs);
+    for (int J : seq<int>(0, NumCoeffs0)) {
+      if (Coeffs[J] != Coeffs0[J])
         return false;
     }
 
-    updateSortedIndices(SortedIndicesForOffset, IndicesInAllPointerOps, i);
+    UpdateSortedIndices(SortedIndicesForOffset, IndicesInAllPointerOps, I);
   }
 
   SortedIndices.clear();
   SortedIndices = SortedIndicesDraft;
-  if (SPtrInfo) {
-    SPtrInfo->StrideSCEV = Stride0;
-    SPtrInfo->Ty = StridedLoadTy;
-  }
+  SPtrInfo.StrideSCEV = Stride0;
+  SPtrInfo.Ty = StridedLoadTy;
   return true;
 }
 
@@ -6951,20 +6946,21 @@ isMaskedLoadCompress(ArrayRef<Value *> VL, ArrayRef<Value *> PointerOps,
 // Same as analyzeRtStrideCandidate, but for constant strides.
 bool BoUpSLP::analyzeConstantStrideCandidate(
     ArrayRef<Value *> PointerOps, Type *ElemTy, Align CommonAlignment,
-    SmallVectorImpl<unsigned> &SortedIndices, StridedPtrInfo *SPtrInfo,
+    SmallVectorImpl<unsigned> &SortedIndices, StridedPtrInfo &SPtrInfo,
     int64_t Diff, Value *Ptr0, Value *PtrN) const {
   const unsigned Sz = PointerOps.size();
   SmallVector<int64_t> SortedOffsetsFromBase;
   SortedOffsetsFromBase.resize(Sz);
-  for (unsigned i = 0; i < Sz; ++i) {
+  for (unsigned I : seq<int>(0, Sz)) {
     Value *Ptr =
-        SortedIndices.empty() ? PointerOps[i] : PointerOps[SortedIndices[i]];
-    SortedOffsetsFromBase[i] =
+        SortedIndices.empty() ? PointerOps[I] : PointerOps[SortedIndices[I]];
+    SortedOffsetsFromBase[I] =
         *getPointersDiff(ElemTy, Ptr0, ElemTy, Ptr, *DL, *SE);
   }
 
   // Find where the first group ends.
-  assert(SortedOffsetsFromBase.size() > 1);
+  assert(SortedOffsetsFromBase.size() > 1 &&
+         "Trying to generate strided load for less than 2 loads");
   int64_t StrideWithinGroup =
       SortedOffsetsFromBase[1] - SortedOffsetsFromBase[0];
   unsigned GroupSize = 1;
@@ -7011,7 +7007,7 @@ bool BoUpSLP::analyzeConstantStrideCandidate(
     if (CurrentGroupStartIdx != Sz)
       return false;
 
-    auto checkGroup = [&](unsigned StartIdx, unsigned GroupSize0,
+    auto CheckGroup = [&](unsigned StartIdx, unsigned GroupSize0,
                           int64_t StrideWithinGroup) -> bool {
       unsigned GroupEndIdx = StartIdx + 1;
       for (; GroupEndIdx != Sz; ++GroupEndIdx) {
@@ -7020,10 +7016,10 @@ bool BoUpSLP::analyzeConstantStrideCandidate(
             StrideWithinGroup)
           break;
       }
-      return (GroupEndIdx - StartIdx == GroupSize0);
+      return GroupEndIdx - StartIdx == GroupSize0;
     };
-    for (unsigned i = 0; i < Sz; i += GroupSize) {
-      if (!checkGroup(i, GroupSize, StrideWithinGroup))
+    for (unsigned I = 0; I < Sz; I += GroupSize) {
+      if (!CheckGroup(I, GroupSize, StrideWithinGroup))
         return false;
     }
   }
@@ -7061,11 +7057,9 @@ bool BoUpSLP::analyzeConstantStrideCandidate(
          has_single_bit(AbsoluteDiff))) &&
        AbsoluteDiff > VecSz) ||
       Diff == -(static_cast<int>(VecSz) - 1)) {
-    if (SPtrInfo) {
-      Type *StrideTy = DL->getIndexType(Ptr0->getType());
-      SPtrInfo->StrideVal = ConstantInt::get(StrideTy, StrideIntVal);
-      SPtrInfo->Ty = StridedLoadTy;
-    }
+    Type *StrideTy = DL->getIndexType(Ptr0->getType());
+    SPtrInfo.StrideVal = ConstantInt::get(StrideTy, StrideIntVal);
+    SPtrInfo.Ty = StridedLoadTy;
     return true;
   }
   return false;
@@ -7113,7 +7107,7 @@ BoUpSLP::LoadsState BoUpSLP::canVectorizeLoads(
   if (!IsSorted) {
     if (Sz > MinProfitableStridedLoads &&
         analyzeRtStrideCandidate(PointerOps, ScalarTy, CommonAlignment, Order,
-                                 SPtrInfo))
+                                 *SPtrInfo))
       return LoadsState::StridedVectorize;
 
     if (!TTI->isLegalMaskedGather(VecTy, CommonAlignment) ||
@@ -7148,7 +7142,7 @@ BoUpSLP::LoadsState BoUpSLP::canVectorizeLoads(
       return LoadsState::CompressVectorize;
     // Simple check if not a strided access - clear order.
     if (analyzeConstantStrideCandidate(PointerOps, ScalarTy, CommonAlignment,
-                                       Order, SPtrInfo, *Diff, Ptr0, PtrN))
+                                       Order, *SPtrInfo, *Diff, Ptr0, PtrN))
       return LoadsState::StridedVectorize;
   }
   if (!TTI->isLegalMaskedGather(VecTy, CommonAlignment) ||
@@ -10018,7 +10012,7 @@ getVectorCallCosts(CallInst *CI, FixedVectorType *VecTy,
 BoUpSLP::TreeEntry::EntryState BoUpSLP::getScalarsVectorizationState(
     const InstructionsState &S, ArrayRef<Value *> VL,
     bool IsScatterVectorizeUserTE, OrdersType &CurrentOrder,
-    SmallVectorImpl<Value *> &PointerOps, StridedPtrInfo *SPtrInfo) {
+    SmallVectorImpl<Value *> &PointerOps, StridedPtrInfo &SPtrInfo) {
   assert(S.getMainOp() &&
          "Expected instructions with same/alternate opcodes only.");
 
@@ -10120,7 +10114,7 @@ BoUpSLP::TreeEntry::EntryState BoUpSLP::getScalarsVectorizationState(
         });
       });
     };
-    switch (canVectorizeLoads(VL, VL0, CurrentOrder, PointerOps, SPtrInfo)) {
+    switch (canVectorizeLoads(VL, VL0, CurrentOrder, PointerOps, &SPtrInfo)) {
     case LoadsState::Vectorize:
       return TreeEntry::Vectorize;
     case LoadsState::CompressVectorize:
@@ -11595,7 +11589,7 @@ void BoUpSLP::buildTreeRec(ArrayRef<Value *> VLRef, unsigned Depth,
   SmallVector<Value *> PointerOps;
   StridedPtrInfo SPtrInfo;
   TreeEntry::EntryState State = getScalarsVectorizationState(
-      S, VL, IsScatterVectorizeUserTE, CurrentOrder, PointerOps, &SPtrInfo);
+      S, VL, IsScatterVectorizeUserTE, CurrentOrder, PointerOps, SPtrInfo);
   if (State == TreeEntry::NeedToGather) {
     newGatherTreeEntry(VL, S, UserTreeIdx, ReuseShuffleIndices);
     return;
@@ -19743,16 +19737,16 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
         PO = IsReverseOrder ? PtrN : Ptr0;
         Type *StrideTy = DL->getIndexType(PO->getType());
         Value *StrideVal;
-        StridedPtrInfo &SPtrInfo = TreeEntryToStridedPtrInfoMap[E];
+        const StridedPtrInfo &SPtrInfo = TreeEntryToStridedPtrInfoMap.at(E);
         StridedLoadTy = SPtrInfo.Ty;
-        assert(StridedLoadTy);
+        assert(StridedLoadTy && "Missing StridedPoinerInfo for tree entry.");
         unsigned StridedLoadEC =
             StridedLoadTy->getElementCount().getKnownMinValue();
 
         Value *Stride = SPtrInfo.StrideVal;
         if (!Stride) {
           const SCEV *StrideSCEV = SPtrInfo.StrideSCEV;
-          assert(StrideSCEV);
+          assert(StrideSCEV && "Neither StrideVal nor StrideSCEV were set.");
           SCEVExpander Expander(*SE, *DL, "strided-load-vec");
           Stride = Expander.expandCodeFor(StrideSCEV, StrideSCEV->getType(),
                                           &*Builder.GetInsertPoint());

>From 1c99fa79b0a2c28877296868e1e9af00bbb559e6 Mon Sep 17 00:00:00 2001
From: Mikhail Gudim <mgudim at ventanamicro.com>
Date: Wed, 13 Aug 2025 08:26:58 -0700
Subject: [PATCH 05/17] address review comments.

---
 .../Transforms/Vectorize/SLPVectorizer.cpp    | 34 ++++++++++++++-----
 1 file changed, 25 insertions(+), 9 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index 469ab64d72571..b67eea6a0a896 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -2228,11 +2228,29 @@ class BoUpSLP {
   ///       may not be necessary.
   bool isLoadCombineCandidate(ArrayRef<Value *> Stores) const;
 
+  // Suppose we are given pointers of the form: %b + x * %s + y * %c
+  // where %c is constant. Check if the pointers can be rearranged as follows:
+  //  %b + 0 * %s + 0
+  //  %b + 0 * %s + 1
+  //  %b + 0 * %s + 2
+  //  ...
+  //  %b + 0 * %s + w
+  //
+  //  %b + 1 * %s + 0
+  //  %b + 1 * %s + 1
+  //  %b + 1 * %s + 2
+  //  ...
+  //  %b + 1 * %s + w
+  //  ...
+  //
+  //  If the pointers can be rearanged in the above pattern, it means that the
+  //  memory can be accessed with a strided loads of width `w` and stride `%s`.
   bool analyzeRtStrideCandidate(ArrayRef<Value *> PointerOps, Type *ElemTy,
                                 Align CommonAlignment,
                                 SmallVectorImpl<unsigned> &SortedIndices,
                                 StridedPtrInfo &SPtrInfo) const;
 
+  // Same as analyzeRtStrideCandidate, but for constant strides.
   bool analyzeConstantStrideCandidate(ArrayRef<Value *> PointerOps,
                                       Type *ElemTy, Align CommonAlignment,
                                       SmallVectorImpl<unsigned> &SortedIndices,
@@ -6537,9 +6555,8 @@ bool BoUpSLP::analyzeRtStrideCandidate(ArrayRef<Value *> PointerOps,
     return false;
 
   SmallVector<int64_t> SortedOffsetsV;
-  for (auto [K, V] : OffsetToPointerOpIdxMap) {
+  for (auto [K, V] : OffsetToPointerOpIdxMap)
     SortedOffsetsV.push_back(K);
-  }
   sort(SortedOffsetsV);
   if (NumOffsets > 1) {
     int64_t CommonDiff = SortedOffsetsV[1] - SortedOffsetsV[0];
@@ -6596,17 +6613,16 @@ bool BoUpSLP::analyzeRtStrideCandidate(ArrayRef<Value *> PointerOps,
     const SCEV *StrideWithinGroup = calculateRtStride(
         PointerOpsForOffset, ElemTy, *DL, *SE, SortedIndicesForOffset, Coeffs);
 
-    if ((!StrideWithinGroup) || StrideWithinGroup != Stride0)
+    if (!StrideWithinGroup || StrideWithinGroup != Stride0)
       return false;
     if (Coeffs.size() != NumCoeffs0)
       return false;
     sort(Coeffs);
-    for (int J : seq<int>(0, NumCoeffs0)) {
-      if (Coeffs[J] != Coeffs0[J])
-        return false;
-    }
+    if (Coeffs != Coeffs0)
+      return false
 
-    UpdateSortedIndices(SortedIndicesForOffset, IndicesInAllPointerOps, I);
+          UpdateSortedIndices(SortedIndicesForOffset, IndicesInAllPointerOps,
+                              I);
   }
 
   SortedIndices.clear();
@@ -6951,7 +6967,7 @@ bool BoUpSLP::analyzeConstantStrideCandidate(
   const unsigned Sz = PointerOps.size();
   SmallVector<int64_t> SortedOffsetsFromBase;
   SortedOffsetsFromBase.resize(Sz);
-  for (unsigned I : seq<int>(0, Sz)) {
+  for (unsigned I : seq<int>(Sz)) {
     Value *Ptr =
         SortedIndices.empty() ? PointerOps[I] : PointerOps[SortedIndices[I]];
     SortedOffsetsFromBase[I] =

>From bb294ad9ec1ecc837daa4a89e5d7a7b4672dda8c Mon Sep 17 00:00:00 2001
From: Mikhail Gudim <mgudim at ventanamicro.com>
Date: Wed, 13 Aug 2025 08:56:01 -0700
Subject: [PATCH 06/17] added cost estimation

---
 llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp | 12 ++++++++++--
 1 file changed, 10 insertions(+), 2 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index b67eea6a0a896..4481f7ac3805a 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -6619,7 +6619,7 @@ bool BoUpSLP::analyzeRtStrideCandidate(ArrayRef<Value *> PointerOps,
       return false;
     sort(Coeffs);
     if (Coeffs != Coeffs0)
-      return false
+      return false;
 
           UpdateSortedIndices(SortedIndicesForOffset, IndicesInAllPointerOps,
                               I);
@@ -15056,11 +15056,19 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef<Value *> VectorizedVals,
         }
         break;
       case TreeEntry::StridedVectorize: {
+        const StridedPtrInfo &SPtrInfo = TreeEntryToStridedPtrInfoMap.at(E);
+        FixedVectorType *StridedLoadTy = SPtrInfo.Ty;
+        assert(StridedLoadTy && "Missing StridedPoinerInfo for tree entry.");
         Align CommonAlignment =
             computeCommonAlignment<LoadInst>(UniqueValues.getArrayRef());
         VecLdCost = TTI->getStridedMemoryOpCost(
-            Instruction::Load, VecTy, LI0->getPointerOperand(),
+            Instruction::Load, StridedLoadTy, LI0->getPointerOperand(),
             /*VariableMask=*/false, CommonAlignment, CostKind);
+        if (StridedLoadTy != VecTy)
+          VecLdCost +=
+              TTI->getCastInstrCost(Instruction::BitCast, StridedLoadTy, VecTy,
+                                    getCastContextHint(*E), CostKind);
+
         break;
       }
       case TreeEntry::CompressVectorize: {

>From 74b136d5a5a90de05cdd981cad90737e6a59e7ac Mon Sep 17 00:00:00 2001
From: Mikhail Gudim <mgudim at ventanamicro.com>
Date: Wed, 13 Aug 2025 09:13:25 -0700
Subject: [PATCH 07/17] format

---
 llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index 4481f7ac3805a..f98dd2f1fff1a 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -6621,8 +6621,7 @@ bool BoUpSLP::analyzeRtStrideCandidate(ArrayRef<Value *> PointerOps,
     if (Coeffs != Coeffs0)
       return false;
 
-          UpdateSortedIndices(SortedIndicesForOffset, IndicesInAllPointerOps,
-                              I);
+    UpdateSortedIndices(SortedIndicesForOffset, IndicesInAllPointerOps, I);
   }
 
   SortedIndices.clear();

>From 0087b20faa7bacf1915df9211daf940153846673 Mon Sep 17 00:00:00 2001
From: Mikhail Gudim <mgudim at ventanamicro.com>
Date: Wed, 13 Aug 2025 11:43:09 -0700
Subject: [PATCH 08/17] change references back to pointers because
 canVectorizeLoads can be called with nullptr

---
 .../Transforms/Vectorize/SLPVectorizer.cpp    | 24 +++++++++++--------
 1 file changed, 14 insertions(+), 10 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index f98dd2f1fff1a..7f5d876e5b379 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -2248,13 +2248,13 @@ class BoUpSLP {
   bool analyzeRtStrideCandidate(ArrayRef<Value *> PointerOps, Type *ElemTy,
                                 Align CommonAlignment,
                                 SmallVectorImpl<unsigned> &SortedIndices,
-                                StridedPtrInfo &SPtrInfo) const;
+                                StridedPtrInfo *SPtrInfo) const;
 
   // Same as analyzeRtStrideCandidate, but for constant strides.
   bool analyzeConstantStrideCandidate(ArrayRef<Value *> PointerOps,
                                       Type *ElemTy, Align CommonAlignment,
                                       SmallVectorImpl<unsigned> &SortedIndices,
-                                      StridedPtrInfo &SPtrInfo, int64_t Diff,
+                                      StridedPtrInfo *SPtrInfo, int64_t Diff,
                                       Value *Ptr0, Value *PtrN) const;
 
   /// Checks if the given array of loads can be represented as a vectorized,
@@ -6513,7 +6513,7 @@ static const SCEV *calculateRtStride(ArrayRef<Value *> PointerOps, Type *ElemTy,
 bool BoUpSLP::analyzeRtStrideCandidate(ArrayRef<Value *> PointerOps,
                                        Type *ElemTy, Align CommonAlignment,
                                        SmallVectorImpl<unsigned> &SortedIndices,
-                                       StridedPtrInfo &SPtrInfo) const {
+                                       StridedPtrInfo *SPtrInfo) const {
   // Group the pointers by constant offset.
   DenseMap<int64_t, std::pair<SmallVector<Value *>, SmallVector<unsigned>>>
       OffsetToPointerOpIdxMap;
@@ -6626,8 +6626,10 @@ bool BoUpSLP::analyzeRtStrideCandidate(ArrayRef<Value *> PointerOps,
 
   SortedIndices.clear();
   SortedIndices = SortedIndicesDraft;
-  SPtrInfo.StrideSCEV = Stride0;
-  SPtrInfo.Ty = StridedLoadTy;
+  if (SPtrInfo) {
+    SPtrInfo->StrideSCEV = Stride0;
+    SPtrInfo->Ty = StridedLoadTy;
+  }
   return true;
 }
 
@@ -6961,7 +6963,7 @@ isMaskedLoadCompress(ArrayRef<Value *> VL, ArrayRef<Value *> PointerOps,
 // Same as analyzeRtStrideCandidate, but for constant strides.
 bool BoUpSLP::analyzeConstantStrideCandidate(
     ArrayRef<Value *> PointerOps, Type *ElemTy, Align CommonAlignment,
-    SmallVectorImpl<unsigned> &SortedIndices, StridedPtrInfo &SPtrInfo,
+    SmallVectorImpl<unsigned> &SortedIndices, StridedPtrInfo *SPtrInfo,
     int64_t Diff, Value *Ptr0, Value *PtrN) const {
   const unsigned Sz = PointerOps.size();
   SmallVector<int64_t> SortedOffsetsFromBase;
@@ -7073,8 +7075,10 @@ bool BoUpSLP::analyzeConstantStrideCandidate(
        AbsoluteDiff > VecSz) ||
       Diff == -(static_cast<int>(VecSz) - 1)) {
     Type *StrideTy = DL->getIndexType(Ptr0->getType());
-    SPtrInfo.StrideVal = ConstantInt::get(StrideTy, StrideIntVal);
-    SPtrInfo.Ty = StridedLoadTy;
+    if (SPtrInfo) {
+      SPtrInfo->StrideVal = ConstantInt::get(StrideTy, StrideIntVal);
+      SPtrInfo->Ty = StridedLoadTy;
+    }
     return true;
   }
   return false;
@@ -7122,7 +7126,7 @@ BoUpSLP::LoadsState BoUpSLP::canVectorizeLoads(
   if (!IsSorted) {
     if (Sz > MinProfitableStridedLoads &&
         analyzeRtStrideCandidate(PointerOps, ScalarTy, CommonAlignment, Order,
-                                 *SPtrInfo))
+                                 SPtrInfo))
       return LoadsState::StridedVectorize;
 
     if (!TTI->isLegalMaskedGather(VecTy, CommonAlignment) ||
@@ -7157,7 +7161,7 @@ BoUpSLP::LoadsState BoUpSLP::canVectorizeLoads(
       return LoadsState::CompressVectorize;
     // Simple check if not a strided access - clear order.
     if (analyzeConstantStrideCandidate(PointerOps, ScalarTy, CommonAlignment,
-                                       Order, *SPtrInfo, *Diff, Ptr0, PtrN))
+                                       Order, SPtrInfo, *Diff, Ptr0, PtrN))
       return LoadsState::StridedVectorize;
   }
   if (!TTI->isLegalMaskedGather(VecTy, CommonAlignment) ||

>From ec7da53eefb3ead72bb3362f54e88efac8801f4a Mon Sep 17 00:00:00 2001
From: Mikhail Gudim <mgudim at ventanamicro.com>
Date: Wed, 13 Aug 2025 11:52:28 -0700
Subject: [PATCH 09/17] three slashes for comments, deleted wrong comment.

---
 .../Transforms/Vectorize/SLPVectorizer.cpp    | 99 ++++++++++---------
 1 file changed, 50 insertions(+), 49 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index 7f5d876e5b379..c0434ce0e0b53 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -1916,17 +1916,15 @@ class BoUpSLP {
   class ShuffleCostEstimator;
   class ShuffleInstructionBuilder;
 
-  // If we decide to generate strided load / store, this struct contains all the
-  // necessary info. It's fields are calculated by analyzeRtStrideCandidate and
-  // analyzeConstantStrideCandidate. Note that Stride can be given either as a
-  // SCEV or as a Value if it already exists.
-  // To get the stride in bytes, StrideVal (or value obtained from StrideSCEV)
-  // has to by multiplied by the size of element of FixedVectorType.
+  /// If we decide to generate strided load / store, this struct contains all
+  /// the necessary info. It's fields are calculated by analyzeRtStrideCandidate
+  /// and analyzeConstantStrideCandidate. Note that Stride can be given either
+  /// as a SCEV or as a Value if it already exists. To get the stride in bytes,
+  /// StrideVal (or value obtained from StrideSCEV) has to by multiplied by the
+  /// size of element of FixedVectorType.
   struct StridedPtrInfo {
     Value *StrideVal = nullptr;
     const SCEV *StrideSCEV = nullptr;
-    // Represents the ammount which needs to be added to the base pointer of
-    // strided load.
     FixedVectorType *Ty = nullptr;
   };
   SmallDenseMap<TreeEntry *, StridedPtrInfo> TreeEntryToStridedPtrInfoMap;
@@ -2228,29 +2226,29 @@ class BoUpSLP {
   ///       may not be necessary.
   bool isLoadCombineCandidate(ArrayRef<Value *> Stores) const;
 
-  // Suppose we are given pointers of the form: %b + x * %s + y * %c
-  // where %c is constant. Check if the pointers can be rearranged as follows:
-  //  %b + 0 * %s + 0
-  //  %b + 0 * %s + 1
-  //  %b + 0 * %s + 2
-  //  ...
-  //  %b + 0 * %s + w
-  //
-  //  %b + 1 * %s + 0
-  //  %b + 1 * %s + 1
-  //  %b + 1 * %s + 2
-  //  ...
-  //  %b + 1 * %s + w
-  //  ...
-  //
-  //  If the pointers can be rearanged in the above pattern, it means that the
-  //  memory can be accessed with a strided loads of width `w` and stride `%s`.
+  /// Suppose we are given pointers of the form: %b + x * %s + y * %c
+  /// where %c is constant. Check if the pointers can be rearranged as follows:
+  ///  %b + 0 * %s + 0
+  ///  %b + 0 * %s + 1
+  ///  %b + 0 * %s + 2
+  ///  ...
+  ///  %b + 0 * %s + w
+  ///
+  ///  %b + 1 * %s + 0
+  ///  %b + 1 * %s + 1
+  ///  %b + 1 * %s + 2
+  ///  ...
+  ///  %b + 1 * %s + w
+  ///  ...
+  ///
+  ///  If the pointers can be rearanged in the above pattern, it means that the
+  ///  memory can be accessed with a strided loads of width `w` and stride `%s`.
   bool analyzeRtStrideCandidate(ArrayRef<Value *> PointerOps, Type *ElemTy,
                                 Align CommonAlignment,
                                 SmallVectorImpl<unsigned> &SortedIndices,
                                 StridedPtrInfo *SPtrInfo) const;
 
-  // Same as analyzeRtStrideCandidate, but for constant strides.
+  /// Same as analyzeRtStrideCandidate, but for constant strides.
   bool analyzeConstantStrideCandidate(ArrayRef<Value *> PointerOps,
                                       Type *ElemTy, Align CommonAlignment,
                                       SmallVectorImpl<unsigned> &SortedIndices,
@@ -6493,23 +6491,23 @@ static const SCEV *calculateRtStride(ArrayRef<Value *> PointerOps, Type *ElemTy,
   return Stride;
 }
 
-// Suppose we are given pointers of the form: %b + x * %s + y * %c
-// where %c is constant. Check if the pointers can be rearranged as follows:
-//  %b + 0 * %s + 0
-//  %b + 0 * %s + 1
-//  %b + 0 * %s + 2
-//  ...
-//  %b + 0 * %s + w
-//
-//  %b + 1 * %s + 0
-//  %b + 1 * %s + 1
-//  %b + 1 * %s + 2
-//  ...
-//  %b + 1 * %s + w
-//  ...
-//
-//  If the pointers can be rearanged in the above pattern, it means that the
-//  memory can be accessed with a strided loads of width `w` and stride `%s`.
+/// Suppose we are given pointers of the form: %b + x * %s + y * %c
+/// where %c is constant. Check if the pointers can be rearranged as follows:
+///  %b + 0 * %s + 0
+///  %b + 0 * %s + 1
+///  %b + 0 * %s + 2
+///  ...
+///  %b + 0 * %s + w
+///
+///  %b + 1 * %s + 0
+///  %b + 1 * %s + 1
+///  %b + 1 * %s + 2
+///  ...
+///  %b + 1 * %s + w
+///  ...
+///
+///  If the pointers can be rearanged in the above pattern, it means that the
+///  memory can be accessed with a strided loads of width `w` and stride `%s`.
 bool BoUpSLP::analyzeRtStrideCandidate(ArrayRef<Value *> PointerOps,
                                        Type *ElemTy, Align CommonAlignment,
                                        SmallVectorImpl<unsigned> &SortedIndices,
@@ -6960,7 +6958,7 @@ isMaskedLoadCompress(ArrayRef<Value *> VL, ArrayRef<Value *> PointerOps,
                               CompressMask, LoadVecTy);
 }
 
-// Same as analyzeRtStrideCandidate, but for constant strides.
+/// Same as analyzeRtStrideCandidate, but for constant strides.
 bool BoUpSLP::analyzeConstantStrideCandidate(
     ArrayRef<Value *> PointerOps, Type *ElemTy, Align CommonAlignment,
     SmallVectorImpl<unsigned> &SortedIndices, StridedPtrInfo *SPtrInfo,
@@ -15065,12 +15063,15 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef<Value *> VectorizedVals,
         Align CommonAlignment =
             computeCommonAlignment<LoadInst>(UniqueValues.getArrayRef());
         VecLdCost = TTI->getStridedMemoryOpCost(
-            Instruction::Load, StridedLoadTy, LI0->getPointerOperand(),
+            Instruction::Load, VecTy, LI0->getPointerOperand(),
             /*VariableMask=*/false, CommonAlignment, CostKind);
-        if (StridedLoadTy != VecTy)
-          VecLdCost +=
-              TTI->getCastInstrCost(Instruction::BitCast, StridedLoadTy, VecTy,
-                                    getCastContextHint(*E), CostKind);
+        //VecLdCost = TTI->getStridedMemoryOpCost(
+        //    Instruction::Load, StridedLoadTy, LI0->getPointerOperand(),
+        //    /*VariableMask=*/false, CommonAlignment, CostKind);
+        //if (StridedLoadTy != VecTy)
+        //  VecLdCost +=
+        //      TTI->getCastInstrCost(Instruction::BitCast, StridedLoadTy, VecTy,
+        //                            getCastContextHint(*E), CostKind);
 
         break;
       }

>From 73f0746a1749e6fff1f20e679ff7f685c611eaf1 Mon Sep 17 00:00:00 2001
From: Mikhail Gudim <mgudim at ventanamicro.com>
Date: Wed, 13 Aug 2025 13:10:11 -0700
Subject: [PATCH 10/17] revert a change made by mistake.

---
 llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index c0434ce0e0b53..7e7b5c9fe475b 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -6521,15 +6521,15 @@ bool BoUpSLP::analyzeRtStrideCandidate(ArrayRef<Value *> PointerOps,
       return false;
 
     const SCEVAddExpr *Add = dyn_cast<SCEVAddExpr>(PtrSCEV);
-    if (!Add)
-      return false;
     int64_t Offset = 0;
-    for (int I : seq<int>(Add->getNumOperands())) {
-      auto *SC = dyn_cast<SCEVConstant>(Add->getOperand(I));
-      if (!SC)
-        continue;
-      Offset = SC->getAPInt().getSExtValue();
-      break;
+    if (Add) {
+      for (int I : seq<int>(Add->getNumOperands())) {
+        const SCEVConstant *SC = dyn_cast<SCEVConstant>(Add->getOperand(I));
+        if (!SC)
+          continue;
+        Offset = SC->getAPInt().getSExtValue();
+        break;
+      }
     }
     OffsetToPointerOpIdxMap[Offset].first.push_back(Ptr);
     OffsetToPointerOpIdxMap[Offset].second.push_back(Idx);

>From c4b907f77c07255e2c7f06760ab15a45d2155584 Mon Sep 17 00:00:00 2001
From: Mikhail Gudim <mgudim at ventanamicro.com>
Date: Wed, 13 Aug 2025 13:13:57 -0700
Subject: [PATCH 11/17] Another DenseMap -> SmallDenseMap

---
 llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index 7e7b5c9fe475b..6983bb8597aa9 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -6513,7 +6513,7 @@ bool BoUpSLP::analyzeRtStrideCandidate(ArrayRef<Value *> PointerOps,
                                        SmallVectorImpl<unsigned> &SortedIndices,
                                        StridedPtrInfo *SPtrInfo) const {
   // Group the pointers by constant offset.
-  DenseMap<int64_t, std::pair<SmallVector<Value *>, SmallVector<unsigned>>>
+  SmallDenseMap<int64_t, std::pair<SmallVector<Value *>, SmallVector<unsigned>>>
       OffsetToPointerOpIdxMap;
   for (auto [Idx, Ptr] : enumerate(PointerOps)) {
     const SCEV *PtrSCEV = SE->getSCEV(Ptr);

>From 0b757cbda271ab88504f513ee19e8abd48e5022b Mon Sep 17 00:00:00 2001
From: Mikhail Gudim <mgudim at ventanamicro.com>
Date: Wed, 13 Aug 2025 13:21:53 -0700
Subject: [PATCH 12/17] put cost estimation back and make the order of type
 operands correct.

---
 llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp | 13 +++++--------
 1 file changed, 5 insertions(+), 8 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index 6983bb8597aa9..587a8e10b3537 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -15063,15 +15063,12 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef<Value *> VectorizedVals,
         Align CommonAlignment =
             computeCommonAlignment<LoadInst>(UniqueValues.getArrayRef());
         VecLdCost = TTI->getStridedMemoryOpCost(
-            Instruction::Load, VecTy, LI0->getPointerOperand(),
+            Instruction::Load, StridedLoadTy, LI0->getPointerOperand(),
             /*VariableMask=*/false, CommonAlignment, CostKind);
-        //VecLdCost = TTI->getStridedMemoryOpCost(
-        //    Instruction::Load, StridedLoadTy, LI0->getPointerOperand(),
-        //    /*VariableMask=*/false, CommonAlignment, CostKind);
-        //if (StridedLoadTy != VecTy)
-        //  VecLdCost +=
-        //      TTI->getCastInstrCost(Instruction::BitCast, StridedLoadTy, VecTy,
-        //                            getCastContextHint(*E), CostKind);
+        if (StridedLoadTy != VecTy)
+          VecLdCost +=
+              TTI->getCastInstrCost(Instruction::BitCast, VecTy, StridedLoadTy,
+                                    getCastContextHint(*E), CostKind);
 
         break;
       }

>From 956baf8777a6a24a62949014273e0f237f50fd88 Mon Sep 17 00:00:00 2001
From: Mikhail Gudim <mgudim at ventanamicro.com>
Date: Sat, 30 Aug 2025 02:05:37 -0700
Subject: [PATCH 13/17] rebased on latest main

---
 llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp | 9 ---------
 1 file changed, 9 deletions(-)

diff --git a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp
index 5fc73f66a0030..1ca513214f67c 100644
--- a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp
+++ b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp
@@ -37,11 +37,6 @@ static cl::opt<unsigned> SLPMaxVF(
         "exclusively by SLP vectorizer."),
     cl::Hidden);
 
-static cl::opt<bool> SLPPreferAltOpcVectorization(
-    "riscv-v-slp-prefer-alt-opc-vectorization",
-    cl::desc("Controls preferAlternateOpcodeVectorization"), cl::init(false),
-    cl::Hidden);
-
 static cl::opt<unsigned>
     RVVMinTripCount("riscv-v-min-trip-count",
                     cl::desc("Set the lower bound of a trip count to decide on "
@@ -3057,7 +3052,3 @@ RISCVTTIImpl::enableMemCmpExpansion(bool OptSize, bool IsZeroCmp) const {
   }
   return Options;
 }
-
-bool RISCVTTIImpl::preferAlternateOpcodeVectorization() const {
-  return SLPPreferAltOpcVectorization;
-}

>From 53ddf5003818e8333a9dfe8ae682174d25df1f6b Mon Sep 17 00:00:00 2001
From: Mikhail Gudim <mgudim at ventanamicro.com>
Date: Sun, 31 Aug 2025 12:19:00 -0700
Subject: [PATCH 14/17] fixed RUN lines in x264 test

---
 llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp           | 3 ++-
 llvm/test/Transforms/SLPVectorizer/RISCV/x264-satd-8x4.ll | 1 -
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index 587a8e10b3537..1c1bd11df9163 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -6386,7 +6386,8 @@ static bool isReverseOrder(ArrayRef<unsigned> Order) {
 /// Otherwise, SCEV* of the stride value is returned.
 static const SCEV *calculateRtStride(ArrayRef<Value *> PointerOps, Type *ElemTy,
                                      const DataLayout &DL, ScalarEvolution &SE,
-                                     SmallVectorImpl<unsigned> &SortedIndices) {
+                                     SmallVectorImpl<unsigned> &SortedIndices,
+                                     SmallVectorImpl<int64_t> &Coeffs) {
   SmallVector<const SCEV *> SCEVs;
   const SCEV *PtrSCEVLowest = nullptr;
   const SCEV *PtrSCEVHighest = nullptr;
diff --git a/llvm/test/Transforms/SLPVectorizer/RISCV/x264-satd-8x4.ll b/llvm/test/Transforms/SLPVectorizer/RISCV/x264-satd-8x4.ll
index 38cf1214081fa..33249a8e66657 100644
--- a/llvm/test/Transforms/SLPVectorizer/RISCV/x264-satd-8x4.ll
+++ b/llvm/test/Transforms/SLPVectorizer/RISCV/x264-satd-8x4.ll
@@ -1,6 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
 ; RUN: opt -mtriple=riscv64 -mattr=+m,+v,+unaligned-vector-mem \
-; RUN: -riscv-v-slp-prefer-alt-opc-vectorization=true \
 ; RUN: -passes=slp-vectorizer -S < %s | FileCheck %s
 ; Function Attrs: nounwind uwtable vscale_range(8,1024)
 define i32 @x264_pixel_satd_8x4(ptr %pix1, i32  %i_pix1, ptr  %pix2, i32  %i_pix2) {

>From 36aa40f8ac8433f4df38c4c0b3edd9a074d52852 Mon Sep 17 00:00:00 2001
From: Mikhail Gudim <mgudim at ventanamicro.com>
Date: Wed, 3 Sep 2025 08:34:01 -0700
Subject: [PATCH 15/17] Pass SPtrInfo by reference + put braces around
 "Coeffs.push_back(0)"

---
 .../Transforms/Vectorize/SLPVectorizer.cpp    | 57 ++++++++++---------
 1 file changed, 29 insertions(+), 28 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index 1c1bd11df9163..0e148ad9d9bae 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -2246,13 +2246,13 @@ class BoUpSLP {
   bool analyzeRtStrideCandidate(ArrayRef<Value *> PointerOps, Type *ElemTy,
                                 Align CommonAlignment,
                                 SmallVectorImpl<unsigned> &SortedIndices,
-                                StridedPtrInfo *SPtrInfo) const;
+                                StridedPtrInfo &SPtrInfo) const;
 
   /// Same as analyzeRtStrideCandidate, but for constant strides.
   bool analyzeConstantStrideCandidate(ArrayRef<Value *> PointerOps,
                                       Type *ElemTy, Align CommonAlignment,
                                       SmallVectorImpl<unsigned> &SortedIndices,
-                                      StridedPtrInfo *SPtrInfo, int64_t Diff,
+                                      StridedPtrInfo &SPtrInfo, int64_t Diff,
                                       Value *Ptr0, Value *PtrN) const;
 
   /// Checks if the given array of loads can be represented as a vectorized,
@@ -2268,7 +2268,7 @@ class BoUpSLP {
   LoadsState canVectorizeLoads(ArrayRef<Value *> VL, const Value *VL0,
                                SmallVectorImpl<unsigned> &Order,
                                SmallVectorImpl<Value *> &PointerOps,
-                               StridedPtrInfo *SPtrInfo = nullptr,
+                               StridedPtrInfo &SPtrInfo,
                                unsigned *BestVF = nullptr,
                                bool TryRecursiveCheck = true) const;
 
@@ -6468,8 +6468,9 @@ static const SCEV *calculateRtStride(ArrayRef<Value *> PointerOps, Type *ElemTy,
                ->isZero())
         return nullptr;
       Dist = SC->getAPInt().getZExtValue();
-    } else
+    } else {
       Coeffs.push_back(0);
+    }
     // If the strides are not the same or repeated, we can't vectorize.
     if ((Dist / Size) * Size != Dist || (Dist / Size) >= SCEVs.size())
       return nullptr;
@@ -6512,7 +6513,7 @@ static const SCEV *calculateRtStride(ArrayRef<Value *> PointerOps, Type *ElemTy,
 bool BoUpSLP::analyzeRtStrideCandidate(ArrayRef<Value *> PointerOps,
                                        Type *ElemTy, Align CommonAlignment,
                                        SmallVectorImpl<unsigned> &SortedIndices,
-                                       StridedPtrInfo *SPtrInfo) const {
+                                       StridedPtrInfo &SPtrInfo) const {
   // Group the pointers by constant offset.
   SmallDenseMap<int64_t, std::pair<SmallVector<Value *>, SmallVector<unsigned>>>
       OffsetToPointerOpIdxMap;
@@ -6625,10 +6626,8 @@ bool BoUpSLP::analyzeRtStrideCandidate(ArrayRef<Value *> PointerOps,
 
   SortedIndices.clear();
   SortedIndices = SortedIndicesDraft;
-  if (SPtrInfo) {
-    SPtrInfo->StrideSCEV = Stride0;
-    SPtrInfo->Ty = StridedLoadTy;
-  }
+  SPtrInfo.StrideSCEV = Stride0;
+  SPtrInfo.Ty = StridedLoadTy;
   return true;
 }
 
@@ -6962,7 +6961,7 @@ isMaskedLoadCompress(ArrayRef<Value *> VL, ArrayRef<Value *> PointerOps,
 /// Same as analyzeRtStrideCandidate, but for constant strides.
 bool BoUpSLP::analyzeConstantStrideCandidate(
     ArrayRef<Value *> PointerOps, Type *ElemTy, Align CommonAlignment,
-    SmallVectorImpl<unsigned> &SortedIndices, StridedPtrInfo *SPtrInfo,
+    SmallVectorImpl<unsigned> &SortedIndices, StridedPtrInfo &SPtrInfo,
     int64_t Diff, Value *Ptr0, Value *PtrN) const {
   const unsigned Sz = PointerOps.size();
   SmallVector<int64_t> SortedOffsetsFromBase;
@@ -7074,10 +7073,8 @@ bool BoUpSLP::analyzeConstantStrideCandidate(
        AbsoluteDiff > VecSz) ||
       Diff == -(static_cast<int>(VecSz) - 1)) {
     Type *StrideTy = DL->getIndexType(Ptr0->getType());
-    if (SPtrInfo) {
-      SPtrInfo->StrideVal = ConstantInt::get(StrideTy, StrideIntVal);
-      SPtrInfo->Ty = StridedLoadTy;
-    }
+    SPtrInfo.StrideVal = ConstantInt::get(StrideTy, StrideIntVal);
+    SPtrInfo.Ty = StridedLoadTy;
     return true;
   }
   return false;
@@ -7085,7 +7082,7 @@ bool BoUpSLP::analyzeConstantStrideCandidate(
 
 BoUpSLP::LoadsState BoUpSLP::canVectorizeLoads(
     ArrayRef<Value *> VL, const Value *VL0, SmallVectorImpl<unsigned> &Order,
-    SmallVectorImpl<Value *> &PointerOps, StridedPtrInfo *SPtrInfo,
+    SmallVectorImpl<Value *> &PointerOps, StridedPtrInfo &SPtrInfo,
     unsigned *BestVF, bool TryRecursiveCheck) const {
   // Check that a vectorized load would load the same memory as a scalar
   // load. For example, we don't want to vectorize loads that are smaller
@@ -7919,8 +7916,9 @@ BoUpSLP::getReorderingData(const TreeEntry &TE, bool TopToBottom,
     if (TE.hasState() && TE.getOpcode() == Instruction::Load) {
       SmallVector<Value *> PointerOps;
       OrdersType CurrentOrder;
+      StridedPtrInfo SPtrInfo;
       LoadsState Res = canVectorizeLoads(TE.Scalars, TE.Scalars.front(),
-                                         CurrentOrder, PointerOps);
+                                         CurrentOrder, PointerOps, SPtrInfo);
       if (Res == LoadsState::Vectorize || Res == LoadsState::StridedVectorize ||
           Res == LoadsState::CompressVectorize)
         return std::move(CurrentOrder);
@@ -9423,9 +9421,9 @@ void BoUpSLP::tryToVectorizeGatheredLoads(
           // Try to build vector load.
           ArrayRef<Value *> Values(
               reinterpret_cast<Value *const *>(Slice.begin()), Slice.size());
-          LoadsState LS =
-              canVectorizeLoads(Values, Slice.front(), CurrentOrder, PointerOps,
-                                /*SPtrInfo =*/nullptr, &BestVF);
+          StridedPtrInfo SPtrInfo;
+          LoadsState LS = canVectorizeLoads(Values, Slice.front(), CurrentOrder,
+                                            PointerOps, SPtrInfo, &BestVF);
           if (LS != LoadsState::Gather ||
               (BestVF > 1 && static_cast<unsigned>(NumElts) == 2 * BestVF)) {
             if (LS == LoadsState::ScatterVectorize) {
@@ -9619,6 +9617,7 @@ void BoUpSLP::tryToVectorizeGatheredLoads(
                 unsigned VF = *CommonVF;
                 OrdersType Order;
                 SmallVector<Value *> PointerOps;
+                StridedPtrInfo SPtrInfo;
                 // Segmented load detected - vectorize at maximum vector factor.
                 if (InterleaveFactor <= Slice.size() &&
                     TTI.isLegalInterleavedAccessType(
@@ -9627,8 +9626,8 @@ void BoUpSLP::tryToVectorizeGatheredLoads(
                         cast<LoadInst>(Slice.front())->getAlign(),
                         cast<LoadInst>(Slice.front())
                             ->getPointerAddressSpace()) &&
-                    canVectorizeLoads(Slice, Slice.front(), Order,
-                                      PointerOps) == LoadsState::Vectorize) {
+                    canVectorizeLoads(Slice, Slice.front(), Order, PointerOps,
+                                      SPtrInfo) == LoadsState::Vectorize) {
                   UserMaxVF = InterleaveFactor * VF;
                 } else {
                   InterleaveFactor = 0;
@@ -9650,8 +9649,9 @@ void BoUpSLP::tryToVectorizeGatheredLoads(
                            ArrayRef<Value *> VL = TE.Scalars;
                            OrdersType Order;
                            SmallVector<Value *> PointerOps;
+                           StridedPtrInfo SPtrInfo;
                            LoadsState State = canVectorizeLoads(
-                               VL, VL.front(), Order, PointerOps);
+                               VL, VL.front(), Order, PointerOps, SPtrInfo);
                            if (State == LoadsState::ScatterVectorize ||
                                State == LoadsState::CompressVectorize)
                              return false;
@@ -9669,11 +9669,11 @@ void BoUpSLP::tryToVectorizeGatheredLoads(
                          [&, Slice = Slice](unsigned Idx) {
                            OrdersType Order;
                            SmallVector<Value *> PointerOps;
+                           StridedPtrInfo SPtrInfo;
                            return canVectorizeLoads(
                                       Slice.slice(Idx * UserMaxVF, UserMaxVF),
-                                      Slice[Idx * UserMaxVF], Order,
-                                      PointerOps) ==
-                                  LoadsState::ScatterVectorize;
+                                      Slice[Idx * UserMaxVF], Order, PointerOps,
+                                      SPtrInfo) == LoadsState::ScatterVectorize;
                          }))
                 UserMaxVF = MaxVF;
               if (Slice.size() != ConsecutiveNodesSize)
@@ -10132,7 +10132,7 @@ BoUpSLP::TreeEntry::EntryState BoUpSLP::getScalarsVectorizationState(
         });
       });
     };
-    switch (canVectorizeLoads(VL, VL0, CurrentOrder, PointerOps, &SPtrInfo)) {
+    switch (canVectorizeLoads(VL, VL0, CurrentOrder, PointerOps, SPtrInfo)) {
     case LoadsState::Vectorize:
       return TreeEntry::Vectorize;
     case LoadsState::CompressVectorize:
@@ -13156,8 +13156,9 @@ void BoUpSLP::transformNodes() {
               if (S.getOpcode() == Instruction::Load) {
                 OrdersType Order;
                 SmallVector<Value *> PointerOps;
-                LoadsState Res =
-                    canVectorizeLoads(Slice, Slice.front(), Order, PointerOps);
+                StridedPtrInfo SPtrInfo;
+                LoadsState Res = canVectorizeLoads(Slice, Slice.front(), Order,
+                                                   PointerOps, SPtrInfo);
                 AllStrided &= Res == LoadsState::StridedVectorize ||
                               Res == LoadsState::ScatterVectorize ||
                               Res == LoadsState::Gather;

>From 37b8ad4572111b4de492ff4a53e07eaf0ecc3294 Mon Sep 17 00:00:00 2001
From: Mikhail Gudim <mgudim at ventanamicro.com>
Date: Wed, 3 Sep 2025 08:41:46 -0700
Subject: [PATCH 16/17] addressed review comments

---
 llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index 0e148ad9d9bae..7c4d72422cd97 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -6522,11 +6522,11 @@ bool BoUpSLP::analyzeRtStrideCandidate(ArrayRef<Value *> PointerOps,
     if (!PtrSCEV)
       return false;
 
-    const SCEVAddExpr *Add = dyn_cast<SCEVAddExpr>(PtrSCEV);
+    const auto *Add = dyn_cast<SCEVAddExpr>(PtrSCEV);
     int64_t Offset = 0;
     if (Add) {
       for (int I : seq<int>(Add->getNumOperands())) {
-        const SCEVConstant *SC = dyn_cast<SCEVConstant>(Add->getOperand(I));
+        const auto *SC = dyn_cast<SCEVConstant>(Add->getOperand(I));
         if (!SC)
           continue;
         Offset = SC->getAPInt().getSExtValue();

>From 60864f1481d7f2b6077d880600199d02a816b898 Mon Sep 17 00:00:00 2001
From: Mikhail Gudim <mgudim at ventanamicro.com>
Date: Thu, 11 Sep 2025 05:57:00 -0700
Subject: [PATCH 17/17] addressed review comments.

---
 llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index 7c4d72422cd97..c2158ed7620ca 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -6555,7 +6555,7 @@ bool BoUpSLP::analyzeRtStrideCandidate(ArrayRef<Value *> PointerOps,
     return false;
 
   SmallVector<int64_t> SortedOffsetsV;
-  for (auto [K, V] : OffsetToPointerOpIdxMap)
+  for (auto [K, _] : OffsetToPointerOpIdxMap)
     SortedOffsetsV.push_back(K);
   sort(SortedOffsetsV);
   if (NumOffsets > 1) {
@@ -6966,7 +6966,7 @@ bool BoUpSLP::analyzeConstantStrideCandidate(
   const unsigned Sz = PointerOps.size();
   SmallVector<int64_t> SortedOffsetsFromBase;
   SortedOffsetsFromBase.resize(Sz);
-  for (unsigned I : seq<int>(Sz)) {
+  for (unsigned I : seq<unsigned>(Sz)) {
     Value *Ptr =
         SortedIndices.empty() ? PointerOps[I] : PointerOps[SortedIndices[I]];
     SortedOffsetsFromBase[I] =