[llvm] [SLPVectorizer] Widen strided loads. (PR #153074)

Sun Aug 31 12:23:06 PDT 2025

https://github.com/mgudim updated https://github.com/llvm/llvm-project/pull/153074

>From 227f1c1a2740955ad27e0e67e949db30c4e2eca9 Mon Sep 17 00:00:00 2001
From: Mikhail Gudim <mgudim at ventanamicro.com>
Date: Fri, 8 Aug 2025 07:52:29 -0700
Subject: [PATCH 01/14] [SLPVectorizer] Widen strided loads.

Currently SLPVectorizer can generate strided loads only for this
pattern:

```
load %base + 0 * %stride
load %base + 1 * %stride
...
load %base + n * %stride
```
In this PR we extend it to this pattern:

```
; load w consecutive elements starting at %base
load %base + 0 * %stride + 0
load %base + 0 * %stride + 1
load %base + 0 * %stride + 2
...
load %base + 0 * %stride + (w - 1)

; load w consecutive elements starting at %base + 1 * %stride
load %base + 1 * %stride + 0
load %base + 1 * %stride + 1
load %base + 1 * %stride + 2
...
load %base + 1 * %stride + (w - 1)
...
; load w consecutive elements starting at %base + n * %stride
load %base + n * %stride
load %base + n * %stride + 0
load %base + n * %stride + 1
load %base + n * %stride + 2
...
load %base + n * %stride + (w - 1)
```
This works for both run-time and constant strides.
---
 .../Target/RISCV/RISCVTargetTransformInfo.cpp |   9 +
 .../Transforms/Vectorize/SLPVectorizer.cpp    | 538 +++++++++++++-----
 .../SLPVectorizer/RISCV/x264-satd-8x4.ll      | 483 ++++++++++++++++
 3 files changed, 872 insertions(+), 158 deletions(-)
 create mode 100644 llvm/test/Transforms/SLPVectorizer/RISCV/x264-satd-8x4.ll

diff --git a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp
index 5e300182657d5..8c53bd9265302 100644
--- a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp
+++ b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp
@@ -37,6 +37,11 @@ static cl::opt<unsigned> SLPMaxVF(
         "exclusively by SLP vectorizer."),
     cl::Hidden);
 
+static cl::opt<bool> SLPPreferAltOpcVectorization(
+    "riscv-v-slp-prefer-alt-opc-vectorization",
+    cl::desc("Controls preferAlternateOpcodeVectorization"), cl::init(false),
+    cl::Hidden);
+
 static cl::opt<unsigned>
     RVVMinTripCount("riscv-v-min-trip-count",
                     cl::desc("Set the lower bound of a trip count to decide on "
@@ -3040,3 +3045,7 @@ RISCVTTIImpl::enableMemCmpExpansion(bool OptSize, bool IsZeroCmp) const {
   }
   return Options;
 }
+
+bool RISCVTTIImpl::preferAlternateOpcodeVectorization() const {
+  return SLPPreferAltOpcVectorization;
+}
diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index 4d4f34a0bdd38..ab2b21919d819 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -1918,6 +1918,21 @@ class BoUpSLP {
   class ShuffleCostEstimator;
   class ShuffleInstructionBuilder;
 
+  // If we decide to generate strided load / store, this struct contains all the
+  // necessary info. It's fields are calculated by analyzeRtStrideCandidate and
+  // analyzeConstantStrideCandidate. Note that Stride can be given either as a
+  // SCEV or as a Value if it already exists.
+  // To get the stride in bytes, StrideVal (or value obtained from StrideSCEV)
+  // has to by multiplied by the size of element of FixedVectorType.
+  struct StridedPtrInfo {
+    Value *StrideVal = nullptr;
+    const SCEV *StrideSCEV = nullptr;
+    // Represents the ammount which needs to be added to the base pointer of
+    // strided load.
+    FixedVectorType *Ty = nullptr;
+  };
+  DenseMap<TreeEntry *, StridedPtrInfo> TreeEntryToStridedPtrInfoMap;
+
 public:
   /// Tracks the state we can represent the loads in the given sequence.
   enum class LoadsState {
@@ -2078,6 +2093,7 @@ class BoUpSLP {
     UserIgnoreList = nullptr;
     PostponedGathers.clear();
     ValueToGatherNodes.clear();
+    TreeEntryToStridedPtrInfoMap.clear();
   }
 
   unsigned getTreeSize() const { return VectorizableTree.size(); }
@@ -2214,6 +2230,17 @@ class BoUpSLP {
   ///       may not be necessary.
   bool isLoadCombineCandidate(ArrayRef<Value *> Stores) const;
 
+  bool analyzeRtStrideCandidate(ArrayRef<Value *> PointerOps, Type *ElemTy,
+                                Align CommonAlignment,
+                                SmallVectorImpl<unsigned> &SortedIndices,
+                                StridedPtrInfo *SPtrInfo) const;
+
+  bool analyzeConstantStrideCandidate(ArrayRef<Value *> PointerOps,
+                                      Type *ElemTy, Align CommonAlignment,
+                                      SmallVectorImpl<unsigned> &SortedIndices,
+                                      StridedPtrInfo *SPtrInfo, int64_t Diff,
+                                      Value *Ptr0, Value *PtrN) const;
+
   /// Checks if the given array of loads can be represented as a vectorized,
   /// scatter or just simple gather.
   /// \param VL list of loads.
@@ -2227,6 +2254,7 @@ class BoUpSLP {
   LoadsState canVectorizeLoads(ArrayRef<Value *> VL, const Value *VL0,
                                SmallVectorImpl<unsigned> &Order,
                                SmallVectorImpl<Value *> &PointerOps,
+                               StridedPtrInfo *SPtrInfo = nullptr,
                                unsigned *BestVF = nullptr,
                                bool TryRecursiveCheck = true) const;
 
@@ -4471,11 +4499,10 @@ class BoUpSLP {
 
   /// Checks if the specified list of the instructions/values can be vectorized
   /// and fills required data before actual scheduling of the instructions.
-  TreeEntry::EntryState
-  getScalarsVectorizationState(const InstructionsState &S, ArrayRef<Value *> VL,
-                               bool IsScatterVectorizeUserTE,
-                               OrdersType &CurrentOrder,
-                               SmallVectorImpl<Value *> &PointerOps);
+  TreeEntry::EntryState getScalarsVectorizationState(
+      const InstructionsState &S, ArrayRef<Value *> VL,
+      bool IsScatterVectorizeUserTE, OrdersType &CurrentOrder,
+      SmallVectorImpl<Value *> &PointerOps, StridedPtrInfo *SPtrInfo);
 
   /// Maps a specific scalar to its tree entry(ies).
   SmallDenseMap<Value *, SmallVector<TreeEntry *>> ScalarToTreeEntries;
@@ -6324,18 +6351,12 @@ static bool isReverseOrder(ArrayRef<unsigned> Order) {
   });
 }
 
-/// Checks if the provided list of pointers \p Pointers represents the strided
-/// pointers for type ElemTy. If they are not, std::nullopt is returned.
-/// Otherwise, if \p Inst is not specified, just initialized optional value is
-/// returned to show that the pointers represent strided pointers. If \p Inst
-/// specified, the runtime stride is materialized before the given \p Inst.
-/// \returns std::nullopt if the pointers are not pointers with the runtime
-/// stride, nullptr or actual stride value, otherwise.
-static std::optional<Value *>
-calculateRtStride(ArrayRef<Value *> PointerOps, Type *ElemTy,
-                  const DataLayout &DL, ScalarEvolution &SE,
-                  SmallVectorImpl<unsigned> &SortedIndices,
-                  Instruction *Inst = nullptr) {
+/// Returns a SCEV expression for the stride if PointerOps is a set of strided
+/// pointers, or nullptr otherwise.
+static const SCEV *calculateRtStride(ArrayRef<Value *> PointerOps, Type *ElemTy,
+                                     const DataLayout &DL, ScalarEvolution &SE,
+                                     SmallVectorImpl<unsigned> &SortedIndices,
+                                     SmallVectorImpl<int64_t> &Coeffs) {
   SmallVector<const SCEV *> SCEVs;
   const SCEV *PtrSCEVLowest = nullptr;
   const SCEV *PtrSCEVHighest = nullptr;
@@ -6344,7 +6365,7 @@ calculateRtStride(ArrayRef<Value *> PointerOps, Type *ElemTy,
   for (Value *Ptr : PointerOps) {
     const SCEV *PtrSCEV = SE.getSCEV(Ptr);
     if (!PtrSCEV)
-      return std::nullopt;
+      return nullptr;
     SCEVs.push_back(PtrSCEV);
     if (!PtrSCEVLowest && !PtrSCEVHighest) {
       PtrSCEVLowest = PtrSCEVHighest = PtrSCEV;
@@ -6352,14 +6373,14 @@ calculateRtStride(ArrayRef<Value *> PointerOps, Type *ElemTy,
     }
     const SCEV *Diff = SE.getMinusSCEV(PtrSCEV, PtrSCEVLowest);
     if (isa<SCEVCouldNotCompute>(Diff))
-      return std::nullopt;
+      return nullptr;
     if (Diff->isNonConstantNegative()) {
       PtrSCEVLowest = PtrSCEV;
       continue;
     }
     const SCEV *Diff1 = SE.getMinusSCEV(PtrSCEVHighest, PtrSCEV);
     if (isa<SCEVCouldNotCompute>(Diff1))
-      return std::nullopt;
+      return nullptr;
     if (Diff1->isNonConstantNegative()) {
       PtrSCEVHighest = PtrSCEV;
       continue;
@@ -6368,7 +6389,7 @@ calculateRtStride(ArrayRef<Value *> PointerOps, Type *ElemTy,
   // Dist = PtrSCEVHighest - PtrSCEVLowest;
   const SCEV *Dist = SE.getMinusSCEV(PtrSCEVHighest, PtrSCEVLowest);
   if (isa<SCEVCouldNotCompute>(Dist))
-    return std::nullopt;
+    return nullptr;
   int Size = DL.getTypeStoreSize(ElemTy);
   auto TryGetStride = [&](const SCEV *Dist,
                           const SCEV *Multiplier) -> const SCEV * {
@@ -6389,10 +6410,10 @@ calculateRtStride(ArrayRef<Value *> PointerOps, Type *ElemTy,
     const SCEV *Sz = SE.getConstant(Dist->getType(), Size * (SCEVs.size() - 1));
     Stride = TryGetStride(Dist, Sz);
     if (!Stride)
-      return std::nullopt;
+      return nullptr;
   }
   if (!Stride || isa<SCEVConstant>(Stride))
-    return std::nullopt;
+    return nullptr;
   // Iterate through all pointers and check if all distances are
   // unique multiple of Stride.
   using DistOrdPair = std::pair<int64_t, int>;
@@ -6406,42 +6427,184 @@ calculateRtStride(ArrayRef<Value *> PointerOps, Type *ElemTy,
       const SCEV *Diff = SE.getMinusSCEV(PtrSCEV, PtrSCEVLowest);
       const SCEV *Coeff = TryGetStride(Diff, Stride);
       if (!Coeff)
-        return std::nullopt;
+        return nullptr;
       const auto *SC = dyn_cast<SCEVConstant>(Coeff);
       if (!SC || isa<SCEVCouldNotCompute>(SC))
-        return std::nullopt;
+        return nullptr;
+      Coeffs.push_back((int64_t)SC->getAPInt().getLimitedValue());
       if (!SE.getMinusSCEV(PtrSCEV, SE.getAddExpr(PtrSCEVLowest,
                                                   SE.getMulExpr(Stride, SC)))
                ->isZero())
-        return std::nullopt;
+        return nullptr;
       Dist = SC->getAPInt().getZExtValue();
-    }
+    } else
+      Coeffs.push_back(0);
     // If the strides are not the same or repeated, we can't vectorize.
     if ((Dist / Size) * Size != Dist || (Dist / Size) >= SCEVs.size())
-      return std::nullopt;
+      return nullptr;
     auto Res = Offsets.emplace(Dist, Cnt);
     if (!Res.second)
-      return std::nullopt;
+      return nullptr;
     // Consecutive order if the inserted element is the last one.
     IsConsecutive = IsConsecutive && std::next(Res.first) == Offsets.end();
     ++Cnt;
   }
   if (Offsets.size() != SCEVs.size())
-    return std::nullopt;
+    return nullptr;
   SortedIndices.clear();
-  if (!IsConsecutive) {
-    // Fill SortedIndices array only if it is non-consecutive.
-    SortedIndices.resize(PointerOps.size());
-    Cnt = 0;
-    for (const std::pair<int64_t, int> &Pair : Offsets) {
-      SortedIndices[Cnt] = Pair.second;
-      ++Cnt;
+  SortedIndices.resize(PointerOps.size());
+  Cnt = 0;
+  for (const std::pair<int64_t, int> &Pair : Offsets) {
+    SortedIndices[Cnt] = Pair.second;
+    ++Cnt;
+  }
+  return Stride;
+}
+
+// Suppose we are given pointers of the form: %b + x * %s + y * %c
+// where %c is constant. Check if the pointers can be rearranged as follows:
+//  %b + 0 * %s + 0
+//  %b + 0 * %s + 1
+//  %b + 0 * %s + 2
+//  ...
+//  %b + 0 * %s + w
+//
+//  %b + 1 * %s + 0
+//  %b + 1 * %s + 1
+//  %b + 1 * %s + 2
+//  ...
+//  %b + 1 * %s + w
+//  ...
+//
+//  If the pointers can be rearanged in the above pattern, it means that the
+//  memory can be accessed with a strided loads of width `w` and stride `%s`.
+bool BoUpSLP::analyzeRtStrideCandidate(ArrayRef<Value *> PointerOps,
+                                       Type *ElemTy, Align CommonAlignment,
+                                       SmallVectorImpl<unsigned> &SortedIndices,
+                                       StridedPtrInfo *SPtrInfo) const {
+  // Group the pointers by constant offset.
+  DenseMap<int64_t, std::pair<SmallVector<Value *>, SmallVector<unsigned>>>
+      OffsetToPointerOpIdxMap;
+  for (auto [Idx, Ptr] : enumerate(PointerOps)) {
+    const SCEV *PtrSCEV = SE->getSCEV(Ptr);
+    if (!PtrSCEV)
+      return false;
+
+    const SCEVAddExpr *Add = dyn_cast<SCEVAddExpr>(PtrSCEV);
+    int64_t Offset = 0;
+    if (Add) {
+      for (int i = 0; i < (int)Add->getNumOperands(); ++i) {
+        const SCEVConstant *SC = dyn_cast<SCEVConstant>(Add->getOperand(i));
+        if (!SC)
+          continue;
+        Offset = (int64_t)(SC->getAPInt().getLimitedValue());
+        break;
+      }
     }
+    OffsetToPointerOpIdxMap[Offset].first.push_back(Ptr);
+    OffsetToPointerOpIdxMap[Offset].second.push_back(Idx);
   }
-  if (!Inst)
-    return nullptr;
-  SCEVExpander Expander(SE, DL, "strided-load-vec");
-  return Expander.expandCodeFor(Stride, Stride->getType(), Inst);
+  int NumOffsets = OffsetToPointerOpIdxMap.size();
+
+  const unsigned Sz = PointerOps.size();
+  unsigned VecSz = Sz;
+  Type *ScalarTy = ElemTy;
+  if (NumOffsets > 1) {
+    if (Sz % NumOffsets != 0)
+      return false;
+    VecSz = Sz / NumOffsets;
+    ScalarTy = Type::getIntNTy(SE->getContext(),
+                               DL->getTypeSizeInBits(ElemTy).getFixedValue() *
+                                   NumOffsets);
+  }
+  FixedVectorType *StridedLoadTy = getWidenedType(ScalarTy, VecSz);
+  if (!TTI->isTypeLegal(StridedLoadTy) ||
+      !TTI->isLegalStridedLoadStore(StridedLoadTy, CommonAlignment))
+    return false;
+
+  SmallVector<int> SortedOffsetsV;
+  for (auto [K, V] : OffsetToPointerOpIdxMap) {
+    SortedOffsetsV.push_back(K);
+  }
+  llvm::sort(SortedOffsetsV);
+  if (NumOffsets > 1) {
+    int CommonDiff = SortedOffsetsV[1] - SortedOffsetsV[0];
+    if ((CommonDiff) != 1)
+      return false;
+    for (int i = 1; i < (int)SortedOffsetsV.size() - 1; ++i) {
+      if (SortedOffsetsV[i + 1] - SortedOffsetsV[i] != CommonDiff)
+        return false;
+    }
+  }
+
+  int64_t LowestOffset = SortedOffsetsV[0];
+  SmallVector<Value *> &PointerOps0 =
+      OffsetToPointerOpIdxMap[LowestOffset].first;
+  SmallVector<unsigned> &IndicesInAllPointerOps0 =
+      OffsetToPointerOpIdxMap[LowestOffset].second;
+
+  SmallVector<int64_t> Coeffs0;
+  SmallVector<unsigned> SortedIndicesForOffset0;
+  const SCEV *Stride0 = calculateRtStride(PointerOps0, ElemTy, *DL, *SE,
+                                          SortedIndicesForOffset0, Coeffs0);
+  if (!Stride0)
+    return false;
+  unsigned NumCoeffs0 = Coeffs0.size();
+  if (NumCoeffs0 * NumOffsets != Sz)
+    return false;
+  llvm::sort(Coeffs0);
+
+  SmallVector<unsigned> SortedIndicesDraft;
+  SortedIndicesDraft.resize(Sz);
+  auto updateSortedIndices =
+      [&](SmallVectorImpl<unsigned> &SortedIndicesForOffset,
+          SmallVectorImpl<unsigned> &IndicesInAllPointerOps,
+          int64_t OffsetNum) {
+        unsigned Num = 0;
+        for (unsigned Idx : SortedIndicesForOffset) {
+          SortedIndicesDraft[Num * NumOffsets + OffsetNum] =
+              IndicesInAllPointerOps[Idx];
+          ++Num;
+        }
+      };
+
+  updateSortedIndices(SortedIndicesForOffset0, IndicesInAllPointerOps0, 0);
+
+  SmallVector<int64_t> Coeffs;
+  SmallVector<unsigned> SortedIndicesForOffset;
+  for (int i = 1; i < NumOffsets; ++i) {
+    Coeffs.clear();
+    SortedIndicesForOffset.clear();
+
+    int64_t Offset = SortedOffsetsV[i];
+    SmallVector<Value *> &PointerOpsForOffset =
+        OffsetToPointerOpIdxMap[Offset].first;
+    SmallVector<unsigned> &IndicesInAllPointerOps =
+        OffsetToPointerOpIdxMap[Offset].second;
+    const SCEV *StrideWithinGroup = calculateRtStride(
+        PointerOpsForOffset, ElemTy, *DL, *SE, SortedIndicesForOffset, Coeffs);
+
+    if ((!StrideWithinGroup) || StrideWithinGroup != Stride0) {
+      return false;
+    }
+    if (Coeffs.size() != NumCoeffs0)
+      return false;
+    llvm::sort(Coeffs);
+    for (unsigned i = 0; i < NumCoeffs0; ++i) {
+      if (Coeffs[i] != Coeffs0[i])
+        return false;
+    }
+
+    updateSortedIndices(SortedIndicesForOffset, IndicesInAllPointerOps, i);
+  }
+
+  SortedIndices.clear();
+  SortedIndices = SortedIndicesDraft;
+  if (SPtrInfo) {
+    SPtrInfo->StrideSCEV = Stride0;
+    SPtrInfo->Ty = StridedLoadTy;
+  }
+  return true;
 }
 
 static std::pair<InstructionCost, InstructionCost>
@@ -6771,77 +6934,133 @@ isMaskedLoadCompress(ArrayRef<Value *> VL, ArrayRef<Value *> PointerOps,
                               CompressMask, LoadVecTy);
 }
 
-/// Checks if strided loads can be generated out of \p VL loads with pointers \p
-/// PointerOps:
-/// 1. Target with strided load support is detected.
-/// 2. The number of loads is greater than MinProfitableStridedLoads, or the
-/// potential stride <= MaxProfitableLoadStride and the potential stride is
-/// power-of-2 (to avoid perf regressions for the very small number of loads)
-/// and max distance > number of loads, or potential stride is -1.
-/// 3. The loads are ordered, or number of unordered loads <=
-/// MaxProfitableUnorderedLoads, or loads are in reversed order. (this check is
-/// to avoid extra costs for very expensive shuffles).
-/// 4. Any pointer operand is an instruction with the users outside of the
-/// current graph (for masked gathers extra extractelement instructions
-/// might be required).
-static bool isStridedLoad(ArrayRef<Value *> VL, ArrayRef<Value *> PointerOps,
-                          ArrayRef<unsigned> Order,
-                          const TargetTransformInfo &TTI, const DataLayout &DL,
-                          ScalarEvolution &SE,
-                          const bool IsAnyPointerUsedOutGraph,
-                          const int64_t Diff) {
-  const size_t Sz = VL.size();
-  const uint64_t AbsoluteDiff = std::abs(Diff);
-  Type *ScalarTy = VL.front()->getType();
-  auto *VecTy = getWidenedType(ScalarTy, Sz);
-  if (IsAnyPointerUsedOutGraph ||
-      (AbsoluteDiff > Sz &&
-       (Sz > MinProfitableStridedLoads ||
-        (AbsoluteDiff <= MaxProfitableLoadStride * Sz &&
-         AbsoluteDiff % Sz == 0 && has_single_bit(AbsoluteDiff / Sz)))) ||
-      Diff == -(static_cast<int64_t>(Sz) - 1)) {
-    int64_t Stride = Diff / static_cast<int64_t>(Sz - 1);
-    if (Diff != Stride * static_cast<int64_t>(Sz - 1))
+// Same as analyzeRtStrideCandidate, but for constant strides.
+bool BoUpSLP::analyzeConstantStrideCandidate(
+    ArrayRef<Value *> PointerOps, Type *ElemTy, Align CommonAlignment,
+    SmallVectorImpl<unsigned> &SortedIndices, StridedPtrInfo *SPtrInfo,
+    int64_t Diff, Value *Ptr0, Value *PtrN) const {
+  const unsigned Sz = PointerOps.size();
+  SmallVector<int64_t> SortedOffsetsFromBase;
+  SortedOffsetsFromBase.resize(Sz);
+  for (unsigned i = 0; i < Sz; ++i) {
+    Value *Ptr =
+        SortedIndices.empty() ? PointerOps[i] : PointerOps[SortedIndices[i]];
+    SortedOffsetsFromBase[i] =
+        *getPointersDiff(ElemTy, Ptr0, ElemTy, Ptr, *DL, *SE);
+  }
+
+  // Find where the first group ends.
+  assert(SortedOffsetsFromBase.size() > 1);
+  int64_t StrideWithinGroup =
+      SortedOffsetsFromBase[1] - SortedOffsetsFromBase[0];
+  unsigned GroupSize = 1;
+  for (; GroupSize != SortedOffsetsFromBase.size(); ++GroupSize) {
+    if (SortedOffsetsFromBase[GroupSize] -
+            SortedOffsetsFromBase[GroupSize - 1] !=
+        StrideWithinGroup)
+      break;
+  }
+  unsigned VecSz = Sz;
+  Type *ScalarTy = ElemTy;
+  int64_t StrideIntVal = StrideWithinGroup;
+  FixedVectorType *StridedLoadTy = getWidenedType(ScalarTy, VecSz);
+
+  if (Sz != GroupSize) {
+    if (Sz % GroupSize != 0)
       return false;
-    Align Alignment =
-        cast<LoadInst>(Order.empty() ? VL.front() : VL[Order.front()])
-            ->getAlign();
-    if (!TTI.isLegalStridedLoadStore(VecTy, Alignment))
+    VecSz = Sz / GroupSize;
+
+    if (StrideWithinGroup != 1)
       return false;
-    Value *Ptr0;
-    Value *PtrN;
-    if (Order.empty()) {
-      Ptr0 = PointerOps.front();
-      PtrN = PointerOps.back();
-    } else {
-      Ptr0 = PointerOps[Order.front()];
-      PtrN = PointerOps[Order.back()];
-    }
-    // Iterate through all pointers and check if all distances are
-    // unique multiple of Dist.
-    SmallSet<int64_t, 4> Dists;
-    for (Value *Ptr : PointerOps) {
-      int64_t Dist = 0;
-      if (Ptr == PtrN)
-        Dist = Diff;
-      else if (Ptr != Ptr0)
-        Dist = *getPointersDiff(ScalarTy, Ptr0, ScalarTy, Ptr, DL, SE);
-      // If the strides are not the same or repeated, we can't
-      // vectorize.
-      if (((Dist / Stride) * Stride) != Dist || !Dists.insert(Dist).second)
+    unsigned VecSz = Sz / GroupSize;
+    ScalarTy = Type::getIntNTy(SE->getContext(),
+                               DL->getTypeSizeInBits(ElemTy).getFixedValue() *
+                                   GroupSize);
+    StridedLoadTy = getWidenedType(ScalarTy, VecSz);
+    if (!TTI->isTypeLegal(StridedLoadTy) ||
+        !TTI->isLegalStridedLoadStore(StridedLoadTy, CommonAlignment))
+      return false;
+
+    unsigned PrevGroupStartIdx = 0;
+    unsigned CurrentGroupStartIdx = GroupSize;
+    int64_t StrideBetweenGroups =
+        SortedOffsetsFromBase[GroupSize] - SortedOffsetsFromBase[0];
+    StrideIntVal = StrideBetweenGroups;
+    while (CurrentGroupStartIdx != Sz) {
+      if (SortedOffsetsFromBase[CurrentGroupStartIdx] -
+              SortedOffsetsFromBase[PrevGroupStartIdx] !=
+          StrideBetweenGroups)
         break;
+      PrevGroupStartIdx = CurrentGroupStartIdx;
+      CurrentGroupStartIdx += GroupSize;
+    }
+    if (CurrentGroupStartIdx != Sz)
+      return false;
+
+    auto checkGroup = [&](unsigned StartIdx, unsigned GroupSize0,
+                          int64_t StrideWithinGroup) -> bool {
+      unsigned GroupEndIdx = StartIdx + 1;
+      for (; GroupEndIdx != Sz; ++GroupEndIdx) {
+        if (SortedOffsetsFromBase[GroupEndIdx] -
+                SortedOffsetsFromBase[GroupEndIdx - 1] !=
+            StrideWithinGroup)
+          break;
+      }
+      return (GroupEndIdx - StartIdx == GroupSize0);
+    };
+    for (unsigned i = 0; i < Sz; i += GroupSize) {
+      if (!checkGroup(i, GroupSize, StrideWithinGroup))
+        return false;
     }
-    if (Dists.size() == Sz)
-      return true;
+  }
+
+  // Try to generate strided load node if:
+  // 1. Target with strided load support is detected.
+  // 2. The number of loads is greater than MinProfitableStridedLoads,
+  // or the potential stride <= MaxProfitableLoadStride and the
+  // potential stride is power-of-2 (to avoid perf regressions for the very
+  // small number of loads) and max distance > number of loads, or potential
+  // stride is -1.
+  // 3. The loads are ordered, or number of unordered loads <=
+  // MaxProfitableUnorderedLoads, or loads are in reversed order.
+  // (this check is to avoid extra costs for very expensive shuffles).
+  // 4. Any pointer operand is an instruction with the users outside of the
+  // current graph (for masked gathers extra extractelement instructions
+  // might be required).
+
+  if (!TTI->isTypeLegal(StridedLoadTy) ||
+      !TTI->isLegalStridedLoadStore(StridedLoadTy, CommonAlignment))
+    return false;
+
+  // Simple check if not a strided access - clear order.
+  bool IsPossibleStrided = Diff % (VecSz - 1) == 0;
+  auto IsAnyPointerUsedOutGraph =
+      IsPossibleStrided && any_of(PointerOps, [&](Value *V) {
+        return isa<Instruction>(V) && any_of(V->users(), [&](User *U) {
+                 return !isVectorized(U) && !MustGather.contains(U);
+               });
+      });
+  const unsigned AbsoluteDiff = std::abs(Diff);
+  if (IsAnyPointerUsedOutGraph ||
+      ((VecSz > MinProfitableStridedLoads ||
+        (AbsoluteDiff <= MaxProfitableLoadStride * VecSz &&
+         has_single_bit(AbsoluteDiff))) &&
+       AbsoluteDiff > VecSz) ||
+      Diff == -(static_cast<int>(VecSz) - 1)) {
+    if (SPtrInfo) {
+      Type *StrideTy = DL->getIndexType(Ptr0->getType());
+      SPtrInfo->StrideVal = ConstantInt::get(StrideTy, StrideIntVal);
+      SPtrInfo->Ty = StridedLoadTy;
+    }
+    return true;
   }
   return false;
 }
 
-BoUpSLP::LoadsState
-BoUpSLP::canVectorizeLoads(ArrayRef<Value *> VL, const Value *VL0,
-                           SmallVectorImpl<unsigned> &Order,
-                           SmallVectorImpl<Value *> &PointerOps,
-                           unsigned *BestVF, bool TryRecursiveCheck) const {
+BoUpSLP::LoadsState BoUpSLP::canVectorizeLoads(
+    ArrayRef<Value *> VL, const Value *VL0, SmallVectorImpl<unsigned> &Order,
+    SmallVectorImpl<Value *> &PointerOps, StridedPtrInfo *SPtrInfo,
+    unsigned *BestVF, bool TryRecursiveCheck) const {
   // Check that a vectorized load would load the same memory as a scalar
   // load. For example, we don't want to vectorize loads that are smaller
   // than 8-bit. Even though we have a packed struct {<i2, i2, i2, i2>} LLVM
@@ -6878,11 +7097,10 @@ BoUpSLP::canVectorizeLoads(ArrayRef<Value *> VL, const Value *VL0,
   auto *VecTy = getWidenedType(ScalarTy, Sz);
   Align CommonAlignment = computeCommonAlignment<LoadInst>(VL);
   if (!IsSorted) {
-    if (Sz > MinProfitableStridedLoads && TTI->isTypeLegal(VecTy)) {
-      if (TTI->isLegalStridedLoadStore(VecTy, CommonAlignment) &&
-          calculateRtStride(PointerOps, ScalarTy, *DL, *SE, Order))
-        return LoadsState::StridedVectorize;
-    }
+    if (Sz > MinProfitableStridedLoads &&
+        analyzeRtStrideCandidate(PointerOps, ScalarTy, CommonAlignment, Order,
+                                 SPtrInfo))
+      return LoadsState::StridedVectorize;
 
     if (!TTI->isLegalMaskedGather(VecTy, CommonAlignment) ||
         TTI->forceScalarizeMaskedGather(VecTy, CommonAlignment))
@@ -6915,17 +7133,8 @@ BoUpSLP::canVectorizeLoads(ArrayRef<Value *> VL, const Value *VL0,
                              }))
       return LoadsState::CompressVectorize;
     // Simple check if not a strided access - clear order.
-    bool IsPossibleStrided = *Diff % (Sz - 1) == 0;
-    // Try to generate strided load node.
-    auto IsAnyPointerUsedOutGraph =
-        IsPossibleStrided && any_of(PointerOps, [&](Value *V) {
-          return isa<Instruction>(V) && any_of(V->users(), [&](User *U) {
-                   return !isVectorized(U) && !MustGather.contains(U);
-                 });
-        });
-    if (IsPossibleStrided &&
-        isStridedLoad(VL, PointerOps, Order, *TTI, *DL, *SE,
-                      IsAnyPointerUsedOutGraph, *Diff))
+    if (analyzeConstantStrideCandidate(PointerOps, ScalarTy, CommonAlignment,
+                                       Order, SPtrInfo, *Diff, Ptr0, PtrN))
       return LoadsState::StridedVectorize;
   }
   if (!TTI->isLegalMaskedGather(VecTy, CommonAlignment) ||
@@ -7009,9 +7218,9 @@ BoUpSLP::canVectorizeLoads(ArrayRef<Value *> VL, const Value *VL0,
         ArrayRef<Value *> Slice = VL.slice(Cnt, VF);
         SmallVector<unsigned> Order;
         SmallVector<Value *> PointerOps;
-        LoadsState LS =
-            canVectorizeLoads(Slice, Slice.front(), Order, PointerOps, BestVF,
-                              /*TryRecursiveCheck=*/false);
+        LoadsState LS = canVectorizeLoads(Slice, Slice.front(), Order,
+                                          PointerOps, SPtrInfo, BestVF,
+                                          /*TryRecursiveCheck=*/false);
         // Check that the sorted loads are consecutive.
         if (LS == LoadsState::Gather) {
           if (BestVF) {
@@ -9188,8 +9397,9 @@ void BoUpSLP::tryToVectorizeGatheredLoads(
           // Try to build vector load.
           ArrayRef<Value *> Values(
               reinterpret_cast<Value *const *>(Slice.begin()), Slice.size());
-          LoadsState LS = canVectorizeLoads(Values, Slice.front(), CurrentOrder,
-                                            PointerOps, &BestVF);
+          LoadsState LS =
+              canVectorizeLoads(Values, Slice.front(), CurrentOrder, PointerOps,
+                                /*SPtrInfo =*/nullptr, &BestVF);
           if (LS != LoadsState::Gather ||
               (BestVF > 1 && static_cast<unsigned>(NumElts) == 2 * BestVF)) {
             if (LS == LoadsState::ScatterVectorize) {
@@ -9794,7 +10004,7 @@ getVectorCallCosts(CallInst *CI, FixedVectorType *VecTy,
 BoUpSLP::TreeEntry::EntryState BoUpSLP::getScalarsVectorizationState(
     const InstructionsState &S, ArrayRef<Value *> VL,
     bool IsScatterVectorizeUserTE, OrdersType &CurrentOrder,
-    SmallVectorImpl<Value *> &PointerOps) {
+    SmallVectorImpl<Value *> &PointerOps, StridedPtrInfo *SPtrInfo) {
   assert(S.getMainOp() &&
          "Expected instructions with same/alternate opcodes only.");
 
@@ -9896,7 +10106,7 @@ BoUpSLP::TreeEntry::EntryState BoUpSLP::getScalarsVectorizationState(
         });
       });
     };
-    switch (canVectorizeLoads(VL, VL0, CurrentOrder, PointerOps)) {
+    switch (canVectorizeLoads(VL, VL0, CurrentOrder, PointerOps, SPtrInfo)) {
     case LoadsState::Vectorize:
       return TreeEntry::Vectorize;
     case LoadsState::CompressVectorize:
@@ -11369,8 +11579,9 @@ void BoUpSLP::buildTreeRec(ArrayRef<Value *> VLRef, unsigned Depth,
       UserTreeIdx.UserTE->State == TreeEntry::ScatterVectorize;
   OrdersType CurrentOrder;
   SmallVector<Value *> PointerOps;
+  StridedPtrInfo SPtrInfo;
   TreeEntry::EntryState State = getScalarsVectorizationState(
-      S, VL, IsScatterVectorizeUserTE, CurrentOrder, PointerOps);
+      S, VL, IsScatterVectorizeUserTE, CurrentOrder, PointerOps, &SPtrInfo);
   if (State == TreeEntry::NeedToGather) {
     newGatherTreeEntry(VL, S, UserTreeIdx, ReuseShuffleIndices);
     return;
@@ -11530,6 +11741,7 @@ void BoUpSLP::buildTreeRec(ArrayRef<Value *> VLRef, unsigned Depth,
         // Vectorizing non-consecutive loads with `llvm.masked.gather`.
         TE = newTreeEntry(VL, TreeEntry::StridedVectorize, Bundle, S,
                           UserTreeIdx, ReuseShuffleIndices, CurrentOrder);
+        TreeEntryToStridedPtrInfoMap[TE] = SPtrInfo;
         LLVM_DEBUG(dbgs() << "SLP: added a new TreeEntry (strided LoadInst).\n";
                    TE->dump());
         break;
@@ -13025,10 +13237,20 @@ void BoUpSLP::transformNodes() {
         InstructionCost StridedCost = TTI->getStridedMemoryOpCost(
             Instruction::Load, VecTy, BaseLI->getPointerOperand(),
             /*VariableMask=*/false, CommonAlignment, CostKind, BaseLI);
-        if (StridedCost < OriginalVecCost)
+        if (StridedCost < OriginalVecCost) {
           // Strided load is more profitable than consecutive load + reverse -
           // transform the node to strided load.
+          // TODO: StrideTy =
+          // DL->getIndexType(E.scalars.front()->getPointerOperand()->getType());
+          StridedPtrInfo SPtrInfo;
+          Type *StrideTy = Type::getIntNTy(
+              SE->getContext(),
+              DL->getTypeStoreSizeInBits(E.Scalars.front()->getType()));
+          SPtrInfo.StrideVal = ConstantInt::get(StrideTy, 1);
+          SPtrInfo.Ty = VecTy;
+          TreeEntryToStridedPtrInfoMap[&E] = SPtrInfo;
           E.State = TreeEntry::StridedVectorize;
+        }
       }
       break;
     }
@@ -19486,6 +19708,7 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
 
       LoadInst *LI = cast<LoadInst>(VL0);
       Instruction *NewLI;
+      FixedVectorType *StridedLoadTy = nullptr;
       Value *PO = LI->getPointerOperand();
       if (E->State == TreeEntry::Vectorize) {
         NewLI = Builder.CreateAlignedLoad(VecTy, PO, LI->getAlign());
@@ -19523,40 +19746,37 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
         Value *Ptr0 = cast<LoadInst>(E->Scalars.front())->getPointerOperand();
         Value *PtrN = cast<LoadInst>(E->Scalars.back())->getPointerOperand();
         PO = IsReverseOrder ? PtrN : Ptr0;
-        std::optional<int64_t> Diff = getPointersDiff(
-            VL0->getType(), Ptr0, VL0->getType(), PtrN, *DL, *SE);
         Type *StrideTy = DL->getIndexType(PO->getType());
         Value *StrideVal;
-        if (Diff) {
-          int64_t Stride =
-              *Diff / (static_cast<int64_t>(E->Scalars.size()) - 1);
-          StrideVal =
-              ConstantInt::get(StrideTy, (IsReverseOrder ? -1 : 1) * Stride *
-                                             DL->getTypeAllocSize(ScalarTy));
-        } else {
-          SmallVector<Value *> PointerOps(E->Scalars.size(), nullptr);
-          transform(E->Scalars, PointerOps.begin(), [](Value *V) {
-            return cast<LoadInst>(V)->getPointerOperand();
-          });
-          OrdersType Order;
-          std::optional<Value *> Stride =
-              calculateRtStride(PointerOps, ScalarTy, *DL, *SE, Order,
-                                &*Builder.GetInsertPoint());
-          Value *NewStride =
-              Builder.CreateIntCast(*Stride, StrideTy, /*isSigned=*/true);
-          StrideVal = Builder.CreateMul(
-              NewStride,
-              ConstantInt::get(
-                  StrideTy,
-                  (IsReverseOrder ? -1 : 1) *
-                      static_cast<int>(DL->getTypeAllocSize(ScalarTy))));
-        }
+
+        StridedPtrInfo &SPtrInfo = TreeEntryToStridedPtrInfoMap[E];
+        StridedLoadTy = SPtrInfo.Ty;
+        assert(StridedLoadTy);
+        unsigned StridedLoadEC =
+            StridedLoadTy->getElementCount().getKnownMinValue();
+
+        Value *Stride = SPtrInfo.StrideVal;
+        if (!Stride) {
+          const SCEV *StrideSCEV = SPtrInfo.StrideSCEV;
+          assert(StrideSCEV);
+          SCEVExpander Expander(*SE, *DL, "strided-load-vec");
+          Stride = Expander.expandCodeFor(StrideSCEV, StrideSCEV->getType(),
+                                          &*Builder.GetInsertPoint());
+        }
+        Value *NewStride =
+            Builder.CreateIntCast(Stride, StrideTy, /*isSigned=*/true);
+        StrideVal = Builder.CreateMul(
+            NewStride, ConstantInt::get(
+                           StrideTy, (IsReverseOrder ? -1 : 1) *
+                                         static_cast<int>(
+                                             DL->getTypeAllocSize(ScalarTy))));
         Align CommonAlignment = computeCommonAlignment<LoadInst>(E->Scalars);
         auto *Inst = Builder.CreateIntrinsic(
             Intrinsic::experimental_vp_strided_load,
-            {VecTy, PO->getType(), StrideTy},
-            {PO, StrideVal, Builder.getAllOnesMask(VecTy->getElementCount()),
-             Builder.getInt32(E->Scalars.size())});
+            {StridedLoadTy, PO->getType(), StrideTy},
+            {PO, StrideVal,
+             Builder.getAllOnesMask(ElementCount::getFixed(StridedLoadEC)),
+             Builder.getInt32(StridedLoadEC)});
         Inst->addParamAttr(
             /*ArgNo=*/0,
             Attribute::getWithAlignment(Inst->getContext(), CommonAlignment));
@@ -19593,6 +19813,8 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
                      ? NewLI
                      : ::propagateMetadata(NewLI, E->Scalars);
 
+      if (StridedLoadTy)
+        V = Builder.CreateBitOrPointerCast(V, VecTy);
       V = FinalShuffle(V, E);
       E->VectorizedValue = V;
       ++NumVectorInstructions;
diff --git a/llvm/test/Transforms/SLPVectorizer/RISCV/x264-satd-8x4.ll b/llvm/test/Transforms/SLPVectorizer/RISCV/x264-satd-8x4.ll
new file mode 100644
index 0000000000000..38cf1214081fa
--- /dev/null
+++ b/llvm/test/Transforms/SLPVectorizer/RISCV/x264-satd-8x4.ll
@@ -0,0 +1,483 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt -mtriple=riscv64 -mattr=+m,+v,+unaligned-vector-mem \
+; RUN: -riscv-v-slp-prefer-alt-opc-vectorization=true \
+; RUN: -passes=slp-vectorizer -S < %s | FileCheck %s
+; Function Attrs: nounwind uwtable vscale_range(8,1024)
+define i32 @x264_pixel_satd_8x4(ptr %pix1, i32  %i_pix1, ptr  %pix2, i32  %i_pix2) {
+; CHECK-LABEL: define i32 @x264_pixel_satd_8x4(
+; CHECK-SAME: ptr [[PIX1:%.*]], i32 [[I_PIX1:%.*]], ptr [[PIX2:%.*]], i32 [[I_PIX2:%.*]]) #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[IDX_EXT:%.*]] = sext i32 [[I_PIX1]] to i64
+; CHECK-NEXT:    [[IDX_EXT63:%.*]] = sext i32 [[I_PIX2]] to i64
+; CHECK-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds nuw i8, ptr [[PIX1]], i64 4
+; CHECK-NEXT:    [[ARRAYIDX5:%.*]] = getelementptr inbounds nuw i8, ptr [[PIX2]], i64 4
+; CHECK-NEXT:    [[TMP0:%.*]] = mul i64 [[IDX_EXT]], 1
+; CHECK-NEXT:    [[TMP1:%.*]] = call <4 x i32> @llvm.experimental.vp.strided.load.v4i32.p0.i64(ptr align 1 [[PIX1]], i64 [[TMP0]], <4 x i1> splat (i1 true), i32 4)
+; CHECK-NEXT:    [[TMP19:%.*]] = bitcast <4 x i32> [[TMP1]] to <16 x i8>
+; CHECK-NEXT:    [[TMP20:%.*]] = zext <16 x i8> [[TMP19]] to <16 x i32>
+; CHECK-NEXT:    [[TMP4:%.*]] = mul i64 [[IDX_EXT63]], 1
+; CHECK-NEXT:    [[TMP5:%.*]] = call <4 x i32> @llvm.experimental.vp.strided.load.v4i32.p0.i64(ptr align 1 [[PIX2]], i64 [[TMP4]], <4 x i1> splat (i1 true), i32 4)
+; CHECK-NEXT:    [[TMP28:%.*]] = bitcast <4 x i32> [[TMP5]] to <16 x i8>
+; CHECK-NEXT:    [[TMP29:%.*]] = zext <16 x i8> [[TMP28]] to <16 x i32>
+; CHECK-NEXT:    [[TMP30:%.*]] = sub nsw <16 x i32> [[TMP20]], [[TMP29]]
+; CHECK-NEXT:    [[TMP9:%.*]] = mul i64 [[IDX_EXT]], 1
+; CHECK-NEXT:    [[TMP10:%.*]] = call <4 x i32> @llvm.experimental.vp.strided.load.v4i32.p0.i64(ptr align 1 [[ARRAYIDX3]], i64 [[TMP9]], <4 x i1> splat (i1 true), i32 4)
+; CHECK-NEXT:    [[TMP38:%.*]] = bitcast <4 x i32> [[TMP10]] to <16 x i8>
+; CHECK-NEXT:    [[TMP39:%.*]] = zext <16 x i8> [[TMP38]] to <16 x i32>
+; CHECK-NEXT:    [[TMP13:%.*]] = mul i64 [[IDX_EXT63]], 1
+; CHECK-NEXT:    [[TMP14:%.*]] = call <4 x i32> @llvm.experimental.vp.strided.load.v4i32.p0.i64(ptr align 1 [[ARRAYIDX5]], i64 [[TMP13]], <4 x i1> splat (i1 true), i32 4)
+; CHECK-NEXT:    [[TMP47:%.*]] = bitcast <4 x i32> [[TMP14]] to <16 x i8>
+; CHECK-NEXT:    [[TMP48:%.*]] = zext <16 x i8> [[TMP47]] to <16 x i32>
+; CHECK-NEXT:    [[TMP49:%.*]] = sub nsw <16 x i32> [[TMP39]], [[TMP48]]
+; CHECK-NEXT:    [[TMP50:%.*]] = shl nsw <16 x i32> [[TMP49]], splat (i32 16)
+; CHECK-NEXT:    [[TMP51:%.*]] = add nsw <16 x i32> [[TMP50]], [[TMP30]]
+; CHECK-NEXT:    [[TMP52:%.*]] = shufflevector <16 x i32> [[TMP51]], <16 x i32> poison, <16 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6, i32 9, i32 8, i32 11, i32 10, i32 13, i32 12, i32 15, i32 14>
+; CHECK-NEXT:    [[TMP53:%.*]] = add nsw <16 x i32> [[TMP52]], [[TMP51]]
+; CHECK-NEXT:    [[TMP54:%.*]] = sub nsw <16 x i32> [[TMP52]], [[TMP51]]
+; CHECK-NEXT:    [[TMP55:%.*]] = shufflevector <16 x i32> [[TMP53]], <16 x i32> [[TMP54]], <16 x i32> <i32 0, i32 17, i32 2, i32 19, i32 4, i32 21, i32 6, i32 23, i32 8, i32 25, i32 10, i32 27, i32 12, i32 29, i32 14, i32 31>
+; CHECK-NEXT:    [[TMP56:%.*]] = shufflevector <16 x i32> [[TMP55]], <16 x i32> poison, <16 x i32> <i32 2, i32 3, i32 0, i32 1, i32 6, i32 7, i32 4, i32 5, i32 10, i32 11, i32 8, i32 9, i32 14, i32 15, i32 12, i32 13>
+; CHECK-NEXT:    [[TMP57:%.*]] = add nsw <16 x i32> [[TMP55]], [[TMP56]]
+; CHECK-NEXT:    [[TMP58:%.*]] = sub nsw <16 x i32> [[TMP55]], [[TMP56]]
+; CHECK-NEXT:    [[TMP59:%.*]] = shufflevector <16 x i32> [[TMP57]], <16 x i32> [[TMP58]], <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 20, i32 21, i32 6, i32 7, i32 24, i32 25, i32 10, i32 11, i32 28, i32 29, i32 14, i32 15>
+; CHECK-NEXT:    [[TMP60:%.*]] = shufflevector <16 x i32> [[TMP59]], <16 x i32> poison, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 12, i32 13, i32 14, i32 15, i32 8, i32 9, i32 10, i32 11>
+; CHECK-NEXT:    [[TMP61:%.*]] = sub nsw <16 x i32> [[TMP59]], [[TMP60]]
+; CHECK-NEXT:    [[TMP62:%.*]] = add nsw <16 x i32> [[TMP59]], [[TMP60]]
+; CHECK-NEXT:    [[TMP63:%.*]] = shufflevector <16 x i32> [[TMP61]], <16 x i32> [[TMP62]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 20, i32 21, i32 22, i32 23, i32 8, i32 9, i32 10, i32 11, i32 28, i32 29, i32 30, i32 31>
+; CHECK-NEXT:    [[TMP64:%.*]] = shufflevector <16 x i32> [[TMP63]], <16 x i32> poison, <16 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEXT:    [[TMP65:%.*]] = add nsw <16 x i32> [[TMP63]], [[TMP64]]
+; CHECK-NEXT:    [[TMP66:%.*]] = sub nsw <16 x i32> [[TMP63]], [[TMP64]]
+; CHECK-NEXT:    [[TMP67:%.*]] = shufflevector <16 x i32> [[TMP65]], <16 x i32> [[TMP66]], <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; CHECK-NEXT:    [[TMP68:%.*]] = lshr <16 x i32> [[TMP67]], splat (i32 15)
+; CHECK-NEXT:    [[TMP69:%.*]] = and <16 x i32> [[TMP68]], splat (i32 65537)
+; CHECK-NEXT:    [[TMP70:%.*]] = mul nuw <16 x i32> [[TMP69]], splat (i32 65535)
+; CHECK-NEXT:    [[TMP71:%.*]] = add <16 x i32> [[TMP70]], [[TMP67]]
+; CHECK-NEXT:    [[TMP72:%.*]] = xor <16 x i32> [[TMP71]], [[TMP70]]
+; CHECK-NEXT:    [[TMP73:%.*]] = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[TMP72]])
+; CHECK-NEXT:    [[CONV118:%.*]] = and i32 [[TMP73]], 65535
+; CHECK-NEXT:    [[SHR:%.*]] = lshr i32 [[TMP73]], 16
+; CHECK-NEXT:    [[ADD119:%.*]] = add nuw nsw i32 [[CONV118]], [[SHR]]
+; CHECK-NEXT:    [[SHR120:%.*]] = lshr i32 [[ADD119]], 1
+; CHECK-NEXT:    ret i32 [[SHR120]]
+;
+entry:
+  %idx.ext = sext i32 %i_pix1 to i64
+  %idx.ext63 = sext i32 %i_pix2 to i64
+  %0 = load i8, ptr %pix1, align 1
+  %conv = zext i8 %0 to i32
+  %1 = load i8, ptr %pix2, align 1
+  %conv2 = zext i8 %1 to i32
+  %sub = sub nsw i32 %conv, %conv2
+  %arrayidx3 = getelementptr inbounds nuw i8, ptr %pix1, i64 4
+  %2 = load i8, ptr %arrayidx3, align 1
+  %conv4 = zext i8 %2 to i32
+  %arrayidx5 = getelementptr inbounds nuw i8, ptr %pix2, i64 4
+  %3 = load i8, ptr %arrayidx5, align 1
+  %conv6 = zext i8 %3 to i32
+  %sub7 = sub nsw i32 %conv4, %conv6
+  %shl = shl nsw i32 %sub7, 16
+  %add = add nsw i32 %shl, %sub
+  %arrayidx8 = getelementptr inbounds nuw i8, ptr %pix1, i64 1
+  %4 = load i8, ptr %arrayidx8, align 1
+  %conv9 = zext i8 %4 to i32
+  %arrayidx10 = getelementptr inbounds nuw i8, ptr %pix2, i64 1
+  %5 = load i8, ptr %arrayidx10, align 1
+  %conv11 = zext i8 %5 to i32
+  %sub12 = sub nsw i32 %conv9, %conv11
+  %arrayidx13 = getelementptr inbounds nuw i8, ptr %pix1, i64 5
+  %6 = load i8, ptr %arrayidx13, align 1
+  %conv14 = zext i8 %6 to i32
+  %arrayidx15 = getelementptr inbounds nuw i8, ptr %pix2, i64 5
+  %7 = load i8, ptr %arrayidx15, align 1
+  %conv16 = zext i8 %7 to i32
+  %sub17 = sub nsw i32 %conv14, %conv16
+  %shl18 = shl nsw i32 %sub17, 16
+  %add19 = add nsw i32 %shl18, %sub12
+  %arrayidx20 = getelementptr inbounds nuw i8, ptr %pix1, i64 2
+  %8 = load i8, ptr %arrayidx20, align 1
+  %conv21 = zext i8 %8 to i32
+  %arrayidx22 = getelementptr inbounds nuw i8, ptr %pix2, i64 2
+  %9 = load i8, ptr %arrayidx22, align 1
+  %conv23 = zext i8 %9 to i32
+  %sub24 = sub nsw i32 %conv21, %conv23
+  %arrayidx25 = getelementptr inbounds nuw i8, ptr %pix1, i64 6
+  %10 = load i8, ptr %arrayidx25, align 1
+  %conv26 = zext i8 %10 to i32
+  %arrayidx27 = getelementptr inbounds nuw i8, ptr %pix2, i64 6
+  %11 = load i8, ptr %arrayidx27, align 1
+  %conv28 = zext i8 %11 to i32
+  %sub29 = sub nsw i32 %conv26, %conv28
+  %shl30 = shl nsw i32 %sub29, 16
+  %add31 = add nsw i32 %shl30, %sub24
+  %arrayidx32 = getelementptr inbounds nuw i8, ptr %pix1, i64 3
+  %12 = load i8, ptr %arrayidx32, align 1
+  %conv33 = zext i8 %12 to i32
+  %arrayidx34 = getelementptr inbounds nuw i8, ptr %pix2, i64 3
+  %13 = load i8, ptr %arrayidx34, align 1
+  %conv35 = zext i8 %13 to i32
+  %sub36 = sub nsw i32 %conv33, %conv35
+  %arrayidx37 = getelementptr inbounds nuw i8, ptr %pix1, i64 7
+  %14 = load i8, ptr %arrayidx37, align 1
+  %conv38 = zext i8 %14 to i32
+  %arrayidx39 = getelementptr inbounds nuw i8, ptr %pix2, i64 7
+  %15 = load i8, ptr %arrayidx39, align 1
+  %conv40 = zext i8 %15 to i32
+  %sub41 = sub nsw i32 %conv38, %conv40
+  %shl42 = shl nsw i32 %sub41, 16
+  %add43 = add nsw i32 %shl42, %sub36
+  %add44 = add nsw i32 %add19, %add
+  %sub45 = sub nsw i32 %add, %add19
+  %add46 = add nsw i32 %add43, %add31
+  %sub47 = sub nsw i32 %add31, %add43
+  %add48 = add nsw i32 %add46, %add44
+  %sub51 = sub nsw i32 %add44, %add46
+  %add55 = add nsw i32 %sub47, %sub45
+  %sub59 = sub nsw i32 %sub45, %sub47
+  %add.ptr = getelementptr inbounds i8, ptr %pix1, i64 %idx.ext
+  %add.ptr64 = getelementptr inbounds i8, ptr %pix2, i64 %idx.ext63
+  %16 = load i8, ptr %add.ptr, align 1
+  %conv.1 = zext i8 %16 to i32
+  %17 = load i8, ptr %add.ptr64, align 1
+  %conv2.1 = zext i8 %17 to i32
+  %sub.1 = sub nsw i32 %conv.1, %conv2.1
+  %arrayidx3.1 = getelementptr inbounds nuw i8, ptr %add.ptr, i64 4
+  %18 = load i8, ptr %arrayidx3.1, align 1
+  %conv4.1 = zext i8 %18 to i32
+  %arrayidx5.1 = getelementptr inbounds nuw i8, ptr %add.ptr64, i64 4
+  %19 = load i8, ptr %arrayidx5.1, align 1
+  %conv6.1 = zext i8 %19 to i32
+  %sub7.1 = sub nsw i32 %conv4.1, %conv6.1
+  %shl.1 = shl nsw i32 %sub7.1, 16
+  %add.1 = add nsw i32 %shl.1, %sub.1
+  %arrayidx8.1 = getelementptr inbounds nuw i8, ptr %add.ptr, i64 1
+  %20 = load i8, ptr %arrayidx8.1, align 1
+  %conv9.1 = zext i8 %20 to i32
+  %arrayidx10.1 = getelementptr inbounds nuw i8, ptr %add.ptr64, i64 1
+  %21 = load i8, ptr %arrayidx10.1, align 1
+  %conv11.1 = zext i8 %21 to i32
+  %sub12.1 = sub nsw i32 %conv9.1, %conv11.1
+  %arrayidx13.1 = getelementptr inbounds nuw i8, ptr %add.ptr, i64 5
+  %22 = load i8, ptr %arrayidx13.1, align 1
+  %conv14.1 = zext i8 %22 to i32
+  %arrayidx15.1 = getelementptr inbounds nuw i8, ptr %add.ptr64, i64 5
+  %23 = load i8, ptr %arrayidx15.1, align 1
+  %conv16.1 = zext i8 %23 to i32
+  %sub17.1 = sub nsw i32 %conv14.1, %conv16.1
+  %shl18.1 = shl nsw i32 %sub17.1, 16
+  %add19.1 = add nsw i32 %shl18.1, %sub12.1
+  %arrayidx20.1 = getelementptr inbounds nuw i8, ptr %add.ptr, i64 2
+  %24 = load i8, ptr %arrayidx20.1, align 1
+  %conv21.1 = zext i8 %24 to i32
+  %arrayidx22.1 = getelementptr inbounds nuw i8, ptr %add.ptr64, i64 2
+  %25 = load i8, ptr %arrayidx22.1, align 1
+  %conv23.1 = zext i8 %25 to i32
+  %sub24.1 = sub nsw i32 %conv21.1, %conv23.1
+  %arrayidx25.1 = getelementptr inbounds nuw i8, ptr %add.ptr, i64 6
+  %26 = load i8, ptr %arrayidx25.1, align 1
+  %conv26.1 = zext i8 %26 to i32
+  %arrayidx27.1 = getelementptr inbounds nuw i8, ptr %add.ptr64, i64 6
+  %27 = load i8, ptr %arrayidx27.1, align 1
+  %conv28.1 = zext i8 %27 to i32
+  %sub29.1 = sub nsw i32 %conv26.1, %conv28.1
+  %shl30.1 = shl nsw i32 %sub29.1, 16
+  %add31.1 = add nsw i32 %shl30.1, %sub24.1
+  %arrayidx32.1 = getelementptr inbounds nuw i8, ptr %add.ptr, i64 3
+  %28 = load i8, ptr %arrayidx32.1, align 1
+  %conv33.1 = zext i8 %28 to i32
+  %arrayidx34.1 = getelementptr inbounds nuw i8, ptr %add.ptr64, i64 3
+  %29 = load i8, ptr %arrayidx34.1, align 1
+  %conv35.1 = zext i8 %29 to i32
+  %sub36.1 = sub nsw i32 %conv33.1, %conv35.1
+  %arrayidx37.1 = getelementptr inbounds nuw i8, ptr %add.ptr, i64 7
+  %30 = load i8, ptr %arrayidx37.1, align 1
+  %conv38.1 = zext i8 %30 to i32
+  %arrayidx39.1 = getelementptr inbounds nuw i8, ptr %add.ptr64, i64 7
+  %31 = load i8, ptr %arrayidx39.1, align 1
+  %conv40.1 = zext i8 %31 to i32
+  %sub41.1 = sub nsw i32 %conv38.1, %conv40.1
+  %shl42.1 = shl nsw i32 %sub41.1, 16
+  %add43.1 = add nsw i32 %shl42.1, %sub36.1
+  %add44.1 = add nsw i32 %add19.1, %add.1
+  %sub45.1 = sub nsw i32 %add.1, %add19.1
+  %add46.1 = add nsw i32 %add43.1, %add31.1
+  %sub47.1 = sub nsw i32 %add31.1, %add43.1
+  %add48.1 = add nsw i32 %add46.1, %add44.1
+  %sub51.1 = sub nsw i32 %add44.1, %add46.1
+  %add55.1 = add nsw i32 %sub47.1, %sub45.1
+  %sub59.1 = sub nsw i32 %sub45.1, %sub47.1
+  %add.ptr.1 = getelementptr inbounds i8, ptr %add.ptr, i64 %idx.ext
+  %add.ptr64.1 = getelementptr inbounds i8, ptr %add.ptr64, i64 %idx.ext63
+  %32 = load i8, ptr %add.ptr.1, align 1
+  %conv.2 = zext i8 %32 to i32
+  %33 = load i8, ptr %add.ptr64.1, align 1
+  %conv2.2 = zext i8 %33 to i32
+  %sub.2 = sub nsw i32 %conv.2, %conv2.2
+  %arrayidx3.2 = getelementptr inbounds nuw i8, ptr %add.ptr.1, i64 4
+  %34 = load i8, ptr %arrayidx3.2, align 1
+  %conv4.2 = zext i8 %34 to i32
+  %arrayidx5.2 = getelementptr inbounds nuw i8, ptr %add.ptr64.1, i64 4
+  %35 = load i8, ptr %arrayidx5.2, align 1
+  %conv6.2 = zext i8 %35 to i32
+  %sub7.2 = sub nsw i32 %conv4.2, %conv6.2
+  %shl.2 = shl nsw i32 %sub7.2, 16
+  %add.2 = add nsw i32 %shl.2, %sub.2
+  %arrayidx8.2 = getelementptr inbounds nuw i8, ptr %add.ptr.1, i64 1
+  %36 = load i8, ptr %arrayidx8.2, align 1
+  %conv9.2 = zext i8 %36 to i32
+  %arrayidx10.2 = getelementptr inbounds nuw i8, ptr %add.ptr64.1, i64 1
+  %37 = load i8, ptr %arrayidx10.2, align 1
+  %conv11.2 = zext i8 %37 to i32
+  %sub12.2 = sub nsw i32 %conv9.2, %conv11.2
+  %arrayidx13.2 = getelementptr inbounds nuw i8, ptr %add.ptr.1, i64 5
+  %38 = load i8, ptr %arrayidx13.2, align 1
+  %conv14.2 = zext i8 %38 to i32
+  %arrayidx15.2 = getelementptr inbounds nuw i8, ptr %add.ptr64.1, i64 5
+  %39 = load i8, ptr %arrayidx15.2, align 1
+  %conv16.2 = zext i8 %39 to i32
+  %sub17.2 = sub nsw i32 %conv14.2, %conv16.2
+  %shl18.2 = shl nsw i32 %sub17.2, 16
+  %add19.2 = add nsw i32 %shl18.2, %sub12.2
+  %arrayidx20.2 = getelementptr inbounds nuw i8, ptr %add.ptr.1, i64 2
+  %40 = load i8, ptr %arrayidx20.2, align 1
+  %conv21.2 = zext i8 %40 to i32
+  %arrayidx22.2 = getelementptr inbounds nuw i8, ptr %add.ptr64.1, i64 2
+  %41 = load i8, ptr %arrayidx22.2, align 1
+  %conv23.2 = zext i8 %41 to i32
+  %sub24.2 = sub nsw i32 %conv21.2, %conv23.2
+  %arrayidx25.2 = getelementptr inbounds nuw i8, ptr %add.ptr.1, i64 6
+  %42 = load i8, ptr %arrayidx25.2, align 1
+  %conv26.2 = zext i8 %42 to i32
+  %arrayidx27.2 = getelementptr inbounds nuw i8, ptr %add.ptr64.1, i64 6
+  %43 = load i8, ptr %arrayidx27.2, align 1
+  %conv28.2 = zext i8 %43 to i32
+  %sub29.2 = sub nsw i32 %conv26.2, %conv28.2
+  %shl30.2 = shl nsw i32 %sub29.2, 16
+  %add31.2 = add nsw i32 %shl30.2, %sub24.2
+  %arrayidx32.2 = getelementptr inbounds nuw i8, ptr %add.ptr.1, i64 3
+  %44 = load i8, ptr %arrayidx32.2, align 1
+  %conv33.2 = zext i8 %44 to i32
+  %arrayidx34.2 = getelementptr inbounds nuw i8, ptr %add.ptr64.1, i64 3
+  %45 = load i8, ptr %arrayidx34.2, align 1
+  %conv35.2 = zext i8 %45 to i32
+  %sub36.2 = sub nsw i32 %conv33.2, %conv35.2
+  %arrayidx37.2 = getelementptr inbounds nuw i8, ptr %add.ptr.1, i64 7
+  %46 = load i8, ptr %arrayidx37.2, align 1
+  %conv38.2 = zext i8 %46 to i32
+  %arrayidx39.2 = getelementptr inbounds nuw i8, ptr %add.ptr64.1, i64 7
+  %47 = load i8, ptr %arrayidx39.2, align 1
+  %conv40.2 = zext i8 %47 to i32
+  %sub41.2 = sub nsw i32 %conv38.2, %conv40.2
+  %shl42.2 = shl nsw i32 %sub41.2, 16
+  %add43.2 = add nsw i32 %shl42.2, %sub36.2
+  %add44.2 = add nsw i32 %add19.2, %add.2
+  %sub45.2 = sub nsw i32 %add.2, %add19.2
+  %add46.2 = add nsw i32 %add43.2, %add31.2
+  %sub47.2 = sub nsw i32 %add31.2, %add43.2
+  %add48.2 = add nsw i32 %add46.2, %add44.2
+  %sub51.2 = sub nsw i32 %add44.2, %add46.2
+  %add55.2 = add nsw i32 %sub47.2, %sub45.2
+  %sub59.2 = sub nsw i32 %sub45.2, %sub47.2
+  %add.ptr.2 = getelementptr inbounds i8, ptr %add.ptr.1, i64 %idx.ext
+  %add.ptr64.2 = getelementptr inbounds i8, ptr %add.ptr64.1, i64 %idx.ext63
+  %48 = load i8, ptr %add.ptr.2, align 1
+  %conv.3 = zext i8 %48 to i32
+  %49 = load i8, ptr %add.ptr64.2, align 1
+  %conv2.3 = zext i8 %49 to i32
+  %sub.3 = sub nsw i32 %conv.3, %conv2.3
+  %arrayidx3.3 = getelementptr inbounds nuw i8, ptr %add.ptr.2, i64 4
+  %50 = load i8, ptr %arrayidx3.3, align 1
+  %conv4.3 = zext i8 %50 to i32
+  %arrayidx5.3 = getelementptr inbounds nuw i8, ptr %add.ptr64.2, i64 4
+  %51 = load i8, ptr %arrayidx5.3, align 1
+  %conv6.3 = zext i8 %51 to i32
+  %sub7.3 = sub nsw i32 %conv4.3, %conv6.3
+  %shl.3 = shl nsw i32 %sub7.3, 16
+  %add.3 = add nsw i32 %shl.3, %sub.3
+  %arrayidx8.3 = getelementptr inbounds nuw i8, ptr %add.ptr.2, i64 1
+  %52 = load i8, ptr %arrayidx8.3, align 1
+  %conv9.3 = zext i8 %52 to i32
+  %arrayidx10.3 = getelementptr inbounds nuw i8, ptr %add.ptr64.2, i64 1
+  %53 = load i8, ptr %arrayidx10.3, align 1
+  %conv11.3 = zext i8 %53 to i32
+  %sub12.3 = sub nsw i32 %conv9.3, %conv11.3
+  %arrayidx13.3 = getelementptr inbounds nuw i8, ptr %add.ptr.2, i64 5
+  %54 = load i8, ptr %arrayidx13.3, align 1
+  %conv14.3 = zext i8 %54 to i32
+  %arrayidx15.3 = getelementptr inbounds nuw i8, ptr %add.ptr64.2, i64 5
+  %55 = load i8, ptr %arrayidx15.3, align 1
+  %conv16.3 = zext i8 %55 to i32
+  %sub17.3 = sub nsw i32 %conv14.3, %conv16.3
+  %shl18.3 = shl nsw i32 %sub17.3, 16
+  %add19.3 = add nsw i32 %shl18.3, %sub12.3
+  %arrayidx20.3 = getelementptr inbounds nuw i8, ptr %add.ptr.2, i64 2
+  %56 = load i8, ptr %arrayidx20.3, align 1
+  %conv21.3 = zext i8 %56 to i32
+  %arrayidx22.3 = getelementptr inbounds nuw i8, ptr %add.ptr64.2, i64 2
+  %57 = load i8, ptr %arrayidx22.3, align 1
+  %conv23.3 = zext i8 %57 to i32
+  %sub24.3 = sub nsw i32 %conv21.3, %conv23.3
+  %arrayidx25.3 = getelementptr inbounds nuw i8, ptr %add.ptr.2, i64 6
+  %58 = load i8, ptr %arrayidx25.3, align 1
+  %conv26.3 = zext i8 %58 to i32
+  %arrayidx27.3 = getelementptr inbounds nuw i8, ptr %add.ptr64.2, i64 6
+  %59 = load i8, ptr %arrayidx27.3, align 1
+  %conv28.3 = zext i8 %59 to i32
+  %sub29.3 = sub nsw i32 %conv26.3, %conv28.3
+  %shl30.3 = shl nsw i32 %sub29.3, 16
+  %add31.3 = add nsw i32 %shl30.3, %sub24.3
+  %arrayidx32.3 = getelementptr inbounds nuw i8, ptr %add.ptr.2, i64 3
+  %60 = load i8, ptr %arrayidx32.3, align 1
+  %conv33.3 = zext i8 %60 to i32
+  %arrayidx34.3 = getelementptr inbounds nuw i8, ptr %add.ptr64.2, i64 3
+  %61 = load i8, ptr %arrayidx34.3, align 1
+  %conv35.3 = zext i8 %61 to i32
+  %sub36.3 = sub nsw i32 %conv33.3, %conv35.3
+  %arrayidx37.3 = getelementptr inbounds nuw i8, ptr %add.ptr.2, i64 7
+  %62 = load i8, ptr %arrayidx37.3, align 1
+  %conv38.3 = zext i8 %62 to i32
+  %arrayidx39.3 = getelementptr inbounds nuw i8, ptr %add.ptr64.2, i64 7
+  %63 = load i8, ptr %arrayidx39.3, align 1
+  %conv40.3 = zext i8 %63 to i32
+  %sub41.3 = sub nsw i32 %conv38.3, %conv40.3
+  %shl42.3 = shl nsw i32 %sub41.3, 16
+  %add43.3 = add nsw i32 %shl42.3, %sub36.3
+  %add44.3 = add nsw i32 %add19.3, %add.3
+  %sub45.3 = sub nsw i32 %add.3, %add19.3
+  %add46.3 = add nsw i32 %add43.3, %add31.3
+  %sub47.3 = sub nsw i32 %add31.3, %add43.3
+  %add48.3 = add nsw i32 %add46.3, %add44.3
+  %sub51.3 = sub nsw i32 %add44.3, %add46.3
+  %add55.3 = add nsw i32 %sub47.3, %sub45.3
+  %sub59.3 = sub nsw i32 %sub45.3, %sub47.3
+  %add78 = add nsw i32 %add48.1, %add48
+  %sub86 = sub nsw i32 %add48, %add48.1
+  %add94 = add nsw i32 %add48.3, %add48.2
+  %sub102 = sub nsw i32 %add48.2, %add48.3
+  %add103 = add nsw i32 %add94, %add78
+  %sub104 = sub nsw i32 %add78, %add94
+  %add105 = add nsw i32 %sub102, %sub86
+  %sub106 = sub nsw i32 %sub86, %sub102
+  %shr.i = lshr i32 %add103, 15
+  %and.i = and i32 %shr.i, 65537
+  %mul.i = mul nuw i32 %and.i, 65535
+  %add.i = add i32 %mul.i, %add103
+  %xor.i = xor i32 %add.i, %mul.i
+  %shr.i169 = lshr i32 %add105, 15
+  %and.i170 = and i32 %shr.i169, 65537
+  %mul.i171 = mul nuw i32 %and.i170, 65535
+  %add.i172 = add i32 %mul.i171, %add105
+  %xor.i173 = xor i32 %add.i172, %mul.i171
+  %shr.i174 = lshr i32 %sub104, 15
+  %and.i175 = and i32 %shr.i174, 65537
+  %mul.i176 = mul nuw i32 %and.i175, 65535
+  %add.i177 = add i32 %mul.i176, %sub104
+  %xor.i178 = xor i32 %add.i177, %mul.i176
+  %shr.i179 = lshr i32 %sub106, 15
+  %and.i180 = and i32 %shr.i179, 65537
+  %mul.i181 = mul nuw i32 %and.i180, 65535
+  %add.i182 = add i32 %mul.i181, %sub106
+  %xor.i183 = xor i32 %add.i182, %mul.i181
+  %add110 = add i32 %xor.i173, %xor.i
+  %add112 = add i32 %add110, %xor.i178
+  %add113 = add i32 %add112, %xor.i183
+  %add78.1 = add nsw i32 %add55.1, %add55
+  %sub86.1 = sub nsw i32 %add55, %add55.1
+  %add94.1 = add nsw i32 %add55.3, %add55.2
+  %sub102.1 = sub nsw i32 %add55.2, %add55.3
+  %add103.1 = add nsw i32 %add94.1, %add78.1
+  %sub104.1 = sub nsw i32 %add78.1, %add94.1
+  %add105.1 = add nsw i32 %sub102.1, %sub86.1
+  %sub106.1 = sub nsw i32 %sub86.1, %sub102.1
+  %shr.i.1 = lshr i32 %add103.1, 15
+  %and.i.1 = and i32 %shr.i.1, 65537
+  %mul.i.1 = mul nuw i32 %and.i.1, 65535
+  %add.i.1 = add i32 %mul.i.1, %add103.1
+  %xor.i.1 = xor i32 %add.i.1, %mul.i.1
+  %shr.i169.1 = lshr i32 %add105.1, 15
+  %and.i170.1 = and i32 %shr.i169.1, 65537
+  %mul.i171.1 = mul nuw i32 %and.i170.1, 65535
+  %add.i172.1 = add i32 %mul.i171.1, %add105.1
+  %xor.i173.1 = xor i32 %add.i172.1, %mul.i171.1
+  %shr.i174.1 = lshr i32 %sub104.1, 15
+  %and.i175.1 = and i32 %shr.i174.1, 65537
+  %mul.i176.1 = mul nuw i32 %and.i175.1, 65535
+  %add.i177.1 = add i32 %mul.i176.1, %sub104.1
+  %xor.i178.1 = xor i32 %add.i177.1, %mul.i176.1
+  %shr.i179.1 = lshr i32 %sub106.1, 15
+  %and.i180.1 = and i32 %shr.i179.1, 65537
+  %mul.i181.1 = mul nuw i32 %and.i180.1, 65535
+  %add.i182.1 = add i32 %mul.i181.1, %sub106.1
+  %xor.i183.1 = xor i32 %add.i182.1, %mul.i181.1
+  %add108.1 = add i32 %xor.i173.1, %add113
+  %add110.1 = add i32 %add108.1, %xor.i.1
+  %add112.1 = add i32 %add110.1, %xor.i178.1
+  %add113.1 = add i32 %add112.1, %xor.i183.1
+  %add78.2 = add nsw i32 %sub51.1, %sub51
+  %sub86.2 = sub nsw i32 %sub51, %sub51.1
+  %add94.2 = add nsw i32 %sub51.3, %sub51.2
+  %sub102.2 = sub nsw i32 %sub51.2, %sub51.3
+  %add103.2 = add nsw i32 %add94.2, %add78.2
+  %sub104.2 = sub nsw i32 %add78.2, %add94.2
+  %add105.2 = add nsw i32 %sub102.2, %sub86.2
+  %sub106.2 = sub nsw i32 %sub86.2, %sub102.2
+  %shr.i.2 = lshr i32 %add103.2, 15
+  %and.i.2 = and i32 %shr.i.2, 65537
+  %mul.i.2 = mul nuw i32 %and.i.2, 65535
+  %add.i.2 = add i32 %mul.i.2, %add103.2
+  %xor.i.2 = xor i32 %add.i.2, %mul.i.2
+  %shr.i169.2 = lshr i32 %add105.2, 15
+  %and.i170.2 = and i32 %shr.i169.2, 65537
+  %mul.i171.2 = mul nuw i32 %and.i170.2, 65535
+  %add.i172.2 = add i32 %mul.i171.2, %add105.2
+  %xor.i173.2 = xor i32 %add.i172.2, %mul.i171.2
+  %shr.i174.2 = lshr i32 %sub104.2, 15
+  %and.i175.2 = and i32 %shr.i174.2, 65537
+  %mul.i176.2 = mul nuw i32 %and.i175.2, 65535
+  %add.i177.2 = add i32 %mul.i176.2, %sub104.2
+  %xor.i178.2 = xor i32 %add.i177.2, %mul.i176.2
+  %shr.i179.2 = lshr i32 %sub106.2, 15
+  %and.i180.2 = and i32 %shr.i179.2, 65537
+  %mul.i181.2 = mul nuw i32 %and.i180.2, 65535
+  %add.i182.2 = add i32 %mul.i181.2, %sub106.2
+  %xor.i183.2 = xor i32 %add.i182.2, %mul.i181.2
+  %add108.2 = add i32 %xor.i173.2, %add113.1
+  %add110.2 = add i32 %add108.2, %xor.i.2
+  %add112.2 = add i32 %add110.2, %xor.i178.2
+  %add113.2 = add i32 %add112.2, %xor.i183.2
+  %add78.3 = add nsw i32 %sub59.1, %sub59
+  %sub86.3 = sub nsw i32 %sub59, %sub59.1
+  %add94.3 = add nsw i32 %sub59.3, %sub59.2
+  %sub102.3 = sub nsw i32 %sub59.2, %sub59.3
+  %add103.3 = add nsw i32 %add94.3, %add78.3
+  %sub104.3 = sub nsw i32 %add78.3, %add94.3
+  %add105.3 = add nsw i32 %sub102.3, %sub86.3
+  %sub106.3 = sub nsw i32 %sub86.3, %sub102.3
+  %shr.i.3 = lshr i32 %add103.3, 15
+  %and.i.3 = and i32 %shr.i.3, 65537
+  %mul.i.3 = mul nuw i32 %and.i.3, 65535
+  %add.i.3 = add i32 %mul.i.3, %add103.3
+  %xor.i.3 = xor i32 %add.i.3, %mul.i.3
+  %shr.i169.3 = lshr i32 %add105.3, 15
+  %and.i170.3 = and i32 %shr.i169.3, 65537
+  %mul.i171.3 = mul nuw i32 %and.i170.3, 65535
+  %add.i172.3 = add i32 %mul.i171.3, %add105.3
+  %xor.i173.3 = xor i32 %add.i172.3, %mul.i171.3
+  %shr.i174.3 = lshr i32 %sub104.3, 15
+  %and.i175.3 = and i32 %shr.i174.3, 65537
+  %mul.i176.3 = mul nuw i32 %and.i175.3, 65535
+  %add.i177.3 = add i32 %mul.i176.3, %sub104.3
+  %xor.i178.3 = xor i32 %add.i177.3, %mul.i176.3
+  %shr.i179.3 = lshr i32 %sub106.3, 15
+  %and.i180.3 = and i32 %shr.i179.3, 65537
+  %mul.i181.3 = mul nuw i32 %and.i180.3, 65535
+  %add.i182.3 = add i32 %mul.i181.3, %sub106.3
+  %xor.i183.3 = xor i32 %add.i182.3, %mul.i181.3
+  %add108.3 = add i32 %xor.i173.3, %add113.2
+  %add110.3 = add i32 %add108.3, %xor.i.3
+  %add112.3 = add i32 %add110.3, %xor.i178.3
+  %add113.3 = add i32 %add112.3, %xor.i183.3
+  %conv118 = and i32 %add113.3, 65535
+  %shr = lshr i32 %add113.3, 16
+  %add119 = add nuw nsw i32 %conv118, %shr
+  %shr120 = lshr i32 %add119, 1
+  ret i32 %shr120
+}

>From edbf0d0aca9e0c74aa8f2975f879eb9986ac8176 Mon Sep 17 00:00:00 2001
From: Mikhail Gudim <mgudim at ventanamicro.com>
Date: Tue, 12 Aug 2025 10:21:30 -0700
Subject: [PATCH 02/14] updated basic-strided-loads.ll

---
 .../RISCV/basic-strided-loads.ll              | 20 +++----------------
 1 file changed, 3 insertions(+), 17 deletions(-)

diff --git a/llvm/test/Transforms/SLPVectorizer/RISCV/basic-strided-loads.ll b/llvm/test/Transforms/SLPVectorizer/RISCV/basic-strided-loads.ll
index 645dbc49269f0..0135d3c01d9f6 100644
--- a/llvm/test/Transforms/SLPVectorizer/RISCV/basic-strided-loads.ll
+++ b/llvm/test/Transforms/SLPVectorizer/RISCV/basic-strided-loads.ll
@@ -630,25 +630,11 @@ define void @rt_stride_widen_no_reordering(ptr %pl, i64 %stride, ptr %ps) {
 ; CHECK-LABEL: define void @rt_stride_widen_no_reordering(
 ; CHECK-SAME: ptr [[PL:%.*]], i64 [[STRIDE:%.*]], ptr [[PS:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[OFFSET0:%.*]] = mul nsw i64 [[STRIDE]], 0
-; CHECK-NEXT:    [[OFFSET4:%.*]] = mul nsw i64 [[STRIDE]], 1
-; CHECK-NEXT:    [[OFFSET8:%.*]] = mul nsw i64 [[STRIDE]], 2
-; CHECK-NEXT:    [[OFFSET12:%.*]] = mul nsw i64 [[STRIDE]], 3
 ; CHECK-NEXT:    [[GEP_L0:%.*]] = getelementptr inbounds i8, ptr [[PL]], i64 [[OFFSET0]]
-; CHECK-NEXT:    [[GEP_L4:%.*]] = getelementptr inbounds i8, ptr [[PL]], i64 [[OFFSET4]]
-; CHECK-NEXT:    [[GEP_L8:%.*]] = getelementptr inbounds i8, ptr [[PL]], i64 [[OFFSET8]]
-; CHECK-NEXT:    [[GEP_L12:%.*]] = getelementptr inbounds i8, ptr [[PL]], i64 [[OFFSET12]]
 ; CHECK-NEXT:    [[GEP_S0:%.*]] = getelementptr inbounds i8, ptr [[PS]], i64 0
-; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i8>, ptr [[GEP_L0]], align 16
-; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i8>, ptr [[GEP_L4]], align 16
-; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i8>, ptr [[GEP_L8]], align 16
-; CHECK-NEXT:    [[TMP4:%.*]] = load <4 x i8>, ptr [[GEP_L12]], align 16
-; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <4 x i8> [[TMP1]], <4 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP6:%.*]] = shufflevector <4 x i8> [[TMP2]], <4 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP7:%.*]] = shufflevector <4 x i8> [[TMP1]], <4 x i8> [[TMP2]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP11:%.*]] = shufflevector <4 x i8> [[TMP3]], <4 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP9:%.*]] = shufflevector <16 x i8> [[TMP7]], <16 x i8> [[TMP11]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP10:%.*]] = shufflevector <4 x i8> [[TMP4]], <4 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP8:%.*]] = shufflevector <16 x i8> [[TMP9]], <16 x i8> [[TMP10]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 18, i32 19>
+; CHECK-NEXT:    [[TMP1:%.*]] = mul i64 [[STRIDE]], 1
+; CHECK-NEXT:    [[TMP2:%.*]] = call <4 x i32> @llvm.experimental.vp.strided.load.v4i32.p0.i64(ptr align 16 [[GEP_L0]], i64 [[TMP1]], <4 x i1> splat (i1 true), i32 4)
+; CHECK-NEXT:    [[TMP8:%.*]] = bitcast <4 x i32> [[TMP2]] to <16 x i8>
 ; CHECK-NEXT:    store <16 x i8> [[TMP8]], ptr [[GEP_S0]], align 16
 ; CHECK-NEXT:    ret void
 ;

>From 257f797e0d123795855ac7df79c27aa319756f0e Mon Sep 17 00:00:00 2001
From: Mikhail Gudim <mgudim at ventanamicro.com>
Date: Tue, 12 Aug 2025 10:41:51 -0700
Subject: [PATCH 03/14] did a "todo".

---
 llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp | 8 +++-----
 1 file changed, 3 insertions(+), 5 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index ab2b21919d819..0f2fa0e1bd58e 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -13240,12 +13240,10 @@ void BoUpSLP::transformNodes() {
         if (StridedCost < OriginalVecCost) {
           // Strided load is more profitable than consecutive load + reverse -
           // transform the node to strided load.
-          // TODO: StrideTy =
-          // DL->getIndexType(E.scalars.front()->getPointerOperand()->getType());
+          Type *StrideTy = DL->getIndexType(cast<LoadInst>(E.Scalars.front())
+                                                ->getPointerOperand()
+                                                ->getType());
           StridedPtrInfo SPtrInfo;
-          Type *StrideTy = Type::getIntNTy(
-              SE->getContext(),
-              DL->getTypeStoreSizeInBits(E.Scalars.front()->getType()));
           SPtrInfo.StrideVal = ConstantInt::get(StrideTy, 1);
           SPtrInfo.Ty = VecTy;
           TreeEntryToStridedPtrInfoMap[&E] = SPtrInfo;

>From 76aca3983e1bc94027ac7ece2b1f1a0810499f2e Mon Sep 17 00:00:00 2001
From: Mikhail Gudim <mgudim at ventanamicro.com>
Date: Tue, 12 Aug 2025 12:22:28 -0700
Subject: [PATCH 04/14] addressed review comments.

---
 .../Transforms/Vectorize/SLPVectorizer.cpp    | 112 +++++++++---------
 1 file changed, 53 insertions(+), 59 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index 0f2fa0e1bd58e..594ded3365d89 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -1931,7 +1931,7 @@ class BoUpSLP {
     // strided load.
     FixedVectorType *Ty = nullptr;
   };
-  DenseMap<TreeEntry *, StridedPtrInfo> TreeEntryToStridedPtrInfoMap;
+  SmallDenseMap<TreeEntry *, StridedPtrInfo> TreeEntryToStridedPtrInfoMap;
 
 public:
   /// Tracks the state we can represent the loads in the given sequence.
@@ -2233,12 +2233,12 @@ class BoUpSLP {
   bool analyzeRtStrideCandidate(ArrayRef<Value *> PointerOps, Type *ElemTy,
                                 Align CommonAlignment,
                                 SmallVectorImpl<unsigned> &SortedIndices,
-                                StridedPtrInfo *SPtrInfo) const;
+                                StridedPtrInfo &SPtrInfo) const;
 
   bool analyzeConstantStrideCandidate(ArrayRef<Value *> PointerOps,
                                       Type *ElemTy, Align CommonAlignment,
                                       SmallVectorImpl<unsigned> &SortedIndices,
-                                      StridedPtrInfo *SPtrInfo, int64_t Diff,
+                                      StridedPtrInfo &SPtrInfo, int64_t Diff,
                                       Value *Ptr0, Value *PtrN) const;
 
   /// Checks if the given array of loads can be represented as a vectorized,
@@ -4502,7 +4502,7 @@ class BoUpSLP {
   TreeEntry::EntryState getScalarsVectorizationState(
       const InstructionsState &S, ArrayRef<Value *> VL,
       bool IsScatterVectorizeUserTE, OrdersType &CurrentOrder,
-      SmallVectorImpl<Value *> &PointerOps, StridedPtrInfo *SPtrInfo);
+      SmallVectorImpl<Value *> &PointerOps, StridedPtrInfo &SPtrInfo);
 
   /// Maps a specific scalar to its tree entry(ies).
   SmallDenseMap<Value *, SmallVector<TreeEntry *>> ScalarToTreeEntries;
@@ -6481,7 +6481,7 @@ static const SCEV *calculateRtStride(ArrayRef<Value *> PointerOps, Type *ElemTy,
 bool BoUpSLP::analyzeRtStrideCandidate(ArrayRef<Value *> PointerOps,
                                        Type *ElemTy, Align CommonAlignment,
                                        SmallVectorImpl<unsigned> &SortedIndices,
-                                       StridedPtrInfo *SPtrInfo) const {
+                                       StridedPtrInfo &SPtrInfo) const {
   // Group the pointers by constant offset.
   DenseMap<int64_t, std::pair<SmallVector<Value *>, SmallVector<unsigned>>>
       OffsetToPointerOpIdxMap;
@@ -6491,15 +6491,15 @@ bool BoUpSLP::analyzeRtStrideCandidate(ArrayRef<Value *> PointerOps,
       return false;
 
     const SCEVAddExpr *Add = dyn_cast<SCEVAddExpr>(PtrSCEV);
+    if (!Add)
+      return false;
     int64_t Offset = 0;
-    if (Add) {
-      for (int i = 0; i < (int)Add->getNumOperands(); ++i) {
-        const SCEVConstant *SC = dyn_cast<SCEVConstant>(Add->getOperand(i));
-        if (!SC)
-          continue;
-        Offset = (int64_t)(SC->getAPInt().getLimitedValue());
-        break;
-      }
+    for (int I : seq<int>(Add->getNumOperands())) {
+      auto *SC = dyn_cast<SCEVConstant>(Add->getOperand(I));
+      if (!SC)
+        continue;
+      Offset = SC->getAPInt().getSExtValue();
+      break;
     }
     OffsetToPointerOpIdxMap[Offset].first.push_back(Ptr);
     OffsetToPointerOpIdxMap[Offset].second.push_back(Idx);
@@ -6522,17 +6522,17 @@ bool BoUpSLP::analyzeRtStrideCandidate(ArrayRef<Value *> PointerOps,
       !TTI->isLegalStridedLoadStore(StridedLoadTy, CommonAlignment))
     return false;
 
-  SmallVector<int> SortedOffsetsV;
+  SmallVector<int64_t> SortedOffsetsV;
   for (auto [K, V] : OffsetToPointerOpIdxMap) {
     SortedOffsetsV.push_back(K);
   }
-  llvm::sort(SortedOffsetsV);
+  sort(SortedOffsetsV);
   if (NumOffsets > 1) {
-    int CommonDiff = SortedOffsetsV[1] - SortedOffsetsV[0];
-    if ((CommonDiff) != 1)
+    int64_t CommonDiff = SortedOffsetsV[1] - SortedOffsetsV[0];
+    if (CommonDiff != 1)
       return false;
-    for (int i = 1; i < (int)SortedOffsetsV.size() - 1; ++i) {
-      if (SortedOffsetsV[i + 1] - SortedOffsetsV[i] != CommonDiff)
+    for (int I : seq<int>(1, SortedOffsetsV.size() - 1)) {
+      if (SortedOffsetsV[I + 1] - SortedOffsetsV[I] != CommonDiff)
         return false;
     }
   }
@@ -6552,31 +6552,29 @@ bool BoUpSLP::analyzeRtStrideCandidate(ArrayRef<Value *> PointerOps,
   unsigned NumCoeffs0 = Coeffs0.size();
   if (NumCoeffs0 * NumOffsets != Sz)
     return false;
-  llvm::sort(Coeffs0);
+  sort(Coeffs0);
 
   SmallVector<unsigned> SortedIndicesDraft;
   SortedIndicesDraft.resize(Sz);
-  auto updateSortedIndices =
+  auto UpdateSortedIndices =
       [&](SmallVectorImpl<unsigned> &SortedIndicesForOffset,
           SmallVectorImpl<unsigned> &IndicesInAllPointerOps,
           int64_t OffsetNum) {
-        unsigned Num = 0;
-        for (unsigned Idx : SortedIndicesForOffset) {
+        for (const auto [Num, Idx] : enumerate(SortedIndicesForOffset)) {
           SortedIndicesDraft[Num * NumOffsets + OffsetNum] =
               IndicesInAllPointerOps[Idx];
-          ++Num;
         }
       };
 
-  updateSortedIndices(SortedIndicesForOffset0, IndicesInAllPointerOps0, 0);
+  UpdateSortedIndices(SortedIndicesForOffset0, IndicesInAllPointerOps0, 0);
 
   SmallVector<int64_t> Coeffs;
   SmallVector<unsigned> SortedIndicesForOffset;
-  for (int i = 1; i < NumOffsets; ++i) {
+  for (int I : seq<int>(1, NumOffsets)) {
     Coeffs.clear();
     SortedIndicesForOffset.clear();
 
-    int64_t Offset = SortedOffsetsV[i];
+    int64_t Offset = SortedOffsetsV[I];
     SmallVector<Value *> &PointerOpsForOffset =
         OffsetToPointerOpIdxMap[Offset].first;
     SmallVector<unsigned> &IndicesInAllPointerOps =
@@ -6584,26 +6582,23 @@ bool BoUpSLP::analyzeRtStrideCandidate(ArrayRef<Value *> PointerOps,
     const SCEV *StrideWithinGroup = calculateRtStride(
         PointerOpsForOffset, ElemTy, *DL, *SE, SortedIndicesForOffset, Coeffs);
 
-    if ((!StrideWithinGroup) || StrideWithinGroup != Stride0) {
+    if ((!StrideWithinGroup) || StrideWithinGroup != Stride0)
       return false;
-    }
     if (Coeffs.size() != NumCoeffs0)
       return false;
-    llvm::sort(Coeffs);
-    for (unsigned i = 0; i < NumCoeffs0; ++i) {
-      if (Coeffs[i] != Coeffs0[i])
+    sort(Coeffs);
+    for (int J : seq<int>(0, NumCoeffs0)) {
+      if (Coeffs[J] != Coeffs0[J])
         return false;
     }
 
-    updateSortedIndices(SortedIndicesForOffset, IndicesInAllPointerOps, i);
+    UpdateSortedIndices(SortedIndicesForOffset, IndicesInAllPointerOps, I);
   }
 
   SortedIndices.clear();
   SortedIndices = SortedIndicesDraft;
-  if (SPtrInfo) {
-    SPtrInfo->StrideSCEV = Stride0;
-    SPtrInfo->Ty = StridedLoadTy;
-  }
+  SPtrInfo.StrideSCEV = Stride0;
+  SPtrInfo.Ty = StridedLoadTy;
   return true;
 }
 
@@ -6937,20 +6932,21 @@ isMaskedLoadCompress(ArrayRef<Value *> VL, ArrayRef<Value *> PointerOps,
 // Same as analyzeRtStrideCandidate, but for constant strides.
 bool BoUpSLP::analyzeConstantStrideCandidate(
     ArrayRef<Value *> PointerOps, Type *ElemTy, Align CommonAlignment,
-    SmallVectorImpl<unsigned> &SortedIndices, StridedPtrInfo *SPtrInfo,
+    SmallVectorImpl<unsigned> &SortedIndices, StridedPtrInfo &SPtrInfo,
     int64_t Diff, Value *Ptr0, Value *PtrN) const {
   const unsigned Sz = PointerOps.size();
   SmallVector<int64_t> SortedOffsetsFromBase;
   SortedOffsetsFromBase.resize(Sz);
-  for (unsigned i = 0; i < Sz; ++i) {
+  for (unsigned I : seq<int>(0, Sz)) {
     Value *Ptr =
-        SortedIndices.empty() ? PointerOps[i] : PointerOps[SortedIndices[i]];
-    SortedOffsetsFromBase[i] =
+        SortedIndices.empty() ? PointerOps[I] : PointerOps[SortedIndices[I]];
+    SortedOffsetsFromBase[I] =
         *getPointersDiff(ElemTy, Ptr0, ElemTy, Ptr, *DL, *SE);
   }
 
   // Find where the first group ends.
-  assert(SortedOffsetsFromBase.size() > 1);
+  assert(SortedOffsetsFromBase.size() > 1 &&
+         "Trying to generate strided load for less than 2 loads");
   int64_t StrideWithinGroup =
       SortedOffsetsFromBase[1] - SortedOffsetsFromBase[0];
   unsigned GroupSize = 1;
@@ -6997,7 +6993,7 @@ bool BoUpSLP::analyzeConstantStrideCandidate(
     if (CurrentGroupStartIdx != Sz)
       return false;
 
-    auto checkGroup = [&](unsigned StartIdx, unsigned GroupSize0,
+    auto CheckGroup = [&](unsigned StartIdx, unsigned GroupSize0,
                           int64_t StrideWithinGroup) -> bool {
       unsigned GroupEndIdx = StartIdx + 1;
       for (; GroupEndIdx != Sz; ++GroupEndIdx) {
@@ -7006,10 +7002,10 @@ bool BoUpSLP::analyzeConstantStrideCandidate(
             StrideWithinGroup)
           break;
       }
-      return (GroupEndIdx - StartIdx == GroupSize0);
+      return GroupEndIdx - StartIdx == GroupSize0;
     };
-    for (unsigned i = 0; i < Sz; i += GroupSize) {
-      if (!checkGroup(i, GroupSize, StrideWithinGroup))
+    for (unsigned I = 0; I < Sz; I += GroupSize) {
+      if (!CheckGroup(I, GroupSize, StrideWithinGroup))
         return false;
     }
   }
@@ -7047,11 +7043,9 @@ bool BoUpSLP::analyzeConstantStrideCandidate(
          has_single_bit(AbsoluteDiff))) &&
        AbsoluteDiff > VecSz) ||
       Diff == -(static_cast<int>(VecSz) - 1)) {
-    if (SPtrInfo) {
-      Type *StrideTy = DL->getIndexType(Ptr0->getType());
-      SPtrInfo->StrideVal = ConstantInt::get(StrideTy, StrideIntVal);
-      SPtrInfo->Ty = StridedLoadTy;
-    }
+    Type *StrideTy = DL->getIndexType(Ptr0->getType());
+    SPtrInfo.StrideVal = ConstantInt::get(StrideTy, StrideIntVal);
+    SPtrInfo.Ty = StridedLoadTy;
     return true;
   }
   return false;
@@ -7099,7 +7093,7 @@ BoUpSLP::LoadsState BoUpSLP::canVectorizeLoads(
   if (!IsSorted) {
     if (Sz > MinProfitableStridedLoads &&
         analyzeRtStrideCandidate(PointerOps, ScalarTy, CommonAlignment, Order,
-                                 SPtrInfo))
+                                 *SPtrInfo))
       return LoadsState::StridedVectorize;
 
     if (!TTI->isLegalMaskedGather(VecTy, CommonAlignment) ||
@@ -7134,7 +7128,7 @@ BoUpSLP::LoadsState BoUpSLP::canVectorizeLoads(
       return LoadsState::CompressVectorize;
     // Simple check if not a strided access - clear order.
     if (analyzeConstantStrideCandidate(PointerOps, ScalarTy, CommonAlignment,
-                                       Order, SPtrInfo, *Diff, Ptr0, PtrN))
+                                       Order, *SPtrInfo, *Diff, Ptr0, PtrN))
       return LoadsState::StridedVectorize;
   }
   if (!TTI->isLegalMaskedGather(VecTy, CommonAlignment) ||
@@ -10004,7 +9998,7 @@ getVectorCallCosts(CallInst *CI, FixedVectorType *VecTy,
 BoUpSLP::TreeEntry::EntryState BoUpSLP::getScalarsVectorizationState(
     const InstructionsState &S, ArrayRef<Value *> VL,
     bool IsScatterVectorizeUserTE, OrdersType &CurrentOrder,
-    SmallVectorImpl<Value *> &PointerOps, StridedPtrInfo *SPtrInfo) {
+    SmallVectorImpl<Value *> &PointerOps, StridedPtrInfo &SPtrInfo) {
   assert(S.getMainOp() &&
          "Expected instructions with same/alternate opcodes only.");
 
@@ -10106,7 +10100,7 @@ BoUpSLP::TreeEntry::EntryState BoUpSLP::getScalarsVectorizationState(
         });
       });
     };
-    switch (canVectorizeLoads(VL, VL0, CurrentOrder, PointerOps, SPtrInfo)) {
+    switch (canVectorizeLoads(VL, VL0, CurrentOrder, PointerOps, &SPtrInfo)) {
     case LoadsState::Vectorize:
       return TreeEntry::Vectorize;
     case LoadsState::CompressVectorize:
@@ -11581,7 +11575,7 @@ void BoUpSLP::buildTreeRec(ArrayRef<Value *> VLRef, unsigned Depth,
   SmallVector<Value *> PointerOps;
   StridedPtrInfo SPtrInfo;
   TreeEntry::EntryState State = getScalarsVectorizationState(
-      S, VL, IsScatterVectorizeUserTE, CurrentOrder, PointerOps, &SPtrInfo);
+      S, VL, IsScatterVectorizeUserTE, CurrentOrder, PointerOps, SPtrInfo);
   if (State == TreeEntry::NeedToGather) {
     newGatherTreeEntry(VL, S, UserTreeIdx, ReuseShuffleIndices);
     return;
@@ -19747,16 +19741,16 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
         Type *StrideTy = DL->getIndexType(PO->getType());
         Value *StrideVal;
 
-        StridedPtrInfo &SPtrInfo = TreeEntryToStridedPtrInfoMap[E];
+        const StridedPtrInfo &SPtrInfo = TreeEntryToStridedPtrInfoMap.at(E);
         StridedLoadTy = SPtrInfo.Ty;
-        assert(StridedLoadTy);
+        assert(StridedLoadTy && "Missing StridedPoinerInfo for tree entry.");
         unsigned StridedLoadEC =
             StridedLoadTy->getElementCount().getKnownMinValue();
 
         Value *Stride = SPtrInfo.StrideVal;
         if (!Stride) {
           const SCEV *StrideSCEV = SPtrInfo.StrideSCEV;
-          assert(StrideSCEV);
+          assert(StrideSCEV && "Neither StrideVal nor StrideSCEV were set.");
           SCEVExpander Expander(*SE, *DL, "strided-load-vec");
           Stride = Expander.expandCodeFor(StrideSCEV, StrideSCEV->getType(),
                                           &*Builder.GetInsertPoint());

>From 6338bd359c44dbb06fc627b472771de69a4df234 Mon Sep 17 00:00:00 2001
From: Mikhail Gudim <mgudim at ventanamicro.com>
Date: Wed, 13 Aug 2025 08:26:58 -0700
Subject: [PATCH 05/14] address review comments.

---
 .../Transforms/Vectorize/SLPVectorizer.cpp    | 34 ++++++++++++++-----
 1 file changed, 25 insertions(+), 9 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index 594ded3365d89..bb4d6e41a2369 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -2230,11 +2230,29 @@ class BoUpSLP {
   ///       may not be necessary.
   bool isLoadCombineCandidate(ArrayRef<Value *> Stores) const;
 
+  // Suppose we are given pointers of the form: %b + x * %s + y * %c
+  // where %c is constant. Check if the pointers can be rearranged as follows:
+  //  %b + 0 * %s + 0
+  //  %b + 0 * %s + 1
+  //  %b + 0 * %s + 2
+  //  ...
+  //  %b + 0 * %s + w
+  //
+  //  %b + 1 * %s + 0
+  //  %b + 1 * %s + 1
+  //  %b + 1 * %s + 2
+  //  ...
+  //  %b + 1 * %s + w
+  //  ...
+  //
+  //  If the pointers can be rearanged in the above pattern, it means that the
+  //  memory can be accessed with a strided loads of width `w` and stride `%s`.
   bool analyzeRtStrideCandidate(ArrayRef<Value *> PointerOps, Type *ElemTy,
                                 Align CommonAlignment,
                                 SmallVectorImpl<unsigned> &SortedIndices,
                                 StridedPtrInfo &SPtrInfo) const;
 
+  // Same as analyzeRtStrideCandidate, but for constant strides.
   bool analyzeConstantStrideCandidate(ArrayRef<Value *> PointerOps,
                                       Type *ElemTy, Align CommonAlignment,
                                       SmallVectorImpl<unsigned> &SortedIndices,
@@ -6523,9 +6541,8 @@ bool BoUpSLP::analyzeRtStrideCandidate(ArrayRef<Value *> PointerOps,
     return false;
 
   SmallVector<int64_t> SortedOffsetsV;
-  for (auto [K, V] : OffsetToPointerOpIdxMap) {
+  for (auto [K, V] : OffsetToPointerOpIdxMap)
     SortedOffsetsV.push_back(K);
-  }
   sort(SortedOffsetsV);
   if (NumOffsets > 1) {
     int64_t CommonDiff = SortedOffsetsV[1] - SortedOffsetsV[0];
@@ -6582,17 +6599,16 @@ bool BoUpSLP::analyzeRtStrideCandidate(ArrayRef<Value *> PointerOps,
     const SCEV *StrideWithinGroup = calculateRtStride(
         PointerOpsForOffset, ElemTy, *DL, *SE, SortedIndicesForOffset, Coeffs);
 
-    if ((!StrideWithinGroup) || StrideWithinGroup != Stride0)
+    if (!StrideWithinGroup || StrideWithinGroup != Stride0)
       return false;
     if (Coeffs.size() != NumCoeffs0)
       return false;
     sort(Coeffs);
-    for (int J : seq<int>(0, NumCoeffs0)) {
-      if (Coeffs[J] != Coeffs0[J])
-        return false;
-    }
+    if (Coeffs != Coeffs0)
+      return false
 
-    UpdateSortedIndices(SortedIndicesForOffset, IndicesInAllPointerOps, I);
+          UpdateSortedIndices(SortedIndicesForOffset, IndicesInAllPointerOps,
+                              I);
   }
 
   SortedIndices.clear();
@@ -6937,7 +6953,7 @@ bool BoUpSLP::analyzeConstantStrideCandidate(
   const unsigned Sz = PointerOps.size();
   SmallVector<int64_t> SortedOffsetsFromBase;
   SortedOffsetsFromBase.resize(Sz);
-  for (unsigned I : seq<int>(0, Sz)) {
+  for (unsigned I : seq<int>(Sz)) {
     Value *Ptr =
         SortedIndices.empty() ? PointerOps[I] : PointerOps[SortedIndices[I]];
     SortedOffsetsFromBase[I] =

>From 6fc717e26ef9d4d5a6e86226712526f2071c3a1f Mon Sep 17 00:00:00 2001
From: Mikhail Gudim <mgudim at ventanamicro.com>
Date: Wed, 13 Aug 2025 08:56:01 -0700
Subject: [PATCH 06/14] added cost estimation

---
 llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp | 12 ++++++++++--
 1 file changed, 10 insertions(+), 2 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index bb4d6e41a2369..085efad577ed5 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -6605,7 +6605,7 @@ bool BoUpSLP::analyzeRtStrideCandidate(ArrayRef<Value *> PointerOps,
       return false;
     sort(Coeffs);
     if (Coeffs != Coeffs0)
-      return false
+      return false;
 
           UpdateSortedIndices(SortedIndicesForOffset, IndicesInAllPointerOps,
                               I);
@@ -15042,11 +15042,19 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef<Value *> VectorizedVals,
         }
         break;
       case TreeEntry::StridedVectorize: {
+        const StridedPtrInfo &SPtrInfo = TreeEntryToStridedPtrInfoMap.at(E);
+        FixedVectorType *StridedLoadTy = SPtrInfo.Ty;
+        assert(StridedLoadTy && "Missing StridedPoinerInfo for tree entry.");
         Align CommonAlignment =
             computeCommonAlignment<LoadInst>(UniqueValues.getArrayRef());
         VecLdCost = TTI->getStridedMemoryOpCost(
-            Instruction::Load, VecTy, LI0->getPointerOperand(),
+            Instruction::Load, StridedLoadTy, LI0->getPointerOperand(),
             /*VariableMask=*/false, CommonAlignment, CostKind);
+        if (StridedLoadTy != VecTy)
+          VecLdCost +=
+              TTI->getCastInstrCost(Instruction::BitCast, StridedLoadTy, VecTy,
+                                    getCastContextHint(*E), CostKind);
+
         break;
       }
       case TreeEntry::CompressVectorize: {

>From 113396769c16783193b726fb40f0588925a19312 Mon Sep 17 00:00:00 2001
From: Mikhail Gudim <mgudim at ventanamicro.com>
Date: Wed, 13 Aug 2025 09:13:25 -0700
Subject: [PATCH 07/14] format

---
 llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index 085efad577ed5..48b7d5642d7ee 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -6607,8 +6607,7 @@ bool BoUpSLP::analyzeRtStrideCandidate(ArrayRef<Value *> PointerOps,
     if (Coeffs != Coeffs0)
       return false;
 
-          UpdateSortedIndices(SortedIndicesForOffset, IndicesInAllPointerOps,
-                              I);
+    UpdateSortedIndices(SortedIndicesForOffset, IndicesInAllPointerOps, I);
   }
 
   SortedIndices.clear();

>From c54d9a0baf4c39f9cb23396a57b6ed8f36cf1900 Mon Sep 17 00:00:00 2001
From: Mikhail Gudim <mgudim at ventanamicro.com>
Date: Wed, 13 Aug 2025 11:43:09 -0700
Subject: [PATCH 08/14] change references back to pointers because
 canVectorizeLoads can be called with nullptr

---
 .../Transforms/Vectorize/SLPVectorizer.cpp    | 24 +++++++++++--------
 1 file changed, 14 insertions(+), 10 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index 48b7d5642d7ee..342e940c49c9d 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -2250,13 +2250,13 @@ class BoUpSLP {
   bool analyzeRtStrideCandidate(ArrayRef<Value *> PointerOps, Type *ElemTy,
                                 Align CommonAlignment,
                                 SmallVectorImpl<unsigned> &SortedIndices,
-                                StridedPtrInfo &SPtrInfo) const;
+                                StridedPtrInfo *SPtrInfo) const;
 
   // Same as analyzeRtStrideCandidate, but for constant strides.
   bool analyzeConstantStrideCandidate(ArrayRef<Value *> PointerOps,
                                       Type *ElemTy, Align CommonAlignment,
                                       SmallVectorImpl<unsigned> &SortedIndices,
-                                      StridedPtrInfo &SPtrInfo, int64_t Diff,
+                                      StridedPtrInfo *SPtrInfo, int64_t Diff,
                                       Value *Ptr0, Value *PtrN) const;
 
   /// Checks if the given array of loads can be represented as a vectorized,
@@ -6499,7 +6499,7 @@ static const SCEV *calculateRtStride(ArrayRef<Value *> PointerOps, Type *ElemTy,
 bool BoUpSLP::analyzeRtStrideCandidate(ArrayRef<Value *> PointerOps,
                                        Type *ElemTy, Align CommonAlignment,
                                        SmallVectorImpl<unsigned> &SortedIndices,
-                                       StridedPtrInfo &SPtrInfo) const {
+                                       StridedPtrInfo *SPtrInfo) const {
   // Group the pointers by constant offset.
   DenseMap<int64_t, std::pair<SmallVector<Value *>, SmallVector<unsigned>>>
       OffsetToPointerOpIdxMap;
@@ -6612,8 +6612,10 @@ bool BoUpSLP::analyzeRtStrideCandidate(ArrayRef<Value *> PointerOps,
 
   SortedIndices.clear();
   SortedIndices = SortedIndicesDraft;
-  SPtrInfo.StrideSCEV = Stride0;
-  SPtrInfo.Ty = StridedLoadTy;
+  if (SPtrInfo) {
+    SPtrInfo->StrideSCEV = Stride0;
+    SPtrInfo->Ty = StridedLoadTy;
+  }
   return true;
 }
 
@@ -6947,7 +6949,7 @@ isMaskedLoadCompress(ArrayRef<Value *> VL, ArrayRef<Value *> PointerOps,
 // Same as analyzeRtStrideCandidate, but for constant strides.
 bool BoUpSLP::analyzeConstantStrideCandidate(
     ArrayRef<Value *> PointerOps, Type *ElemTy, Align CommonAlignment,
-    SmallVectorImpl<unsigned> &SortedIndices, StridedPtrInfo &SPtrInfo,
+    SmallVectorImpl<unsigned> &SortedIndices, StridedPtrInfo *SPtrInfo,
     int64_t Diff, Value *Ptr0, Value *PtrN) const {
   const unsigned Sz = PointerOps.size();
   SmallVector<int64_t> SortedOffsetsFromBase;
@@ -7059,8 +7061,10 @@ bool BoUpSLP::analyzeConstantStrideCandidate(
        AbsoluteDiff > VecSz) ||
       Diff == -(static_cast<int>(VecSz) - 1)) {
     Type *StrideTy = DL->getIndexType(Ptr0->getType());
-    SPtrInfo.StrideVal = ConstantInt::get(StrideTy, StrideIntVal);
-    SPtrInfo.Ty = StridedLoadTy;
+    if (SPtrInfo) {
+      SPtrInfo->StrideVal = ConstantInt::get(StrideTy, StrideIntVal);
+      SPtrInfo->Ty = StridedLoadTy;
+    }
     return true;
   }
   return false;
@@ -7108,7 +7112,7 @@ BoUpSLP::LoadsState BoUpSLP::canVectorizeLoads(
   if (!IsSorted) {
     if (Sz > MinProfitableStridedLoads &&
         analyzeRtStrideCandidate(PointerOps, ScalarTy, CommonAlignment, Order,
-                                 *SPtrInfo))
+                                 SPtrInfo))
       return LoadsState::StridedVectorize;
 
     if (!TTI->isLegalMaskedGather(VecTy, CommonAlignment) ||
@@ -7143,7 +7147,7 @@ BoUpSLP::LoadsState BoUpSLP::canVectorizeLoads(
       return LoadsState::CompressVectorize;
     // Simple check if not a strided access - clear order.
     if (analyzeConstantStrideCandidate(PointerOps, ScalarTy, CommonAlignment,
-                                       Order, *SPtrInfo, *Diff, Ptr0, PtrN))
+                                       Order, SPtrInfo, *Diff, Ptr0, PtrN))
       return LoadsState::StridedVectorize;
   }
   if (!TTI->isLegalMaskedGather(VecTy, CommonAlignment) ||

>From 79d533af4f6a62db9dc59f050a0c3b8fa9164514 Mon Sep 17 00:00:00 2001
From: Mikhail Gudim <mgudim at ventanamicro.com>
Date: Wed, 13 Aug 2025 11:52:28 -0700
Subject: [PATCH 09/14] three slashes for comments, deleted wrong comment.

---
 .../Transforms/Vectorize/SLPVectorizer.cpp    | 99 ++++++++++---------
 1 file changed, 50 insertions(+), 49 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index 342e940c49c9d..ff5a370387e43 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -1918,17 +1918,15 @@ class BoUpSLP {
   class ShuffleCostEstimator;
   class ShuffleInstructionBuilder;
 
-  // If we decide to generate strided load / store, this struct contains all the
-  // necessary info. It's fields are calculated by analyzeRtStrideCandidate and
-  // analyzeConstantStrideCandidate. Note that Stride can be given either as a
-  // SCEV or as a Value if it already exists.
-  // To get the stride in bytes, StrideVal (or value obtained from StrideSCEV)
-  // has to by multiplied by the size of element of FixedVectorType.
+  /// If we decide to generate strided load / store, this struct contains all
+  /// the necessary info. It's fields are calculated by analyzeRtStrideCandidate
+  /// and analyzeConstantStrideCandidate. Note that Stride can be given either
+  /// as a SCEV or as a Value if it already exists. To get the stride in bytes,
+  /// StrideVal (or value obtained from StrideSCEV) has to by multiplied by the
+  /// size of element of FixedVectorType.
   struct StridedPtrInfo {
     Value *StrideVal = nullptr;
     const SCEV *StrideSCEV = nullptr;
-    // Represents the ammount which needs to be added to the base pointer of
-    // strided load.
     FixedVectorType *Ty = nullptr;
   };
   SmallDenseMap<TreeEntry *, StridedPtrInfo> TreeEntryToStridedPtrInfoMap;
@@ -2230,29 +2228,29 @@ class BoUpSLP {
   ///       may not be necessary.
   bool isLoadCombineCandidate(ArrayRef<Value *> Stores) const;
 
-  // Suppose we are given pointers of the form: %b + x * %s + y * %c
-  // where %c is constant. Check if the pointers can be rearranged as follows:
-  //  %b + 0 * %s + 0
-  //  %b + 0 * %s + 1
-  //  %b + 0 * %s + 2
-  //  ...
-  //  %b + 0 * %s + w
-  //
-  //  %b + 1 * %s + 0
-  //  %b + 1 * %s + 1
-  //  %b + 1 * %s + 2
-  //  ...
-  //  %b + 1 * %s + w
-  //  ...
-  //
-  //  If the pointers can be rearanged in the above pattern, it means that the
-  //  memory can be accessed with a strided loads of width `w` and stride `%s`.
+  /// Suppose we are given pointers of the form: %b + x * %s + y * %c
+  /// where %c is constant. Check if the pointers can be rearranged as follows:
+  ///  %b + 0 * %s + 0
+  ///  %b + 0 * %s + 1
+  ///  %b + 0 * %s + 2
+  ///  ...
+  ///  %b + 0 * %s + w
+  ///
+  ///  %b + 1 * %s + 0
+  ///  %b + 1 * %s + 1
+  ///  %b + 1 * %s + 2
+  ///  ...
+  ///  %b + 1 * %s + w
+  ///  ...
+  ///
+  ///  If the pointers can be rearanged in the above pattern, it means that the
+  ///  memory can be accessed with a strided loads of width `w` and stride `%s`.
   bool analyzeRtStrideCandidate(ArrayRef<Value *> PointerOps, Type *ElemTy,
                                 Align CommonAlignment,
                                 SmallVectorImpl<unsigned> &SortedIndices,
                                 StridedPtrInfo *SPtrInfo) const;
 
-  // Same as analyzeRtStrideCandidate, but for constant strides.
+  /// Same as analyzeRtStrideCandidate, but for constant strides.
   bool analyzeConstantStrideCandidate(ArrayRef<Value *> PointerOps,
                                       Type *ElemTy, Align CommonAlignment,
                                       SmallVectorImpl<unsigned> &SortedIndices,
@@ -6479,23 +6477,23 @@ static const SCEV *calculateRtStride(ArrayRef<Value *> PointerOps, Type *ElemTy,
   return Stride;
 }
 
-// Suppose we are given pointers of the form: %b + x * %s + y * %c
-// where %c is constant. Check if the pointers can be rearranged as follows:
-//  %b + 0 * %s + 0
-//  %b + 0 * %s + 1
-//  %b + 0 * %s + 2
-//  ...
-//  %b + 0 * %s + w
-//
-//  %b + 1 * %s + 0
-//  %b + 1 * %s + 1
-//  %b + 1 * %s + 2
-//  ...
-//  %b + 1 * %s + w
-//  ...
-//
-//  If the pointers can be rearanged in the above pattern, it means that the
-//  memory can be accessed with a strided loads of width `w` and stride `%s`.
+/// Suppose we are given pointers of the form: %b + x * %s + y * %c
+/// where %c is constant. Check if the pointers can be rearranged as follows:
+///  %b + 0 * %s + 0
+///  %b + 0 * %s + 1
+///  %b + 0 * %s + 2
+///  ...
+///  %b + 0 * %s + w
+///
+///  %b + 1 * %s + 0
+///  %b + 1 * %s + 1
+///  %b + 1 * %s + 2
+///  ...
+///  %b + 1 * %s + w
+///  ...
+///
+///  If the pointers can be rearanged in the above pattern, it means that the
+///  memory can be accessed with a strided loads of width `w` and stride `%s`.
 bool BoUpSLP::analyzeRtStrideCandidate(ArrayRef<Value *> PointerOps,
                                        Type *ElemTy, Align CommonAlignment,
                                        SmallVectorImpl<unsigned> &SortedIndices,
@@ -6946,7 +6944,7 @@ isMaskedLoadCompress(ArrayRef<Value *> VL, ArrayRef<Value *> PointerOps,
                               CompressMask, LoadVecTy);
 }
 
-// Same as analyzeRtStrideCandidate, but for constant strides.
+/// Same as analyzeRtStrideCandidate, but for constant strides.
 bool BoUpSLP::analyzeConstantStrideCandidate(
     ArrayRef<Value *> PointerOps, Type *ElemTy, Align CommonAlignment,
     SmallVectorImpl<unsigned> &SortedIndices, StridedPtrInfo *SPtrInfo,
@@ -15051,12 +15049,15 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef<Value *> VectorizedVals,
         Align CommonAlignment =
             computeCommonAlignment<LoadInst>(UniqueValues.getArrayRef());
         VecLdCost = TTI->getStridedMemoryOpCost(
-            Instruction::Load, StridedLoadTy, LI0->getPointerOperand(),
+            Instruction::Load, VecTy, LI0->getPointerOperand(),
             /*VariableMask=*/false, CommonAlignment, CostKind);
-        if (StridedLoadTy != VecTy)
-          VecLdCost +=
-              TTI->getCastInstrCost(Instruction::BitCast, StridedLoadTy, VecTy,
-                                    getCastContextHint(*E), CostKind);
+        //VecLdCost = TTI->getStridedMemoryOpCost(
+        //    Instruction::Load, StridedLoadTy, LI0->getPointerOperand(),
+        //    /*VariableMask=*/false, CommonAlignment, CostKind);
+        //if (StridedLoadTy != VecTy)
+        //  VecLdCost +=
+        //      TTI->getCastInstrCost(Instruction::BitCast, StridedLoadTy, VecTy,
+        //                            getCastContextHint(*E), CostKind);
 
         break;
       }

>From 3fdd0b00c0129e0bf4108db1510ad68eb10a0d8c Mon Sep 17 00:00:00 2001
From: Mikhail Gudim <mgudim at ventanamicro.com>
Date: Wed, 13 Aug 2025 13:10:11 -0700
Subject: [PATCH 10/14] revert a change made by mistake.

---
 llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index ff5a370387e43..b6000afb0db89 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -6507,15 +6507,15 @@ bool BoUpSLP::analyzeRtStrideCandidate(ArrayRef<Value *> PointerOps,
       return false;
 
     const SCEVAddExpr *Add = dyn_cast<SCEVAddExpr>(PtrSCEV);
-    if (!Add)
-      return false;
     int64_t Offset = 0;
-    for (int I : seq<int>(Add->getNumOperands())) {
-      auto *SC = dyn_cast<SCEVConstant>(Add->getOperand(I));
-      if (!SC)
-        continue;
-      Offset = SC->getAPInt().getSExtValue();
-      break;
+    if (Add) {
+      for (int I : seq<int>(Add->getNumOperands())) {
+        const SCEVConstant *SC = dyn_cast<SCEVConstant>(Add->getOperand(I));
+        if (!SC)
+          continue;
+        Offset = SC->getAPInt().getSExtValue();
+        break;
+      }
     }
     OffsetToPointerOpIdxMap[Offset].first.push_back(Ptr);
     OffsetToPointerOpIdxMap[Offset].second.push_back(Idx);

>From 0a7115f26264e80e44a6920bf4576ca58b0c8202 Mon Sep 17 00:00:00 2001
From: Mikhail Gudim <mgudim at ventanamicro.com>
Date: Wed, 13 Aug 2025 13:13:57 -0700
Subject: [PATCH 11/14] Another DenseMap -> SmallDenseMap

---
 llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index b6000afb0db89..3f49f7528227b 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -6499,7 +6499,7 @@ bool BoUpSLP::analyzeRtStrideCandidate(ArrayRef<Value *> PointerOps,
                                        SmallVectorImpl<unsigned> &SortedIndices,
                                        StridedPtrInfo *SPtrInfo) const {
   // Group the pointers by constant offset.
-  DenseMap<int64_t, std::pair<SmallVector<Value *>, SmallVector<unsigned>>>
+  SmallDenseMap<int64_t, std::pair<SmallVector<Value *>, SmallVector<unsigned>>>
       OffsetToPointerOpIdxMap;
   for (auto [Idx, Ptr] : enumerate(PointerOps)) {
     const SCEV *PtrSCEV = SE->getSCEV(Ptr);

>From 16840691be002d482935a1fc7c9e216c64a34a04 Mon Sep 17 00:00:00 2001
From: Mikhail Gudim <mgudim at ventanamicro.com>
Date: Wed, 13 Aug 2025 13:21:53 -0700
Subject: [PATCH 12/14] put cost estimation back and make the order of type
 operands correct.

---
 llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp | 13 +++++--------
 1 file changed, 5 insertions(+), 8 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index 3f49f7528227b..e25bff8d40322 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -15049,15 +15049,12 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef<Value *> VectorizedVals,
         Align CommonAlignment =
             computeCommonAlignment<LoadInst>(UniqueValues.getArrayRef());
         VecLdCost = TTI->getStridedMemoryOpCost(
-            Instruction::Load, VecTy, LI0->getPointerOperand(),
+            Instruction::Load, StridedLoadTy, LI0->getPointerOperand(),
             /*VariableMask=*/false, CommonAlignment, CostKind);
-        //VecLdCost = TTI->getStridedMemoryOpCost(
-        //    Instruction::Load, StridedLoadTy, LI0->getPointerOperand(),
-        //    /*VariableMask=*/false, CommonAlignment, CostKind);
-        //if (StridedLoadTy != VecTy)
-        //  VecLdCost +=
-        //      TTI->getCastInstrCost(Instruction::BitCast, StridedLoadTy, VecTy,
-        //                            getCastContextHint(*E), CostKind);
+        if (StridedLoadTy != VecTy)
+          VecLdCost +=
+              TTI->getCastInstrCost(Instruction::BitCast, VecTy, StridedLoadTy,
+                                    getCastContextHint(*E), CostKind);
 
         break;
       }

>From c2492e6d9730e80ef0810dcdcdce40af2ef28f1b Mon Sep 17 00:00:00 2001
From: Mikhail Gudim <mgudim at ventanamicro.com>
Date: Sat, 30 Aug 2025 02:05:37 -0700
Subject: [PATCH 13/14] rebased on latest main

---
 llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp | 9 ---------
 1 file changed, 9 deletions(-)

diff --git a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp
index 8c53bd9265302..5e300182657d5 100644
--- a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp
+++ b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp
@@ -37,11 +37,6 @@ static cl::opt<unsigned> SLPMaxVF(
         "exclusively by SLP vectorizer."),
     cl::Hidden);
 
-static cl::opt<bool> SLPPreferAltOpcVectorization(
-    "riscv-v-slp-prefer-alt-opc-vectorization",
-    cl::desc("Controls preferAlternateOpcodeVectorization"), cl::init(false),
-    cl::Hidden);
-
 static cl::opt<unsigned>
     RVVMinTripCount("riscv-v-min-trip-count",
                     cl::desc("Set the lower bound of a trip count to decide on "
@@ -3045,7 +3040,3 @@ RISCVTTIImpl::enableMemCmpExpansion(bool OptSize, bool IsZeroCmp) const {
   }
   return Options;
 }
-
-bool RISCVTTIImpl::preferAlternateOpcodeVectorization() const {
-  return SLPPreferAltOpcVectorization;
-}

>From 001c120d2cf04f2272881ad2cbea8fb4760e6bb8 Mon Sep 17 00:00:00 2001
From: Mikhail Gudim <mgudim at ventanamicro.com>
Date: Sun, 31 Aug 2025 12:19:00 -0700
Subject: [PATCH 14/14] fixed RUN lines in x264 test

---
 llvm/test/Transforms/SLPVectorizer/RISCV/x264-satd-8x4.ll | 1 -
 1 file changed, 1 deletion(-)

diff --git a/llvm/test/Transforms/SLPVectorizer/RISCV/x264-satd-8x4.ll b/llvm/test/Transforms/SLPVectorizer/RISCV/x264-satd-8x4.ll
index 38cf1214081fa..33249a8e66657 100644
--- a/llvm/test/Transforms/SLPVectorizer/RISCV/x264-satd-8x4.ll
+++ b/llvm/test/Transforms/SLPVectorizer/RISCV/x264-satd-8x4.ll
@@ -1,6 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
 ; RUN: opt -mtriple=riscv64 -mattr=+m,+v,+unaligned-vector-mem \
-; RUN: -riscv-v-slp-prefer-alt-opc-vectorization=true \
 ; RUN: -passes=slp-vectorizer -S < %s | FileCheck %s
 ; Function Attrs: nounwind uwtable vscale_range(8,1024)
 define i32 @x264_pixel_satd_8x4(ptr %pix1, i32  %i_pix1, ptr  %pix2, i32  %i_pix2) {