[llvm] [SLPVectorizer] Widen strided loads. (PR #153074)

Sat Aug 30 02:07:17 PDT 2025

https://github.com/mgudim updated https://github.com/llvm/llvm-project/pull/153074

>From b2fea3b7fe582fbaf936beb727a5ca351e3dcfb9 Mon Sep 17 00:00:00 2001
From: Mikhail Gudim <mgudim at ventanamicro.com>
Date: Fri, 8 Aug 2025 07:52:29 -0700
Subject: [PATCH 01/13] [SLPVectorizer] Widen strided loads.

Currently SLPVectorizer can generate strided loads only for this
pattern:

```
load %base + 0 * %stride
load %base + 1 * %stride
...
load %base + n * %stride
```
In this PR we extend it to this pattern:

```
; load w consecutive elements starting at %base
load %base + 0 * %stride + 0
load %base + 0 * %stride + 1
load %base + 0 * %stride + 2
...
load %base + 0 * %stride + (w - 1)

; load w consecutive elements starting at %base + 1 * %stride
load %base + 1 * %stride + 0
load %base + 1 * %stride + 1
load %base + 1 * %stride + 2
...
load %base + 1 * %stride + (w - 1)
...
; load w consecutive elements starting at %base + n * %stride
load %base + n * %stride
load %base + n * %stride + 0
load %base + n * %stride + 1
load %base + n * %stride + 2
...
load %base + n * %stride + (w - 1)
```
This works for both run-time and constant strides.
---
 .../Target/RISCV/RISCVTargetTransformInfo.cpp |   9 +
 .../Transforms/Vectorize/SLPVectorizer.cpp    | 538 +++++++++++++-----
 .../SLPVectorizer/RISCV/x264-satd-8x4.ll      | 483 ++++++++++++++++
 3 files changed, 872 insertions(+), 158 deletions(-)
 create mode 100644 llvm/test/Transforms/SLPVectorizer/RISCV/x264-satd-8x4.ll

diff --git a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp
index 5e300182657d5..8c53bd9265302 100644
--- a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp
+++ b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp
@@ -37,6 +37,11 @@ static cl::opt<unsigned> SLPMaxVF(
         "exclusively by SLP vectorizer."),
     cl::Hidden);
 
+static cl::opt<bool> SLPPreferAltOpcVectorization(
+    "riscv-v-slp-prefer-alt-opc-vectorization",
+    cl::desc("Controls preferAlternateOpcodeVectorization"), cl::init(false),
+    cl::Hidden);
+
 static cl::opt<unsigned>
     RVVMinTripCount("riscv-v-min-trip-count",
                     cl::desc("Set the lower bound of a trip count to decide on "
@@ -3040,3 +3045,7 @@ RISCVTTIImpl::enableMemCmpExpansion(bool OptSize, bool IsZeroCmp) const {
   }
   return Options;
 }
+
+bool RISCVTTIImpl::preferAlternateOpcodeVectorization() const {
+  return SLPPreferAltOpcVectorization;
+}
diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index 80528775b30a4..22ed2faee9916 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -1918,6 +1918,21 @@ class BoUpSLP {
   class ShuffleCostEstimator;
   class ShuffleInstructionBuilder;
 
+  // If we decide to generate strided load / store, this struct contains all the
+  // necessary info. It's fields are calculated by analyzeRtStrideCandidate and
+  // analyzeConstantStrideCandidate. Note that Stride can be given either as a
+  // SCEV or as a Value if it already exists.
+  // To get the stride in bytes, StrideVal (or value obtained from StrideSCEV)
+  // has to by multiplied by the size of element of FixedVectorType.
+  struct StridedPtrInfo {
+    Value *StrideVal = nullptr;
+    const SCEV *StrideSCEV = nullptr;
+    // Represents the ammount which needs to be added to the base pointer of
+    // strided load.
+    FixedVectorType *Ty = nullptr;
+  };
+  DenseMap<TreeEntry *, StridedPtrInfo> TreeEntryToStridedPtrInfoMap;
+
 public:
   /// Tracks the state we can represent the loads in the given sequence.
   enum class LoadsState {
@@ -2078,6 +2093,7 @@ class BoUpSLP {
     UserIgnoreList = nullptr;
     PostponedGathers.clear();
     ValueToGatherNodes.clear();
+    TreeEntryToStridedPtrInfoMap.clear();
   }
 
   unsigned getTreeSize() const { return VectorizableTree.size(); }
@@ -2214,6 +2230,17 @@ class BoUpSLP {
   ///       may not be necessary.
   bool isLoadCombineCandidate(ArrayRef<Value *> Stores) const;
 
+  bool analyzeRtStrideCandidate(ArrayRef<Value *> PointerOps, Type *ElemTy,
+                                Align CommonAlignment,
+                                SmallVectorImpl<unsigned> &SortedIndices,
+                                StridedPtrInfo *SPtrInfo) const;
+
+  bool analyzeConstantStrideCandidate(ArrayRef<Value *> PointerOps,
+                                      Type *ElemTy, Align CommonAlignment,
+                                      SmallVectorImpl<unsigned> &SortedIndices,
+                                      StridedPtrInfo *SPtrInfo, int64_t Diff,
+                                      Value *Ptr0, Value *PtrN) const;
+
   /// Checks if the given array of loads can be represented as a vectorized,
   /// scatter or just simple gather.
   /// \param VL list of loads.
@@ -2227,6 +2254,7 @@ class BoUpSLP {
   LoadsState canVectorizeLoads(ArrayRef<Value *> VL, const Value *VL0,
                                SmallVectorImpl<unsigned> &Order,
                                SmallVectorImpl<Value *> &PointerOps,
+                               StridedPtrInfo *SPtrInfo = nullptr,
                                unsigned *BestVF = nullptr,
                                bool TryRecursiveCheck = true) const;
 
@@ -4471,11 +4499,10 @@ class BoUpSLP {
 
   /// Checks if the specified list of the instructions/values can be vectorized
   /// and fills required data before actual scheduling of the instructions.
-  TreeEntry::EntryState
-  getScalarsVectorizationState(const InstructionsState &S, ArrayRef<Value *> VL,
-                               bool IsScatterVectorizeUserTE,
-                               OrdersType &CurrentOrder,
-                               SmallVectorImpl<Value *> &PointerOps);
+  TreeEntry::EntryState getScalarsVectorizationState(
+      const InstructionsState &S, ArrayRef<Value *> VL,
+      bool IsScatterVectorizeUserTE, OrdersType &CurrentOrder,
+      SmallVectorImpl<Value *> &PointerOps, StridedPtrInfo *SPtrInfo);
 
   /// Maps a specific scalar to its tree entry(ies).
   SmallDenseMap<Value *, SmallVector<TreeEntry *>> ScalarToTreeEntries;
@@ -6324,18 +6351,12 @@ static bool isReverseOrder(ArrayRef<unsigned> Order) {
   });
 }
 
-/// Checks if the provided list of pointers \p Pointers represents the strided
-/// pointers for type ElemTy. If they are not, std::nullopt is returned.
-/// Otherwise, if \p Inst is not specified, just initialized optional value is
-/// returned to show that the pointers represent strided pointers. If \p Inst
-/// specified, the runtime stride is materialized before the given \p Inst.
-/// \returns std::nullopt if the pointers are not pointers with the runtime
-/// stride, nullptr or actual stride value, otherwise.
-static std::optional<Value *>
-calculateRtStride(ArrayRef<Value *> PointerOps, Type *ElemTy,
-                  const DataLayout &DL, ScalarEvolution &SE,
-                  SmallVectorImpl<unsigned> &SortedIndices,
-                  Instruction *Inst = nullptr) {
+/// Returns a SCEV expression for the stride if PointerOps is a set of strided
+/// pointers, or nullptr otherwise.
+static const SCEV *calculateRtStride(ArrayRef<Value *> PointerOps, Type *ElemTy,
+                                     const DataLayout &DL, ScalarEvolution &SE,
+                                     SmallVectorImpl<unsigned> &SortedIndices,
+                                     SmallVectorImpl<int64_t> &Coeffs) {
   SmallVector<const SCEV *> SCEVs;
   const SCEV *PtrSCEVLowest = nullptr;
   const SCEV *PtrSCEVHighest = nullptr;
@@ -6344,7 +6365,7 @@ calculateRtStride(ArrayRef<Value *> PointerOps, Type *ElemTy,
   for (Value *Ptr : PointerOps) {
     const SCEV *PtrSCEV = SE.getSCEV(Ptr);
     if (!PtrSCEV)
-      return std::nullopt;
+      return nullptr;
     SCEVs.push_back(PtrSCEV);
     if (!PtrSCEVLowest && !PtrSCEVHighest) {
       PtrSCEVLowest = PtrSCEVHighest = PtrSCEV;
@@ -6352,14 +6373,14 @@ calculateRtStride(ArrayRef<Value *> PointerOps, Type *ElemTy,
     }
     const SCEV *Diff = SE.getMinusSCEV(PtrSCEV, PtrSCEVLowest);
     if (isa<SCEVCouldNotCompute>(Diff))
-      return std::nullopt;
+      return nullptr;
     if (Diff->isNonConstantNegative()) {
       PtrSCEVLowest = PtrSCEV;
       continue;
     }
     const SCEV *Diff1 = SE.getMinusSCEV(PtrSCEVHighest, PtrSCEV);
     if (isa<SCEVCouldNotCompute>(Diff1))
-      return std::nullopt;
+      return nullptr;
     if (Diff1->isNonConstantNegative()) {
       PtrSCEVHighest = PtrSCEV;
       continue;
@@ -6368,7 +6389,7 @@ calculateRtStride(ArrayRef<Value *> PointerOps, Type *ElemTy,
   // Dist = PtrSCEVHighest - PtrSCEVLowest;
   const SCEV *Dist = SE.getMinusSCEV(PtrSCEVHighest, PtrSCEVLowest);
   if (isa<SCEVCouldNotCompute>(Dist))
-    return std::nullopt;
+    return nullptr;
   int Size = DL.getTypeStoreSize(ElemTy);
   auto TryGetStride = [&](const SCEV *Dist,
                           const SCEV *Multiplier) -> const SCEV * {
@@ -6389,10 +6410,10 @@ calculateRtStride(ArrayRef<Value *> PointerOps, Type *ElemTy,
     const SCEV *Sz = SE.getConstant(Dist->getType(), Size * (SCEVs.size() - 1));
     Stride = TryGetStride(Dist, Sz);
     if (!Stride)
-      return std::nullopt;
+      return nullptr;
   }
   if (!Stride || isa<SCEVConstant>(Stride))
-    return std::nullopt;
+    return nullptr;
   // Iterate through all pointers and check if all distances are
   // unique multiple of Stride.
   using DistOrdPair = std::pair<int64_t, int>;
@@ -6406,42 +6427,184 @@ calculateRtStride(ArrayRef<Value *> PointerOps, Type *ElemTy,
       const SCEV *Diff = SE.getMinusSCEV(PtrSCEV, PtrSCEVLowest);
       const SCEV *Coeff = TryGetStride(Diff, Stride);
       if (!Coeff)
-        return std::nullopt;
+        return nullptr;
       const auto *SC = dyn_cast<SCEVConstant>(Coeff);
       if (!SC || isa<SCEVCouldNotCompute>(SC))
-        return std::nullopt;
+        return nullptr;
+      Coeffs.push_back((int64_t)SC->getAPInt().getLimitedValue());
       if (!SE.getMinusSCEV(PtrSCEV, SE.getAddExpr(PtrSCEVLowest,
                                                   SE.getMulExpr(Stride, SC)))
                ->isZero())
-        return std::nullopt;
+        return nullptr;
       Dist = SC->getAPInt().getZExtValue();
-    }
+    } else
+      Coeffs.push_back(0);
     // If the strides are not the same or repeated, we can't vectorize.
     if ((Dist / Size) * Size != Dist || (Dist / Size) >= SCEVs.size())
-      return std::nullopt;
+      return nullptr;
     auto Res = Offsets.emplace(Dist, Cnt);
     if (!Res.second)
-      return std::nullopt;
+      return nullptr;
     // Consecutive order if the inserted element is the last one.
     IsConsecutive = IsConsecutive && std::next(Res.first) == Offsets.end();
     ++Cnt;
   }
   if (Offsets.size() != SCEVs.size())
-    return std::nullopt;
+    return nullptr;
   SortedIndices.clear();
-  if (!IsConsecutive) {
-    // Fill SortedIndices array only if it is non-consecutive.
-    SortedIndices.resize(PointerOps.size());
-    Cnt = 0;
-    for (const std::pair<int64_t, int> &Pair : Offsets) {
-      SortedIndices[Cnt] = Pair.second;
-      ++Cnt;
+  SortedIndices.resize(PointerOps.size());
+  Cnt = 0;
+  for (const std::pair<int64_t, int> &Pair : Offsets) {
+    SortedIndices[Cnt] = Pair.second;
+    ++Cnt;
+  }
+  return Stride;
+}
+
+// Suppose we are given pointers of the form: %b + x * %s + y * %c
+// where %c is constant. Check if the pointers can be rearranged as follows:
+//  %b + 0 * %s + 0
+//  %b + 0 * %s + 1
+//  %b + 0 * %s + 2
+//  ...
+//  %b + 0 * %s + w
+//
+//  %b + 1 * %s + 0
+//  %b + 1 * %s + 1
+//  %b + 1 * %s + 2
+//  ...
+//  %b + 1 * %s + w
+//  ...
+//
+//  If the pointers can be rearanged in the above pattern, it means that the
+//  memory can be accessed with a strided loads of width `w` and stride `%s`.
+bool BoUpSLP::analyzeRtStrideCandidate(ArrayRef<Value *> PointerOps,
+                                       Type *ElemTy, Align CommonAlignment,
+                                       SmallVectorImpl<unsigned> &SortedIndices,
+                                       StridedPtrInfo *SPtrInfo) const {
+  // Group the pointers by constant offset.
+  DenseMap<int64_t, std::pair<SmallVector<Value *>, SmallVector<unsigned>>>
+      OffsetToPointerOpIdxMap;
+  for (auto [Idx, Ptr] : enumerate(PointerOps)) {
+    const SCEV *PtrSCEV = SE->getSCEV(Ptr);
+    if (!PtrSCEV)
+      return false;
+
+    const SCEVAddExpr *Add = dyn_cast<SCEVAddExpr>(PtrSCEV);
+    int64_t Offset = 0;
+    if (Add) {
+      for (int i = 0; i < (int)Add->getNumOperands(); ++i) {
+        const SCEVConstant *SC = dyn_cast<SCEVConstant>(Add->getOperand(i));
+        if (!SC)
+          continue;
+        Offset = (int64_t)(SC->getAPInt().getLimitedValue());
+        break;
+      }
     }
+    OffsetToPointerOpIdxMap[Offset].first.push_back(Ptr);
+    OffsetToPointerOpIdxMap[Offset].second.push_back(Idx);
   }
-  if (!Inst)
-    return nullptr;
-  SCEVExpander Expander(SE, DL, "strided-load-vec");
-  return Expander.expandCodeFor(Stride, Stride->getType(), Inst);
+  int NumOffsets = OffsetToPointerOpIdxMap.size();
+
+  const unsigned Sz = PointerOps.size();
+  unsigned VecSz = Sz;
+  Type *ScalarTy = ElemTy;
+  if (NumOffsets > 1) {
+    if (Sz % NumOffsets != 0)
+      return false;
+    VecSz = Sz / NumOffsets;
+    ScalarTy = Type::getIntNTy(SE->getContext(),
+                               DL->getTypeSizeInBits(ElemTy).getFixedValue() *
+                                   NumOffsets);
+  }
+  FixedVectorType *StridedLoadTy = getWidenedType(ScalarTy, VecSz);
+  if (!TTI->isTypeLegal(StridedLoadTy) ||
+      !TTI->isLegalStridedLoadStore(StridedLoadTy, CommonAlignment))
+    return false;
+
+  SmallVector<int> SortedOffsetsV;
+  for (auto [K, V] : OffsetToPointerOpIdxMap) {
+    SortedOffsetsV.push_back(K);
+  }
+  llvm::sort(SortedOffsetsV);
+  if (NumOffsets > 1) {
+    int CommonDiff = SortedOffsetsV[1] - SortedOffsetsV[0];
+    if ((CommonDiff) != 1)
+      return false;
+    for (int i = 1; i < (int)SortedOffsetsV.size() - 1; ++i) {
+      if (SortedOffsetsV[i + 1] - SortedOffsetsV[i] != CommonDiff)
+        return false;
+    }
+  }
+
+  int64_t LowestOffset = SortedOffsetsV[0];
+  SmallVector<Value *> &PointerOps0 =
+      OffsetToPointerOpIdxMap[LowestOffset].first;
+  SmallVector<unsigned> &IndicesInAllPointerOps0 =
+      OffsetToPointerOpIdxMap[LowestOffset].second;
+
+  SmallVector<int64_t> Coeffs0;
+  SmallVector<unsigned> SortedIndicesForOffset0;
+  const SCEV *Stride0 = calculateRtStride(PointerOps0, ElemTy, *DL, *SE,
+                                          SortedIndicesForOffset0, Coeffs0);
+  if (!Stride0)
+    return false;
+  unsigned NumCoeffs0 = Coeffs0.size();
+  if (NumCoeffs0 * NumOffsets != Sz)
+    return false;
+  llvm::sort(Coeffs0);
+
+  SmallVector<unsigned> SortedIndicesDraft;
+  SortedIndicesDraft.resize(Sz);
+  auto updateSortedIndices =
+      [&](SmallVectorImpl<unsigned> &SortedIndicesForOffset,
+          SmallVectorImpl<unsigned> &IndicesInAllPointerOps,
+          int64_t OffsetNum) {
+        unsigned Num = 0;
+        for (unsigned Idx : SortedIndicesForOffset) {
+          SortedIndicesDraft[Num * NumOffsets + OffsetNum] =
+              IndicesInAllPointerOps[Idx];
+          ++Num;
+        }
+      };
+
+  updateSortedIndices(SortedIndicesForOffset0, IndicesInAllPointerOps0, 0);
+
+  SmallVector<int64_t> Coeffs;
+  SmallVector<unsigned> SortedIndicesForOffset;
+  for (int i = 1; i < NumOffsets; ++i) {
+    Coeffs.clear();
+    SortedIndicesForOffset.clear();
+
+    int64_t Offset = SortedOffsetsV[i];
+    SmallVector<Value *> &PointerOpsForOffset =
+        OffsetToPointerOpIdxMap[Offset].first;
+    SmallVector<unsigned> &IndicesInAllPointerOps =
+        OffsetToPointerOpIdxMap[Offset].second;
+    const SCEV *StrideWithinGroup = calculateRtStride(
+        PointerOpsForOffset, ElemTy, *DL, *SE, SortedIndicesForOffset, Coeffs);
+
+    if ((!StrideWithinGroup) || StrideWithinGroup != Stride0) {
+      return false;
+    }
+    if (Coeffs.size() != NumCoeffs0)
+      return false;
+    llvm::sort(Coeffs);
+    for (unsigned i = 0; i < NumCoeffs0; ++i) {
+      if (Coeffs[i] != Coeffs0[i])
+        return false;
+    }
+
+    updateSortedIndices(SortedIndicesForOffset, IndicesInAllPointerOps, i);
+  }
+
+  SortedIndices.clear();
+  SortedIndices = SortedIndicesDraft;
+  if (SPtrInfo) {
+    SPtrInfo->StrideSCEV = Stride0;
+    SPtrInfo->Ty = StridedLoadTy;
+  }
+  return true;
 }
 
 static std::pair<InstructionCost, InstructionCost>
@@ -6771,77 +6934,133 @@ isMaskedLoadCompress(ArrayRef<Value *> VL, ArrayRef<Value *> PointerOps,
                               CompressMask, LoadVecTy);
 }
 
-/// Checks if strided loads can be generated out of \p VL loads with pointers \p
-/// PointerOps:
-/// 1. Target with strided load support is detected.
-/// 2. The number of loads is greater than MinProfitableStridedLoads, or the
-/// potential stride <= MaxProfitableLoadStride and the potential stride is
-/// power-of-2 (to avoid perf regressions for the very small number of loads)
-/// and max distance > number of loads, or potential stride is -1.
-/// 3. The loads are ordered, or number of unordered loads <=
-/// MaxProfitableUnorderedLoads, or loads are in reversed order. (this check is
-/// to avoid extra costs for very expensive shuffles).
-/// 4. Any pointer operand is an instruction with the users outside of the
-/// current graph (for masked gathers extra extractelement instructions
-/// might be required).
-static bool isStridedLoad(ArrayRef<Value *> VL, ArrayRef<Value *> PointerOps,
-                          ArrayRef<unsigned> Order,
-                          const TargetTransformInfo &TTI, const DataLayout &DL,
-                          ScalarEvolution &SE,
-                          const bool IsAnyPointerUsedOutGraph,
-                          const int64_t Diff) {
-  const size_t Sz = VL.size();
-  const uint64_t AbsoluteDiff = std::abs(Diff);
-  Type *ScalarTy = VL.front()->getType();
-  auto *VecTy = getWidenedType(ScalarTy, Sz);
-  if (IsAnyPointerUsedOutGraph ||
-      (AbsoluteDiff > Sz &&
-       (Sz > MinProfitableStridedLoads ||
-        (AbsoluteDiff <= MaxProfitableLoadStride * Sz &&
-         AbsoluteDiff % Sz == 0 && has_single_bit(AbsoluteDiff / Sz)))) ||
-      Diff == -(static_cast<int64_t>(Sz) - 1)) {
-    int64_t Stride = Diff / static_cast<int64_t>(Sz - 1);
-    if (Diff != Stride * static_cast<int64_t>(Sz - 1))
+// Same as analyzeRtStrideCandidate, but for constant strides.
+bool BoUpSLP::analyzeConstantStrideCandidate(
+    ArrayRef<Value *> PointerOps, Type *ElemTy, Align CommonAlignment,
+    SmallVectorImpl<unsigned> &SortedIndices, StridedPtrInfo *SPtrInfo,
+    int64_t Diff, Value *Ptr0, Value *PtrN) const {
+  const unsigned Sz = PointerOps.size();
+  SmallVector<int64_t> SortedOffsetsFromBase;
+  SortedOffsetsFromBase.resize(Sz);
+  for (unsigned i = 0; i < Sz; ++i) {
+    Value *Ptr =
+        SortedIndices.empty() ? PointerOps[i] : PointerOps[SortedIndices[i]];
+    SortedOffsetsFromBase[i] =
+        *getPointersDiff(ElemTy, Ptr0, ElemTy, Ptr, *DL, *SE);
+  }
+
+  // Find where the first group ends.
+  assert(SortedOffsetsFromBase.size() > 1);
+  int64_t StrideWithinGroup =
+      SortedOffsetsFromBase[1] - SortedOffsetsFromBase[0];
+  unsigned GroupSize = 1;
+  for (; GroupSize != SortedOffsetsFromBase.size(); ++GroupSize) {
+    if (SortedOffsetsFromBase[GroupSize] -
+            SortedOffsetsFromBase[GroupSize - 1] !=
+        StrideWithinGroup)
+      break;
+  }
+  unsigned VecSz = Sz;
+  Type *ScalarTy = ElemTy;
+  int64_t StrideIntVal = StrideWithinGroup;
+  FixedVectorType *StridedLoadTy = getWidenedType(ScalarTy, VecSz);
+
+  if (Sz != GroupSize) {
+    if (Sz % GroupSize != 0)
       return false;
-    Align Alignment =
-        cast<LoadInst>(Order.empty() ? VL.front() : VL[Order.front()])
-            ->getAlign();
-    if (!TTI.isLegalStridedLoadStore(VecTy, Alignment))
+    VecSz = Sz / GroupSize;
+
+    if (StrideWithinGroup != 1)
       return false;
-    Value *Ptr0;
-    Value *PtrN;
-    if (Order.empty()) {
-      Ptr0 = PointerOps.front();
-      PtrN = PointerOps.back();
-    } else {
-      Ptr0 = PointerOps[Order.front()];
-      PtrN = PointerOps[Order.back()];
-    }
-    // Iterate through all pointers and check if all distances are
-    // unique multiple of Dist.
-    SmallSet<int64_t, 4> Dists;
-    for (Value *Ptr : PointerOps) {
-      int64_t Dist = 0;
-      if (Ptr == PtrN)
-        Dist = Diff;
-      else if (Ptr != Ptr0)
-        Dist = *getPointersDiff(ScalarTy, Ptr0, ScalarTy, Ptr, DL, SE);
-      // If the strides are not the same or repeated, we can't
-      // vectorize.
-      if (((Dist / Stride) * Stride) != Dist || !Dists.insert(Dist).second)
+    unsigned VecSz = Sz / GroupSize;
+    ScalarTy = Type::getIntNTy(SE->getContext(),
+                               DL->getTypeSizeInBits(ElemTy).getFixedValue() *
+                                   GroupSize);
+    StridedLoadTy = getWidenedType(ScalarTy, VecSz);
+    if (!TTI->isTypeLegal(StridedLoadTy) ||
+        !TTI->isLegalStridedLoadStore(StridedLoadTy, CommonAlignment))
+      return false;
+
+    unsigned PrevGroupStartIdx = 0;
+    unsigned CurrentGroupStartIdx = GroupSize;
+    int64_t StrideBetweenGroups =
+        SortedOffsetsFromBase[GroupSize] - SortedOffsetsFromBase[0];
+    StrideIntVal = StrideBetweenGroups;
+    while (CurrentGroupStartIdx != Sz) {
+      if (SortedOffsetsFromBase[CurrentGroupStartIdx] -
+              SortedOffsetsFromBase[PrevGroupStartIdx] !=
+          StrideBetweenGroups)
         break;
+      PrevGroupStartIdx = CurrentGroupStartIdx;
+      CurrentGroupStartIdx += GroupSize;
+    }
+    if (CurrentGroupStartIdx != Sz)
+      return false;
+
+    auto checkGroup = [&](unsigned StartIdx, unsigned GroupSize0,
+                          int64_t StrideWithinGroup) -> bool {
+      unsigned GroupEndIdx = StartIdx + 1;
+      for (; GroupEndIdx != Sz; ++GroupEndIdx) {
+        if (SortedOffsetsFromBase[GroupEndIdx] -
+                SortedOffsetsFromBase[GroupEndIdx - 1] !=
+            StrideWithinGroup)
+          break;
+      }
+      return (GroupEndIdx - StartIdx == GroupSize0);
+    };
+    for (unsigned i = 0; i < Sz; i += GroupSize) {
+      if (!checkGroup(i, GroupSize, StrideWithinGroup))
+        return false;
     }
-    if (Dists.size() == Sz)
-      return true;
+  }
+
+  // Try to generate strided load node if:
+  // 1. Target with strided load support is detected.
+  // 2. The number of loads is greater than MinProfitableStridedLoads,
+  // or the potential stride <= MaxProfitableLoadStride and the
+  // potential stride is power-of-2 (to avoid perf regressions for the very
+  // small number of loads) and max distance > number of loads, or potential
+  // stride is -1.
+  // 3. The loads are ordered, or number of unordered loads <=
+  // MaxProfitableUnorderedLoads, or loads are in reversed order.
+  // (this check is to avoid extra costs for very expensive shuffles).
+  // 4. Any pointer operand is an instruction with the users outside of the
+  // current graph (for masked gathers extra extractelement instructions
+  // might be required).
+
+  if (!TTI->isTypeLegal(StridedLoadTy) ||
+      !TTI->isLegalStridedLoadStore(StridedLoadTy, CommonAlignment))
+    return false;
+
+  // Simple check if not a strided access - clear order.
+  bool IsPossibleStrided = Diff % (VecSz - 1) == 0;
+  auto IsAnyPointerUsedOutGraph =
+      IsPossibleStrided && any_of(PointerOps, [&](Value *V) {
+        return isa<Instruction>(V) && any_of(V->users(), [&](User *U) {
+                 return !isVectorized(U) && !MustGather.contains(U);
+               });
+      });
+  const unsigned AbsoluteDiff = std::abs(Diff);
+  if (IsAnyPointerUsedOutGraph ||
+      ((VecSz > MinProfitableStridedLoads ||
+        (AbsoluteDiff <= MaxProfitableLoadStride * VecSz &&
+         has_single_bit(AbsoluteDiff))) &&
+       AbsoluteDiff > VecSz) ||
+      Diff == -(static_cast<int>(VecSz) - 1)) {
+    if (SPtrInfo) {
+      Type *StrideTy = DL->getIndexType(Ptr0->getType());
+      SPtrInfo->StrideVal = ConstantInt::get(StrideTy, StrideIntVal);
+      SPtrInfo->Ty = StridedLoadTy;
+    }
+    return true;
   }
   return false;
 }
 
-BoUpSLP::LoadsState
-BoUpSLP::canVectorizeLoads(ArrayRef<Value *> VL, const Value *VL0,
-                           SmallVectorImpl<unsigned> &Order,
-                           SmallVectorImpl<Value *> &PointerOps,
-                           unsigned *BestVF, bool TryRecursiveCheck) const {
+BoUpSLP::LoadsState BoUpSLP::canVectorizeLoads(
+    ArrayRef<Value *> VL, const Value *VL0, SmallVectorImpl<unsigned> &Order,
+    SmallVectorImpl<Value *> &PointerOps, StridedPtrInfo *SPtrInfo,
+    unsigned *BestVF, bool TryRecursiveCheck) const {
   // Check that a vectorized load would load the same memory as a scalar
   // load. For example, we don't want to vectorize loads that are smaller
   // than 8-bit. Even though we have a packed struct {<i2, i2, i2, i2>} LLVM
@@ -6878,11 +7097,10 @@ BoUpSLP::canVectorizeLoads(ArrayRef<Value *> VL, const Value *VL0,
   auto *VecTy = getWidenedType(ScalarTy, Sz);
   Align CommonAlignment = computeCommonAlignment<LoadInst>(VL);
   if (!IsSorted) {
-    if (Sz > MinProfitableStridedLoads && TTI->isTypeLegal(VecTy)) {
-      if (TTI->isLegalStridedLoadStore(VecTy, CommonAlignment) &&
-          calculateRtStride(PointerOps, ScalarTy, *DL, *SE, Order))
-        return LoadsState::StridedVectorize;
-    }
+    if (Sz > MinProfitableStridedLoads &&
+        analyzeRtStrideCandidate(PointerOps, ScalarTy, CommonAlignment, Order,
+                                 SPtrInfo))
+      return LoadsState::StridedVectorize;
 
     if (!TTI->isLegalMaskedGather(VecTy, CommonAlignment) ||
         TTI->forceScalarizeMaskedGather(VecTy, CommonAlignment))
@@ -6915,17 +7133,8 @@ BoUpSLP::canVectorizeLoads(ArrayRef<Value *> VL, const Value *VL0,
                              }))
       return LoadsState::CompressVectorize;
     // Simple check if not a strided access - clear order.
-    bool IsPossibleStrided = *Diff % (Sz - 1) == 0;
-    // Try to generate strided load node.
-    auto IsAnyPointerUsedOutGraph =
-        IsPossibleStrided && any_of(PointerOps, [&](Value *V) {
-          return isa<Instruction>(V) && any_of(V->users(), [&](User *U) {
-                   return !isVectorized(U) && !MustGather.contains(U);
-                 });
-        });
-    if (IsPossibleStrided &&
-        isStridedLoad(VL, PointerOps, Order, *TTI, *DL, *SE,
-                      IsAnyPointerUsedOutGraph, *Diff))
+    if (analyzeConstantStrideCandidate(PointerOps, ScalarTy, CommonAlignment,
+                                       Order, SPtrInfo, *Diff, Ptr0, PtrN))
       return LoadsState::StridedVectorize;
   }
   if (!TTI->isLegalMaskedGather(VecTy, CommonAlignment) ||
@@ -7009,9 +7218,9 @@ BoUpSLP::canVectorizeLoads(ArrayRef<Value *> VL, const Value *VL0,
         ArrayRef<Value *> Slice = VL.slice(Cnt, VF);
         SmallVector<unsigned> Order;
         SmallVector<Value *> PointerOps;
-        LoadsState LS =
-            canVectorizeLoads(Slice, Slice.front(), Order, PointerOps, BestVF,
-                              /*TryRecursiveCheck=*/false);
+        LoadsState LS = canVectorizeLoads(Slice, Slice.front(), Order,
+                                          PointerOps, SPtrInfo, BestVF,
+                                          /*TryRecursiveCheck=*/false);
         // Check that the sorted loads are consecutive.
         if (LS == LoadsState::Gather) {
           if (BestVF) {
@@ -9188,8 +9397,9 @@ void BoUpSLP::tryToVectorizeGatheredLoads(
           // Try to build vector load.
           ArrayRef<Value *> Values(
               reinterpret_cast<Value *const *>(Slice.begin()), Slice.size());
-          LoadsState LS = canVectorizeLoads(Values, Slice.front(), CurrentOrder,
-                                            PointerOps, &BestVF);
+          LoadsState LS =
+              canVectorizeLoads(Values, Slice.front(), CurrentOrder, PointerOps,
+                                /*SPtrInfo =*/nullptr, &BestVF);
           if (LS != LoadsState::Gather ||
               (BestVF > 1 && static_cast<unsigned>(NumElts) == 2 * BestVF)) {
             if (LS == LoadsState::ScatterVectorize) {
@@ -9794,7 +10004,7 @@ getVectorCallCosts(CallInst *CI, FixedVectorType *VecTy,
 BoUpSLP::TreeEntry::EntryState BoUpSLP::getScalarsVectorizationState(
     const InstructionsState &S, ArrayRef<Value *> VL,
     bool IsScatterVectorizeUserTE, OrdersType &CurrentOrder,
-    SmallVectorImpl<Value *> &PointerOps) {
+    SmallVectorImpl<Value *> &PointerOps, StridedPtrInfo *SPtrInfo) {
   assert(S.getMainOp() &&
          "Expected instructions with same/alternate opcodes only.");
 
@@ -9896,7 +10106,7 @@ BoUpSLP::TreeEntry::EntryState BoUpSLP::getScalarsVectorizationState(
         });
       });
     };
-    switch (canVectorizeLoads(VL, VL0, CurrentOrder, PointerOps)) {
+    switch (canVectorizeLoads(VL, VL0, CurrentOrder, PointerOps, SPtrInfo)) {
     case LoadsState::Vectorize:
       return TreeEntry::Vectorize;
     case LoadsState::CompressVectorize:
@@ -11355,8 +11565,9 @@ void BoUpSLP::buildTreeRec(ArrayRef<Value *> VLRef, unsigned Depth,
       UserTreeIdx.UserTE->State == TreeEntry::ScatterVectorize;
   OrdersType CurrentOrder;
   SmallVector<Value *> PointerOps;
+  StridedPtrInfo SPtrInfo;
   TreeEntry::EntryState State = getScalarsVectorizationState(
-      S, VL, IsScatterVectorizeUserTE, CurrentOrder, PointerOps);
+      S, VL, IsScatterVectorizeUserTE, CurrentOrder, PointerOps, &SPtrInfo);
   if (State == TreeEntry::NeedToGather) {
     newGatherTreeEntry(VL, S, UserTreeIdx, ReuseShuffleIndices);
     return;
@@ -11516,6 +11727,7 @@ void BoUpSLP::buildTreeRec(ArrayRef<Value *> VLRef, unsigned Depth,
         // Vectorizing non-consecutive loads with `llvm.masked.gather`.
         TE = newTreeEntry(VL, TreeEntry::StridedVectorize, Bundle, S,
                           UserTreeIdx, ReuseShuffleIndices, CurrentOrder);
+        TreeEntryToStridedPtrInfoMap[TE] = SPtrInfo;
         LLVM_DEBUG(dbgs() << "SLP: added a new TreeEntry (strided LoadInst).\n";
                    TE->dump());
         break;
@@ -13011,10 +13223,20 @@ void BoUpSLP::transformNodes() {
         InstructionCost StridedCost = TTI->getStridedMemoryOpCost(
             Instruction::Load, VecTy, BaseLI->getPointerOperand(),
             /*VariableMask=*/false, CommonAlignment, CostKind, BaseLI);
-        if (StridedCost < OriginalVecCost)
+        if (StridedCost < OriginalVecCost) {
           // Strided load is more profitable than consecutive load + reverse -
           // transform the node to strided load.
+          // TODO: StrideTy =
+          // DL->getIndexType(E.scalars.front()->getPointerOperand()->getType());
+          StridedPtrInfo SPtrInfo;
+          Type *StrideTy = Type::getIntNTy(
+              SE->getContext(),
+              DL->getTypeStoreSizeInBits(E.Scalars.front()->getType()));
+          SPtrInfo.StrideVal = ConstantInt::get(StrideTy, 1);
+          SPtrInfo.Ty = VecTy;
+          TreeEntryToStridedPtrInfoMap[&E] = SPtrInfo;
           E.State = TreeEntry::StridedVectorize;
+        }
       }
       break;
     }
@@ -19472,6 +19694,7 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
 
       LoadInst *LI = cast<LoadInst>(VL0);
       Instruction *NewLI;
+      FixedVectorType *StridedLoadTy = nullptr;
       Value *PO = LI->getPointerOperand();
       if (E->State == TreeEntry::Vectorize) {
         NewLI = Builder.CreateAlignedLoad(VecTy, PO, LI->getAlign());
@@ -19509,40 +19732,37 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
         Value *Ptr0 = cast<LoadInst>(E->Scalars.front())->getPointerOperand();
         Value *PtrN = cast<LoadInst>(E->Scalars.back())->getPointerOperand();
         PO = IsReverseOrder ? PtrN : Ptr0;
-        std::optional<int64_t> Diff = getPointersDiff(
-            VL0->getType(), Ptr0, VL0->getType(), PtrN, *DL, *SE);
         Type *StrideTy = DL->getIndexType(PO->getType());
         Value *StrideVal;
-        if (Diff) {
-          int64_t Stride =
-              *Diff / (static_cast<int64_t>(E->Scalars.size()) - 1);
-          StrideVal =
-              ConstantInt::get(StrideTy, (IsReverseOrder ? -1 : 1) * Stride *
-                                             DL->getTypeAllocSize(ScalarTy));
-        } else {
-          SmallVector<Value *> PointerOps(E->Scalars.size(), nullptr);
-          transform(E->Scalars, PointerOps.begin(), [](Value *V) {
-            return cast<LoadInst>(V)->getPointerOperand();
-          });
-          OrdersType Order;
-          std::optional<Value *> Stride =
-              calculateRtStride(PointerOps, ScalarTy, *DL, *SE, Order,
-                                &*Builder.GetInsertPoint());
-          Value *NewStride =
-              Builder.CreateIntCast(*Stride, StrideTy, /*isSigned=*/true);
-          StrideVal = Builder.CreateMul(
-              NewStride,
-              ConstantInt::get(
-                  StrideTy,
-                  (IsReverseOrder ? -1 : 1) *
-                      static_cast<int>(DL->getTypeAllocSize(ScalarTy))));
-        }
+
+        StridedPtrInfo &SPtrInfo = TreeEntryToStridedPtrInfoMap[E];
+        StridedLoadTy = SPtrInfo.Ty;
+        assert(StridedLoadTy);
+        unsigned StridedLoadEC =
+            StridedLoadTy->getElementCount().getKnownMinValue();
+
+        Value *Stride = SPtrInfo.StrideVal;
+        if (!Stride) {
+          const SCEV *StrideSCEV = SPtrInfo.StrideSCEV;
+          assert(StrideSCEV);
+          SCEVExpander Expander(*SE, *DL, "strided-load-vec");
+          Stride = Expander.expandCodeFor(StrideSCEV, StrideSCEV->getType(),
+                                          &*Builder.GetInsertPoint());
+        }
+        Value *NewStride =
+            Builder.CreateIntCast(Stride, StrideTy, /*isSigned=*/true);
+        StrideVal = Builder.CreateMul(
+            NewStride, ConstantInt::get(
+                           StrideTy, (IsReverseOrder ? -1 : 1) *
+                                         static_cast<int>(
+                                             DL->getTypeAllocSize(ScalarTy))));
         Align CommonAlignment = computeCommonAlignment<LoadInst>(E->Scalars);
         auto *Inst = Builder.CreateIntrinsic(
             Intrinsic::experimental_vp_strided_load,
-            {VecTy, PO->getType(), StrideTy},
-            {PO, StrideVal, Builder.getAllOnesMask(VecTy->getElementCount()),
-             Builder.getInt32(E->Scalars.size())});
+            {StridedLoadTy, PO->getType(), StrideTy},
+            {PO, StrideVal,
+             Builder.getAllOnesMask(ElementCount::getFixed(StridedLoadEC)),
+             Builder.getInt32(StridedLoadEC)});
         Inst->addParamAttr(
             /*ArgNo=*/0,
             Attribute::getWithAlignment(Inst->getContext(), CommonAlignment));
@@ -19579,6 +19799,8 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
                      ? NewLI
                      : ::propagateMetadata(NewLI, E->Scalars);
 
+      if (StridedLoadTy)
+        V = Builder.CreateBitOrPointerCast(V, VecTy);
       V = FinalShuffle(V, E);
       E->VectorizedValue = V;
       ++NumVectorInstructions;
diff --git a/llvm/test/Transforms/SLPVectorizer/RISCV/x264-satd-8x4.ll b/llvm/test/Transforms/SLPVectorizer/RISCV/x264-satd-8x4.ll
new file mode 100644
index 0000000000000..38cf1214081fa
--- /dev/null
+++ b/llvm/test/Transforms/SLPVectorizer/RISCV/x264-satd-8x4.ll
@@ -0,0 +1,483 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt -mtriple=riscv64 -mattr=+m,+v,+unaligned-vector-mem \
+; RUN: -riscv-v-slp-prefer-alt-opc-vectorization=true \
+; RUN: -passes=slp-vectorizer -S < %s | FileCheck %s
+; Function Attrs: nounwind uwtable vscale_range(8,1024)
+define i32 @x264_pixel_satd_8x4(ptr %pix1, i32  %i_pix1, ptr  %pix2, i32  %i_pix2) {
+; CHECK-LABEL: define i32 @x264_pixel_satd_8x4(
+; CHECK-SAME: ptr [[PIX1:%.*]], i32 [[I_PIX1:%.*]], ptr [[PIX2:%.*]], i32 [[I_PIX2:%.*]]) #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[IDX_EXT:%.*]] = sext i32 [[I_PIX1]] to i64
+; CHECK-NEXT:    [[IDX_EXT63:%.*]] = sext i32 [[I_PIX2]] to i64
+; CHECK-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds nuw i8, ptr [[PIX1]], i64 4
+; CHECK-NEXT:    [[ARRAYIDX5:%.*]] = getelementptr inbounds nuw i8, ptr [[PIX2]], i64 4
+; CHECK-NEXT:    [[TMP0:%.*]] = mul i64 [[IDX_EXT]], 1
+; CHECK-NEXT:    [[TMP1:%.*]] = call <4 x i32> @llvm.experimental.vp.strided.load.v4i32.p0.i64(ptr align 1 [[PIX1]], i64 [[TMP0]], <4 x i1> splat (i1 true), i32 4)
+; CHECK-NEXT:    [[TMP19:%.*]] = bitcast <4 x i32> [[TMP1]] to <16 x i8>
+; CHECK-NEXT:    [[TMP20:%.*]] = zext <16 x i8> [[TMP19]] to <16 x i32>
+; CHECK-NEXT:    [[TMP4:%.*]] = mul i64 [[IDX_EXT63]], 1
+; CHECK-NEXT:    [[TMP5:%.*]] = call <4 x i32> @llvm.experimental.vp.strided.load.v4i32.p0.i64(ptr align 1 [[PIX2]], i64 [[TMP4]], <4 x i1> splat (i1 true), i32 4)
+; CHECK-NEXT:    [[TMP28:%.*]] = bitcast <4 x i32> [[TMP5]] to <16 x i8>
+; CHECK-NEXT:    [[TMP29:%.*]] = zext <16 x i8> [[TMP28]] to <16 x i32>
+; CHECK-NEXT:    [[TMP30:%.*]] = sub nsw <16 x i32> [[TMP20]], [[TMP29]]
+; CHECK-NEXT:    [[TMP9:%.*]] = mul i64 [[IDX_EXT]], 1
+; CHECK-NEXT:    [[TMP10:%.*]] = call <4 x i32> @llvm.experimental.vp.strided.load.v4i32.p0.i64(ptr align 1 [[ARRAYIDX3]], i64 [[TMP9]], <4 x i1> splat (i1 true), i32 4)
+; CHECK-NEXT:    [[TMP38:%.*]] = bitcast <4 x i32> [[TMP10]] to <16 x i8>
+; CHECK-NEXT:    [[TMP39:%.*]] = zext <16 x i8> [[TMP38]] to <16 x i32>
+; CHECK-NEXT:    [[TMP13:%.*]] = mul i64 [[IDX_EXT63]], 1
+; CHECK-NEXT:    [[TMP14:%.*]] = call <4 x i32> @llvm.experimental.vp.strided.load.v4i32.p0.i64(ptr align 1 [[ARRAYIDX5]], i64 [[TMP13]], <4 x i1> splat (i1 true), i32 4)
+; CHECK-NEXT:    [[TMP47:%.*]] = bitcast <4 x i32> [[TMP14]] to <16 x i8>
+; CHECK-NEXT:    [[TMP48:%.*]] = zext <16 x i8> [[TMP47]] to <16 x i32>
+; CHECK-NEXT:    [[TMP49:%.*]] = sub nsw <16 x i32> [[TMP39]], [[TMP48]]
+; CHECK-NEXT:    [[TMP50:%.*]] = shl nsw <16 x i32> [[TMP49]], splat (i32 16)
+; CHECK-NEXT:    [[TMP51:%.*]] = add nsw <16 x i32> [[TMP50]], [[TMP30]]
+; CHECK-NEXT:    [[TMP52:%.*]] = shufflevector <16 x i32> [[TMP51]], <16 x i32> poison, <16 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6, i32 9, i32 8, i32 11, i32 10, i32 13, i32 12, i32 15, i32 14>
+; CHECK-NEXT:    [[TMP53:%.*]] = add nsw <16 x i32> [[TMP52]], [[TMP51]]
+; CHECK-NEXT:    [[TMP54:%.*]] = sub nsw <16 x i32> [[TMP52]], [[TMP51]]
+; CHECK-NEXT:    [[TMP55:%.*]] = shufflevector <16 x i32> [[TMP53]], <16 x i32> [[TMP54]], <16 x i32> <i32 0, i32 17, i32 2, i32 19, i32 4, i32 21, i32 6, i32 23, i32 8, i32 25, i32 10, i32 27, i32 12, i32 29, i32 14, i32 31>
+; CHECK-NEXT:    [[TMP56:%.*]] = shufflevector <16 x i32> [[TMP55]], <16 x i32> poison, <16 x i32> <i32 2, i32 3, i32 0, i32 1, i32 6, i32 7, i32 4, i32 5, i32 10, i32 11, i32 8, i32 9, i32 14, i32 15, i32 12, i32 13>
+; CHECK-NEXT:    [[TMP57:%.*]] = add nsw <16 x i32> [[TMP55]], [[TMP56]]
+; CHECK-NEXT:    [[TMP58:%.*]] = sub nsw <16 x i32> [[TMP55]], [[TMP56]]
+; CHECK-NEXT:    [[TMP59:%.*]] = shufflevector <16 x i32> [[TMP57]], <16 x i32> [[TMP58]], <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 20, i32 21, i32 6, i32 7, i32 24, i32 25, i32 10, i32 11, i32 28, i32 29, i32 14, i32 15>
+; CHECK-NEXT:    [[TMP60:%.*]] = shufflevector <16 x i32> [[TMP59]], <16 x i32> poison, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 12, i32 13, i32 14, i32 15, i32 8, i32 9, i32 10, i32 11>
+; CHECK-NEXT:    [[TMP61:%.*]] = sub nsw <16 x i32> [[TMP59]], [[TMP60]]
+; CHECK-NEXT:    [[TMP62:%.*]] = add nsw <16 x i32> [[TMP59]], [[TMP60]]
+; CHECK-NEXT:    [[TMP63:%.*]] = shufflevector <16 x i32> [[TMP61]], <16 x i32> [[TMP62]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 20, i32 21, i32 22, i32 23, i32 8, i32 9, i32 10, i32 11, i32 28, i32 29, i32 30, i32 31>
+; CHECK-NEXT:    [[TMP64:%.*]] = shufflevector <16 x i32> [[TMP63]], <16 x i32> poison, <16 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEXT:    [[TMP65:%.*]] = add nsw <16 x i32> [[TMP63]], [[TMP64]]
+; CHECK-NEXT:    [[TMP66:%.*]] = sub nsw <16 x i32> [[TMP63]], [[TMP64]]
+; CHECK-NEXT:    [[TMP67:%.*]] = shufflevector <16 x i32> [[TMP65]], <16 x i32> [[TMP66]], <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; CHECK-NEXT:    [[TMP68:%.*]] = lshr <16 x i32> [[TMP67]], splat (i32 15)
+; CHECK-NEXT:    [[TMP69:%.*]] = and <16 x i32> [[TMP68]], splat (i32 65537)
+; CHECK-NEXT:    [[TMP70:%.*]] = mul nuw <16 x i32> [[TMP69]], splat (i32 65535)
+; CHECK-NEXT:    [[TMP71:%.*]] = add <16 x i32> [[TMP70]], [[TMP67]]
+; CHECK-NEXT:    [[TMP72:%.*]] = xor <16 x i32> [[TMP71]], [[TMP70]]
+; CHECK-NEXT:    [[TMP73:%.*]] = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[TMP72]])
+; CHECK-NEXT:    [[CONV118:%.*]] = and i32 [[TMP73]], 65535
+; CHECK-NEXT:    [[SHR:%.*]] = lshr i32 [[TMP73]], 16
+; CHECK-NEXT:    [[ADD119:%.*]] = add nuw nsw i32 [[CONV118]], [[SHR]]
+; CHECK-NEXT:    [[SHR120:%.*]] = lshr i32 [[ADD119]], 1
+; CHECK-NEXT:    ret i32 [[SHR120]]
+;
+entry:
+  %idx.ext = sext i32 %i_pix1 to i64
+  %idx.ext63 = sext i32 %i_pix2 to i64
+  %0 = load i8, ptr %pix1, align 1
+  %conv = zext i8 %0 to i32
+  %1 = load i8, ptr %pix2, align 1
+  %conv2 = zext i8 %1 to i32
+  %sub = sub nsw i32 %conv, %conv2
+  %arrayidx3 = getelementptr inbounds nuw i8, ptr %pix1, i64 4
+  %2 = load i8, ptr %arrayidx3, align 1
+  %conv4 = zext i8 %2 to i32
+  %arrayidx5 = getelementptr inbounds nuw i8, ptr %pix2, i64 4
+  %3 = load i8, ptr %arrayidx5, align 1
+  %conv6 = zext i8 %3 to i32
+  %sub7 = sub nsw i32 %conv4, %conv6
+  %shl = shl nsw i32 %sub7, 16
+  %add = add nsw i32 %shl, %sub
+  %arrayidx8 = getelementptr inbounds nuw i8, ptr %pix1, i64 1
+  %4 = load i8, ptr %arrayidx8, align 1
+  %conv9 = zext i8 %4 to i32
+  %arrayidx10 = getelementptr inbounds nuw i8, ptr %pix2, i64 1
+  %5 = load i8, ptr %arrayidx10, align 1
+  %conv11 = zext i8 %5 to i32
+  %sub12 = sub nsw i32 %conv9, %conv11
+  %arrayidx13 = getelementptr inbounds nuw i8, ptr %pix1, i64 5
+  %6 = load i8, ptr %arrayidx13, align 1
+  %conv14 = zext i8 %6 to i32
+  %arrayidx15 = getelementptr inbounds nuw i8, ptr %pix2, i64 5
+  %7 = load i8, ptr %arrayidx15, align 1
+  %conv16 = zext i8 %7 to i32
+  %sub17 = sub nsw i32 %conv14, %conv16
+  %shl18 = shl nsw i32 %sub17, 16
+  %add19 = add nsw i32 %shl18, %sub12
+  %arrayidx20 = getelementptr inbounds nuw i8, ptr %pix1, i64 2
+  %8 = load i8, ptr %arrayidx20, align 1
+  %conv21 = zext i8 %8 to i32
+  %arrayidx22 = getelementptr inbounds nuw i8, ptr %pix2, i64 2
+  %9 = load i8, ptr %arrayidx22, align 1
+  %conv23 = zext i8 %9 to i32
+  %sub24 = sub nsw i32 %conv21, %conv23
+  %arrayidx25 = getelementptr inbounds nuw i8, ptr %pix1, i64 6
+  %10 = load i8, ptr %arrayidx25, align 1
+  %conv26 = zext i8 %10 to i32
+  %arrayidx27 = getelementptr inbounds nuw i8, ptr %pix2, i64 6
+  %11 = load i8, ptr %arrayidx27, align 1
+  %conv28 = zext i8 %11 to i32
+  %sub29 = sub nsw i32 %conv26, %conv28
+  %shl30 = shl nsw i32 %sub29, 16
+  %add31 = add nsw i32 %shl30, %sub24
+  %arrayidx32 = getelementptr inbounds nuw i8, ptr %pix1, i64 3
+  %12 = load i8, ptr %arrayidx32, align 1
+  %conv33 = zext i8 %12 to i32
+  %arrayidx34 = getelementptr inbounds nuw i8, ptr %pix2, i64 3
+  %13 = load i8, ptr %arrayidx34, align 1
+  %conv35 = zext i8 %13 to i32
+  %sub36 = sub nsw i32 %conv33, %conv35
+  %arrayidx37 = getelementptr inbounds nuw i8, ptr %pix1, i64 7
+  %14 = load i8, ptr %arrayidx37, align 1
+  %conv38 = zext i8 %14 to i32
+  %arrayidx39 = getelementptr inbounds nuw i8, ptr %pix2, i64 7
+  %15 = load i8, ptr %arrayidx39, align 1
+  %conv40 = zext i8 %15 to i32
+  %sub41 = sub nsw i32 %conv38, %conv40
+  %shl42 = shl nsw i32 %sub41, 16
+  %add43 = add nsw i32 %shl42, %sub36
+  %add44 = add nsw i32 %add19, %add
+  %sub45 = sub nsw i32 %add, %add19
+  %add46 = add nsw i32 %add43, %add31
+  %sub47 = sub nsw i32 %add31, %add43
+  %add48 = add nsw i32 %add46, %add44
+  %sub51 = sub nsw i32 %add44, %add46
+  %add55 = add nsw i32 %sub47, %sub45
+  %sub59 = sub nsw i32 %sub45, %sub47
+  %add.ptr = getelementptr inbounds i8, ptr %pix1, i64 %idx.ext
+  %add.ptr64 = getelementptr inbounds i8, ptr %pix2, i64 %idx.ext63
+  %16 = load i8, ptr %add.ptr, align 1
+  %conv.1 = zext i8 %16 to i32
+  %17 = load i8, ptr %add.ptr64, align 1
+  %conv2.1 = zext i8 %17 to i32
+  %sub.1 = sub nsw i32 %conv.1, %conv2.1
+  %arrayidx3.1 = getelementptr inbounds nuw i8, ptr %add.ptr, i64 4
+  %18 = load i8, ptr %arrayidx3.1, align 1
+  %conv4.1 = zext i8 %18 to i32
+  %arrayidx5.1 = getelementptr inbounds nuw i8, ptr %add.ptr64, i64 4
+  %19 = load i8, ptr %arrayidx5.1, align 1
+  %conv6.1 = zext i8 %19 to i32
+  %sub7.1 = sub nsw i32 %conv4.1, %conv6.1
+  %shl.1 = shl nsw i32 %sub7.1, 16
+  %add.1 = add nsw i32 %shl.1, %sub.1
+  %arrayidx8.1 = getelementptr inbounds nuw i8, ptr %add.ptr, i64 1
+  %20 = load i8, ptr %arrayidx8.1, align 1
+  %conv9.1 = zext i8 %20 to i32
+  %arrayidx10.1 = getelementptr inbounds nuw i8, ptr %add.ptr64, i64 1
+  %21 = load i8, ptr %arrayidx10.1, align 1
+  %conv11.1 = zext i8 %21 to i32
+  %sub12.1 = sub nsw i32 %conv9.1, %conv11.1
+  %arrayidx13.1 = getelementptr inbounds nuw i8, ptr %add.ptr, i64 5
+  %22 = load i8, ptr %arrayidx13.1, align 1
+  %conv14.1 = zext i8 %22 to i32
+  %arrayidx15.1 = getelementptr inbounds nuw i8, ptr %add.ptr64, i64 5
+  %23 = load i8, ptr %arrayidx15.1, align 1
+  %conv16.1 = zext i8 %23 to i32
+  %sub17.1 = sub nsw i32 %conv14.1, %conv16.1
+  %shl18.1 = shl nsw i32 %sub17.1, 16
+  %add19.1 = add nsw i32 %shl18.1, %sub12.1
+  %arrayidx20.1 = getelementptr inbounds nuw i8, ptr %add.ptr, i64 2
+  %24 = load i8, ptr %arrayidx20.1, align 1
+  %conv21.1 = zext i8 %24 to i32
+  %arrayidx22.1 = getelementptr inbounds nuw i8, ptr %add.ptr64, i64 2
+  %25 = load i8, ptr %arrayidx22.1, align 1
+  %conv23.1 = zext i8 %25 to i32
+  %sub24.1 = sub nsw i32 %conv21.1, %conv23.1
+  %arrayidx25.1 = getelementptr inbounds nuw i8, ptr %add.ptr, i64 6
+  %26 = load i8, ptr %arrayidx25.1, align 1
+  %conv26.1 = zext i8 %26 to i32
+  %arrayidx27.1 = getelementptr inbounds nuw i8, ptr %add.ptr64, i64 6
+  %27 = load i8, ptr %arrayidx27.1, align 1
+  %conv28.1 = zext i8 %27 to i32
+  %sub29.1 = sub nsw i32 %conv26.1, %conv28.1
+  %shl30.1 = shl nsw i32 %sub29.1, 16
+  %add31.1 = add nsw i32 %shl30.1, %sub24.1
+  %arrayidx32.1 = getelementptr inbounds nuw i8, ptr %add.ptr, i64 3
+  %28 = load i8, ptr %arrayidx32.1, align 1
+  %conv33.1 = zext i8 %28 to i32
+  %arrayidx34.1 = getelementptr inbounds nuw i8, ptr %add.ptr64, i64 3
+  %29 = load i8, ptr %arrayidx34.1, align 1
+  %conv35.1 = zext i8 %29 to i32
+  %sub36.1 = sub nsw i32 %conv33.1, %conv35.1
+  %arrayidx37.1 = getelementptr inbounds nuw i8, ptr %add.ptr, i64 7
+  %30 = load i8, ptr %arrayidx37.1, align 1
+  %conv38.1 = zext i8 %30 to i32
+  %arrayidx39.1 = getelementptr inbounds nuw i8, ptr %add.ptr64, i64 7
+  %31 = load i8, ptr %arrayidx39.1, align 1
+  %conv40.1 = zext i8 %31 to i32
+  %sub41.1 = sub nsw i32 %conv38.1, %conv40.1
+  %shl42.1 = shl nsw i32 %sub41.1, 16
+  %add43.1 = add nsw i32 %shl42.1, %sub36.1
+  %add44.1 = add nsw i32 %add19.1, %add.1
+  %sub45.1 = sub nsw i32 %add.1, %add19.1
+  %add46.1 = add nsw i32 %add43.1, %add31.1
+  %sub47.1 = sub nsw i32 %add31.1, %add43.1
+  %add48.1 = add nsw i32 %add46.1, %add44.1
+  %sub51.1 = sub nsw i32 %add44.1, %add46.1
+  %add55.1 = add nsw i32 %sub47.1, %sub45.1
+  %sub59.1 = sub nsw i32 %sub45.1, %sub47.1
+  %add.ptr.1 = getelementptr inbounds i8, ptr %add.ptr, i64 %idx.ext
+  %add.ptr64.1 = getelementptr inbounds i8, ptr %add.ptr64, i64 %idx.ext63
+  %32 = load i8, ptr %add.ptr.1, align 1
+  %conv.2 = zext i8 %32 to i32
+  %33 = load i8, ptr %add.ptr64.1, align 1
+  %conv2.2 = zext i8 %33 to i32
+  %sub.2 = sub nsw i32 %conv.2, %conv2.2
+  %arrayidx3.2 = getelementptr inbounds nuw i8, ptr %add.ptr.1, i64 4
+  %34 = load i8, ptr %arrayidx3.2, align 1
+  %conv4.2 = zext i8 %34 to i32
+  %arrayidx5.2 = getelementptr inbounds nuw i8, ptr %add.ptr64.1, i64 4
+  %35 = load i8, ptr %arrayidx5.2, align 1
+  %conv6.2 = zext i8 %35 to i32
+  %sub7.2 = sub nsw i32 %conv4.2, %conv6.2
+  %shl.2 = shl nsw i32 %sub7.2, 16
+  %add.2 = add nsw i32 %shl.2, %sub.2
+  %arrayidx8.2 = getelementptr inbounds nuw i8, ptr %add.ptr.1, i64 1
+  %36 = load i8, ptr %arrayidx8.2, align 1
+  %conv9.2 = zext i8 %36 to i32
+  %arrayidx10.2 = getelementptr inbounds nuw i8, ptr %add.ptr64.1, i64 1
+  %37 = load i8, ptr %arrayidx10.2, align 1
+  %conv11.2 = zext i8 %37 to i32
+  %sub12.2 = sub nsw i32 %conv9.2, %conv11.2
+  %arrayidx13.2 = getelementptr inbounds nuw i8, ptr %add.ptr.1, i64 5
+  %38 = load i8, ptr %arrayidx13.2, align 1
+  %conv14.2 = zext i8 %38 to i32
+  %arrayidx15.2 = getelementptr inbounds nuw i8, ptr %add.ptr64.1, i64 5
+  %39 = load i8, ptr %arrayidx15.2, align 1
+  %conv16.2 = zext i8 %39 to i32
+  %sub17.2 = sub nsw i32 %conv14.2, %conv16.2
+  %shl18.2 = shl nsw i32 %sub17.2, 16
+  %add19.2 = add nsw i32 %shl18.2, %sub12.2
+  %arrayidx20.2 = getelementptr inbounds nuw i8, ptr %add.ptr.1, i64 2
+  %40 = load i8, ptr %arrayidx20.2, align 1
+  %conv21.2 = zext i8 %40 to i32
+  %arrayidx22.2 = getelementptr inbounds nuw i8, ptr %add.ptr64.1, i64 2
+  %41 = load i8, ptr %arrayidx22.2, align 1
+  %conv23.2 = zext i8 %41 to i32
+  %sub24.2 = sub nsw i32 %conv21.2, %conv23.2
+  %arrayidx25.2 = getelementptr inbounds nuw i8, ptr %add.ptr.1, i64 6
+  %42 = load i8, ptr %arrayidx25.2, align 1
+  %conv26.2 = zext i8 %42 to i32
+  %arrayidx27.2 = getelementptr inbounds nuw i8, ptr %add.ptr64.1, i64 6
+  %43 = load i8, ptr %arrayidx27.2, align 1
+  %conv28.2 = zext i8 %43 to i32
+  %sub29.2 = sub nsw i32 %conv26.2, %conv28.2
+  %shl30.2 = shl nsw i32 %sub29.2, 16
+  %add31.2 = add nsw i32 %shl30.2, %sub24.2
+  %arrayidx32.2 = getelementptr inbounds nuw i8, ptr %add.ptr.1, i64 3
+  %44 = load i8, ptr %arrayidx32.2, align 1
+  %conv33.2 = zext i8 %44 to i32
+  %arrayidx34.2 = getelementptr inbounds nuw i8, ptr %add.ptr64.1, i64 3
+  %45 = load i8, ptr %arrayidx34.2, align 1
+  %conv35.2 = zext i8 %45 to i32
+  %sub36.2 = sub nsw i32 %conv33.2, %conv35.2
+  %arrayidx37.2 = getelementptr inbounds nuw i8, ptr %add.ptr.1, i64 7
+  %46 = load i8, ptr %arrayidx37.2, align 1
+  %conv38.2 = zext i8 %46 to i32
+  %arrayidx39.2 = getelementptr inbounds nuw i8, ptr %add.ptr64.1, i64 7
+  %47 = load i8, ptr %arrayidx39.2, align 1
+  %conv40.2 = zext i8 %47 to i32
+  %sub41.2 = sub nsw i32 %conv38.2, %conv40.2
+  %shl42.2 = shl nsw i32 %sub41.2, 16
+  %add43.2 = add nsw i32 %shl42.2, %sub36.2
+  %add44.2 = add nsw i32 %add19.2, %add.2
+  %sub45.2 = sub nsw i32 %add.2, %add19.2
+  %add46.2 = add nsw i32 %add43.2, %add31.2
+  %sub47.2 = sub nsw i32 %add31.2, %add43.2
+  %add48.2 = add nsw i32 %add46.2, %add44.2
+  %sub51.2 = sub nsw i32 %add44.2, %add46.2
+  %add55.2 = add nsw i32 %sub47.2, %sub45.2
+  %sub59.2 = sub nsw i32 %sub45.2, %sub47.2
+  %add.ptr.2 = getelementptr inbounds i8, ptr %add.ptr.1, i64 %idx.ext
+  %add.ptr64.2 = getelementptr inbounds i8, ptr %add.ptr64.1, i64 %idx.ext63
+  %48 = load i8, ptr %add.ptr.2, align 1
+  %conv.3 = zext i8 %48 to i32
+  %49 = load i8, ptr %add.ptr64.2, align 1
+  %conv2.3 = zext i8 %49 to i32
+  %sub.3 = sub nsw i32 %conv.3, %conv2.3
+  %arrayidx3.3 = getelementptr inbounds nuw i8, ptr %add.ptr.2, i64 4
+  %50 = load i8, ptr %arrayidx3.3, align 1
+  %conv4.3 = zext i8 %50 to i32
+  %arrayidx5.3 = getelementptr inbounds nuw i8, ptr %add.ptr64.2, i64 4
+  %51 = load i8, ptr %arrayidx5.3, align 1
+  %conv6.3 = zext i8 %51 to i32
+  %sub7.3 = sub nsw i32 %conv4.3, %conv6.3
+  %shl.3 = shl nsw i32 %sub7.3, 16
+  %add.3 = add nsw i32 %shl.3, %sub.3
+  %arrayidx8.3 = getelementptr inbounds nuw i8, ptr %add.ptr.2, i64 1
+  %52 = load i8, ptr %arrayidx8.3, align 1
+  %conv9.3 = zext i8 %52 to i32
+  %arrayidx10.3 = getelementptr inbounds nuw i8, ptr %add.ptr64.2, i64 1
+  %53 = load i8, ptr %arrayidx10.3, align 1
+  %conv11.3 = zext i8 %53 to i32
+  %sub12.3 = sub nsw i32 %conv9.3, %conv11.3
+  %arrayidx13.3 = getelementptr inbounds nuw i8, ptr %add.ptr.2, i64 5
+  %54 = load i8, ptr %arrayidx13.3, align 1
+  %conv14.3 = zext i8 %54 to i32
+  %arrayidx15.3 = getelementptr inbounds nuw i8, ptr %add.ptr64.2, i64 5
+  %55 = load i8, ptr %arrayidx15.3, align 1
+  %conv16.3 = zext i8 %55 to i32
+  %sub17.3 = sub nsw i32 %conv14.3, %conv16.3
+  %shl18.3 = shl nsw i32 %sub17.3, 16
+  %add19.3 = add nsw i32 %shl18.3, %sub12.3
+  %arrayidx20.3 = getelementptr inbounds nuw i8, ptr %add.ptr.2, i64 2
+  %56 = load i8, ptr %arrayidx20.3, align 1
+  %conv21.3 = zext i8 %56 to i32
+  %arrayidx22.3 = getelementptr inbounds nuw i8, ptr %add.ptr64.2, i64 2
+  %57 = load i8, ptr %arrayidx22.3, align 1
+  %conv23.3 = zext i8 %57 to i32
+  %sub24.3 = sub nsw i32 %conv21.3, %conv23.3
+  %arrayidx25.3 = getelementptr inbounds nuw i8, ptr %add.ptr.2, i64 6
+  %58 = load i8, ptr %arrayidx25.3, align 1
+  %conv26.3 = zext i8 %58 to i32
+  %arrayidx27.3 = getelementptr inbounds nuw i8, ptr %add.ptr64.2, i64 6
+  %59 = load i8, ptr %arrayidx27.3, align 1
+  %conv28.3 = zext i8 %59 to i32
+  %sub29.3 = sub nsw i32 %conv26.3, %conv28.3
+  %shl30.3 = shl nsw i32 %sub29.3, 16
+  %add31.3 = add nsw i32 %shl30.3, %sub24.3
+  %arrayidx32.3 = getelementptr inbounds nuw i8, ptr %add.ptr.2, i64 3
+  %60 = load i8, ptr %arrayidx32.3, align 1
+  %conv33.3 = zext i8 %60 to i32
+  %arrayidx34.3 = getelementptr inbounds nuw i8, ptr %add.ptr64.2, i64 3
+  %61 = load i8, ptr %arrayidx34.3, align 1
+  %conv35.3 = zext i8 %61 to i32
+  %sub36.3 = sub nsw i32 %conv33.3, %conv35.3
+  %arrayidx37.3 = getelementptr inbounds nuw i8, ptr %add.ptr.2, i64 7
+  %62 = load i8, ptr %arrayidx37.3, align 1
+  %conv38.3 = zext i8 %62 to i32
+  %arrayidx39.3 = getelementptr inbounds nuw i8, ptr %add.ptr64.2, i64 7
+  %63 = load i8, ptr %arrayidx39.3, align 1
+  %conv40.3 = zext i8 %63 to i32
+  %sub41.3 = sub nsw i32 %conv38.3, %conv40.3
+  %shl42.3 = shl nsw i32 %sub41.3, 16
+  %add43.3 = add nsw i32 %shl42.3, %sub36.3
+  %add44.3 = add nsw i32 %add19.3, %add.3
+  %sub45.3 = sub nsw i32 %add.3, %add19.3
+  %add46.3 = add nsw i32 %add43.3, %add31.3
+  %sub47.3 = sub nsw i32 %add31.3, %add43.3
+  %add48.3 = add nsw i32 %add46.3, %add44.3
+  %sub51.3 = sub nsw i32 %add44.3, %add46.3
+  %add55.3 = add nsw i32 %sub47.3, %sub45.3
+  %sub59.3 = sub nsw i32 %sub45.3, %sub47.3
+  %add78 = add nsw i32 %add48.1, %add48
+  %sub86 = sub nsw i32 %add48, %add48.1
+  %add94 = add nsw i32 %add48.3, %add48.2
+  %sub102 = sub nsw i32 %add48.2, %add48.3
+  %add103 = add nsw i32 %add94, %add78
+  %sub104 = sub nsw i32 %add78, %add94
+  %add105 = add nsw i32 %sub102, %sub86
+  %sub106 = sub nsw i32 %sub86, %sub102
+  %shr.i = lshr i32 %add103, 15
+  %and.i = and i32 %shr.i, 65537
+  %mul.i = mul nuw i32 %and.i, 65535
+  %add.i = add i32 %mul.i, %add103
+  %xor.i = xor i32 %add.i, %mul.i
+  %shr.i169 = lshr i32 %add105, 15
+  %and.i170 = and i32 %shr.i169, 65537
+  %mul.i171 = mul nuw i32 %and.i170, 65535
+  %add.i172 = add i32 %mul.i171, %add105
+  %xor.i173 = xor i32 %add.i172, %mul.i171
+  %shr.i174 = lshr i32 %sub104, 15
+  %and.i175 = and i32 %shr.i174, 65537
+  %mul.i176 = mul nuw i32 %and.i175, 65535
+  %add.i177 = add i32 %mul.i176, %sub104
+  %xor.i178 = xor i32 %add.i177, %mul.i176
+  %shr.i179 = lshr i32 %sub106, 15
+  %and.i180 = and i32 %shr.i179, 65537
+  %mul.i181 = mul nuw i32 %and.i180, 65535
+  %add.i182 = add i32 %mul.i181, %sub106
+  %xor.i183 = xor i32 %add.i182, %mul.i181
+  %add110 = add i32 %xor.i173, %xor.i
+  %add112 = add i32 %add110, %xor.i178
+  %add113 = add i32 %add112, %xor.i183
+  %add78.1 = add nsw i32 %add55.1, %add55
+  %sub86.1 = sub nsw i32 %add55, %add55.1
+  %add94.1 = add nsw i32 %add55.3, %add55.2
+  %sub102.1 = sub nsw i32 %add55.2, %add55.3
+  %add103.1 = add nsw i32 %add94.1, %add78.1
+  %sub104.1 = sub nsw i32 %add78.1, %add94.1
+  %add105.1 = add nsw i32 %sub102.1, %sub86.1
+  %sub106.1 = sub nsw i32 %sub86.1, %sub102.1
+  %shr.i.1 = lshr i32 %add103.1, 15
+  %and.i.1 = and i32 %shr.i.1, 65537
+  %mul.i.1 = mul nuw i32 %and.i.1, 65535
+  %add.i.1 = add i32 %mul.i.1, %add103.1
+  %xor.i.1 = xor i32 %add.i.1, %mul.i.1
+  %shr.i169.1 = lshr i32 %add105.1, 15
+  %and.i170.1 = and i32 %shr.i169.1, 65537
+  %mul.i171.1 = mul nuw i32 %and.i170.1, 65535
+  %add.i172.1 = add i32 %mul.i171.1, %add105.1
+  %xor.i173.1 = xor i32 %add.i172.1, %mul.i171.1
+  %shr.i174.1 = lshr i32 %sub104.1, 15
+  %and.i175.1 = and i32 %shr.i174.1, 65537
+  %mul.i176.1 = mul nuw i32 %and.i175.1, 65535
+  %add.i177.1 = add i32 %mul.i176.1, %sub104.1
+  %xor.i178.1 = xor i32 %add.i177.1, %mul.i176.1
+  %shr.i179.1 = lshr i32 %sub106.1, 15
+  %and.i180.1 = and i32 %shr.i179.1, 65537
+  %mul.i181.1 = mul nuw i32 %and.i180.1, 65535
+  %add.i182.1 = add i32 %mul.i181.1, %sub106.1
+  %xor.i183.1 = xor i32 %add.i182.1, %mul.i181.1
+  %add108.1 = add i32 %xor.i173.1, %add113
+  %add110.1 = add i32 %add108.1, %xor.i.1
+  %add112.1 = add i32 %add110.1, %xor.i178.1
+  %add113.1 = add i32 %add112.1, %xor.i183.1
+  %add78.2 = add nsw i32 %sub51.1, %sub51
+  %sub86.2 = sub nsw i32 %sub51, %sub51.1
+  %add94.2 = add nsw i32 %sub51.3, %sub51.2
+  %sub102.2 = sub nsw i32 %sub51.2, %sub51.3
+  %add103.2 = add nsw i32 %add94.2, %add78.2
+  %sub104.2 = sub nsw i32 %add78.2, %add94.2
+  %add105.2 = add nsw i32 %sub102.2, %sub86.2
+  %sub106.2 = sub nsw i32 %sub86.2, %sub102.2
+  %shr.i.2 = lshr i32 %add103.2, 15
+  %and.i.2 = and i32 %shr.i.2, 65537
+  %mul.i.2 = mul nuw i32 %and.i.2, 65535
+  %add.i.2 = add i32 %mul.i.2, %add103.2
+  %xor.i.2 = xor i32 %add.i.2, %mul.i.2
+  %shr.i169.2 = lshr i32 %add105.2, 15
+  %and.i170.2 = and i32 %shr.i169.2, 65537
+  %mul.i171.2 = mul nuw i32 %and.i170.2, 65535
+  %add.i172.2 = add i32 %mul.i171.2, %add105.2
+  %xor.i173.2 = xor i32 %add.i172.2, %mul.i171.2
+  %shr.i174.2 = lshr i32 %sub104.2, 15
+  %and.i175.2 = and i32 %shr.i174.2, 65537
+  %mul.i176.2 = mul nuw i32 %and.i175.2, 65535
+  %add.i177.2 = add i32 %mul.i176.2, %sub104.2
+  %xor.i178.2 = xor i32 %add.i177.2, %mul.i176.2
+  %shr.i179.2 = lshr i32 %sub106.2, 15
+  %and.i180.2 = and i32 %shr.i179.2, 65537
+  %mul.i181.2 = mul nuw i32 %and.i180.2, 65535
+  %add.i182.2 = add i32 %mul.i181.2, %sub106.2
+  %xor.i183.2 = xor i32 %add.i182.2, %mul.i181.2
+  %add108.2 = add i32 %xor.i173.2, %add113.1
+  %add110.2 = add i32 %add108.2, %xor.i.2
+  %add112.2 = add i32 %add110.2, %xor.i178.2
+  %add113.2 = add i32 %add112.2, %xor.i183.2
+  %add78.3 = add nsw i32 %sub59.1, %sub59
+  %sub86.3 = sub nsw i32 %sub59, %sub59.1
+  %add94.3 = add nsw i32 %sub59.3, %sub59.2
+  %sub102.3 = sub nsw i32 %sub59.2, %sub59.3
+  %add103.3 = add nsw i32 %add94.3, %add78.3
+  %sub104.3 = sub nsw i32 %add78.3, %add94.3
+  %add105.3 = add nsw i32 %sub102.3, %sub86.3
+  %sub106.3 = sub nsw i32 %sub86.3, %sub102.3
+  %shr.i.3 = lshr i32 %add103.3, 15
+  %and.i.3 = and i32 %shr.i.3, 65537
+  %mul.i.3 = mul nuw i32 %and.i.3, 65535
+  %add.i.3 = add i32 %mul.i.3, %add103.3
+  %xor.i.3 = xor i32 %add.i.3, %mul.i.3
+  %shr.i169.3 = lshr i32 %add105.3, 15
+  %and.i170.3 = and i32 %shr.i169.3, 65537
+  %mul.i171.3 = mul nuw i32 %and.i170.3, 65535
+  %add.i172.3 = add i32 %mul.i171.3, %add105.3
+  %xor.i173.3 = xor i32 %add.i172.3, %mul.i171.3
+  %shr.i174.3 = lshr i32 %sub104.3, 15
+  %and.i175.3 = and i32 %shr.i174.3, 65537
+  %mul.i176.3 = mul nuw i32 %and.i175.3, 65535
+  %add.i177.3 = add i32 %mul.i176.3, %sub104.3
+  %xor.i178.3 = xor i32 %add.i177.3, %mul.i176.3
+  %shr.i179.3 = lshr i32 %sub106.3, 15
+  %and.i180.3 = and i32 %shr.i179.3, 65537
+  %mul.i181.3 = mul nuw i32 %and.i180.3, 65535
+  %add.i182.3 = add i32 %mul.i181.3, %sub106.3
+  %xor.i183.3 = xor i32 %add.i182.3, %mul.i181.3
+  %add108.3 = add i32 %xor.i173.3, %add113.2
+  %add110.3 = add i32 %add108.3, %xor.i.3
+  %add112.3 = add i32 %add110.3, %xor.i178.3
+  %add113.3 = add i32 %add112.3, %xor.i183.3
+  %conv118 = and i32 %add113.3, 65535
+  %shr = lshr i32 %add113.3, 16
+  %add119 = add nuw nsw i32 %conv118, %shr
+  %shr120 = lshr i32 %add119, 1
+  ret i32 %shr120
+}

>From 6c9892685b718722ae830ad25ec693529547936d Mon Sep 17 00:00:00 2001
From: Mikhail Gudim <mgudim at ventanamicro.com>
Date: Tue, 12 Aug 2025 10:21:30 -0700
Subject: [PATCH 02/13] updated basic-strided-loads.ll

---
 .../RISCV/basic-strided-loads.ll              | 20 +++----------------
 1 file changed, 3 insertions(+), 17 deletions(-)

diff --git a/llvm/test/Transforms/SLPVectorizer/RISCV/basic-strided-loads.ll b/llvm/test/Transforms/SLPVectorizer/RISCV/basic-strided-loads.ll
index 645dbc49269f0..0135d3c01d9f6 100644
--- a/llvm/test/Transforms/SLPVectorizer/RISCV/basic-strided-loads.ll
+++ b/llvm/test/Transforms/SLPVectorizer/RISCV/basic-strided-loads.ll
@@ -630,25 +630,11 @@ define void @rt_stride_widen_no_reordering(ptr %pl, i64 %stride, ptr %ps) {
 ; CHECK-LABEL: define void @rt_stride_widen_no_reordering(
 ; CHECK-SAME: ptr [[PL:%.*]], i64 [[STRIDE:%.*]], ptr [[PS:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[OFFSET0:%.*]] = mul nsw i64 [[STRIDE]], 0
-; CHECK-NEXT:    [[OFFSET4:%.*]] = mul nsw i64 [[STRIDE]], 1
-; CHECK-NEXT:    [[OFFSET8:%.*]] = mul nsw i64 [[STRIDE]], 2
-; CHECK-NEXT:    [[OFFSET12:%.*]] = mul nsw i64 [[STRIDE]], 3
 ; CHECK-NEXT:    [[GEP_L0:%.*]] = getelementptr inbounds i8, ptr [[PL]], i64 [[OFFSET0]]
-; CHECK-NEXT:    [[GEP_L4:%.*]] = getelementptr inbounds i8, ptr [[PL]], i64 [[OFFSET4]]
-; CHECK-NEXT:    [[GEP_L8:%.*]] = getelementptr inbounds i8, ptr [[PL]], i64 [[OFFSET8]]
-; CHECK-NEXT:    [[GEP_L12:%.*]] = getelementptr inbounds i8, ptr [[PL]], i64 [[OFFSET12]]
 ; CHECK-NEXT:    [[GEP_S0:%.*]] = getelementptr inbounds i8, ptr [[PS]], i64 0
-; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i8>, ptr [[GEP_L0]], align 16
-; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i8>, ptr [[GEP_L4]], align 16
-; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i8>, ptr [[GEP_L8]], align 16
-; CHECK-NEXT:    [[TMP4:%.*]] = load <4 x i8>, ptr [[GEP_L12]], align 16
-; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <4 x i8> [[TMP1]], <4 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP6:%.*]] = shufflevector <4 x i8> [[TMP2]], <4 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP7:%.*]] = shufflevector <4 x i8> [[TMP1]], <4 x i8> [[TMP2]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP11:%.*]] = shufflevector <4 x i8> [[TMP3]], <4 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP9:%.*]] = shufflevector <16 x i8> [[TMP7]], <16 x i8> [[TMP11]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP10:%.*]] = shufflevector <4 x i8> [[TMP4]], <4 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP8:%.*]] = shufflevector <16 x i8> [[TMP9]], <16 x i8> [[TMP10]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 18, i32 19>
+; CHECK-NEXT:    [[TMP1:%.*]] = mul i64 [[STRIDE]], 1
+; CHECK-NEXT:    [[TMP2:%.*]] = call <4 x i32> @llvm.experimental.vp.strided.load.v4i32.p0.i64(ptr align 16 [[GEP_L0]], i64 [[TMP1]], <4 x i1> splat (i1 true), i32 4)
+; CHECK-NEXT:    [[TMP8:%.*]] = bitcast <4 x i32> [[TMP2]] to <16 x i8>
 ; CHECK-NEXT:    store <16 x i8> [[TMP8]], ptr [[GEP_S0]], align 16
 ; CHECK-NEXT:    ret void
 ;

>From 0f77913b328b550bd3d493a4a6d878a159abb9b1 Mon Sep 17 00:00:00 2001
From: Mikhail Gudim <mgudim at ventanamicro.com>
Date: Tue, 12 Aug 2025 10:41:51 -0700
Subject: [PATCH 03/13] did a "todo".

---
 llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp | 8 +++-----
 1 file changed, 3 insertions(+), 5 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index 22ed2faee9916..8c84129cd46a2 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -13226,12 +13226,10 @@ void BoUpSLP::transformNodes() {
         if (StridedCost < OriginalVecCost) {
           // Strided load is more profitable than consecutive load + reverse -
           // transform the node to strided load.
-          // TODO: StrideTy =
-          // DL->getIndexType(E.scalars.front()->getPointerOperand()->getType());
+          Type *StrideTy = DL->getIndexType(cast<LoadInst>(E.Scalars.front())
+                                                ->getPointerOperand()
+                                                ->getType());
           StridedPtrInfo SPtrInfo;
-          Type *StrideTy = Type::getIntNTy(
-              SE->getContext(),
-              DL->getTypeStoreSizeInBits(E.Scalars.front()->getType()));
           SPtrInfo.StrideVal = ConstantInt::get(StrideTy, 1);
           SPtrInfo.Ty = VecTy;
           TreeEntryToStridedPtrInfoMap[&E] = SPtrInfo;

>From 8421a230fe6afd8dd834f9d1ac98304d9e86b9f0 Mon Sep 17 00:00:00 2001
From: Mikhail Gudim <mgudim at ventanamicro.com>
Date: Tue, 12 Aug 2025 12:22:28 -0700
Subject: [PATCH 04/13] addressed review comments.

---
 .../Transforms/Vectorize/SLPVectorizer.cpp    | 112 +++++++++---------
 1 file changed, 53 insertions(+), 59 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index 8c84129cd46a2..7a1d8e1546a15 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -1931,7 +1931,7 @@ class BoUpSLP {
     // strided load.
     FixedVectorType *Ty = nullptr;
   };
-  DenseMap<TreeEntry *, StridedPtrInfo> TreeEntryToStridedPtrInfoMap;
+  SmallDenseMap<TreeEntry *, StridedPtrInfo> TreeEntryToStridedPtrInfoMap;
 
 public:
   /// Tracks the state we can represent the loads in the given sequence.
@@ -2233,12 +2233,12 @@ class BoUpSLP {
   bool analyzeRtStrideCandidate(ArrayRef<Value *> PointerOps, Type *ElemTy,
                                 Align CommonAlignment,
                                 SmallVectorImpl<unsigned> &SortedIndices,
-                                StridedPtrInfo *SPtrInfo) const;
+                                StridedPtrInfo &SPtrInfo) const;
 
   bool analyzeConstantStrideCandidate(ArrayRef<Value *> PointerOps,
                                       Type *ElemTy, Align CommonAlignment,
                                       SmallVectorImpl<unsigned> &SortedIndices,
-                                      StridedPtrInfo *SPtrInfo, int64_t Diff,
+                                      StridedPtrInfo &SPtrInfo, int64_t Diff,
                                       Value *Ptr0, Value *PtrN) const;
 
   /// Checks if the given array of loads can be represented as a vectorized,
@@ -4502,7 +4502,7 @@ class BoUpSLP {
   TreeEntry::EntryState getScalarsVectorizationState(
       const InstructionsState &S, ArrayRef<Value *> VL,
       bool IsScatterVectorizeUserTE, OrdersType &CurrentOrder,
-      SmallVectorImpl<Value *> &PointerOps, StridedPtrInfo *SPtrInfo);
+      SmallVectorImpl<Value *> &PointerOps, StridedPtrInfo &SPtrInfo);
 
   /// Maps a specific scalar to its tree entry(ies).
   SmallDenseMap<Value *, SmallVector<TreeEntry *>> ScalarToTreeEntries;
@@ -6481,7 +6481,7 @@ static const SCEV *calculateRtStride(ArrayRef<Value *> PointerOps, Type *ElemTy,
 bool BoUpSLP::analyzeRtStrideCandidate(ArrayRef<Value *> PointerOps,
                                        Type *ElemTy, Align CommonAlignment,
                                        SmallVectorImpl<unsigned> &SortedIndices,
-                                       StridedPtrInfo *SPtrInfo) const {
+                                       StridedPtrInfo &SPtrInfo) const {
   // Group the pointers by constant offset.
   DenseMap<int64_t, std::pair<SmallVector<Value *>, SmallVector<unsigned>>>
       OffsetToPointerOpIdxMap;
@@ -6491,15 +6491,15 @@ bool BoUpSLP::analyzeRtStrideCandidate(ArrayRef<Value *> PointerOps,
       return false;
 
     const SCEVAddExpr *Add = dyn_cast<SCEVAddExpr>(PtrSCEV);
+    if (!Add)
+      return false;
     int64_t Offset = 0;
-    if (Add) {
-      for (int i = 0; i < (int)Add->getNumOperands(); ++i) {
-        const SCEVConstant *SC = dyn_cast<SCEVConstant>(Add->getOperand(i));
-        if (!SC)
-          continue;
-        Offset = (int64_t)(SC->getAPInt().getLimitedValue());
-        break;
-      }
+    for (int I : seq<int>(Add->getNumOperands())) {
+      auto *SC = dyn_cast<SCEVConstant>(Add->getOperand(I));
+      if (!SC)
+        continue;
+      Offset = SC->getAPInt().getSExtValue();
+      break;
     }
     OffsetToPointerOpIdxMap[Offset].first.push_back(Ptr);
     OffsetToPointerOpIdxMap[Offset].second.push_back(Idx);
@@ -6522,17 +6522,17 @@ bool BoUpSLP::analyzeRtStrideCandidate(ArrayRef<Value *> PointerOps,
       !TTI->isLegalStridedLoadStore(StridedLoadTy, CommonAlignment))
     return false;
 
-  SmallVector<int> SortedOffsetsV;
+  SmallVector<int64_t> SortedOffsetsV;
   for (auto [K, V] : OffsetToPointerOpIdxMap) {
     SortedOffsetsV.push_back(K);
   }
-  llvm::sort(SortedOffsetsV);
+  sort(SortedOffsetsV);
   if (NumOffsets > 1) {
-    int CommonDiff = SortedOffsetsV[1] - SortedOffsetsV[0];
-    if ((CommonDiff) != 1)
+    int64_t CommonDiff = SortedOffsetsV[1] - SortedOffsetsV[0];
+    if (CommonDiff != 1)
       return false;
-    for (int i = 1; i < (int)SortedOffsetsV.size() - 1; ++i) {
-      if (SortedOffsetsV[i + 1] - SortedOffsetsV[i] != CommonDiff)
+    for (int I : seq<int>(1, SortedOffsetsV.size() - 1)) {
+      if (SortedOffsetsV[I + 1] - SortedOffsetsV[I] != CommonDiff)
         return false;
     }
   }
@@ -6552,31 +6552,29 @@ bool BoUpSLP::analyzeRtStrideCandidate(ArrayRef<Value *> PointerOps,
   unsigned NumCoeffs0 = Coeffs0.size();
   if (NumCoeffs0 * NumOffsets != Sz)
     return false;
-  llvm::sort(Coeffs0);
+  sort(Coeffs0);
 
   SmallVector<unsigned> SortedIndicesDraft;
   SortedIndicesDraft.resize(Sz);
-  auto updateSortedIndices =
+  auto UpdateSortedIndices =
       [&](SmallVectorImpl<unsigned> &SortedIndicesForOffset,
           SmallVectorImpl<unsigned> &IndicesInAllPointerOps,
           int64_t OffsetNum) {
-        unsigned Num = 0;
-        for (unsigned Idx : SortedIndicesForOffset) {
+        for (const auto [Num, Idx] : enumerate(SortedIndicesForOffset)) {
           SortedIndicesDraft[Num * NumOffsets + OffsetNum] =
               IndicesInAllPointerOps[Idx];
-          ++Num;
         }
       };
 
-  updateSortedIndices(SortedIndicesForOffset0, IndicesInAllPointerOps0, 0);
+  UpdateSortedIndices(SortedIndicesForOffset0, IndicesInAllPointerOps0, 0);
 
   SmallVector<int64_t> Coeffs;
   SmallVector<unsigned> SortedIndicesForOffset;
-  for (int i = 1; i < NumOffsets; ++i) {
+  for (int I : seq<int>(1, NumOffsets)) {
     Coeffs.clear();
     SortedIndicesForOffset.clear();
 
-    int64_t Offset = SortedOffsetsV[i];
+    int64_t Offset = SortedOffsetsV[I];
     SmallVector<Value *> &PointerOpsForOffset =
         OffsetToPointerOpIdxMap[Offset].first;
     SmallVector<unsigned> &IndicesInAllPointerOps =
@@ -6584,26 +6582,23 @@ bool BoUpSLP::analyzeRtStrideCandidate(ArrayRef<Value *> PointerOps,
     const SCEV *StrideWithinGroup = calculateRtStride(
         PointerOpsForOffset, ElemTy, *DL, *SE, SortedIndicesForOffset, Coeffs);
 
-    if ((!StrideWithinGroup) || StrideWithinGroup != Stride0) {
+    if ((!StrideWithinGroup) || StrideWithinGroup != Stride0)
       return false;
-    }
     if (Coeffs.size() != NumCoeffs0)
       return false;
-    llvm::sort(Coeffs);
-    for (unsigned i = 0; i < NumCoeffs0; ++i) {
-      if (Coeffs[i] != Coeffs0[i])
+    sort(Coeffs);
+    for (int J : seq<int>(0, NumCoeffs0)) {
+      if (Coeffs[J] != Coeffs0[J])
         return false;
     }
 
-    updateSortedIndices(SortedIndicesForOffset, IndicesInAllPointerOps, i);
+    UpdateSortedIndices(SortedIndicesForOffset, IndicesInAllPointerOps, I);
   }
 
   SortedIndices.clear();
   SortedIndices = SortedIndicesDraft;
-  if (SPtrInfo) {
-    SPtrInfo->StrideSCEV = Stride0;
-    SPtrInfo->Ty = StridedLoadTy;
-  }
+  SPtrInfo.StrideSCEV = Stride0;
+  SPtrInfo.Ty = StridedLoadTy;
   return true;
 }
 
@@ -6937,20 +6932,21 @@ isMaskedLoadCompress(ArrayRef<Value *> VL, ArrayRef<Value *> PointerOps,
 // Same as analyzeRtStrideCandidate, but for constant strides.
 bool BoUpSLP::analyzeConstantStrideCandidate(
     ArrayRef<Value *> PointerOps, Type *ElemTy, Align CommonAlignment,
-    SmallVectorImpl<unsigned> &SortedIndices, StridedPtrInfo *SPtrInfo,
+    SmallVectorImpl<unsigned> &SortedIndices, StridedPtrInfo &SPtrInfo,
     int64_t Diff, Value *Ptr0, Value *PtrN) const {
   const unsigned Sz = PointerOps.size();
   SmallVector<int64_t> SortedOffsetsFromBase;
   SortedOffsetsFromBase.resize(Sz);
-  for (unsigned i = 0; i < Sz; ++i) {
+  for (unsigned I : seq<int>(0, Sz)) {
     Value *Ptr =
-        SortedIndices.empty() ? PointerOps[i] : PointerOps[SortedIndices[i]];
-    SortedOffsetsFromBase[i] =
+        SortedIndices.empty() ? PointerOps[I] : PointerOps[SortedIndices[I]];
+    SortedOffsetsFromBase[I] =
         *getPointersDiff(ElemTy, Ptr0, ElemTy, Ptr, *DL, *SE);
   }
 
   // Find where the first group ends.
-  assert(SortedOffsetsFromBase.size() > 1);
+  assert(SortedOffsetsFromBase.size() > 1 &&
+         "Trying to generate strided load for less than 2 loads");
   int64_t StrideWithinGroup =
       SortedOffsetsFromBase[1] - SortedOffsetsFromBase[0];
   unsigned GroupSize = 1;
@@ -6997,7 +6993,7 @@ bool BoUpSLP::analyzeConstantStrideCandidate(
     if (CurrentGroupStartIdx != Sz)
       return false;
 
-    auto checkGroup = [&](unsigned StartIdx, unsigned GroupSize0,
+    auto CheckGroup = [&](unsigned StartIdx, unsigned GroupSize0,
                           int64_t StrideWithinGroup) -> bool {
       unsigned GroupEndIdx = StartIdx + 1;
       for (; GroupEndIdx != Sz; ++GroupEndIdx) {
@@ -7006,10 +7002,10 @@ bool BoUpSLP::analyzeConstantStrideCandidate(
             StrideWithinGroup)
           break;
       }
-      return (GroupEndIdx - StartIdx == GroupSize0);
+      return GroupEndIdx - StartIdx == GroupSize0;
     };
-    for (unsigned i = 0; i < Sz; i += GroupSize) {
-      if (!checkGroup(i, GroupSize, StrideWithinGroup))
+    for (unsigned I = 0; I < Sz; I += GroupSize) {
+      if (!CheckGroup(I, GroupSize, StrideWithinGroup))
         return false;
     }
   }
@@ -7047,11 +7043,9 @@ bool BoUpSLP::analyzeConstantStrideCandidate(
          has_single_bit(AbsoluteDiff))) &&
        AbsoluteDiff > VecSz) ||
       Diff == -(static_cast<int>(VecSz) - 1)) {
-    if (SPtrInfo) {
-      Type *StrideTy = DL->getIndexType(Ptr0->getType());
-      SPtrInfo->StrideVal = ConstantInt::get(StrideTy, StrideIntVal);
-      SPtrInfo->Ty = StridedLoadTy;
-    }
+    Type *StrideTy = DL->getIndexType(Ptr0->getType());
+    SPtrInfo.StrideVal = ConstantInt::get(StrideTy, StrideIntVal);
+    SPtrInfo.Ty = StridedLoadTy;
     return true;
   }
   return false;
@@ -7099,7 +7093,7 @@ BoUpSLP::LoadsState BoUpSLP::canVectorizeLoads(
   if (!IsSorted) {
     if (Sz > MinProfitableStridedLoads &&
         analyzeRtStrideCandidate(PointerOps, ScalarTy, CommonAlignment, Order,
-                                 SPtrInfo))
+                                 *SPtrInfo))
       return LoadsState::StridedVectorize;
 
     if (!TTI->isLegalMaskedGather(VecTy, CommonAlignment) ||
@@ -7134,7 +7128,7 @@ BoUpSLP::LoadsState BoUpSLP::canVectorizeLoads(
       return LoadsState::CompressVectorize;
     // Simple check if not a strided access - clear order.
     if (analyzeConstantStrideCandidate(PointerOps, ScalarTy, CommonAlignment,
-                                       Order, SPtrInfo, *Diff, Ptr0, PtrN))
+                                       Order, *SPtrInfo, *Diff, Ptr0, PtrN))
       return LoadsState::StridedVectorize;
   }
   if (!TTI->isLegalMaskedGather(VecTy, CommonAlignment) ||
@@ -10004,7 +9998,7 @@ getVectorCallCosts(CallInst *CI, FixedVectorType *VecTy,
 BoUpSLP::TreeEntry::EntryState BoUpSLP::getScalarsVectorizationState(
     const InstructionsState &S, ArrayRef<Value *> VL,
     bool IsScatterVectorizeUserTE, OrdersType &CurrentOrder,
-    SmallVectorImpl<Value *> &PointerOps, StridedPtrInfo *SPtrInfo) {
+    SmallVectorImpl<Value *> &PointerOps, StridedPtrInfo &SPtrInfo) {
   assert(S.getMainOp() &&
          "Expected instructions with same/alternate opcodes only.");
 
@@ -10106,7 +10100,7 @@ BoUpSLP::TreeEntry::EntryState BoUpSLP::getScalarsVectorizationState(
         });
       });
     };
-    switch (canVectorizeLoads(VL, VL0, CurrentOrder, PointerOps, SPtrInfo)) {
+    switch (canVectorizeLoads(VL, VL0, CurrentOrder, PointerOps, &SPtrInfo)) {
     case LoadsState::Vectorize:
       return TreeEntry::Vectorize;
     case LoadsState::CompressVectorize:
@@ -11567,7 +11561,7 @@ void BoUpSLP::buildTreeRec(ArrayRef<Value *> VLRef, unsigned Depth,
   SmallVector<Value *> PointerOps;
   StridedPtrInfo SPtrInfo;
   TreeEntry::EntryState State = getScalarsVectorizationState(
-      S, VL, IsScatterVectorizeUserTE, CurrentOrder, PointerOps, &SPtrInfo);
+      S, VL, IsScatterVectorizeUserTE, CurrentOrder, PointerOps, SPtrInfo);
   if (State == TreeEntry::NeedToGather) {
     newGatherTreeEntry(VL, S, UserTreeIdx, ReuseShuffleIndices);
     return;
@@ -19733,16 +19727,16 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
         Type *StrideTy = DL->getIndexType(PO->getType());
         Value *StrideVal;
 
-        StridedPtrInfo &SPtrInfo = TreeEntryToStridedPtrInfoMap[E];
+        const StridedPtrInfo &SPtrInfo = TreeEntryToStridedPtrInfoMap.at(E);
         StridedLoadTy = SPtrInfo.Ty;
-        assert(StridedLoadTy);
+        assert(StridedLoadTy && "Missing StridedPoinerInfo for tree entry.");
         unsigned StridedLoadEC =
             StridedLoadTy->getElementCount().getKnownMinValue();
 
         Value *Stride = SPtrInfo.StrideVal;
         if (!Stride) {
           const SCEV *StrideSCEV = SPtrInfo.StrideSCEV;
-          assert(StrideSCEV);
+          assert(StrideSCEV && "Neither StrideVal nor StrideSCEV were set.");
           SCEVExpander Expander(*SE, *DL, "strided-load-vec");
           Stride = Expander.expandCodeFor(StrideSCEV, StrideSCEV->getType(),
                                           &*Builder.GetInsertPoint());

>From 6cadde26217b7388bd444e5f27a4943c46f810df Mon Sep 17 00:00:00 2001
From: Mikhail Gudim <mgudim at ventanamicro.com>
Date: Wed, 13 Aug 2025 08:26:58 -0700
Subject: [PATCH 05/13] address review comments.

---
 .../Transforms/Vectorize/SLPVectorizer.cpp    | 34 ++++++++++++++-----
 1 file changed, 25 insertions(+), 9 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index 7a1d8e1546a15..38f1e7b4ce7f3 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -2230,11 +2230,29 @@ class BoUpSLP {
   ///       may not be necessary.
   bool isLoadCombineCandidate(ArrayRef<Value *> Stores) const;
 
+  // Suppose we are given pointers of the form: %b + x * %s + y * %c
+  // where %c is constant. Check if the pointers can be rearranged as follows:
+  //  %b + 0 * %s + 0
+  //  %b + 0 * %s + 1
+  //  %b + 0 * %s + 2
+  //  ...
+  //  %b + 0 * %s + w
+  //
+  //  %b + 1 * %s + 0
+  //  %b + 1 * %s + 1
+  //  %b + 1 * %s + 2
+  //  ...
+  //  %b + 1 * %s + w
+  //  ...
+  //
+  //  If the pointers can be rearanged in the above pattern, it means that the
+  //  memory can be accessed with a strided loads of width `w` and stride `%s`.
   bool analyzeRtStrideCandidate(ArrayRef<Value *> PointerOps, Type *ElemTy,
                                 Align CommonAlignment,
                                 SmallVectorImpl<unsigned> &SortedIndices,
                                 StridedPtrInfo &SPtrInfo) const;
 
+  // Same as analyzeRtStrideCandidate, but for constant strides.
   bool analyzeConstantStrideCandidate(ArrayRef<Value *> PointerOps,
                                       Type *ElemTy, Align CommonAlignment,
                                       SmallVectorImpl<unsigned> &SortedIndices,
@@ -6523,9 +6541,8 @@ bool BoUpSLP::analyzeRtStrideCandidate(ArrayRef<Value *> PointerOps,
     return false;
 
   SmallVector<int64_t> SortedOffsetsV;
-  for (auto [K, V] : OffsetToPointerOpIdxMap) {
+  for (auto [K, V] : OffsetToPointerOpIdxMap)
     SortedOffsetsV.push_back(K);
-  }
   sort(SortedOffsetsV);
   if (NumOffsets > 1) {
     int64_t CommonDiff = SortedOffsetsV[1] - SortedOffsetsV[0];
@@ -6582,17 +6599,16 @@ bool BoUpSLP::analyzeRtStrideCandidate(ArrayRef<Value *> PointerOps,
     const SCEV *StrideWithinGroup = calculateRtStride(
         PointerOpsForOffset, ElemTy, *DL, *SE, SortedIndicesForOffset, Coeffs);
 
-    if ((!StrideWithinGroup) || StrideWithinGroup != Stride0)
+    if (!StrideWithinGroup || StrideWithinGroup != Stride0)
       return false;
     if (Coeffs.size() != NumCoeffs0)
       return false;
     sort(Coeffs);
-    for (int J : seq<int>(0, NumCoeffs0)) {
-      if (Coeffs[J] != Coeffs0[J])
-        return false;
-    }
+    if (Coeffs != Coeffs0)
+      return false
 
-    UpdateSortedIndices(SortedIndicesForOffset, IndicesInAllPointerOps, I);
+          UpdateSortedIndices(SortedIndicesForOffset, IndicesInAllPointerOps,
+                              I);
   }
 
   SortedIndices.clear();
@@ -6937,7 +6953,7 @@ bool BoUpSLP::analyzeConstantStrideCandidate(
   const unsigned Sz = PointerOps.size();
   SmallVector<int64_t> SortedOffsetsFromBase;
   SortedOffsetsFromBase.resize(Sz);
-  for (unsigned I : seq<int>(0, Sz)) {
+  for (unsigned I : seq<int>(Sz)) {
     Value *Ptr =
         SortedIndices.empty() ? PointerOps[I] : PointerOps[SortedIndices[I]];
     SortedOffsetsFromBase[I] =

>From cc912e8168eba7cc74178a20df7e374b70169b22 Mon Sep 17 00:00:00 2001
From: Mikhail Gudim <mgudim at ventanamicro.com>
Date: Wed, 13 Aug 2025 08:56:01 -0700
Subject: [PATCH 06/13] added cost estimation

---
 llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp | 12 ++++++++++--
 1 file changed, 10 insertions(+), 2 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index 38f1e7b4ce7f3..3b2da58739652 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -6605,7 +6605,7 @@ bool BoUpSLP::analyzeRtStrideCandidate(ArrayRef<Value *> PointerOps,
       return false;
     sort(Coeffs);
     if (Coeffs != Coeffs0)
-      return false
+      return false;
 
           UpdateSortedIndices(SortedIndicesForOffset, IndicesInAllPointerOps,
                               I);
@@ -15028,11 +15028,19 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef<Value *> VectorizedVals,
         }
         break;
       case TreeEntry::StridedVectorize: {
+        const StridedPtrInfo &SPtrInfo = TreeEntryToStridedPtrInfoMap.at(E);
+        FixedVectorType *StridedLoadTy = SPtrInfo.Ty;
+        assert(StridedLoadTy && "Missing StridedPoinerInfo for tree entry.");
         Align CommonAlignment =
             computeCommonAlignment<LoadInst>(UniqueValues.getArrayRef());
         VecLdCost = TTI->getStridedMemoryOpCost(
-            Instruction::Load, VecTy, LI0->getPointerOperand(),
+            Instruction::Load, StridedLoadTy, LI0->getPointerOperand(),
             /*VariableMask=*/false, CommonAlignment, CostKind);
+        if (StridedLoadTy != VecTy)
+          VecLdCost +=
+              TTI->getCastInstrCost(Instruction::BitCast, StridedLoadTy, VecTy,
+                                    getCastContextHint(*E), CostKind);
+
         break;
       }
       case TreeEntry::CompressVectorize: {

>From 62677b07064ae30bb65a9f7f4083a25a29c4ac5a Mon Sep 17 00:00:00 2001
From: Mikhail Gudim <mgudim at ventanamicro.com>
Date: Wed, 13 Aug 2025 09:13:25 -0700
Subject: [PATCH 07/13] format

---
 llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index 3b2da58739652..896d7f4f83140 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -6607,8 +6607,7 @@ bool BoUpSLP::analyzeRtStrideCandidate(ArrayRef<Value *> PointerOps,
     if (Coeffs != Coeffs0)
       return false;
 
-          UpdateSortedIndices(SortedIndicesForOffset, IndicesInAllPointerOps,
-                              I);
+    UpdateSortedIndices(SortedIndicesForOffset, IndicesInAllPointerOps, I);
   }
 
   SortedIndices.clear();

>From e910560696eb00004cc73cb41ed932eeb49853b7 Mon Sep 17 00:00:00 2001
From: Mikhail Gudim <mgudim at ventanamicro.com>
Date: Wed, 13 Aug 2025 11:43:09 -0700
Subject: [PATCH 08/13] change references back to pointers because
 canVectorizeLoads can be called with nullptr

---
 .../Transforms/Vectorize/SLPVectorizer.cpp    | 24 +++++++++++--------
 1 file changed, 14 insertions(+), 10 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index 896d7f4f83140..ab3aaffdcbff1 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -2250,13 +2250,13 @@ class BoUpSLP {
   bool analyzeRtStrideCandidate(ArrayRef<Value *> PointerOps, Type *ElemTy,
                                 Align CommonAlignment,
                                 SmallVectorImpl<unsigned> &SortedIndices,
-                                StridedPtrInfo &SPtrInfo) const;
+                                StridedPtrInfo *SPtrInfo) const;
 
   // Same as analyzeRtStrideCandidate, but for constant strides.
   bool analyzeConstantStrideCandidate(ArrayRef<Value *> PointerOps,
                                       Type *ElemTy, Align CommonAlignment,
                                       SmallVectorImpl<unsigned> &SortedIndices,
-                                      StridedPtrInfo &SPtrInfo, int64_t Diff,
+                                      StridedPtrInfo *SPtrInfo, int64_t Diff,
                                       Value *Ptr0, Value *PtrN) const;
 
   /// Checks if the given array of loads can be represented as a vectorized,
@@ -6499,7 +6499,7 @@ static const SCEV *calculateRtStride(ArrayRef<Value *> PointerOps, Type *ElemTy,
 bool BoUpSLP::analyzeRtStrideCandidate(ArrayRef<Value *> PointerOps,
                                        Type *ElemTy, Align CommonAlignment,
                                        SmallVectorImpl<unsigned> &SortedIndices,
-                                       StridedPtrInfo &SPtrInfo) const {
+                                       StridedPtrInfo *SPtrInfo) const {
   // Group the pointers by constant offset.
   DenseMap<int64_t, std::pair<SmallVector<Value *>, SmallVector<unsigned>>>
       OffsetToPointerOpIdxMap;
@@ -6612,8 +6612,10 @@ bool BoUpSLP::analyzeRtStrideCandidate(ArrayRef<Value *> PointerOps,
 
   SortedIndices.clear();
   SortedIndices = SortedIndicesDraft;
-  SPtrInfo.StrideSCEV = Stride0;
-  SPtrInfo.Ty = StridedLoadTy;
+  if (SPtrInfo) {
+    SPtrInfo->StrideSCEV = Stride0;
+    SPtrInfo->Ty = StridedLoadTy;
+  }
   return true;
 }
 
@@ -6947,7 +6949,7 @@ isMaskedLoadCompress(ArrayRef<Value *> VL, ArrayRef<Value *> PointerOps,
 // Same as analyzeRtStrideCandidate, but for constant strides.
 bool BoUpSLP::analyzeConstantStrideCandidate(
     ArrayRef<Value *> PointerOps, Type *ElemTy, Align CommonAlignment,
-    SmallVectorImpl<unsigned> &SortedIndices, StridedPtrInfo &SPtrInfo,
+    SmallVectorImpl<unsigned> &SortedIndices, StridedPtrInfo *SPtrInfo,
     int64_t Diff, Value *Ptr0, Value *PtrN) const {
   const unsigned Sz = PointerOps.size();
   SmallVector<int64_t> SortedOffsetsFromBase;
@@ -7059,8 +7061,10 @@ bool BoUpSLP::analyzeConstantStrideCandidate(
        AbsoluteDiff > VecSz) ||
       Diff == -(static_cast<int>(VecSz) - 1)) {
     Type *StrideTy = DL->getIndexType(Ptr0->getType());
-    SPtrInfo.StrideVal = ConstantInt::get(StrideTy, StrideIntVal);
-    SPtrInfo.Ty = StridedLoadTy;
+    if (SPtrInfo) {
+      SPtrInfo->StrideVal = ConstantInt::get(StrideTy, StrideIntVal);
+      SPtrInfo->Ty = StridedLoadTy;
+    }
     return true;
   }
   return false;
@@ -7108,7 +7112,7 @@ BoUpSLP::LoadsState BoUpSLP::canVectorizeLoads(
   if (!IsSorted) {
     if (Sz > MinProfitableStridedLoads &&
         analyzeRtStrideCandidate(PointerOps, ScalarTy, CommonAlignment, Order,
-                                 *SPtrInfo))
+                                 SPtrInfo))
       return LoadsState::StridedVectorize;
 
     if (!TTI->isLegalMaskedGather(VecTy, CommonAlignment) ||
@@ -7143,7 +7147,7 @@ BoUpSLP::LoadsState BoUpSLP::canVectorizeLoads(
       return LoadsState::CompressVectorize;
     // Simple check if not a strided access - clear order.
     if (analyzeConstantStrideCandidate(PointerOps, ScalarTy, CommonAlignment,
-                                       Order, *SPtrInfo, *Diff, Ptr0, PtrN))
+                                       Order, SPtrInfo, *Diff, Ptr0, PtrN))
       return LoadsState::StridedVectorize;
   }
   if (!TTI->isLegalMaskedGather(VecTy, CommonAlignment) ||

>From 44eb8f236c749ab958282d24f3eb9cd6fc549920 Mon Sep 17 00:00:00 2001
From: Mikhail Gudim <mgudim at ventanamicro.com>
Date: Wed, 13 Aug 2025 11:52:28 -0700
Subject: [PATCH 09/13] three slashes for comments, deleted wrong comment.

---
 .../Transforms/Vectorize/SLPVectorizer.cpp    | 99 ++++++++++---------
 1 file changed, 50 insertions(+), 49 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index ab3aaffdcbff1..a36c66d793b26 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -1918,17 +1918,15 @@ class BoUpSLP {
   class ShuffleCostEstimator;
   class ShuffleInstructionBuilder;
 
-  // If we decide to generate strided load / store, this struct contains all the
-  // necessary info. It's fields are calculated by analyzeRtStrideCandidate and
-  // analyzeConstantStrideCandidate. Note that Stride can be given either as a
-  // SCEV or as a Value if it already exists.
-  // To get the stride in bytes, StrideVal (or value obtained from StrideSCEV)
-  // has to by multiplied by the size of element of FixedVectorType.
+  /// If we decide to generate strided load / store, this struct contains all
+  /// the necessary info. It's fields are calculated by analyzeRtStrideCandidate
+  /// and analyzeConstantStrideCandidate. Note that Stride can be given either
+  /// as a SCEV or as a Value if it already exists. To get the stride in bytes,
+  /// StrideVal (or value obtained from StrideSCEV) has to by multiplied by the
+  /// size of element of FixedVectorType.
   struct StridedPtrInfo {
     Value *StrideVal = nullptr;
     const SCEV *StrideSCEV = nullptr;
-    // Represents the ammount which needs to be added to the base pointer of
-    // strided load.
     FixedVectorType *Ty = nullptr;
   };
   SmallDenseMap<TreeEntry *, StridedPtrInfo> TreeEntryToStridedPtrInfoMap;
@@ -2230,29 +2228,29 @@ class BoUpSLP {
   ///       may not be necessary.
   bool isLoadCombineCandidate(ArrayRef<Value *> Stores) const;
 
-  // Suppose we are given pointers of the form: %b + x * %s + y * %c
-  // where %c is constant. Check if the pointers can be rearranged as follows:
-  //  %b + 0 * %s + 0
-  //  %b + 0 * %s + 1
-  //  %b + 0 * %s + 2
-  //  ...
-  //  %b + 0 * %s + w
-  //
-  //  %b + 1 * %s + 0
-  //  %b + 1 * %s + 1
-  //  %b + 1 * %s + 2
-  //  ...
-  //  %b + 1 * %s + w
-  //  ...
-  //
-  //  If the pointers can be rearanged in the above pattern, it means that the
-  //  memory can be accessed with a strided loads of width `w` and stride `%s`.
+  /// Suppose we are given pointers of the form: %b + x * %s + y * %c
+  /// where %c is constant. Check if the pointers can be rearranged as follows:
+  ///  %b + 0 * %s + 0
+  ///  %b + 0 * %s + 1
+  ///  %b + 0 * %s + 2
+  ///  ...
+  ///  %b + 0 * %s + w
+  ///
+  ///  %b + 1 * %s + 0
+  ///  %b + 1 * %s + 1
+  ///  %b + 1 * %s + 2
+  ///  ...
+  ///  %b + 1 * %s + w
+  ///  ...
+  ///
+  ///  If the pointers can be rearanged in the above pattern, it means that the
+  ///  memory can be accessed with a strided loads of width `w` and stride `%s`.
   bool analyzeRtStrideCandidate(ArrayRef<Value *> PointerOps, Type *ElemTy,
                                 Align CommonAlignment,
                                 SmallVectorImpl<unsigned> &SortedIndices,
                                 StridedPtrInfo *SPtrInfo) const;
 
-  // Same as analyzeRtStrideCandidate, but for constant strides.
+  /// Same as analyzeRtStrideCandidate, but for constant strides.
   bool analyzeConstantStrideCandidate(ArrayRef<Value *> PointerOps,
                                       Type *ElemTy, Align CommonAlignment,
                                       SmallVectorImpl<unsigned> &SortedIndices,
@@ -6479,23 +6477,23 @@ static const SCEV *calculateRtStride(ArrayRef<Value *> PointerOps, Type *ElemTy,
   return Stride;
 }
 
-// Suppose we are given pointers of the form: %b + x * %s + y * %c
-// where %c is constant. Check if the pointers can be rearranged as follows:
-//  %b + 0 * %s + 0
-//  %b + 0 * %s + 1
-//  %b + 0 * %s + 2
-//  ...
-//  %b + 0 * %s + w
-//
-//  %b + 1 * %s + 0
-//  %b + 1 * %s + 1
-//  %b + 1 * %s + 2
-//  ...
-//  %b + 1 * %s + w
-//  ...
-//
-//  If the pointers can be rearanged in the above pattern, it means that the
-//  memory can be accessed with a strided loads of width `w` and stride `%s`.
+/// Suppose we are given pointers of the form: %b + x * %s + y * %c
+/// where %c is constant. Check if the pointers can be rearranged as follows:
+///  %b + 0 * %s + 0
+///  %b + 0 * %s + 1
+///  %b + 0 * %s + 2
+///  ...
+///  %b + 0 * %s + w
+///
+///  %b + 1 * %s + 0
+///  %b + 1 * %s + 1
+///  %b + 1 * %s + 2
+///  ...
+///  %b + 1 * %s + w
+///  ...
+///
+///  If the pointers can be rearanged in the above pattern, it means that the
+///  memory can be accessed with a strided loads of width `w` and stride `%s`.
 bool BoUpSLP::analyzeRtStrideCandidate(ArrayRef<Value *> PointerOps,
                                        Type *ElemTy, Align CommonAlignment,
                                        SmallVectorImpl<unsigned> &SortedIndices,
@@ -6946,7 +6944,7 @@ isMaskedLoadCompress(ArrayRef<Value *> VL, ArrayRef<Value *> PointerOps,
                               CompressMask, LoadVecTy);
 }
 
-// Same as analyzeRtStrideCandidate, but for constant strides.
+/// Same as analyzeRtStrideCandidate, but for constant strides.
 bool BoUpSLP::analyzeConstantStrideCandidate(
     ArrayRef<Value *> PointerOps, Type *ElemTy, Align CommonAlignment,
     SmallVectorImpl<unsigned> &SortedIndices, StridedPtrInfo *SPtrInfo,
@@ -15037,12 +15035,15 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef<Value *> VectorizedVals,
         Align CommonAlignment =
             computeCommonAlignment<LoadInst>(UniqueValues.getArrayRef());
         VecLdCost = TTI->getStridedMemoryOpCost(
-            Instruction::Load, StridedLoadTy, LI0->getPointerOperand(),
+            Instruction::Load, VecTy, LI0->getPointerOperand(),
             /*VariableMask=*/false, CommonAlignment, CostKind);
-        if (StridedLoadTy != VecTy)
-          VecLdCost +=
-              TTI->getCastInstrCost(Instruction::BitCast, StridedLoadTy, VecTy,
-                                    getCastContextHint(*E), CostKind);
+        //VecLdCost = TTI->getStridedMemoryOpCost(
+        //    Instruction::Load, StridedLoadTy, LI0->getPointerOperand(),
+        //    /*VariableMask=*/false, CommonAlignment, CostKind);
+        //if (StridedLoadTy != VecTy)
+        //  VecLdCost +=
+        //      TTI->getCastInstrCost(Instruction::BitCast, StridedLoadTy, VecTy,
+        //                            getCastContextHint(*E), CostKind);
 
         break;
       }

>From 7cf95271dcf0750e2ab58e4d5edcbdeca3a4b6bf Mon Sep 17 00:00:00 2001
From: Mikhail Gudim <mgudim at ventanamicro.com>
Date: Wed, 13 Aug 2025 13:10:11 -0700
Subject: [PATCH 10/13] revert a change made by mistake.

---
 llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index a36c66d793b26..71465ef0dc51c 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -6507,15 +6507,15 @@ bool BoUpSLP::analyzeRtStrideCandidate(ArrayRef<Value *> PointerOps,
       return false;
 
     const SCEVAddExpr *Add = dyn_cast<SCEVAddExpr>(PtrSCEV);
-    if (!Add)
-      return false;
     int64_t Offset = 0;
-    for (int I : seq<int>(Add->getNumOperands())) {
-      auto *SC = dyn_cast<SCEVConstant>(Add->getOperand(I));
-      if (!SC)
-        continue;
-      Offset = SC->getAPInt().getSExtValue();
-      break;
+    if (Add) {
+      for (int I : seq<int>(Add->getNumOperands())) {
+        const SCEVConstant *SC = dyn_cast<SCEVConstant>(Add->getOperand(I));
+        if (!SC)
+          continue;
+        Offset = SC->getAPInt().getSExtValue();
+        break;
+      }
     }
     OffsetToPointerOpIdxMap[Offset].first.push_back(Ptr);
     OffsetToPointerOpIdxMap[Offset].second.push_back(Idx);

>From 9c9f03c257519059281fa3ce6b30a0629c03daaf Mon Sep 17 00:00:00 2001
From: Mikhail Gudim <mgudim at ventanamicro.com>
Date: Wed, 13 Aug 2025 13:13:57 -0700
Subject: [PATCH 11/13] Another DenseMap -> SmallDenseMap

---
 llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index 71465ef0dc51c..2718c5560a17d 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -6499,7 +6499,7 @@ bool BoUpSLP::analyzeRtStrideCandidate(ArrayRef<Value *> PointerOps,
                                        SmallVectorImpl<unsigned> &SortedIndices,
                                        StridedPtrInfo *SPtrInfo) const {
   // Group the pointers by constant offset.
-  DenseMap<int64_t, std::pair<SmallVector<Value *>, SmallVector<unsigned>>>
+  SmallDenseMap<int64_t, std::pair<SmallVector<Value *>, SmallVector<unsigned>>>
       OffsetToPointerOpIdxMap;
   for (auto [Idx, Ptr] : enumerate(PointerOps)) {
     const SCEV *PtrSCEV = SE->getSCEV(Ptr);

>From 540f4fb51714e1fb88ba5b7b7d1af5b68e800c6c Mon Sep 17 00:00:00 2001
From: Mikhail Gudim <mgudim at ventanamicro.com>
Date: Wed, 13 Aug 2025 13:21:53 -0700
Subject: [PATCH 12/13] put cost estimation back and make the order of type
 operands correct.

---
 llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp | 13 +++++--------
 1 file changed, 5 insertions(+), 8 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index 2718c5560a17d..b3c812600ef0f 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -15035,15 +15035,12 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef<Value *> VectorizedVals,
         Align CommonAlignment =
             computeCommonAlignment<LoadInst>(UniqueValues.getArrayRef());
         VecLdCost = TTI->getStridedMemoryOpCost(
-            Instruction::Load, VecTy, LI0->getPointerOperand(),
+            Instruction::Load, StridedLoadTy, LI0->getPointerOperand(),
             /*VariableMask=*/false, CommonAlignment, CostKind);
-        //VecLdCost = TTI->getStridedMemoryOpCost(
-        //    Instruction::Load, StridedLoadTy, LI0->getPointerOperand(),
-        //    /*VariableMask=*/false, CommonAlignment, CostKind);
-        //if (StridedLoadTy != VecTy)
-        //  VecLdCost +=
-        //      TTI->getCastInstrCost(Instruction::BitCast, StridedLoadTy, VecTy,
-        //                            getCastContextHint(*E), CostKind);
+        if (StridedLoadTy != VecTy)
+          VecLdCost +=
+              TTI->getCastInstrCost(Instruction::BitCast, VecTy, StridedLoadTy,
+                                    getCastContextHint(*E), CostKind);
 
         break;
       }

>From 42edb85c7fac53ac53b6cafe72525bcd43155d41 Mon Sep 17 00:00:00 2001
From: Mikhail Gudim <mgudim at ventanamicro.com>
Date: Sat, 30 Aug 2025 02:05:37 -0700
Subject: [PATCH 13/13] rebased on latest main

---
 llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp | 9 ---------
 1 file changed, 9 deletions(-)

diff --git a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp
index 8c53bd9265302..5e300182657d5 100644
--- a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp
+++ b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp
@@ -37,11 +37,6 @@ static cl::opt<unsigned> SLPMaxVF(
         "exclusively by SLP vectorizer."),
     cl::Hidden);
 
-static cl::opt<bool> SLPPreferAltOpcVectorization(
-    "riscv-v-slp-prefer-alt-opc-vectorization",
-    cl::desc("Controls preferAlternateOpcodeVectorization"), cl::init(false),
-    cl::Hidden);
-
 static cl::opt<unsigned>
     RVVMinTripCount("riscv-v-min-trip-count",
                     cl::desc("Set the lower bound of a trip count to decide on "
@@ -3045,7 +3040,3 @@ RISCVTTIImpl::enableMemCmpExpansion(bool OptSize, bool IsZeroCmp) const {
   }
   return Options;
 }
-
-bool RISCVTTIImpl::preferAlternateOpcodeVectorization() const {
-  return SLPPreferAltOpcVectorization;
-}