[llvm] [LV] Convert gather loads with invariant stride into strided loads (PR #147297)

Fri Aug 22 02:54:40 PDT 2025

llvmbot wrote:




@llvm/pr-subscribers-vectorizers

Author: Mel Chen (Mel-Chen)

<details>
<summary>Changes</summary>

This patch detects stride memory accesses such as:
```
  void stride(int* a, int *b, int n) {
    for (int i = 0; i < n; i++)
      a[i * 5] = b[i * 5] + i;
  }
```
and converts widen non-consecutive loads (i.e. gathers) into strided loads when profitable and legal.
The transformation is implemented as part of VPlan and is inspired by existing logic in RISCVGatherScatterLowering. Some of the legality and analysis logic has been moved into VPlan to enable this conversion earlier during vectorization planning.

This enables more efficient code generation for targets like RISC-V that support strided loads natively.

---

Patch is 73.16 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/147297.diff


14 Files Affected:

- (modified) llvm/lib/Transforms/Vectorize/LoopVectorize.cpp (+17-9) 
- (modified) llvm/lib/Transforms/Vectorize/VPlan.h (+83-12) 
- (modified) llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp (+4-2) 
- (modified) llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp (+66-6) 
- (modified) llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp (+193-5) 
- (modified) llvm/lib/Transforms/Vectorize/VPlanTransforms.h (+6) 
- (modified) llvm/lib/Transforms/Vectorize/VPlanValue.h (+1) 
- (modified) llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp (+2-1) 
- (modified) llvm/test/Transforms/LoopVectorize/RISCV/blocks-with-dead-instructions.ll (+3-1) 
- (modified) llvm/test/Transforms/LoopVectorize/RISCV/masked_gather_scatter.ll (+12-10) 
- (modified) llvm/test/Transforms/LoopVectorize/RISCV/strided-accesses.ll (+12-6) 
- (modified) llvm/test/Transforms/LoopVectorize/RISCV/tail-folding-gather-scatter.ll (+5-19) 
- (modified) llvm/test/Transforms/LoopVectorize/RISCV/tail-folding-interleave.ll (+46-85) 
- (modified) llvm/test/Transforms/LoopVectorize/vplan-dot-printing.ll (+2-2) 


``````````diff

diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index c4110582da1ef..528e6abc78aee 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -4073,7 +4073,7 @@ void LoopVectorizationPlanner::emitInvalidCostRemarks(
                 [](const auto *R) { return Instruction::Select; })
             .Case<VPWidenStoreRecipe>(
                 [](const auto *R) { return Instruction::Store; })
-            .Case<VPWidenLoadRecipe>(
+            .Case<VPWidenLoadRecipe, VPWidenStridedLoadRecipe>(
                 [](const auto *R) { return Instruction::Load; })
             .Case<VPWidenCallRecipe, VPWidenIntrinsicRecipe>(
                 [](const auto *R) { return Instruction::Call; })
@@ -4172,6 +4172,7 @@ static bool willGenerateVectors(VPlan &Plan, ElementCount VF,
       case VPDef::VPWidenPointerInductionSC:
       case VPDef::VPReductionPHISC:
       case VPDef::VPInterleaveSC:
+      case VPDef::VPWidenStridedLoadSC:
       case VPDef::VPWidenLoadEVLSC:
       case VPDef::VPWidenLoadSC:
       case VPDef::VPWidenStoreEVLSC:
@@ -7732,7 +7733,10 @@ VPRecipeBuilder::tryToWidenMemory(Instruction *I, ArrayRef<VPValue *> Operands,
           new VPVectorEndPointerRecipe(Ptr, &Plan.getVF(), getLoadStoreType(I),
                                        /*Stride*/ -1, Flags, I->getDebugLoc());
     } else {
-      VectorPtr = new VPVectorPointerRecipe(Ptr, getLoadStoreType(I),
+      const DataLayout &DL = I->getDataLayout();
+      auto *StrideTy = DL.getIndexType(Ptr->getUnderlyingValue()->getType());
+      VPValue *StrideOne = Plan.getOrAddLiveIn(ConstantInt::get(StrideTy, 1));
+      VectorPtr = new VPVectorPointerRecipe(Ptr, getLoadStoreType(I), StrideOne,
                                             GEP ? GEP->getNoWrapFlags()
                                                 : GEPNoWrapFlags::none(),
                                             I->getDebugLoc());
@@ -8832,19 +8836,14 @@ VPlanPtr LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(
                                 *Plan))
     return nullptr;
 
+  VPCostContext CostCtx(CM.TTI, *CM.TLI, *Plan, CM, CM.CostKind);
   // Transform recipes to abstract recipes if it is legal and beneficial and
   // clamp the range for better cost estimation.
   // TODO: Enable following transform when the EVL-version of extended-reduction
   // and mulacc-reduction are implemented.
-  if (!CM.foldTailWithEVL()) {
-    VPCostContext CostCtx(CM.TTI, *CM.TLI, *Plan, CM, CM.CostKind);
+  if (!CM.foldTailWithEVL())
     VPlanTransforms::runPass(VPlanTransforms::convertToAbstractRecipes, *Plan,
                              CostCtx, Range);
-  }
-
-  for (ElementCount VF : Range)
-    Plan->addVF(VF);
-  Plan->setName("Initial VPlan");
 
   // Interleave memory: for each Interleave Group we marked earlier as relevant
   // for this VPlan, replace the Recipes widening its memory instructions with a
@@ -8853,6 +8852,15 @@ VPlanPtr LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(
                            InterleaveGroups, RecipeBuilder,
                            CM.isScalarEpilogueAllowed());
 
+  // Convert memory recipes to strided access recipes if the strided access is
+  // legal and profitable.
+  VPlanTransforms::runPass(VPlanTransforms::convertToStridedAccesses, *Plan,
+                           CostCtx, Range);
+
+  for (ElementCount VF : Range)
+    Plan->addVF(VF);
+  Plan->setName("Initial VPlan");
+
   // Replace VPValues for known constant strides guaranteed by predicate scalar
   // evolution.
   auto CanUseVersionedStride = [&Plan](VPUser &U, unsigned) {
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h
index 6814dc5de6716..3c3af87cc0f6e 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.h
+++ b/llvm/lib/Transforms/Vectorize/VPlan.h
@@ -559,6 +559,7 @@ class VPSingleDefRecipe : public VPRecipeBase, public VPValue {
     case VPRecipeBase::VPBranchOnMaskSC:
     case VPRecipeBase::VPInterleaveSC:
     case VPRecipeBase::VPIRInstructionSC:
+    case VPRecipeBase::VPWidenStridedLoadSC:
     case VPRecipeBase::VPWidenLoadEVLSC:
     case VPRecipeBase::VPWidenLoadSC:
     case VPRecipeBase::VPWidenStoreEVLSC:
@@ -1746,10 +1747,6 @@ struct LLVM_ABI_FOR_TEST VPWidenSelectRecipe : public VPRecipeWithIRFlags,
 
 /// A recipe for handling GEP instructions.
 class LLVM_ABI_FOR_TEST VPWidenGEPRecipe : public VPRecipeWithIRFlags {
-  bool isPointerLoopInvariant() const {
-    return getOperand(0)->isDefinedOutsideLoopRegions();
-  }
-
   bool isIndexLoopInvariant(unsigned I) const {
     return getOperand(I + 1)->isDefinedOutsideLoopRegions();
   }
@@ -1778,6 +1775,30 @@ class LLVM_ABI_FOR_TEST VPWidenGEPRecipe : public VPRecipeWithIRFlags {
 
   VP_CLASSOF_IMPL(VPDef::VPWidenGEPSC)
 
+  bool isPointerLoopInvariant() const {
+    return getOperand(0)->isDefinedOutsideLoopRegions();
+  }
+
+  std::optional<unsigned> getUniqueVariantIndex() const {
+    std::optional<unsigned> VarIdx;
+    for (unsigned I = 0, E = getNumOperands() - 1; I < E; ++I) {
+      if (isIndexLoopInvariant(I))
+        continue;
+
+      if (VarIdx)
+        return std::nullopt;
+      VarIdx = I;
+    }
+    return VarIdx;
+  }
+
+  Type *getIndexedType(unsigned I) const {
+    auto *GEP = cast<GetElementPtrInst>(getUnderlyingInstr());
+    Type *SourceElementType = GEP->getSourceElementType();
+    SmallVector<Value *, 4> Ops(GEP->idx_begin(), GEP->idx_begin() + I);
+    return GetElementPtrInst::getIndexedType(SourceElementType, Ops);
+  }
+
   /// Generate the gep nodes.
   void execute(VPTransformState &State) override;
 
@@ -1866,20 +1887,23 @@ class VPVectorEndPointerRecipe : public VPRecipeWithIRFlags,
 #endif
 };
 
-/// A recipe to compute the pointers for widened memory accesses of IndexTy.
+/// A recipe to compute the pointers for widened memory accesses of IndexedTy,
+/// with the Stride expressed in units of IndexedTy.
 class VPVectorPointerRecipe : public VPRecipeWithIRFlags,
-                              public VPUnrollPartAccessor<1> {
+                              public VPUnrollPartAccessor<2> {
   Type *IndexedTy;
 
 public:
-  VPVectorPointerRecipe(VPValue *Ptr, Type *IndexedTy, GEPNoWrapFlags GEPFlags,
-                        DebugLoc DL)
-      : VPRecipeWithIRFlags(VPDef::VPVectorPointerSC, ArrayRef<VPValue *>(Ptr),
-                            GEPFlags, DL),
+  VPVectorPointerRecipe(VPValue *Ptr, Type *IndexedTy, VPValue *Stride,
+                        GEPNoWrapFlags GEPFlags, DebugLoc DL)
+      : VPRecipeWithIRFlags(VPDef::VPVectorPointerSC,
+                            ArrayRef<VPValue *>({Ptr, Stride}), GEPFlags, DL),
         IndexedTy(IndexedTy) {}
 
   VP_CLASSOF_IMPL(VPDef::VPVectorPointerSC)
 
+  VPValue *getStride() const { return getOperand(1); }
+
   void execute(VPTransformState &State) override;
 
   bool onlyFirstLaneUsed(const VPValue *Op) const override {
@@ -1897,7 +1921,7 @@ class VPVectorPointerRecipe : public VPRecipeWithIRFlags,
   }
 
   VPVectorPointerRecipe *clone() override {
-    return new VPVectorPointerRecipe(getOperand(0), IndexedTy,
+    return new VPVectorPointerRecipe(getOperand(0), IndexedTy, getStride(),
                                      getGEPNoWrapFlags(), getDebugLoc());
   }
 
@@ -3040,7 +3064,8 @@ class LLVM_ABI_FOR_TEST VPWidenMemoryRecipe : public VPRecipeBase,
     return R->getVPDefID() == VPRecipeBase::VPWidenLoadSC ||
            R->getVPDefID() == VPRecipeBase::VPWidenStoreSC ||
            R->getVPDefID() == VPRecipeBase::VPWidenLoadEVLSC ||
-           R->getVPDefID() == VPRecipeBase::VPWidenStoreEVLSC;
+           R->getVPDefID() == VPRecipeBase::VPWidenStoreEVLSC ||
+           R->getVPDefID() == VPRecipeBase::VPWidenStridedLoadSC;
   }
 
   static inline bool classof(const VPUser *U) {
@@ -3161,6 +3186,52 @@ struct VPWidenLoadEVLRecipe final : public VPWidenMemoryRecipe, public VPValue {
   }
 };
 
+/// A recipe for strided load operations, using the base address, stride, and an
+/// optional mask. This recipe will generate an vp.strided.load intrinsic call
+/// to represent memory accesses with a fixed stride.
+struct VPWidenStridedLoadRecipe final : public VPWidenMemoryRecipe,
+                                        public VPValue {
+  VPWidenStridedLoadRecipe(LoadInst &Load, VPValue *Addr, VPValue *Stride,
+                           VPValue *VF, VPValue *Mask,
+                           const VPIRMetadata &Metadata, DebugLoc DL)
+      : VPWidenMemoryRecipe(
+            VPDef::VPWidenStridedLoadSC, Load, {Addr, Stride, VF},
+            /*Consecutive=*/false, /*Reverse=*/false, Metadata, DL),
+        VPValue(this, &Load) {
+    setMask(Mask);
+  }
+
+  VPWidenStridedLoadRecipe *clone() override {
+    return new VPWidenStridedLoadRecipe(cast<LoadInst>(Ingredient), getAddr(),
+                                        getStride(), getVF(), getMask(), *this,
+                                        getDebugLoc());
+  }
+
+  VP_CLASSOF_IMPL(VPDef::VPWidenStridedLoadSC);
+
+  /// Return the stride operand.
+  VPValue *getStride() const { return getOperand(1); }
+
+  /// Return the VF operand.
+  VPValue *getVF() const { return getOperand(2); }
+
+  /// Generate a strided load.
+  void execute(VPTransformState &State) override;
+
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+  /// Print the recipe.
+  void print(raw_ostream &O, const Twine &Indent,
+             VPSlotTracker &SlotTracker) const override;
+#endif
+
+  /// Returns true if the recipe only uses the first lane of operand \p Op.
+  bool onlyFirstLaneUsed(const VPValue *Op) const override {
+    assert(is_contained(operands(), Op) &&
+           "Op must be an operand of the recipe");
+    return Op == getAddr() || Op == getStride() || Op == getVF();
+  }
+};
+
 /// A recipe for widening store operations, using the stored value, the address
 /// to store to and an optional mask.
 struct LLVM_ABI_FOR_TEST VPWidenStoreRecipe final : public VPWidenMemoryRecipe {
diff --git a/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp b/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp
index 6c9a78add6822..36760feb1cb14 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp
@@ -187,8 +187,10 @@ Type *VPTypeAnalysis::inferScalarTypeForRecipe(const VPWidenCallRecipe *R) {
 }
 
 Type *VPTypeAnalysis::inferScalarTypeForRecipe(const VPWidenMemoryRecipe *R) {
-  assert((isa<VPWidenLoadRecipe, VPWidenLoadEVLRecipe>(R)) &&
-         "Store recipes should not define any values");
+  assert(
+      (isa<VPWidenLoadRecipe, VPWidenLoadEVLRecipe, VPWidenStridedLoadRecipe>(
+          R)) &&
+      "Store recipes should not define any values");
   return cast<LoadInst>(&R->getIngredient())->getType();
 }
 
diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
index a92540f457c73..90c05b7b8076c 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
@@ -81,6 +81,7 @@ bool VPRecipeBase::mayWriteToMemory() const {
   case VPWidenCastSC:
   case VPWidenGEPSC:
   case VPWidenIntOrFpInductionSC:
+  case VPWidenStridedLoadSC:
   case VPWidenLoadEVLSC:
   case VPWidenLoadSC:
   case VPWidenPHISC:
@@ -104,6 +105,7 @@ bool VPRecipeBase::mayReadFromMemory() const {
     return cast<VPExpressionRecipe>(this)->mayReadOrWriteMemory();
   case VPInstructionSC:
     return cast<VPInstruction>(this)->opcodeMayReadOrWriteFromMemory();
+  case VPWidenStridedLoadSC:
   case VPWidenLoadEVLSC:
   case VPWidenLoadSC:
     return true;
@@ -185,6 +187,7 @@ bool VPRecipeBase::mayHaveSideEffects() const {
   }
   case VPInterleaveSC:
     return mayWriteToMemory();
+  case VPWidenStridedLoadSC:
   case VPWidenLoadEVLSC:
   case VPWidenLoadSC:
   case VPWidenStoreEVLSC:
@@ -2512,13 +2515,22 @@ void VPVectorEndPointerRecipe::print(raw_ostream &O, const Twine &Indent,
 void VPVectorPointerRecipe::execute(VPTransformState &State) {
   auto &Builder = State.Builder;
   unsigned CurrentPart = getUnrollPart(*this);
-  Type *IndexTy = getGEPIndexTy(State.VF.isScalable(), /*IsReverse*/ false,
-                                /*IsUnitStride*/ true, CurrentPart, Builder);
+  Value *Stride = State.get(getStride(), /*IsScalar*/ true);
+
+  auto *StrideC = dyn_cast<ConstantInt>(Stride);
+  bool IsStrideOne = StrideC && StrideC->isOne();
+  bool IsUnitStride = IsStrideOne || (StrideC && StrideC->isMinusOne());
+  Type *IndexTy =
+      getGEPIndexTy(State.VF.isScalable(),
+                    /*IsReverse*/ false, IsUnitStride, CurrentPart, Builder);
   Value *Ptr = State.get(getOperand(0), VPLane(0));
 
+  Stride = Builder.CreateSExtOrTrunc(Stride, IndexTy);
   Value *Increment = createStepForVF(Builder, IndexTy, State.VF, CurrentPart);
+  Value *Index = IsStrideOne ? Increment : Builder.CreateMul(Increment, Stride);
+
   Value *ResultPtr =
-      Builder.CreateGEP(IndexedTy, Ptr, Increment, "", getGEPNoWrapFlags());
+      Builder.CreateGEP(IndexedTy, Ptr, Index, "", getGEPNoWrapFlags());
 
   State.set(this, ResultPtr, /*IsScalar*/ true);
 }
@@ -3170,9 +3182,11 @@ InstructionCost VPWidenMemoryRecipe::computeCost(ElementCount VF,
   const Align Alignment = getLoadStoreAlignment(&Ingredient);
   unsigned AS = cast<PointerType>(Ctx.Types.inferScalarType(getAddr()))
                     ->getAddressSpace();
-  unsigned Opcode = isa<VPWidenLoadRecipe, VPWidenLoadEVLRecipe>(this)
-                        ? Instruction::Load
-                        : Instruction::Store;
+  unsigned Opcode =
+      isa<VPWidenLoadRecipe, VPWidenLoadEVLRecipe, VPWidenStridedLoadRecipe>(
+          this)
+          ? Instruction::Load
+          : Instruction::Store;
 
   if (!Consecutive) {
     // TODO: Using the original IR may not be accurate.
@@ -3182,6 +3196,11 @@ InstructionCost VPWidenMemoryRecipe::computeCost(ElementCount VF,
     Type *PtrTy = toVectorTy(Ptr->getType(), VF);
     assert(!Reverse &&
            "Inconsecutive memory access should not have the order.");
+
+    if (isa<VPWidenStridedLoadRecipe>(this))
+      return Ctx.TTI.getStridedMemoryOpCost(
+          Opcode, Ty, Ptr, IsMasked, Alignment, Ctx.CostKind, &Ingredient);
+
     return Ctx.TTI.getAddressComputationCost(PtrTy, nullptr, nullptr,
                                              Ctx.CostKind) +
            Ctx.TTI.getGatherScatterOpCost(Opcode, Ty, Ptr, IsMasked, Alignment,
@@ -3331,6 +3350,47 @@ void VPWidenLoadEVLRecipe::print(raw_ostream &O, const Twine &Indent,
 }
 #endif
 
+void VPWidenStridedLoadRecipe::execute(VPTransformState &State) {
+  Type *ScalarDataTy = getLoadStoreType(&Ingredient);
+  auto *DataTy = VectorType::get(ScalarDataTy, State.VF);
+  const Align Alignment = getLoadStoreAlignment(&Ingredient);
+
+  auto &Builder = State.Builder;
+  Value *Addr = State.get(getAddr(), /*IsScalar*/ true);
+  Value *StrideInBytes = State.get(getStride(), /*IsScalar*/ true);
+  Value *Mask = nullptr;
+  if (VPValue *VPMask = getMask())
+    Mask = State.get(VPMask);
+  else
+    Mask = Builder.CreateVectorSplat(State.VF, Builder.getTrue());
+  Value *RunTimeVF = Builder.CreateZExtOrTrunc(State.get(getVF(), VPLane(0)),
+                                               Builder.getInt32Ty());
+
+  auto *PtrTy = Addr->getType();
+  auto *StrideTy = StrideInBytes->getType();
+  CallInst *NewLI = Builder.CreateIntrinsic(
+      Intrinsic::experimental_vp_strided_load, {DataTy, PtrTy, StrideTy},
+      {Addr, StrideInBytes, Mask, RunTimeVF}, nullptr, "wide.strided.load");
+  NewLI->addParamAttr(
+      0, Attribute::getWithAlignment(NewLI->getContext(), Alignment));
+  applyMetadata(*NewLI);
+  State.set(this, NewLI);
+}
+
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+void VPWidenStridedLoadRecipe::print(raw_ostream &O, const Twine &Indent,
+                                     VPSlotTracker &SlotTracker) const {
+  O << Indent << "WIDEN ";
+  printAsOperand(O, SlotTracker);
+  O << " = load ";
+  getAddr()->printAsOperand(O, SlotTracker);
+  O << ", stride = ";
+  getStride()->printAsOperand(O, SlotTracker);
+  O << ", runtimeVF = ";
+  getVF()->printAsOperand(O, SlotTracker);
+}
+#endif
+
 void VPWidenStoreRecipe::execute(VPTransformState &State) {
   VPValue *StoredVPValue = getStoredValue();
   bool CreateScatter = !isConsecutive();
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
index 677d97e54d556..cf9b1c4199449 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
@@ -2203,6 +2203,12 @@ static VPRecipeBase *optimizeMaskToEVL(VPValue *HeaderMask,
         VPValue *NewAddr = GetNewAddr(L->getAddr());
         return new VPWidenLoadEVLRecipe(*L, NewAddr, EVL, NewMask);
       })
+      .Case<VPWidenStridedLoadRecipe>([&](VPWidenStridedLoadRecipe *L) {
+        VPValue *NewMask = GetNewMask(L->getMask());
+        return new VPWidenStridedLoadRecipe(
+            *cast<LoadInst>(&L->getIngredient()), L->getAddr(), L->getStride(),
+            &EVL, NewMask, *L, L->getDebugLoc());
+      })
       .Case<VPWidenStoreRecipe>([&](VPWidenStoreRecipe *S) {
         VPValue *NewMask = GetNewMask(S->getMask());
         VPValue *NewAddr = GetNewAddr(S->getAddr());
@@ -2237,10 +2243,12 @@ static void transformRecipestoEVLRecipes(VPlan &Plan, VPValue &EVL) {
   VPRegionBlock *LoopRegion = Plan.getVectorLoopRegion();
   VPBasicBlock *Header = LoopRegion->getEntryBasicBlock();
 
-  assert(all_of(Plan.getVF().users(),
-                IsaPred<VPVectorEndPointerRecipe, VPScalarIVStepsRecipe,
-                        VPWidenIntOrFpInductionRecipe>) &&
-         "User of VF that we can't transform to EVL.");
+  assert(
+      all_of(
+          Plan.getVF().users(),
+          IsaPred<VPVectorEndPointerRecipe, VPScalarIVStepsRecipe,
+                  VPWidenIntOrFpInductionRecipe, VPWidenStridedLoadRecipe>) &&
+      "User of VF that we can't transform to EVL.");
   Plan.getVF().replaceUsesWithIf(&EVL, [](VPUser &U, unsigned Idx) {
     return isa<VPWidenIntOrFpInductionRecipe, VPScalarIVStepsRecipe>(U);
   });
@@ -2337,7 +2345,8 @@ static void transformRecipestoEVLRecipes(VPlan &Plan, VPValue &EVL) {
     assert(NumDefVal <= 1 &&
            "Only supports recipes with a single definition or without users.");
     EVLRecipe->insertBefore(CurRecipe);
-    if (isa<VPSingleDefRecipe, VPWidenLoadEVLRecipe>(EVLRecipe)) {
+    if (isa<VPSingleDefRecipe, VPWidenLoadEVLRecipe, VPWidenStridedLoadRecipe>(
+            EVLRecipe)) {
       VPValue *CurVPV = CurRecipe->getVPSingleValue();
       CurVPV->replaceAllUsesWith(EVLRecipe->getVPSingleValue());
     }
@@ -3767,3 +3776,182 @@ void VPlanTransforms::addBranchWeightToMiddleTerminator(
       MDB.createBranchWeights({1, VectorStep - 1}, /*IsExpected=*/false);
   MiddleTerm->addMetadata(LLVMContext::MD_prof, BranchWeights);
 }
+
+static std::pair<VPValue *, VPValue *> matchStridedStart(VPValue *CurIndex) {
+  if (auto *WidenIV = dyn_cast<VPWidenIntOrFpInductionRecipe>(CurIndex))
+    return {WidenIV, WidenIV->getStepValue()};
+
+  auto *WidenR = dyn_cast<VPWidenRecipe>(CurIndex);
+  if (!WidenR || !CurIndex->getUnderlyingValue())
+    return {nullptr, nullptr};
+
+  unsigned Opcode = WidenR->getOpcode();
+  // TODO: Support Instruction::Add and Instruction::Or.
+  if (Opcode != Instruction::Shl && Opcode != Instruction::Mul)
+    return {nullptr, nullptr};
+
+  // Match the pattern binop(variant, invariant), or binop(invariant, variant)
+  // if the binary operator is commutative.
+  bool IsLHSUniform = vputils::isSingleScalar(WidenR->getOperand(0));
+  if (IsLHSUniform == vputils::isSingleScalar(WidenR->getOperand(1)) ||
+      (IsLHSUniform && !Instruction::isCommutative(Opcode)))
+    return {nullptr, nullptr};
+  unsigned VarIdx = IsLHSUniform ? 1 : 0;
+
+  auto [Start, Stride] = matchStridedStart(WidenR->getOperand(VarIdx));
+  if (!Start)
+    return {nullptr, nullptr};
+
+  SmallVector<VPValue *> StartOps(WidenR->operands());
+  StartOps[VarIdx] = Start;
+  auto *StartR = new VPReplicateRecipe(WidenR->getUnderlyingInstr(), StartOps,
+                                       /*IsUniform*/ true);
+  StartR->insertBefore(WidenR);
+
+  unsigned InvIdx = VarIdx == 0 ? 1 : 0;
+  auto *StrideR =
+      new VPInstruction(Opcode, {Stride,...
[truncated]

``````````

</details>


https://github.com/llvm/llvm-project/pull/147297