[llvm] [VPlan] Implement interleaving as VPlan-to-VPlan transform. (PR #95842)

Mon Jun 17 13:32:51 PDT 2024

llvmbot wrote:




@llvm/pr-subscribers-llvm-transforms

Author: Florian Hahn (fhahn)

<details>
<summary>Changes</summary>

This patch implements explicit interleaving as VPlan transform. In follow up patches this will allow  simplifying VPTransform state (no need to store unrolled parts) as well as recipe execution (no need to generate code for multiple parts in a each recipe). It also allows for more general optimziations (e.g. avoid generating code for recipes that are uniform-across parts).

In the initial implementation, a number of recipes still take the unrolled part as additional, optional argument, if their execution depends on the unrolled part.

The computation for start/step values for scalable inductions changed slightly. Previously the step would be computed as scalar and then splatted, now vscale gets splatted and multiplied by the step in a vector mul.

This has been split off  https://github.com/llvm/llvm-project/pull/94339 which also includes changes to simplify VPTransfomState and recipes' ::execute.

The current version mostly leaves existing ::execute untouched and instead sets VPTransfomState::UF to 1.

---

Patch is 161.29 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/95842.diff


29 Files Affected:

- (modified) llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h (+9) 
- (modified) llvm/lib/Transforms/Vectorize/LoopVectorize.cpp (+31-15) 
- (modified) llvm/lib/Transforms/Vectorize/VPlan.cpp (+12) 
- (modified) llvm/lib/Transforms/Vectorize/VPlan.h (+49-1) 
- (modified) llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp (+108-49) 
- (modified) llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp (+418) 
- (modified) llvm/lib/Transforms/Vectorize/VPlanTransforms.h (+2) 
- (modified) llvm/lib/Transforms/Vectorize/VPlanValue.h (+1-1) 
- (modified) llvm/test/Transforms/LoopVectorize/AArch64/arbitrary-induction-step.ll (+2-2) 
- (modified) llvm/test/Transforms/LoopVectorize/AArch64/sve-gather-scatter.ll (+1-1) 
- (modified) llvm/test/Transforms/LoopVectorize/AArch64/sve-inductions-unusual-types.ll (+2-2) 
- (modified) llvm/test/Transforms/LoopVectorize/AArch64/sve-live-out-pointer-induction.ll (+4-1) 
- (modified) llvm/test/Transforms/LoopVectorize/AArch64/sve-widen-phi.ll (+1-1) 
- (modified) llvm/test/Transforms/LoopVectorize/RISCV/interleaved-accesses.ll (+6-6) 
- (modified) llvm/test/Transforms/LoopVectorize/RISCV/uniform-load-store.ll (-3) 
- (modified) llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-interleave.ll (+6-7) 
- (modified) llvm/test/Transforms/LoopVectorize/X86/epilog-vectorization-inductions.ll (+5-3) 
- (modified) llvm/test/Transforms/LoopVectorize/X86/induction-costs.ll (-3) 
- (modified) llvm/test/Transforms/LoopVectorize/X86/interleaving.ll (+34-34) 
- (modified) llvm/test/Transforms/LoopVectorize/X86/pr47437.ll (+24-24) 
- (modified) llvm/test/Transforms/LoopVectorize/X86/uniform_mem_op.ll (+12-21) 
- (modified) llvm/test/Transforms/LoopVectorize/first-order-recurrence.ll (+42-42) 
- (modified) llvm/test/Transforms/LoopVectorize/float-induction.ll (+13-15) 
- (modified) llvm/test/Transforms/LoopVectorize/induction.ll (+39-38) 
- (modified) llvm/test/Transforms/LoopVectorize/interleave-and-scalarize-only.ll (-2) 
- (modified) llvm/test/Transforms/LoopVectorize/pr45679-fold-tail-by-masking.ll (+12-12) 
- (modified) llvm/test/Transforms/LoopVectorize/reduction-inloop-uf4.ll (+60-60) 
- (modified) llvm/test/Transforms/LoopVectorize/scalable-inductions.ll (+2-2) 
- (modified) llvm/test/Transforms/LoopVectorize/vplan-printing-before-execute.ll (+13-6) 


``````````diff

diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h b/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h
index c03c278fcebe7..61fc02cdf9b8b 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h
@@ -161,6 +161,15 @@ class VPBuilder {
     return tryInsertInstruction(
         new VPInstruction(Opcode, Operands, WrapFlags, DL, Name));
   }
+
+  VPInstruction *createFPOp(unsigned Opcode,
+                            std::initializer_list<VPValue *> Operands,
+                            DebugLoc DL = {}, const Twine &Name = "",
+                            FastMathFlags FMFs = {}) {
+    auto *Op = new VPInstruction(Opcode, Operands, FMFs, DL, Name);
+    return tryInsertInstruction(Op);
+  }
+
   VPValue *createNot(VPValue *Operand, DebugLoc DL = {},
                      const Twine &Name = "") {
     return createInstruction(VPInstruction::Not, {Operand}, DL, Name);
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 408083765ccb1..732499ba9a33a 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -3476,10 +3476,7 @@ void InnerLoopVectorizer::fixFixedOrderRecurrence(VPLiveOut *LO,
   // initial value for the recurrence when jumping to the scalar loop.
   VPValue *VPExtract = LO->getOperand(0);
   using namespace llvm::VPlanPatternMatch;
-  assert(match(VPExtract, m_VPInstruction<VPInstruction::ExtractFromEnd>(
-                              m_VPValue(), m_VPValue())) &&
-         "FOR LiveOut expects to use an extract from end.");
-  Value *ResumeScalarFOR = State.get(VPExtract, UF - 1, true);
+  Value *ResumeScalarFOR = State.get(VPExtract, 0, true);
 
   // Fix the initial value of the original recurrence in the scalar loop.
   PHINode *ScalarHeaderPhi = LO->getPhi();
@@ -7429,6 +7426,8 @@ LoopVectorizationPlanner::executePlan(
       "expanded SCEVs to reuse can only be used during epilogue vectorization");
   (void)IsEpilogueVectorization;
 
+  VPlanTransforms::interleave(BestVPlan, BestUF,
+                              OrigLoop->getHeader()->getModule()->getContext());
   VPlanTransforms::optimizeForVFAndUF(BestVPlan, BestVF, BestUF, PSE);
 
   LLVM_DEBUG(dbgs() << "Executing best plan with VF=" << BestVF
@@ -9152,42 +9151,59 @@ void VPWidenPointerInductionRecipe::execute(VPTransformState &State) {
 
   auto *IVR = getParent()->getPlan()->getCanonicalIV();
   PHINode *CanonicalIV = cast<PHINode>(State.get(IVR, 0, /*IsScalar*/ true));
+  unsigned CurrentPart = 0;
+  if (getNumOperands() == 5)
+    CurrentPart =
+        cast<ConstantInt>(getOperand(4)->getLiveInIRValue())->getZExtValue();
   Type *PhiType = IndDesc.getStep()->getType();
 
   // Build a pointer phi
   Value *ScalarStartValue = getStartValue()->getLiveInIRValue();
   Type *ScStValueType = ScalarStartValue->getType();
-  PHINode *NewPointerPhi = PHINode::Create(ScStValueType, 2, "pointer.phi",
-                                           CanonicalIV->getIterator());
+  PHINode *NewPointerPhi = nullptr;
 
   BasicBlock *VectorPH = State.CFG.getPreheaderBBFor(this);
-  NewPointerPhi->addIncoming(ScalarStartValue, VectorPH);
+  if (getNumOperands() == 5) {
+    auto *GEP = cast<GetElementPtrInst>(State.get(getOperand(3), 0));
+    NewPointerPhi = cast<PHINode>(GEP->getPointerOperand());
+  } else {
+    NewPointerPhi =
+        PHINode::Create(ScStValueType, 2, "pointer.phi", CanonicalIV);
+    NewPointerPhi->addIncoming(ScalarStartValue, VectorPH);
+  }
 
   // A pointer induction, performed by using a gep
   BasicBlock::iterator InductionLoc = State.Builder.GetInsertPoint();
+  unsigned UF = getNumOperands() == 2
+                    ? 1
+                    : cast<ConstantInt>(getOperand(2)->getLiveInIRValue())
+                          ->getZExtValue();
 
   Value *ScalarStepValue = State.get(getOperand(1), VPIteration(0, 0));
   Value *RuntimeVF = getRuntimeVF(State.Builder, PhiType, State.VF);
   Value *NumUnrolledElems =
-      State.Builder.CreateMul(RuntimeVF, ConstantInt::get(PhiType, State.UF));
-  Value *InductionGEP = GetElementPtrInst::Create(
-      State.Builder.getInt8Ty(), NewPointerPhi,
-      State.Builder.CreateMul(ScalarStepValue, NumUnrolledElems), "ptr.ind",
-      InductionLoc);
+      State.Builder.CreateMul(RuntimeVF, ConstantInt::get(PhiType, UF));
   // Add induction update using an incorrect block temporarily. The phi node
   // will be fixed after VPlan execution. Note that at this point the latch
   // block cannot be used, as it does not exist yet.
   // TODO: Model increment value in VPlan, by turning the recipe into a
   // multi-def and a subclass of VPHeaderPHIRecipe.
-  NewPointerPhi->addIncoming(InductionGEP, VectorPH);
+  if (getNumOperands() != 5) {
+    Value *InductionGEP = GetElementPtrInst::Create(
+        State.Builder.getInt8Ty(), NewPointerPhi,
+        State.Builder.CreateMul(ScalarStepValue, NumUnrolledElems), "ptr.ind",
+        InductionLoc);
+
+    NewPointerPhi->addIncoming(InductionGEP, VectorPH);
+  }
 
   // Create UF many actual address geps that use the pointer
   // phi as base and a vectorized version of the step value
   // (<step*0, ..., step*N>) as offset.
   for (unsigned Part = 0; Part < State.UF; ++Part) {
     Type *VecPhiType = VectorType::get(PhiType, State.VF);
-    Value *StartOffsetScalar =
-        State.Builder.CreateMul(RuntimeVF, ConstantInt::get(PhiType, Part));
+    Value *StartOffsetScalar = State.Builder.CreateMul(
+        RuntimeVF, ConstantInt::get(PhiType, CurrentPart));
     Value *StartOffset =
         State.Builder.CreateVectorSplat(State.VF, StartOffsetScalar);
     // Create a vector of consecutive numbers from zero to VF.
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.cpp b/llvm/lib/Transforms/Vectorize/VPlan.cpp
index 348a2be5072b4..2e47704fe274c 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlan.cpp
@@ -821,6 +821,10 @@ void VPlan::prepareToExecute(Value *TripCountV, Value *VectorTripCountV,
   // FIXME: Model VF * UF computation completely in VPlan.
   VFxUF.setUnderlyingValue(
       createStepForVF(Builder, TripCountV->getType(), State.VF, State.UF));
+  if (VF.getNumUsers() > 0) {
+    VF.setUnderlyingValue(
+        createStepForVF(Builder, TripCountV->getType(), State.VF, 1));
+  }
 
   // When vectorizing the epilogue loop, the canonical induction start value
   // needs to be changed from zero to the value after the main vector loop.
@@ -846,6 +850,7 @@ void VPlan::prepareToExecute(Value *TripCountV, Value *VectorTripCountV,
 /// Assumes a single pre-header basic-block was created for this. Introduce
 /// additional basic-blocks as needed, and fill them all.
 void VPlan::execute(VPTransformState *State) {
+  State->UF = 1;
   // Initialize CFG state.
   State->CFG.PrevVPBB = nullptr;
   State->CFG.ExitBB = State->CFG.PrevBB->getSingleSuccessor();
@@ -890,6 +895,9 @@ void VPlan::execute(VPTransformState *State) {
       // Move the last step to the end of the latch block. This ensures
       // consistent placement of all induction updates.
       Instruction *Inc = cast<Instruction>(Phi->getIncomingValue(1));
+      if (isa<VPWidenIntOrFpInductionRecipe>(&R) && R.getNumOperands() == 4)
+        Inc->setOperand(0, State->get(R.getOperand(3), 0));
+
       Inc->moveBefore(VectorLatchBB->getTerminator()->getPrevNode());
       continue;
     }
@@ -1254,6 +1262,10 @@ void VPlanIngredient::print(raw_ostream &O) const {
 
 template void DomTreeBuilder::Calculate<VPDominatorTree>(VPDominatorTree &DT);
 
+bool VPValue::isDefinedOutsideVectorRegions() const {
+  return !hasDefiningRecipe() || !getDefiningRecipe()->getParent()->getParent();
+}
+
 void VPValue::replaceAllUsesWith(VPValue *New) {
   replaceUsesWithIf(New, [](VPUser &, unsigned) { return true; });
 }
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h
index 5bb88e4a57dc3..706851f84de23 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.h
+++ b/llvm/lib/Transforms/Vectorize/VPlan.h
@@ -701,6 +701,8 @@ class VPLiveOut : public VPUser {
 
   PHINode *getPhi() const { return Phi; }
 
+  bool onlyFirstPartUsed(const VPValue *Op) const override { return true; }
+
 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
   /// Print the VPLiveOut to \p O.
   void print(raw_ostream &O, VPSlotTracker &SlotTracker) const;
@@ -1330,6 +1332,9 @@ class VPInstruction : public VPRecipeWithIRFlags {
   /// Returns true if this VPInstruction produces a scalar value from a vector,
   /// e.g. by performing a reduction or extracting a lane.
   bool isVectorToScalar() const;
+
+  /// Return the interleave count from the VPInstruction's last argument.
+  unsigned getInterleaveCount() const;
 };
 
 /// VPWidenRecipe is a recipe for producing a copy of vector type its
@@ -1611,6 +1616,9 @@ class VPVectorPointerRecipe : public VPRecipeWithIRFlags {
                                      isInBounds(), getDebugLoc());
   }
 
+  /// Return the current part for this vector pointer.
+  unsigned getPartForRecipe() const;
+
 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
   /// Print the recipe.
   void print(raw_ostream &O, const Twine &Indent,
@@ -1951,6 +1959,9 @@ class VPReductionPHIRecipe : public VPHeaderPHIRecipe {
 
   /// Returns true, if the phi is part of an in-loop reduction.
   bool isInLoop() const { return IsInLoop; }
+
+  /// Return the current part for this scalar step.
+  unsigned getPartForRecipe() const;
 };
 
 /// A recipe for vectorizing a phi-node as a sequence of mask-based select
@@ -2593,6 +2604,9 @@ class VPCanonicalIVPHIRecipe : public VPHeaderPHIRecipe {
   /// Generate the canonical scalar induction phi of the vector loop.
   void execute(VPTransformState &State) override;
 
+  /// Return the current part for this scalar step.
+  unsigned getPartForRecipe() const;
+
 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
   /// Print the recipe.
   void print(raw_ostream &O, const Twine &Indent,
@@ -2637,7 +2651,9 @@ class VPActiveLaneMaskPHIRecipe : public VPHeaderPHIRecipe {
   ~VPActiveLaneMaskPHIRecipe() override = default;
 
   VPActiveLaneMaskPHIRecipe *clone() override {
-    return new VPActiveLaneMaskPHIRecipe(getOperand(0), getDebugLoc());
+    auto *R = new VPActiveLaneMaskPHIRecipe(getOperand(0), getDebugLoc());
+    R->addOperand(getOperand(1));
+    return R;
   }
 
   VP_CLASSOF_IMPL(VPDef::VPActiveLaneMaskPHISC)
@@ -2715,6 +2731,9 @@ class VPWidenCanonicalIVRecipe : public VPSingleDefRecipe {
   /// step = <VF*UF, VF*UF, ..., VF*UF>.
   void execute(VPTransformState &State) override;
 
+  /// Return the current part for this scalar step.
+  unsigned getPartForRecipe() const;
+
 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
   /// Print the recipe.
   void print(raw_ostream &O, const Twine &Indent,
@@ -2827,6 +2846,9 @@ class VPScalarIVStepsRecipe : public VPRecipeWithIRFlags {
            "Op must be an operand of the recipe");
     return true;
   }
+
+  /// Return the current part for this scalar step.
+  unsigned getPartForRecipe() const;
 };
 
 /// VPBasicBlock serves as the leaf of the Hierarchical Control-Flow Graph. It
@@ -3145,6 +3167,8 @@ class VPlan {
   /// Represents the loop-invariant VF * UF of the vector loop region.
   VPValue VFxUF;
 
+  VPValue VF;
+
   /// Holds a mapping between Values and their corresponding VPValue inside
   /// VPlan.
   Value2VPValueTy Value2VPValue;
@@ -3232,6 +3256,7 @@ class VPlan {
 
   /// Returns VF * UF of the vector loop region.
   VPValue &getVFxUF() { return VFxUF; }
+  VPValue &getVF() { return VF; }
 
   void addVF(ElementCount VF) { VFs.insert(VF); }
 
@@ -3665,6 +3690,29 @@ inline bool isUniformAfterVectorization(VPValue *VPV) {
     return VPI->isVectorToScalar();
   return false;
 }
+
+/// Checks if \p C is uniform across all VFs and UFs. It is considered as such
+/// if it is either defined outside the vector region or its operand is known to
+/// be uniform across all VFs and UFs (e.g. VPDerivedIV or VPCanonicalIVPHI).
+inline bool isUniformAcrossVFsAndUFs(VPValue *V) {
+  if (auto *VPI = dyn_cast_or_null<VPInstruction>(V->getDefiningRecipe())) {
+    return VPI ==
+           VPI->getParent()->getPlan()->getCanonicalIV()->getBackedgeValue();
+  }
+  if (isa<VPCanonicalIVPHIRecipe, VPDerivedIVRecipe, VPExpandSCEVRecipe>(V))
+    return true;
+  if (isa<VPReplicateRecipe>(V) && cast<VPReplicateRecipe>(V)->isUniform() &&
+      (isa<LoadInst, StoreInst>(V->getUnderlyingValue())) &&
+      all_of(V->getDefiningRecipe()->operands(),
+             [](VPValue *Op) { return Op->isDefinedOutsideVectorRegions(); }))
+    return true;
+
+  auto *C = dyn_cast_or_null<VPScalarCastRecipe>(V->getDefiningRecipe());
+  return C && (C->isDefinedOutsideVectorRegions() ||
+               isa<VPDerivedIVRecipe>(C->getOperand(0)) ||
+               isa<VPCanonicalIVPHIRecipe>(C->getOperand(0)));
+}
+
 } // end namespace vputils
 
 } // end namespace llvm
diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
index a3ff6395bb39e..50aae9f28c562 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
@@ -393,9 +393,9 @@ Value *VPInstruction::generatePerPart(VPTransformState &State, unsigned Part) {
     if (Part != 0)
       return State.get(this, 0, /*IsScalar*/ true);
 
+    unsigned UF = getInterleaveCount();
     Value *ScalarTC = State.get(getOperand(0), {0, 0});
-    Value *Step =
-        createStepForVF(Builder, ScalarTC->getType(), State.VF, State.UF);
+    Value *Step = createStepForVF(Builder, ScalarTC->getType(), State.VF, UF);
     Value *Sub = Builder.CreateSub(ScalarTC, Step);
     Value *Cmp = Builder.CreateICmp(CmpInst::Predicate::ICMP_UGT, ScalarTC, Step);
     Value *Zero = ConstantInt::get(ScalarTC->getType(), 0);
@@ -428,14 +428,15 @@ Value *VPInstruction::generatePerPart(VPTransformState &State, unsigned Part) {
   }
   case VPInstruction::CanonicalIVIncrementForPart: {
     auto *IV = State.get(getOperand(0), VPIteration(0, 0));
-    if (Part == 0)
-      return IV;
-
-    // The canonical IV is incremented by the vectorization factor (num of SIMD
-    // elements) times the unroll part.
-    Value *Step = createStepForVF(Builder, IV->getType(), State.VF, Part);
-    return Builder.CreateAdd(IV, Step, Name, hasNoUnsignedWrap(),
-                             hasNoSignedWrap());
+    if (getNumOperands() == 2) {
+      // The canonical IV is incremented by the vectorization factor (num of
+      // SIMD elements) times the unroll part.
+      Value *Step = createStepForVF(Builder, IV->getType(), State.VF,
+                                    getInterleaveCount());
+      return Builder.CreateAdd(IV, Step, Name, hasNoUnsignedWrap(),
+                               hasNoSignedWrap());
+    }
+    return IV;
   }
   case VPInstruction::BranchOnCond: {
     if (Part != 0)
@@ -483,8 +484,7 @@ Value *VPInstruction::generatePerPart(VPTransformState &State, unsigned Part) {
     return CondBr;
   }
   case VPInstruction::ComputeReductionResult: {
-    if (Part != 0)
-      return State.get(this, 0, /*IsScalar*/ true);
+    unsigned NumParts = getNumOperands() - 1;
 
     // FIXME: The cross-recipe dependency on VPReductionPHIRecipe is temporary
     // and will be removed by breaking up the recipe further.
@@ -495,11 +495,10 @@ Value *VPInstruction::generatePerPart(VPTransformState &State, unsigned Part) {
 
     RecurKind RK = RdxDesc.getRecurrenceKind();
 
-    VPValue *LoopExitingDef = getOperand(1);
     Type *PhiTy = OrigPhi->getType();
-    VectorParts RdxParts(State.UF);
-    for (unsigned Part = 0; Part < State.UF; ++Part)
-      RdxParts[Part] = State.get(LoopExitingDef, Part, PhiR->isInLoop());
+    VectorParts RdxParts(NumParts);
+    for (unsigned Part = 0; Part != NumParts; ++Part)
+      RdxParts[Part] = State.get(getOperand(1 + Part), 0, PhiR->isInLoop());
 
     // If the vector reduction can be performed in a smaller type, we truncate
     // then extend the loop exit value to enable InstCombine to evaluate the
@@ -507,7 +506,7 @@ Value *VPInstruction::generatePerPart(VPTransformState &State, unsigned Part) {
     // TODO: Handle this in truncateToMinBW.
     if (State.VF.isVector() && PhiTy != RdxDesc.getRecurrenceType()) {
       Type *RdxVecTy = VectorType::get(RdxDesc.getRecurrenceType(), State.VF);
-      for (unsigned Part = 0; Part < State.UF; ++Part)
+      for (unsigned Part = 0; Part < NumParts; ++Part)
         RdxParts[Part] = Builder.CreateTrunc(RdxParts[Part], RdxVecTy);
     }
     // Reduce all of the unrolled parts into a single vector.
@@ -517,12 +516,12 @@ Value *VPInstruction::generatePerPart(VPTransformState &State, unsigned Part) {
       Op = Instruction::Or;
 
     if (PhiR->isOrdered()) {
-      ReducedPartRdx = RdxParts[State.UF - 1];
+      ReducedPartRdx = RdxParts[NumParts - 1];
     } else {
       // Floating-point operations should have some FMF to enable the reduction.
       IRBuilderBase::FastMathFlagGuard FMFG(Builder);
       Builder.setFastMathFlags(RdxDesc.getFastMathFlags());
-      for (unsigned Part = 1; Part < State.UF; ++Part) {
+      for (unsigned Part = 1; Part < NumParts; ++Part) {
         Value *RdxPart = RdxParts[Part];
         if (Op != Instruction::ICmp && Op != Instruction::FCmp)
           ReducedPartRdx = Builder.CreateBinOp(
@@ -603,6 +602,13 @@ bool VPInstruction::isVectorToScalar() const {
          getOpcode() == VPInstruction::ComputeReductionResult;
 }
 
+unsigned VPInstruction::getInterleaveCount() const {
+  return getNumOperands() == 1
+             ? 1
+             : cast<ConstantInt>(getOperand(1)->getLiveInIRValue())
+                   ->getZExtValue();
+}
+
 #if !defined(NDEBUG)
 bool VPInstruction::isFPMathOp() const {
   // Inspired by FPMathOperator::classof. Notable differences are that we don't
@@ -1215,24 +1221,32 @@ void VPWidenIntOrFpInductionRecipe::execute(VPTransformState &State) {
     MulOp = Instruction::FMul;
   }
 
-  // Multiply the vectorization factor by the step using integer or
-  // floating-point arithmetic as appropriate.
-  Type *StepType = Step->getType();
-  Value *RuntimeVF;
-  if (Step->getType()->isFloatingPointTy())
-    RuntimeVF = getRuntimeVFAsFloat(Builder, StepType, State.VF);
-  else
-    RuntimeVF = getRuntimeVF(Builder, StepType, State.VF);
-  Value *Mul = Builder.CreateBinOp(MulOp, Step, RuntimeVF);
-
-  // Create a vector splat to use in the induction update.
-  //
-  // FIXME: If the step is non-constant, we create the vector splat with
-  //        IRBuilder. IRBuilder can constant-fold the multiply, but it doesn't
-  //        handle a constant vector splat.
-  Value *SplatVF = isa<Constant>(Mul)
-                       ? ConstantVector::getSplat(State.VF, cast<Constant>(Mul))
-                       : Builder.CreateVectorSplat(State.VF, Mul);
+  Value *SplatVF;
+  if (getNumOperands() == 4) {
+    // Need to create stuff in PH.
+    SplatVF = State.get(getOperand(2), 0);
+  } else {
+
+    // Multiply the vectorization factor by the step using integer or
+    // floating-point arithmetic as appropriate.
+    Type *StepType = Step->getType();
+    Value *RuntimeVF;
+    if (Step->getType()->isFloatingPointTy())
+      RuntimeVF = getRuntimeVFAsFloat(Builder, StepType, State.VF);
+    else
+      RuntimeVF = getRuntimeVF(Builder, StepType, State.VF);
+    Value *Mul = Builder.CreateBinOp(MulOp, Step, RuntimeVF);
+
+    // Create a vector splat to use in the induction update.
+    //
+    // FIXME: If the step is non-constant, we create the vector splat with
+    //        IRBuilder. IRBuilder can constant-fold the multiply, but it
+    //        doesn't handle a constant vector splat.
+    SplatVF = isa<Constant>(Mul)
+                  ? ConstantVector::getSplat(State.VF, cast<Constant>(Mul))
+                  : Builder.CreateVectorSplat(State.VF, Mul);
+  }
+
   Builder.restoreIP(CurrIP);
 
   // We may need to add the step a number of times, depending on the unroll
@@ -1362,7 +1376,8 @@ void VPScalarIVStepsRecipe::execute(VPTransformState &State) {
     EndLane = StartLane + 1;
   }
   for (unsigned Part = StartPart; Part < EndPart; ++Part) {
-    Value *StartIdx0 = createStepForVF(Builder, IntStepTy, State.VF, Part);
+    Value *StartIdx0 =
+        cre...
[truncated]

``````````

</details>


https://github.com/llvm/llvm-project/pull/95842