[llvm] [VPlan] Implement interleaving as VPlan-to-VPlan transform. (PR #95842)

Sat Sep 21 10:14:24 PDT 2024

https://github.com/fhahn updated https://github.com/llvm/llvm-project/pull/95842

>From 0c3c2935e9da471b6d636242c2b8b1d90f7a8859 Mon Sep 17 00:00:00 2001
From: Florian Hahn <flo at fhahn.com>
Date: Thu, 13 Jun 2024 21:43:24 +0100
Subject: [PATCH 01/15] [VPlan] Implement interleaving as VPlan-to-VPlan
 transform.

This patch implements explicit interleaving as VPlan transform. In
follow up patches this will allow  simplifying VPTransform state
(no need to store unrolled parts) as well as recipe execution (no
need to generate code for multiple parts in a each recipe). It also
allows for more general optimziations (e.g. avoid generating code for
recipes that are uniform-across parts).

In the initial implementation, a number of recipes still take the
unrolled part as additional, optional argument, if their execution
depends on the unrolled part.

The computation for start/step values for scalable inductions changed
slightly. Previously the step would be computed as scalar and then
splatted, now vscale gets splatted and multiplied by the step in a
vector mul.

This has been split off  https://github.com/llvm/llvm-project/pull/94339
which also includes changes to simplify VPTransfomState and recipes'
::execute.

The current version mostly leaves existing ::execute untouched and
instead sets VPTransfomState::UF to 1.
---
 .../Vectorize/LoopVectorizationPlanner.h      |   9 +
 .../Transforms/Vectorize/LoopVectorize.cpp    |  83 ++++
 llvm/lib/Transforms/Vectorize/VPlan.cpp       |  12 +
 llvm/lib/Transforms/Vectorize/VPlan.h         |  50 ++-
 .../lib/Transforms/Vectorize/VPlanRecipes.cpp | 156 +++++--
 .../Transforms/Vectorize/VPlanTransforms.cpp  | 418 ++++++++++++++++++
 .../Transforms/Vectorize/VPlanTransforms.h    |   2 +
 llvm/lib/Transforms/Vectorize/VPlanValue.h    |   2 +-
 .../AArch64/arbitrary-induction-step.ll       |   4 +-
 .../AArch64/sve-gather-scatter.ll             |   2 +-
 .../AArch64/sve-inductions-unusual-types.ll   |   4 +-
 .../LoopVectorize/AArch64/sve-widen-phi.ll    |  25 --
 .../RISCV/interleaved-accesses.ll             |  12 +-
 .../LoopVectorize/RISCV/uniform-load-store.ll |   3 -
 ...ectorize-force-tail-with-evl-interleave.ll |  36 +-
 .../X86/epilog-vectorization-inductions.ll    |   8 +-
 .../LoopVectorize/X86/interleaving.ll         |  80 ----
 .../Transforms/LoopVectorize/X86/pr47437.ll   |  48 +-
 .../LoopVectorize/X86/uniform_mem_op.ll       |  33 +-
 .../LoopVectorize/first-order-recurrence.ll   |  84 ++--
 .../LoopVectorize/float-induction.ll          |  28 +-
 .../Transforms/LoopVectorize/induction.ll     |  77 ++--
 .../interleave-and-scalarize-only.ll          |   2 -
 .../pr45679-fold-tail-by-masking.ll           |  24 +-
 .../LoopVectorize/reduction-inloop-uf4.ll     | 120 ++---
 .../LoopVectorize/scalable-inductions.ll      |   4 +-
 .../vplan-printing-before-execute.ll          |  19 +-
 27 files changed, 917 insertions(+), 428 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h b/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h
index f6271300539320..4de253b6cc475b 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h
@@ -161,6 +161,15 @@ class VPBuilder {
     return tryInsertInstruction(
         new VPInstruction(Opcode, Operands, WrapFlags, DL, Name));
   }
+
+  VPInstruction *createFPOp(unsigned Opcode,
+                            std::initializer_list<VPValue *> Operands,
+                            DebugLoc DL = {}, const Twine &Name = "",
+                            FastMathFlags FMFs = {}) {
+    auto *Op = new VPInstruction(Opcode, Operands, FMFs, DL, Name);
+    return tryInsertInstruction(Op);
+  }
+
   VPValue *createNot(VPValue *Operand, DebugLoc DL = {},
                      const Twine &Name = "") {
     return createInstruction(VPInstruction::Not, {Operand}, DL, Name);
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index f1bb96a38cfaa9..f47c33d8a96941 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -7392,6 +7392,8 @@ LoopVectorizationPlanner::executePlan(
       "expanded SCEVs to reuse can only be used during epilogue vectorization");
   (void)IsEpilogueVectorization;
 
+  VPlanTransforms::interleave(BestVPlan, BestUF,
+                              OrigLoop->getHeader()->getModule()->getContext());
   VPlanTransforms::optimizeForVFAndUF(BestVPlan, BestVF, BestUF, PSE);
 
   LLVM_DEBUG(dbgs() << "Executing best plan with VF=" << BestVF
@@ -9228,6 +9230,87 @@ void LoopVectorizationPlanner::adjustRecipesForReductions(
   VPlanTransforms::clearReductionWrapFlags(*Plan);
 }
 
+void VPWidenPointerInductionRecipe::execute(VPTransformState &State) {
+  assert(IndDesc.getKind() == InductionDescriptor::IK_PtrInduction &&
+         "Not a pointer induction according to InductionDescriptor!");
+  assert(cast<PHINode>(getUnderlyingInstr())->getType()->isPointerTy() &&
+         "Unexpected type.");
+  assert(!onlyScalarsGenerated(State.VF.isScalable()) &&
+         "Recipe should have been replaced");
+
+  auto *IVR = getParent()->getPlan()->getCanonicalIV();
+  PHINode *CanonicalIV = cast<PHINode>(State.get(IVR, 0, /*IsScalar*/ true));
+  unsigned CurrentPart = 0;
+  if (getNumOperands() == 5)
+    CurrentPart =
+        cast<ConstantInt>(getOperand(4)->getLiveInIRValue())->getZExtValue();
+  Type *PhiType = IndDesc.getStep()->getType();
+
+  // Build a pointer phi
+  Value *ScalarStartValue = getStartValue()->getLiveInIRValue();
+  Type *ScStValueType = ScalarStartValue->getType();
+  PHINode *NewPointerPhi = nullptr;
+
+  BasicBlock *VectorPH = State.CFG.getPreheaderBBFor(this);
+  if (getNumOperands() == 5) {
+    auto *GEP = cast<GetElementPtrInst>(State.get(getOperand(3), 0));
+    NewPointerPhi = cast<PHINode>(GEP->getPointerOperand());
+  } else {
+    NewPointerPhi =
+        PHINode::Create(ScStValueType, 2, "pointer.phi", CanonicalIV);
+    NewPointerPhi->addIncoming(ScalarStartValue, VectorPH);
+  }
+
+  // A pointer induction, performed by using a gep
+  BasicBlock::iterator InductionLoc = State.Builder.GetInsertPoint();
+  unsigned UF = getNumOperands() == 2
+                    ? 1
+                    : cast<ConstantInt>(getOperand(2)->getLiveInIRValue())
+                          ->getZExtValue();
+
+  Value *ScalarStepValue = State.get(getOperand(1), VPIteration(0, 0));
+  Value *RuntimeVF = getRuntimeVF(State.Builder, PhiType, State.VF);
+  Value *NumUnrolledElems =
+      State.Builder.CreateMul(RuntimeVF, ConstantInt::get(PhiType, UF));
+  // Add induction update using an incorrect block temporarily. The phi node
+  // will be fixed after VPlan execution. Note that at this point the latch
+  // block cannot be used, as it does not exist yet.
+  // TODO: Model increment value in VPlan, by turning the recipe into a
+  // multi-def and a subclass of VPHeaderPHIRecipe.
+  if (getNumOperands() != 5) {
+    Value *InductionGEP = GetElementPtrInst::Create(
+        State.Builder.getInt8Ty(), NewPointerPhi,
+        State.Builder.CreateMul(ScalarStepValue, NumUnrolledElems), "ptr.ind",
+        InductionLoc);
+
+    NewPointerPhi->addIncoming(InductionGEP, VectorPH);
+  }
+
+  // Create UF many actual address geps that use the pointer
+  // phi as base and a vectorized version of the step value
+  // (<step*0, ..., step*N>) as offset.
+  for (unsigned Part = 0; Part < State.UF; ++Part) {
+    Type *VecPhiType = VectorType::get(PhiType, State.VF);
+    Value *StartOffsetScalar = State.Builder.CreateMul(
+        RuntimeVF, ConstantInt::get(PhiType, CurrentPart));
+    Value *StartOffset =
+        State.Builder.CreateVectorSplat(State.VF, StartOffsetScalar);
+    // Create a vector of consecutive numbers from zero to VF.
+    StartOffset = State.Builder.CreateAdd(
+        StartOffset, State.Builder.CreateStepVector(VecPhiType));
+
+    assert(ScalarStepValue == State.get(getOperand(1), VPIteration(Part, 0)) &&
+           "scalar step must be the same across all parts");
+    Value *GEP = State.Builder.CreateGEP(
+        State.Builder.getInt8Ty(), NewPointerPhi,
+        State.Builder.CreateMul(
+            StartOffset,
+            State.Builder.CreateVectorSplat(State.VF, ScalarStepValue),
+            "vector.gep"));
+    State.set(this, GEP, Part);
+  }
+}
+
 void VPDerivedIVRecipe::execute(VPTransformState &State) {
   assert(!State.Instance && "VPDerivedIVRecipe being replicated.");
 
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.cpp b/llvm/lib/Transforms/Vectorize/VPlan.cpp
index 2a9e4e12190cdc..8933e5fc178307 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlan.cpp
@@ -931,6 +931,10 @@ void VPlan::prepareToExecute(Value *TripCountV, Value *VectorTripCountV,
   // FIXME: Model VF * UF computation completely in VPlan.
   VFxUF.setUnderlyingValue(
       createStepForVF(Builder, TripCountV->getType(), State.VF, State.UF));
+  if (VF.getNumUsers() > 0) {
+    VF.setUnderlyingValue(
+        createStepForVF(Builder, TripCountV->getType(), State.VF, 1));
+  }
 
   // When vectorizing the epilogue loop, the canonical induction start value
   // needs to be changed from zero to the value after the main vector loop.
@@ -974,6 +978,7 @@ static void replaceVPBBWithIRVPBB(VPBasicBlock *VPBB, BasicBlock *IRBB) {
 /// Assumes a single pre-header basic-block was created for this. Introduce
 /// additional basic-blocks as needed, and fill them all.
 void VPlan::execute(VPTransformState *State) {
+  State->UF = 1;
   // Initialize CFG state.
   State->CFG.PrevVPBB = nullptr;
   State->CFG.ExitBB = State->CFG.PrevBB->getSingleSuccessor();
@@ -1048,6 +1053,9 @@ void VPlan::execute(VPTransformState *State) {
       // Move the last step to the end of the latch block. This ensures
       // consistent placement of all induction updates.
       Instruction *Inc = cast<Instruction>(Phi->getIncomingValue(1));
+      if (isa<VPWidenIntOrFpInductionRecipe>(&R) && R.getNumOperands() == 4)
+        Inc->setOperand(0, State->get(R.getOperand(3), 0));
+
       Inc->moveBefore(VectorLatchBB->getTerminator()->getPrevNode());
       continue;
     }
@@ -1418,6 +1426,10 @@ void VPlanIngredient::print(raw_ostream &O) const {
 
 template void DomTreeBuilder::Calculate<VPDominatorTree>(VPDominatorTree &DT);
 
+bool VPValue::isDefinedOutsideVectorRegions() const {
+  return !hasDefiningRecipe() || !getDefiningRecipe()->getParent()->getParent();
+}
+
 void VPValue::replaceAllUsesWith(VPValue *New) {
   replaceUsesWithIf(New, [](VPUser &, unsigned) { return true; });
 }
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h
index 016ad75c21ceb0..a79bdfdf098fdd 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.h
+++ b/llvm/lib/Transforms/Vectorize/VPlan.h
@@ -727,6 +727,8 @@ class VPLiveOut : public VPUser {
 
   PHINode *getPhi() const { return Phi; }
 
+  bool onlyFirstPartUsed(const VPValue *Op) const override { return true; }
+
 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
   /// Print the VPLiveOut to \p O.
   void print(raw_ostream &O, VPSlotTracker &SlotTracker) const;
@@ -1397,6 +1399,9 @@ class VPInstruction : public VPRecipeWithIRFlags {
   /// Returns true if this VPInstruction's operands are single scalars and the
   /// result is also a single scalar.
   bool isSingleScalar() const;
+
+  /// Return the interleave count from the VPInstruction's last argument.
+  unsigned getInterleaveCount() const;
 };
 
 /// VPWidenRecipe is a recipe for producing a widened instruction using the
@@ -1686,6 +1691,9 @@ class VPVectorPointerRecipe : public VPRecipeWithIRFlags {
                                      isInBounds(), getDebugLoc());
   }
 
+  /// Return the current part for this vector pointer.
+  unsigned getPartForRecipe() const;
+
 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
   /// Print the recipe.
   void print(raw_ostream &O, const Twine &Indent,
@@ -2026,6 +2034,9 @@ class VPReductionPHIRecipe : public VPHeaderPHIRecipe {
 
   /// Returns true, if the phi is part of an in-loop reduction.
   bool isInLoop() const { return IsInLoop; }
+
+  /// Return the current part for this scalar step.
+  unsigned getPartForRecipe() const;
 };
 
 /// A recipe for vectorizing a phi-node as a sequence of mask-based select
@@ -2736,6 +2747,9 @@ class VPCanonicalIVPHIRecipe : public VPHeaderPHIRecipe {
   /// Generate the canonical scalar induction phi of the vector loop.
   void execute(VPTransformState &State) override;
 
+  /// Return the current part for this scalar step.
+  unsigned getPartForRecipe() const;
+
 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
   /// Print the recipe.
   void print(raw_ostream &O, const Twine &Indent,
@@ -2780,7 +2794,9 @@ class VPActiveLaneMaskPHIRecipe : public VPHeaderPHIRecipe {
   ~VPActiveLaneMaskPHIRecipe() override = default;
 
   VPActiveLaneMaskPHIRecipe *clone() override {
-    return new VPActiveLaneMaskPHIRecipe(getOperand(0), getDebugLoc());
+    auto *R = new VPActiveLaneMaskPHIRecipe(getOperand(0), getDebugLoc());
+    R->addOperand(getOperand(1));
+    return R;
   }
 
   VP_CLASSOF_IMPL(VPDef::VPActiveLaneMaskPHISC)
@@ -2858,6 +2874,9 @@ class VPWidenCanonicalIVRecipe : public VPSingleDefRecipe {
   /// step = <VF*UF, VF*UF, ..., VF*UF>.
   void execute(VPTransformState &State) override;
 
+  /// Return the current part for this scalar step.
+  unsigned getPartForRecipe() const;
+
 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
   /// Print the recipe.
   void print(raw_ostream &O, const Twine &Indent,
@@ -2970,6 +2989,9 @@ class VPScalarIVStepsRecipe : public VPRecipeWithIRFlags {
            "Op must be an operand of the recipe");
     return true;
   }
+
+  /// Return the current part for this scalar step.
+  unsigned getPartForRecipe() const;
 };
 
 /// VPBasicBlock serves as the leaf of the Hierarchical Control-Flow Graph. It
@@ -3294,6 +3316,8 @@ class VPlan {
   /// Represents the loop-invariant VF * UF of the vector loop region.
   VPValue VFxUF;
 
+  VPValue VF;
+
   /// Holds a mapping between Values and their corresponding VPValue inside
   /// VPlan.
   Value2VPValueTy Value2VPValue;
@@ -3388,6 +3412,7 @@ class VPlan {
 
   /// Returns VF * UF of the vector loop region.
   VPValue &getVFxUF() { return VFxUF; }
+  VPValue &getVF() { return VF; }
 
   void addVF(ElementCount VF) { VFs.insert(VF); }
 
@@ -3825,6 +3850,29 @@ inline bool isUniformAfterVectorization(const VPValue *VPV) {
 
 /// Return true if \p V is a header mask in \p Plan.
 bool isHeaderMask(const VPValue *V, VPlan &Plan);
+
+/// Checks if \p C is uniform across all VFs and UFs. It is considered as such
+/// if it is either defined outside the vector region or its operand is known to
+/// be uniform across all VFs and UFs (e.g. VPDerivedIV or VPCanonicalIVPHI).
+inline bool isUniformAcrossVFsAndUFs(VPValue *V) {
+  if (auto *VPI = dyn_cast_or_null<VPInstruction>(V->getDefiningRecipe())) {
+    return VPI ==
+           VPI->getParent()->getPlan()->getCanonicalIV()->getBackedgeValue();
+  }
+  if (isa<VPCanonicalIVPHIRecipe, VPDerivedIVRecipe, VPExpandSCEVRecipe>(V))
+    return true;
+  if (isa<VPReplicateRecipe>(V) && cast<VPReplicateRecipe>(V)->isUniform() &&
+      (isa<LoadInst, StoreInst>(V->getUnderlyingValue())) &&
+      all_of(V->getDefiningRecipe()->operands(),
+             [](VPValue *Op) { return Op->isDefinedOutsideVectorRegions(); }))
+    return true;
+
+  auto *C = dyn_cast_or_null<VPScalarCastRecipe>(V->getDefiningRecipe());
+  return C && (C->isDefinedOutsideVectorRegions() ||
+               isa<VPDerivedIVRecipe>(C->getOperand(0)) ||
+               isa<VPCanonicalIVPHIRecipe>(C->getOperand(0)));
+}
+
 } // end namespace vputils
 
 } // end namespace llvm
diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
index 911b2fe9e9a1eb..63ec22faf5c76c 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
@@ -453,9 +453,9 @@ Value *VPInstruction::generatePerPart(VPTransformState &State, unsigned Part) {
     if (Part != 0)
       return State.get(this, 0, /*IsScalar*/ true);
 
+    unsigned UF = getInterleaveCount();
     Value *ScalarTC = State.get(getOperand(0), {0, 0});
-    Value *Step =
-        createStepForVF(Builder, ScalarTC->getType(), State.VF, State.UF);
+    Value *Step = createStepForVF(Builder, ScalarTC->getType(), State.VF, UF);
     Value *Sub = Builder.CreateSub(ScalarTC, Step);
     Value *Cmp = Builder.CreateICmp(CmpInst::Predicate::ICMP_UGT, ScalarTC, Step);
     Value *Zero = ConstantInt::get(ScalarTC->getType(), 0);
@@ -488,14 +488,15 @@ Value *VPInstruction::generatePerPart(VPTransformState &State, unsigned Part) {
   }
   case VPInstruction::CanonicalIVIncrementForPart: {
     auto *IV = State.get(getOperand(0), VPIteration(0, 0));
-    if (Part == 0)
-      return IV;
-
-    // The canonical IV is incremented by the vectorization factor (num of SIMD
-    // elements) times the unroll part.
-    Value *Step = createStepForVF(Builder, IV->getType(), State.VF, Part);
-    return Builder.CreateAdd(IV, Step, Name, hasNoUnsignedWrap(),
-                             hasNoSignedWrap());
+    if (getNumOperands() == 2) {
+      // The canonical IV is incremented by the vectorization factor (num of
+      // SIMD elements) times the unroll part.
+      Value *Step = createStepForVF(Builder, IV->getType(), State.VF,
+                                    getInterleaveCount());
+      return Builder.CreateAdd(IV, Step, Name, hasNoUnsignedWrap(),
+                               hasNoSignedWrap());
+    }
+    return IV;
   }
   case VPInstruction::BranchOnCond: {
     if (Part != 0)
@@ -543,8 +544,7 @@ Value *VPInstruction::generatePerPart(VPTransformState &State, unsigned Part) {
     return CondBr;
   }
   case VPInstruction::ComputeReductionResult: {
-    if (Part != 0)
-      return State.get(this, 0, /*IsScalar*/ true);
+    unsigned NumParts = getNumOperands() - 1;
 
     // FIXME: The cross-recipe dependency on VPReductionPHIRecipe is temporary
     // and will be removed by breaking up the recipe further.
@@ -555,11 +555,10 @@ Value *VPInstruction::generatePerPart(VPTransformState &State, unsigned Part) {
 
     RecurKind RK = RdxDesc.getRecurrenceKind();
 
-    VPValue *LoopExitingDef = getOperand(1);
     Type *PhiTy = OrigPhi->getType();
-    VectorParts RdxParts(State.UF);
-    for (unsigned Part = 0; Part < State.UF; ++Part)
-      RdxParts[Part] = State.get(LoopExitingDef, Part, PhiR->isInLoop());
+    VectorParts RdxParts(NumParts);
+    for (unsigned Part = 0; Part != NumParts; ++Part)
+      RdxParts[Part] = State.get(getOperand(1 + Part), 0, PhiR->isInLoop());
 
     // If the vector reduction can be performed in a smaller type, we truncate
     // then extend the loop exit value to enable InstCombine to evaluate the
@@ -567,7 +566,7 @@ Value *VPInstruction::generatePerPart(VPTransformState &State, unsigned Part) {
     // TODO: Handle this in truncateToMinBW.
     if (State.VF.isVector() && PhiTy != RdxDesc.getRecurrenceType()) {
       Type *RdxVecTy = VectorType::get(RdxDesc.getRecurrenceType(), State.VF);
-      for (unsigned Part = 0; Part < State.UF; ++Part)
+      for (unsigned Part = 0; Part < NumParts; ++Part)
         RdxParts[Part] = Builder.CreateTrunc(RdxParts[Part], RdxVecTy);
     }
     // Reduce all of the unrolled parts into a single vector.
@@ -577,12 +576,12 @@ Value *VPInstruction::generatePerPart(VPTransformState &State, unsigned Part) {
       Op = Instruction::Or;
 
     if (PhiR->isOrdered()) {
-      ReducedPartRdx = RdxParts[State.UF - 1];
+      ReducedPartRdx = RdxParts[NumParts - 1];
     } else {
       // Floating-point operations should have some FMF to enable the reduction.
       IRBuilderBase::FastMathFlagGuard FMFG(Builder);
       Builder.setFastMathFlags(RdxDesc.getFastMathFlags());
-      for (unsigned Part = 1; Part < State.UF; ++Part) {
+      for (unsigned Part = 1; Part < NumParts; ++Part) {
         Value *RdxPart = RdxParts[Part];
         if (Op != Instruction::ICmp && Op != Instruction::FCmp)
           ReducedPartRdx = Builder.CreateBinOp(
@@ -688,6 +687,12 @@ bool VPInstruction::isSingleScalar() const {
   return getOpcode() == VPInstruction::ResumePhi;
 }
 
+unsigned VPInstruction::getInterleaveCount() const {
+  return getNumOperands() == 1
+             ? 1
+             : cast<ConstantInt>(getOperand(1)->getLiveInIRValue())
+                   ->getZExtValue();
+
 #if !defined(NDEBUG)
 bool VPInstruction::isFPMathOp() const {
   // Inspired by FPMathOperator::classof. Notable differences are that we don't
@@ -1305,24 +1310,32 @@ void VPWidenIntOrFpInductionRecipe::execute(VPTransformState &State) {
     MulOp = Instruction::FMul;
   }
 
-  // Multiply the vectorization factor by the step using integer or
-  // floating-point arithmetic as appropriate.
-  Type *StepType = Step->getType();
-  Value *RuntimeVF;
-  if (Step->getType()->isFloatingPointTy())
-    RuntimeVF = getRuntimeVFAsFloat(Builder, StepType, State.VF);
-  else
-    RuntimeVF = getRuntimeVF(Builder, StepType, State.VF);
-  Value *Mul = Builder.CreateBinOp(MulOp, Step, RuntimeVF);
-
-  // Create a vector splat to use in the induction update.
-  //
-  // FIXME: If the step is non-constant, we create the vector splat with
-  //        IRBuilder. IRBuilder can constant-fold the multiply, but it doesn't
-  //        handle a constant vector splat.
-  Value *SplatVF = isa<Constant>(Mul)
-                       ? ConstantVector::getSplat(State.VF, cast<Constant>(Mul))
-                       : Builder.CreateVectorSplat(State.VF, Mul);
+  Value *SplatVF;
+  if (getNumOperands() == 4) {
+    // Need to create stuff in PH.
+    SplatVF = State.get(getOperand(2), 0);
+  } else {
+
+    // Multiply the vectorization factor by the step using integer or
+    // floating-point arithmetic as appropriate.
+    Type *StepType = Step->getType();
+    Value *RuntimeVF;
+    if (Step->getType()->isFloatingPointTy())
+      RuntimeVF = getRuntimeVFAsFloat(Builder, StepType, State.VF);
+    else
+      RuntimeVF = getRuntimeVF(Builder, StepType, State.VF);
+    Value *Mul = Builder.CreateBinOp(MulOp, Step, RuntimeVF);
+
+    // Create a vector splat to use in the induction update.
+    //
+    // FIXME: If the step is non-constant, we create the vector splat with
+    //        IRBuilder. IRBuilder can constant-fold the multiply, but it
+    //        doesn't handle a constant vector splat.
+    SplatVF = isa<Constant>(Mul)
+                  ? ConstantVector::getSplat(State.VF, cast<Constant>(Mul))
+                  : Builder.CreateVectorSplat(State.VF, Mul);
+  }
+
   Builder.restoreIP(CurrIP);
 
   // We may need to add the step a number of times, depending on the unroll
@@ -1452,7 +1465,8 @@ void VPScalarIVStepsRecipe::execute(VPTransformState &State) {
     EndLane = StartLane + 1;
   }
   for (unsigned Part = StartPart; Part < EndPart; ++Part) {
-    Value *StartIdx0 = createStepForVF(Builder, IntStepTy, State.VF, Part);
+    Value *StartIdx0 =
+        createStepForVF(Builder, IntStepTy, State.VF, getPartForRecipe());
 
     if (!FirstLaneOnly && State.VF.isScalable()) {
       auto *SplatStartIdx = Builder.CreateVectorSplat(State.VF, StartIdx0);
@@ -1485,6 +1499,13 @@ void VPScalarIVStepsRecipe::execute(VPTransformState &State) {
   }
 }
 
+unsigned VPScalarIVStepsRecipe::getPartForRecipe() const {
+  return getNumOperands() == 2
+             ? 0
+             : cast<ConstantInt>(getOperand(2)->getLiveInIRValue())
+                   ->getZExtValue();
+}
+
 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
 void VPScalarIVStepsRecipe::print(raw_ostream &O, const Twine &Indent,
                                   VPSlotTracker &SlotTracker) const {
@@ -1586,6 +1607,7 @@ void VPWidenGEPRecipe::print(raw_ostream &O, const Twine &Indent,
 void VPVectorPointerRecipe ::execute(VPTransformState &State) {
   auto &Builder = State.Builder;
   State.setDebugLocFrom(getDebugLoc());
+  unsigned CurrentPart = getPartForRecipe();
   for (unsigned Part = 0; Part < State.UF; ++Part) {
     // Calculate the pointer for the specific unroll-part.
     Value *PartPtr = nullptr;
@@ -1593,7 +1615,7 @@ void VPVectorPointerRecipe ::execute(VPTransformState &State) {
     // or query DataLayout for a more suitable index type otherwise.
     const DataLayout &DL =
         Builder.GetInsertBlock()->getDataLayout();
-    Type *IndexTy = State.VF.isScalable() && (IsReverse || Part > 0)
+    Type *IndexTy = State.VF.isScalable() && (IsReverse || CurrentPart > 0)
                         ? DL.getIndexType(IndexedTy->getPointerTo())
                         : Builder.getInt32Ty();
     Value *Ptr = State.get(getOperand(0), VPIteration(0, 0));
@@ -1604,16 +1626,17 @@ void VPVectorPointerRecipe ::execute(VPTransformState &State) {
       // RunTimeVF =  VScale * VF.getKnownMinValue()
       // For fixed-width VScale is 1, then RunTimeVF = VF.getKnownMinValue()
       Value *RunTimeVF = getRuntimeVF(Builder, IndexTy, State.VF);
-      // NumElt = -Part * RunTimeVF
+      // NumElt = -CurrentPart * RunTimeVF
       Value *NumElt = Builder.CreateMul(
-          ConstantInt::get(IndexTy, -(int64_t)Part), RunTimeVF);
+          ConstantInt::get(IndexTy, -(int64_t)CurrentPart), RunTimeVF);
       // LastLane = 1 - RunTimeVF
       Value *LastLane =
           Builder.CreateSub(ConstantInt::get(IndexTy, 1), RunTimeVF);
       PartPtr = Builder.CreateGEP(IndexedTy, Ptr, NumElt, "", InBounds);
       PartPtr = Builder.CreateGEP(IndexedTy, PartPtr, LastLane, "", InBounds);
     } else {
-      Value *Increment = createStepForVF(Builder, IndexTy, State.VF, Part);
+      Value *Increment =
+          createStepForVF(Builder, IndexTy, State.VF, CurrentPart);
       PartPtr = Builder.CreateGEP(IndexedTy, Ptr, Increment, "", InBounds);
     }
 
@@ -1621,6 +1644,13 @@ void VPVectorPointerRecipe ::execute(VPTransformState &State) {
   }
 }
 
+unsigned VPVectorPointerRecipe::getPartForRecipe() const {
+  return getNumOperands() == 1
+             ? 0
+             : cast<ConstantInt>(getOperand(1)->getLiveInIRValue())
+                   ->getZExtValue();
+}
+
 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
 void VPVectorPointerRecipe::print(raw_ostream &O, const Twine &Indent,
                                   VPSlotTracker &SlotTracker) const {
@@ -2564,6 +2594,14 @@ void VPWidenPointerInductionRecipe::print(raw_ostream &O, const Twine &Indent,
   O << " = WIDEN-POINTER-INDUCTION ";
   getStartValue()->printAsOperand(O, SlotTracker);
   O << ", " << *IndDesc.getStep();
+  if (getNumOperands() == 5) {
+    O << ", ";
+    getOperand(2)->printAsOperand(O, SlotTracker);
+    O << ", ";
+    getOperand(3)->printAsOperand(O, SlotTracker);
+    O << ", ";
+    getOperand(4)->printAsOperand(O, SlotTracker);
+  }
 }
 #endif
 
@@ -2599,7 +2637,7 @@ void VPWidenCanonicalIVRecipe::execute(VPTransformState &State) {
                       ? CanonicalIV
                       : Builder.CreateVectorSplat(VF, CanonicalIV, "broadcast");
   for (unsigned Part = 0, UF = State.UF; Part < UF; ++Part) {
-    Value *VStep = createStepForVF(Builder, STy, VF, Part);
+    Value *VStep = createStepForVF(Builder, STy, VF, getPartForRecipe());
     if (VF.isVector()) {
       VStep = Builder.CreateVectorSplat(VF, VStep);
       VStep =
@@ -2610,6 +2648,13 @@ void VPWidenCanonicalIVRecipe::execute(VPTransformState &State) {
   }
 }
 
+unsigned VPWidenCanonicalIVRecipe::getPartForRecipe() const {
+  return getNumOperands() == 1
+             ? 0
+             : cast<ConstantInt>(getOperand(1)->getLiveInIRValue())
+                   ->getZExtValue();
+}
+
 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
 void VPWidenCanonicalIVRecipe::print(raw_ostream &O, const Twine &Indent,
                                      VPSlotTracker &SlotTracker) const {
@@ -2688,6 +2733,8 @@ void VPReductionPHIRecipe::execute(VPTransformState &State) {
 
   Value *Iden = nullptr;
   RecurKind RK = RdxDesc.getRecurrenceKind();
+  unsigned CurrentPart = getPartForRecipe();
+
   if (RecurrenceDescriptor::isMinMaxRecurrenceKind(RK) ||
       RecurrenceDescriptor::isAnyOfRecurrenceKind(RK)) {
     // MinMax and AnyOf reductions have the start value as their identity.
@@ -2704,11 +2751,15 @@ void VPReductionPHIRecipe::execute(VPTransformState &State) {
                                          RdxDesc.getFastMathFlags());
 
     if (!ScalarPHI) {
-      Iden = Builder.CreateVectorSplat(State.VF, Iden);
-      IRBuilderBase::InsertPointGuard IPBuilder(Builder);
-      Builder.SetInsertPoint(VectorPH->getTerminator());
-      Constant *Zero = Builder.getInt32(0);
-      StartV = Builder.CreateInsertElement(Iden, StartV, Zero);
+      if (CurrentPart != 0) {
+        Iden = Builder.CreateVectorSplat(State.VF, Iden);
+      } else {
+        Iden = Builder.CreateVectorSplat(State.VF, Iden);
+        IRBuilderBase::InsertPointGuard IPBuilder(Builder);
+        Builder.SetInsertPoint(VectorPH->getTerminator());
+        Constant *Zero = Builder.getInt32(0);
+        StartV = Builder.CreateInsertElement(Iden, StartV, Zero);
+      }
     }
   }
 
@@ -2716,11 +2767,18 @@ void VPReductionPHIRecipe::execute(VPTransformState &State) {
     Value *EntryPart = State.get(this, Part, IsInLoop);
     // Make sure to add the reduction start value only to the
     // first unroll part.
-    Value *StartVal = (Part == 0) ? StartV : Iden;
+    Value *StartVal = (CurrentPart == 0) ? StartV : Iden;
     cast<PHINode>(EntryPart)->addIncoming(StartVal, VectorPH);
   }
 }
 
+unsigned VPReductionPHIRecipe::getPartForRecipe() const {
+  return getNumOperands() == 2
+             ? 0
+             : cast<ConstantInt>(getOperand(2)->getLiveInIRValue())
+                   ->getZExtValue();
+}
+
 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
 void VPReductionPHIRecipe::print(raw_ostream &O, const Twine &Indent,
                                  VPSlotTracker &SlotTracker) const {
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
index 045f6c356669fa..7a998c726f3290 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
@@ -1622,3 +1622,421 @@ void VPlanTransforms::dropPoisonGeneratingRecipes(
     }
   }
 }
+
+static VPValue *getInterleavedValue(
+    DenseMap<VPValue *, SmallVector<VPValue *>> &InterleavedValues, VPValue *V,
+    unsigned IC) {
+  if (IC == 0)
+    return V;
+  if (V->isLiveIn())
+    return V;
+  return InterleavedValues[V][IC - 1];
+}
+
+static void interleaveReplicateRegion(
+    VPRegionBlock *VPR, VPlan &Plan, unsigned IC,
+    DenseMap<VPValue *, SmallVector<VPValue *>> &InterleavedValues) {
+  Type *CanIVIntTy = Plan.getCanonicalIV()->getScalarType();
+  VPBlockBase *InsertPt = VPR;
+  for (unsigned I = 1; I != IC; ++I) {
+    auto *Copy = VPR->clone();
+    VPBlockUtils::insertBlockAfter(Copy, InsertPt);
+    InsertPt = Copy;
+
+    ReversePostOrderTraversal<VPBlockShallowTraversalWrapper<VPBlockBase *>>
+        RPOT(Copy->getEntry());
+    ReversePostOrderTraversal<VPBlockShallowTraversalWrapper<VPBlockBase *>>
+        RPOT2(VPR->getEntry());
+    for (const auto &[New, Old] :
+         zip(VPBlockUtils::blocksOnly<VPBasicBlock>(RPOT),
+             VPBlockUtils::blocksOnly<VPBasicBlock>(RPOT2))) {
+      if (New->getParent() != Copy)
+        break;
+      for (const auto &[CopyR, OrigR] : zip(*New, *Old)) {
+        for (unsigned Idx = 0; Idx != CopyR.getNumOperands(); ++Idx) {
+          CopyR.setOperand(Idx, getInterleavedValue(InterleavedValues,
+                                                    CopyR.getOperand(Idx), I));
+        }
+        if (auto *ScalarIVSteps = dyn_cast<VPScalarIVStepsRecipe>(&CopyR)) {
+          ScalarIVSteps->addOperand(
+              Plan.getOrAddLiveIn(ConstantInt::get(CanIVIntTy, I)));
+        }
+
+        unsigned Idx = 0;
+        for (VPValue *Res : OrigR.definedValues()) {
+          auto Ins = InterleavedValues.insert({Res, {}});
+          Ins.first->second.push_back(CopyR.getVPValue(Idx));
+          Idx++;
+        }
+      }
+    }
+  }
+}
+
+static void interleaveHeaderPHI(
+    VPRecipeBase &R, VPlan &Plan, unsigned IC,
+    VPBasicBlock::iterator &InsertPtForPhi,
+    DenseMap<VPValue *, SmallVector<VPValue *>> &InterleavedValues,
+    VPTypeAnalysis &TypeInfo, SmallPtrSet<VPRecipeBase *, 8> &ToSkip,
+    SmallVector<SmallVector<VPHeaderPHIRecipe *>> &PhisToRemap) {
+  if (isa<VPFirstOrderRecurrencePHIRecipe>(&R))
+    return;
+
+  // Generate step vectors for each unrolled part.
+  if (auto *IV = dyn_cast<VPWidenIntOrFpInductionRecipe>(&R)) {
+    VPBasicBlock *PH =
+        cast<VPBasicBlock>(Plan.getVectorLoopRegion()->getSinglePredecessor());
+    VPValue *Step = &Plan.getVF();
+    Type *IVTy = TypeInfo.inferScalarType(IV);
+    auto &ID = IV->getInductionDescriptor();
+    FastMathFlags FMFs;
+    if (ID.getInductionBinOp() && isa<FPMathOperator>(ID.getInductionBinOp()))
+      FMFs = ID.getInductionBinOp()->getFastMathFlags();
+
+    if (TypeInfo.inferScalarType(Step) != IVTy) {
+      Instruction::CastOps CastOp;
+      if (IVTy->isFloatingPointTy())
+        CastOp = Instruction::UIToFP;
+      else
+        CastOp = Instruction::Trunc;
+      Step = new VPWidenCastRecipe(CastOp, Step, IV->getScalarType());
+      PH->appendRecipe(Step->getDefiningRecipe());
+      ToSkip.insert(Step->getDefiningRecipe());
+    }
+
+    auto *ConstScale =
+        IV->getOperand(1)->isLiveIn()
+            ? dyn_cast<ConstantInt>(IV->getOperand(1)->getLiveInIRValue())
+            : nullptr;
+    if (!ConstScale || ConstScale->getZExtValue() != 1) {
+      VPValue *Scale = IV->getOperand(1);
+      if (TypeInfo.inferScalarType(Scale) != IVTy) {
+        Scale = new VPWidenCastRecipe(Instruction::Trunc, Scale,
+                                      IV->getScalarType());
+        PH->appendRecipe(Scale->getDefiningRecipe());
+        ToSkip.insert(Scale->getDefiningRecipe());
+      }
+
+      VPBuilder Builder(PH);
+      VPInstruction *Mul;
+      if (IVTy->isFloatingPointTy())
+        Mul = Builder.createFPOp(Instruction::FMul, {Step, Scale},
+                                 R.getDebugLoc(), "", FMFs);
+      else
+        Mul = Builder.createNaryOp(Instruction::Mul, {Step, Scale},
+                                   R.getDebugLoc());
+      Step = Mul;
+      ToSkip.insert(Mul);
+    }
+    R.addOperand(Step);
+
+    for (unsigned I = 1; I != IC; ++I) {
+      VPBuilder Builder;
+      Builder.setInsertPoint(R.getParent(), InsertPtForPhi);
+      auto Ins = InterleavedValues.insert({IV, {}});
+      VPValue *Prev = getInterleavedValue(InterleavedValues, IV, I - 1);
+      VPInstruction *Add;
+      std::string Name = I > 1 ? "step.add." + std::to_string(I) : "step.add";
+
+      if (IVTy->isFloatingPointTy())
+        Add = Builder.createFPOp(ID.getInductionOpcode(),
+                                 {
+                                     Prev,
+                                     Step,
+                                 },
+                                 R.getDebugLoc(), Name, FMFs);
+      else
+        Add = Builder.createNaryOp(Instruction::Add,
+                                   {
+                                       Prev,
+                                       Step,
+                                   },
+                                   R.getDebugLoc(), Name);
+      ToSkip.insert(Add);
+      Ins.first->second.push_back(Add);
+      InsertPtForPhi = std::next(Add->getIterator());
+    }
+    R.addOperand(getInterleavedValue(InterleavedValues, IV, IC - 1));
+    return;
+  }
+
+  VPRecipeBase *InsertPt = &R;
+  Type *CanIVIntTy = Plan.getCanonicalIV()->getScalarType();
+  for (unsigned I = 1; I != IC; ++I) {
+    VPRecipeBase *Copy = R.clone();
+    Copy->insertAfter(InsertPt);
+    InsertPt = Copy;
+    unsigned Idx = 0;
+    for (VPValue *Res : R.definedValues()) {
+      auto Ins = InterleavedValues.insert({Res, {}});
+      Ins.first->second.push_back(Copy->getVPValue(Idx));
+      Idx++;
+    }
+    if (isa<VPWidenPointerInductionRecipe>(&R)) {
+      if (I == 1)
+        R.addOperand(Plan.getOrAddLiveIn(ConstantInt::get(CanIVIntTy, IC)));
+      Copy->addOperand(Plan.getOrAddLiveIn(ConstantInt::get(CanIVIntTy, IC)));
+      Copy->addOperand(R.getVPSingleValue());
+      Copy->addOperand(Plan.getOrAddLiveIn(ConstantInt::get(CanIVIntTy, I)));
+      continue;
+    }
+
+    if (auto *RdxPhi = dyn_cast<VPReductionPHIRecipe>(&R)) {
+      if (RdxPhi->isOrdered()) {
+        Copy->eraseFromParent();
+        break;
+      }
+      Copy->addOperand(Plan.getOrAddLiveIn(ConstantInt::get(CanIVIntTy, I)));
+    }
+
+    if (I == 1)
+      PhisToRemap.emplace_back();
+
+    auto *H = cast<VPHeaderPHIRecipe>(Copy);
+    PhisToRemap.back().push_back(H);
+  }
+}
+
+static void
+interleaveRecipe(VPRecipeBase &R, VPlan &Plan, unsigned IC,
+                 DenseMap<VPValue *, SmallVector<VPValue *>> &InterleavedValues,
+                 VPTypeAnalysis &TypeInfo) {
+  using namespace llvm::VPlanPatternMatch;
+  VPValue *Op1;
+  if (match(&R, m_VPInstruction<VPInstruction::ComputeReductionResult>(
+                    m_VPValue(), m_VPValue(Op1)))) {
+    auto Ins = InterleavedValues.insert({R.getVPSingleValue(), {}});
+    for (unsigned I = 1; I != IC; ++I) {
+      R.addOperand(getInterleavedValue(InterleavedValues, Op1, I));
+      Ins.first->second.push_back(R.getVPSingleValue());
+    }
+    return;
+  }
+  VPValue *Op0;
+  if (match(&R, m_VPInstruction<VPInstruction::ExtractFromEnd>(m_VPValue(Op0),
+                                                               m_VPValue()))) {
+    auto Ins = InterleavedValues.insert({R.getVPSingleValue(), {}});
+    for (unsigned I = 1; I != IC; ++I) {
+      Ins.first->second.push_back(R.getVPSingleValue());
+    }
+
+    bool ScalarVFOnly = Plan.hasScalarVFOnly();
+    if (!ScalarVFOnly) {
+      R.setOperand(0, getInterleavedValue(InterleavedValues, Op0, IC - 1));
+      return;
+    }
+  }
+
+  Type *CanIVIntTy = Plan.getCanonicalIV()->getScalarType();
+  if (isa<VPInstruction>(&R) && cast<VPInstruction>(&R)->getOpcode() ==
+                                    VPInstruction::CalculateTripCountMinusVF) {
+    R.addOperand(Plan.getOrAddLiveIn(ConstantInt::get(CanIVIntTy, IC)));
+    auto Ins = InterleavedValues.insert({R.getVPSingleValue(), {}});
+    for (unsigned I = 1; I != IC; ++I) {
+      Ins.first->second.push_back(R.getVPSingleValue());
+    }
+
+    return;
+  }
+
+  if (auto *VPI = dyn_cast<VPInstruction>(&R)) {
+    if (VPI->getOpcode() == VPInstruction::BranchOnCount ||
+        VPI->getOpcode() == VPInstruction::BranchOnCond)
+      return;
+  }
+
+  if (auto *RepR = dyn_cast<VPReplicateRecipe>(&R)) {
+    if (isa<StoreInst>(RepR->getUnderlyingValue()) &&
+        RepR->getOperand(1)->isDefinedOutsideVectorRegions()) {
+      R.setOperand(
+          0, getInterleavedValue(InterleavedValues, R.getOperand(0), IC - 1));
+      return;
+    }
+    if (auto *II = dyn_cast<IntrinsicInst>(RepR->getUnderlyingValue())) {
+      if (II->getIntrinsicID() == Intrinsic::experimental_noalias_scope_decl) {
+        auto Ins = InterleavedValues.insert({RepR, {}});
+        Ins.first->second.push_back(RepR);
+        return;
+      }
+    }
+  }
+
+  // TODO: Generalize for any uniform recipe.
+  if (auto *Cast = dyn_cast<VPWidenCastRecipe>(&R)) {
+    if (Cast->getOperand(0)->isLiveIn()) {
+      auto Ins = InterleavedValues.insert({Cast, {}});
+      Ins.first->second.push_back(Cast);
+      return;
+    }
+  }
+
+  if (isa<VPInstruction>(&R) &&
+      vputils::onlyFirstPartUsed(R.getVPSingleValue())) {
+    auto Ins = InterleavedValues.insert({R.getVPSingleValue(), {}});
+    for (unsigned I = 1; I != IC; ++I) {
+      Ins.first->second.push_back(R.getVPSingleValue());
+    }
+
+    return;
+  }
+
+  VPRecipeBase *InsertPt = &R;
+  for (unsigned I = 1; I != IC; ++I) {
+    VPRecipeBase *Copy = R.clone();
+    Copy->insertAfter(InsertPt);
+    InsertPt = Copy;
+    unsigned Idx = 0;
+    for (VPValue *Res : R.definedValues()) {
+      auto Ins = InterleavedValues.insert({Res, {}});
+      Ins.first->second.push_back(Copy->getVPValue(Idx));
+      Idx++;
+    }
+
+    if (auto *VPI = dyn_cast<VPInstruction>(&R)) {
+      if (VPI->getOpcode() == VPInstruction::CanonicalIVIncrementForPart) {
+        Copy->addOperand(Plan.getOrAddLiveIn(ConstantInt::get(CanIVIntTy, I)));
+      }
+      if (VPI->getOpcode() == VPInstruction::FirstOrderRecurrenceSplice) {
+        Copy->setOperand(
+            0, getInterleavedValue(InterleavedValues, R.getOperand(1), I - 1));
+        Copy->setOperand(
+            1, getInterleavedValue(InterleavedValues, R.getOperand(1), I));
+        continue;
+      }
+    }
+    if (auto *Red = dyn_cast<VPReductionRecipe>(&R)) {
+      auto *Phi = cast<VPReductionPHIRecipe>(R.getOperand(0));
+      if (Phi->isOrdered()) {
+        auto Ins = InterleavedValues.insert({Phi, {}});
+        if (I == 1) {
+          Ins.first->second.clear();
+          Ins.first->second.push_back(Red);
+        }
+        Ins.first->second.push_back(Copy->getVPSingleValue());
+        Phi->setOperand(1, Copy->getVPSingleValue());
+      }
+    }
+    for (unsigned Idx = 0; Idx != Copy->getNumOperands(); ++Idx)
+      Copy->setOperand(Idx, getInterleavedValue(InterleavedValues,
+                                                Copy->getOperand(Idx), I));
+
+    // Add operand indicating the part to generate code for to recipes still
+    // requiring it.
+    if (isa<VPScalarIVStepsRecipe, VPWidenCanonicalIVRecipe,
+            VPVectorPointerRecipe>(Copy))
+      Copy->addOperand(Plan.getOrAddLiveIn(ConstantInt::get(CanIVIntTy, I)));
+
+    if (isa<VPVectorPointerRecipe>(R))
+      Copy->setOperand(0, R.getOperand(0));
+  }
+}
+
+static void
+interleaveBlock(VPBlockBase *VPB, VPlan &Plan, unsigned IC,
+                DenseMap<VPValue *, SmallVector<VPValue *>> &InterleavedValues,
+                VPTypeAnalysis &TypeInfo,
+                SmallPtrSet<VPRecipeBase *, 8> &ToSkip,
+                SmallVector<SmallVector<VPHeaderPHIRecipe *>> &PhisToRemap) {
+  auto *VPR = dyn_cast<VPRegionBlock>(VPB);
+  if (VPR) {
+    if (VPR->isReplicator())
+      interleaveReplicateRegion(VPR, Plan, IC, InterleavedValues);
+    else {
+      ReversePostOrderTraversal<VPBlockShallowTraversalWrapper<VPBlockBase *>>
+          RPOT(VPR->getEntry());
+      for (VPBlockBase *VPB : RPOT) {
+        interleaveBlock(VPB, Plan, IC, InterleavedValues, TypeInfo, ToSkip,
+                        PhisToRemap);
+      }
+    }
+    return;
+  }
+
+  auto *VPBB = cast<VPBasicBlock>(VPB);
+  auto InsertPtForPhi = VPBB->getFirstNonPhi();
+  for (VPRecipeBase &R : make_early_inc_range(*VPBB)) {
+    if (ToSkip.contains(&R))
+      continue;
+
+    auto *SingleDef = dyn_cast<VPSingleDefRecipe>(&R);
+    if (SingleDef && vputils::isUniformAcrossVFsAndUFs(SingleDef)) {
+      for (unsigned I = 1; I != IC; ++I) {
+        auto Ins = InterleavedValues.insert({SingleDef, {}});
+        Ins.first->second.push_back(SingleDef);
+      }
+      continue;
+    }
+
+    if (auto *H = dyn_cast<VPHeaderPHIRecipe>(&R)) {
+      interleaveHeaderPHI(R, Plan, IC, InsertPtForPhi, InterleavedValues,
+                          TypeInfo, ToSkip, PhisToRemap);
+      continue;
+    }
+
+    interleaveRecipe(R, Plan, IC, InterleavedValues, TypeInfo);
+  }
+}
+
+void VPlanTransforms::interleave(VPlan &Plan, unsigned IC, LLVMContext &Ctx) {
+  assert(IC > 0);
+  if (IC == 1)
+    return;
+  DenseMap<VPValue *, SmallVector<VPValue *>> InterleavedValues;
+
+  SmallPtrSet<VPRecipeBase *, 8> ToSkip;
+
+  Type *CanIVIntTy = Plan.getCanonicalIV()->getScalarType();
+  VPTypeAnalysis TypeInfo(CanIVIntTy, Ctx);
+  ReversePostOrderTraversal<VPBlockShallowTraversalWrapper<VPBlockBase *>> RPOT(
+      Plan.getEntry());
+  SmallVector<SmallVector<VPHeaderPHIRecipe *>> PhisToRemap;
+  interleaveBlock(Plan.getPreheader(), Plan, IC, InterleavedValues, TypeInfo,
+                  ToSkip, PhisToRemap);
+
+  for (VPBlockBase *VPB : RPOT) {
+    interleaveBlock(VPB, Plan, IC, InterleavedValues, TypeInfo, ToSkip,
+                    PhisToRemap);
+  }
+
+  for (auto &R : PhisToRemap) {
+    unsigned I = 1;
+    for (VPHeaderPHIRecipe *H : R) {
+      for (unsigned Idx = 0; Idx != H->getNumOperands(); ++Idx)
+        H->setOperand(
+            Idx, getInterleavedValue(InterleavedValues, H->getOperand(Idx), I));
+      I++;
+    }
+  }
+
+  for (VPRecipeBase &H :
+       Plan.getVectorLoopRegion()->getEntryBasicBlock()->phis()) {
+    if (!isa<VPFirstOrderRecurrencePHIRecipe>(&H)) {
+      continue;
+    }
+    H.setOperand(
+        1, getInterleavedValue(InterleavedValues, H.getOperand(1), IC - 1));
+  }
+
+  using namespace llvm::VPlanPatternMatch;
+  bool ScalarVFOnly = Plan.hasScalarVFOnly();
+  for (const auto &[_, LO] : Plan.getLiveOuts()) {
+    VPValue *In = nullptr;
+    VPValue *Op0;
+    if (ScalarVFOnly &&
+        match(LO->getOperand(0), m_VPInstruction<VPInstruction::ExtractFromEnd>(
+                                     m_VPValue(Op0), m_VPValue()))) {
+      VPInstruction *Extract =
+          cast<VPInstruction>(LO->getOperand(0)->getDefiningRecipe());
+      unsigned Offset =
+          cast<ConstantInt>(Extract->getOperand(1)->getLiveInIRValue())
+              ->getZExtValue();
+      In = getInterleavedValue(InterleavedValues, Op0, IC - Offset);
+      LO->setOperand(0, In);
+      Extract->getDefiningRecipe()->eraseFromParent();
+      continue;
+    } else
+      In = getInterleavedValue(InterleavedValues, LO->getOperand(0), IC - 1);
+
+    LO->setOperand(0, In);
+  }
+}
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.h b/llvm/lib/Transforms/Vectorize/VPlanTransforms.h
index 96b8a6639723c2..67d22ce46b6d92 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.h
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.h
@@ -106,6 +106,8 @@ struct VPlanTransforms {
   /// this transformation.
   /// \returns true if the transformation succeeds, or false if it doesn't.
   static bool tryAddExplicitVectorLength(VPlan &Plan);
+
+  static void interleave(VPlan &Plan, unsigned IC, LLVMContext &Ctx);
 };
 
 } // namespace llvm
diff --git a/llvm/lib/Transforms/Vectorize/VPlanValue.h b/llvm/lib/Transforms/Vectorize/VPlanValue.h
index 452c977106a773..d9668a985c0907 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanValue.h
+++ b/llvm/lib/Transforms/Vectorize/VPlanValue.h
@@ -183,7 +183,7 @@ class VPValue {
   /// Returns true if the VPValue is defined outside any vector regions, i.e. it
   /// is a live-in value.
   /// TODO: Also handle recipes defined in pre-header blocks.
-  bool isDefinedOutsideVectorRegions() const { return !hasDefiningRecipe(); }
+  bool isDefinedOutsideVectorRegions() const;
 
   // Set \p Val as the underlying Value of this VPValue.
   void setUnderlyingValue(Value *Val) {
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/arbitrary-induction-step.ll b/llvm/test/Transforms/LoopVectorize/AArch64/arbitrary-induction-step.ll
index 22aaa563daa5ad..b784c465f878e7 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/arbitrary-induction-step.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/arbitrary-induction-step.ll
@@ -103,10 +103,10 @@ for.end:                                          ; preds = %for.body
 
 ; CHECK-LABEL: @ptr_ind_plus2(
 ; CHECK: %[[V0:.*]] = load <8 x i32>
-; CHECK: %[[V1:.*]] = load <8 x i32>
 ; CHECK: shufflevector <8 x i32> %[[V0]], <8 x i32> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
-; CHECK: shufflevector <8 x i32> %[[V1]], <8 x i32> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
 ; CHECK: shufflevector <8 x i32> %[[V0]], <8 x i32> poison, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+; CHECK: %[[V1:.*]] = load <8 x i32>
+; CHECK: shufflevector <8 x i32> %[[V1]], <8 x i32> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
 ; CHECK: shufflevector <8 x i32> %[[V1]], <8 x i32> poison, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
 ; CHECK: mul nsw <4 x i32>
 ; CHECK: mul nsw <4 x i32>
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-gather-scatter.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-gather-scatter.ll
index 2a80a7affa4f8c..fd0dc3304f2166 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/sve-gather-scatter.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-gather-scatter.ll
@@ -294,9 +294,9 @@ define void @gather_nxv4i32_ind64_stride2(ptr noalias nocapture %a, ptr noalias
 ; CHECK-NEXT:    [[N_VEC:%.*]] = and i64 [[DOTNEG]], [[N]]
 ; CHECK-NEXT:    [[TMP3:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-NEXT:    [[TMP4:%.*]] = shl nuw nsw i64 [[TMP3]], 3
-; CHECK-NEXT:    [[TMP5:%.*]] = call <vscale x 4 x i64> @llvm.experimental.stepvector.nxv4i64()
 ; CHECK-NEXT:    [[TMP6:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-NEXT:    [[TMP7:%.*]] = shl nuw nsw i64 [[TMP6]], 2
+; CHECK-NEXT:    [[TMP5:%.*]] = call <vscale x 4 x i64> @llvm.experimental.stepvector.nxv4i64()
 ; CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 4 x i64> poison, i64 [[TMP7]], i64 0
 ; CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <vscale x 4 x i64> [[DOTSPLATINSERT]], <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-inductions-unusual-types.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-inductions-unusual-types.ll
index 812af1a102083f..5dfda70bd41386 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/sve-inductions-unusual-types.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-inductions-unusual-types.ll
@@ -19,7 +19,7 @@ define void @induction_i7(ptr %dst) #0 {
 ; CHECK-NEXT:    [[IND_END:%.*]] = trunc i64 [[N_VEC]] to i7
 ; CHECK-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 4
-; CHECK-NEXT:    [[TMP6:%.*]] = call <vscale x 2 x i8> @llvm.experimental.stepvector.nxv2i8()
+; CHECK:         [[TMP6:%.*]] = call <vscale x 2 x i8> @llvm.experimental.stepvector.nxv2i8()
 ; CHECK-NEXT:    [[TMP7:%.*]] = trunc <vscale x 2 x i8> [[TMP6]] to <vscale x 2 x i7>
 ; CHECK-NEXT:    [[TMP8:%.*]] = add <vscale x 2 x i7> [[TMP7]], zeroinitializer
 ; CHECK-NEXT:    [[TMP9:%.*]] = mul <vscale x 2 x i7> [[TMP8]], shufflevector (<vscale x 2 x i7> insertelement (<vscale x 2 x i7> poison, i7 1, i64 0), <vscale x 2 x i7> poison, <vscale x 2 x i32> zeroinitializer)
@@ -93,7 +93,7 @@ define void @induction_i3_zext(ptr %dst) #0 {
 ; CHECK-NEXT:    [[IND_END:%.*]] = trunc i64 [[N_VEC]] to i3
 ; CHECK-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 4
-; CHECK-NEXT:    [[TMP6:%.*]] = call <vscale x 2 x i8> @llvm.experimental.stepvector.nxv2i8()
+; CHECK:         [[TMP6:%.*]] = call <vscale x 2 x i8> @llvm.experimental.stepvector.nxv2i8()
 ; CHECK-NEXT:    [[TMP7:%.*]] = trunc <vscale x 2 x i8> [[TMP6]] to <vscale x 2 x i3>
 ; CHECK-NEXT:    [[TMP8:%.*]] = add <vscale x 2 x i3> [[TMP7]], zeroinitializer
 ; CHECK-NEXT:    [[TMP9:%.*]] = mul <vscale x 2 x i3> [[TMP8]], shufflevector (<vscale x 2 x i3> insertelement (<vscale x 2 x i3> poison, i3 1, i64 0), <vscale x 2 x i3> poison, <vscale x 2 x i32> zeroinitializer)
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-widen-phi.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-widen-phi.ll
index 76084776b2b765..a9886b43df5517 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/sve-widen-phi.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-widen-phi.ll
@@ -37,31 +37,6 @@ define void @widen_ptr_phi_unrolled(ptr noalias nocapture %a, ptr noalias nocapt
 ; CHECK-NEXT:    [[TMP7:%.*]] = shl nuw nsw i64 [[TMP6]], 5
 ; CHECK-NEXT:    [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[C]], i64 [[OFFSET_IDX]]
 ; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[C]], i64 [[OFFSET_IDX]]
-; CHECK-NEXT:    [[NEXT_GEP2:%.*]] = getelementptr i8, ptr [[TMP8]], i64 [[TMP7]]
-; CHECK-NEXT:    [[WIDE_VEC:%.*]] = load <vscale x 8 x i32>, ptr [[NEXT_GEP]], align 4
-; CHECK-NEXT:    [[WIDE_VEC3:%.*]] = load <vscale x 8 x i32>, ptr [[NEXT_GEP2]], align 4
-; CHECK-NEXT:    [[STRIDED_VEC:%.*]] = call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.vector.deinterleave2.nxv8i32(<vscale x 8 x i32> [[WIDE_VEC]])
-; CHECK-NEXT:    [[TMP9:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[STRIDED_VEC]], 0
-; CHECK-NEXT:    [[TMP10:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[STRIDED_VEC]], 1
-; CHECK-NEXT:    [[STRIDED_VEC4:%.*]] = call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.vector.deinterleave2.nxv8i32(<vscale x 8 x i32> [[WIDE_VEC3]])
-; CHECK-NEXT:    [[TMP11:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[STRIDED_VEC4]], 0
-; CHECK-NEXT:    [[TMP12:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[STRIDED_VEC4]], 1
-; CHECK-NEXT:    [[TMP13:%.*]] = add nsw <vscale x 4 x i32> [[TMP9]], shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 1, i64 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer)
-; CHECK-NEXT:    [[TMP14:%.*]] = add nsw <vscale x 4 x i32> [[TMP11]], shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 1, i64 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer)
-; CHECK-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[INDEX]]
-; CHECK-NEXT:    [[TMP16:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[DOTIDX:%.*]] = shl nuw nsw i64 [[TMP16]], 4
-; CHECK-NEXT:    [[TMP17:%.*]] = getelementptr inbounds i8, ptr [[TMP15]], i64 [[DOTIDX]]
-; CHECK-NEXT:    store <vscale x 4 x i32> [[TMP13]], ptr [[TMP15]], align 4
-; CHECK-NEXT:    store <vscale x 4 x i32> [[TMP14]], ptr [[TMP17]], align 4
-; CHECK-NEXT:    [[TMP18:%.*]] = add nsw <vscale x 4 x i32> [[TMP10]], shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 1, i64 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer)
-; CHECK-NEXT:    [[TMP19:%.*]] = add nsw <vscale x 4 x i32> [[TMP12]], shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 1, i64 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer)
-; CHECK-NEXT:    [[TMP20:%.*]] = getelementptr inbounds i32, ptr [[B:%.*]], i64 [[INDEX]]
-; CHECK-NEXT:    [[TMP21:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[DOTIDX5:%.*]] = shl nuw nsw i64 [[TMP21]], 4
-; CHECK-NEXT:    [[TMP22:%.*]] = getelementptr inbounds i8, ptr [[TMP20]], i64 [[DOTIDX5]]
-; CHECK-NEXT:    store <vscale x 4 x i32> [[TMP18]], ptr [[TMP20]], align 4
-; CHECK-NEXT:    store <vscale x 4 x i32> [[TMP19]], ptr [[TMP22]], align 4
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
 ; CHECK-NEXT:    [[TMP23:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[TMP23]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/interleaved-accesses.ll b/llvm/test/Transforms/LoopVectorize/RISCV/interleaved-accesses.ll
index 87bc77cb7767ff..79c7e4b64c30bd 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/interleaved-accesses.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/interleaved-accesses.ll
@@ -1227,12 +1227,12 @@ define void @combine_load_factor2_i32(ptr noalias %p, ptr noalias %q) {
 ; FIXED-NEXT:    [[TMP4:%.*]] = getelementptr i32, ptr [[P:%.*]], i64 [[TMP2]]
 ; FIXED-NEXT:    [[TMP5:%.*]] = getelementptr i32, ptr [[P]], i64 [[TMP3]]
 ; FIXED-NEXT:    [[TMP6:%.*]] = getelementptr i32, ptr [[TMP4]], i32 0
-; FIXED-NEXT:    [[TMP7:%.*]] = getelementptr i32, ptr [[TMP5]], i32 0
 ; FIXED-NEXT:    [[WIDE_VEC:%.*]] = load <16 x i32>, ptr [[TMP6]], align 4
-; FIXED-NEXT:    [[WIDE_VEC1:%.*]] = load <16 x i32>, ptr [[TMP7]], align 4
 ; FIXED-NEXT:    [[STRIDED_VEC:%.*]] = shufflevector <16 x i32> [[WIDE_VEC]], <16 x i32> poison, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
-; FIXED-NEXT:    [[STRIDED_VEC2:%.*]] = shufflevector <16 x i32> [[WIDE_VEC1]], <16 x i32> poison, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
 ; FIXED-NEXT:    [[STRIDED_VEC3:%.*]] = shufflevector <16 x i32> [[WIDE_VEC]], <16 x i32> poison, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
+; FIXED-NEXT:    [[TMP7:%.*]] = getelementptr i32, ptr [[TMP5]], i32 0
+; FIXED-NEXT:    [[WIDE_VEC1:%.*]] = load <16 x i32>, ptr [[TMP7]], align 4
+; FIXED-NEXT:    [[STRIDED_VEC2:%.*]] = shufflevector <16 x i32> [[WIDE_VEC1]], <16 x i32> poison, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
 ; FIXED-NEXT:    [[STRIDED_VEC4:%.*]] = shufflevector <16 x i32> [[WIDE_VEC1]], <16 x i32> poison, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
 ; FIXED-NEXT:    [[TMP8:%.*]] = add <8 x i32> [[STRIDED_VEC]], [[STRIDED_VEC3]]
 ; FIXED-NEXT:    [[TMP9:%.*]] = add <8 x i32> [[STRIDED_VEC2]], [[STRIDED_VEC4]]
@@ -1415,12 +1415,12 @@ define void @combine_load_factor2_i64(ptr noalias %p, ptr noalias %q) {
 ; FIXED-NEXT:    [[TMP4:%.*]] = getelementptr i64, ptr [[P:%.*]], i64 [[TMP2]]
 ; FIXED-NEXT:    [[TMP5:%.*]] = getelementptr i64, ptr [[P]], i64 [[TMP3]]
 ; FIXED-NEXT:    [[TMP6:%.*]] = getelementptr i64, ptr [[TMP4]], i32 0
-; FIXED-NEXT:    [[TMP7:%.*]] = getelementptr i64, ptr [[TMP5]], i32 0
 ; FIXED-NEXT:    [[WIDE_VEC:%.*]] = load <8 x i64>, ptr [[TMP6]], align 8
-; FIXED-NEXT:    [[WIDE_VEC1:%.*]] = load <8 x i64>, ptr [[TMP7]], align 8
 ; FIXED-NEXT:    [[STRIDED_VEC:%.*]] = shufflevector <8 x i64> [[WIDE_VEC]], <8 x i64> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
-; FIXED-NEXT:    [[STRIDED_VEC2:%.*]] = shufflevector <8 x i64> [[WIDE_VEC1]], <8 x i64> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
 ; FIXED-NEXT:    [[STRIDED_VEC3:%.*]] = shufflevector <8 x i64> [[WIDE_VEC]], <8 x i64> poison, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+; FIXED-NEXT:    [[TMP7:%.*]] = getelementptr i64, ptr [[TMP5]], i32 0
+; FIXED-NEXT:    [[WIDE_VEC1:%.*]] = load <8 x i64>, ptr [[TMP7]], align 8
+; FIXED-NEXT:    [[STRIDED_VEC2:%.*]] = shufflevector <8 x i64> [[WIDE_VEC1]], <8 x i64> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
 ; FIXED-NEXT:    [[STRIDED_VEC4:%.*]] = shufflevector <8 x i64> [[WIDE_VEC1]], <8 x i64> poison, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
 ; FIXED-NEXT:    [[TMP8:%.*]] = add <4 x i64> [[STRIDED_VEC]], [[STRIDED_VEC3]]
 ; FIXED-NEXT:    [[TMP9:%.*]] = add <4 x i64> [[STRIDED_VEC2]], [[STRIDED_VEC4]]
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/uniform-load-store.ll b/llvm/test/Transforms/LoopVectorize/RISCV/uniform-load-store.ll
index 0ecba2f304682c..2b48cdd890a5f7 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/uniform-load-store.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/uniform-load-store.ll
@@ -967,9 +967,6 @@ define void @uniform_store_of_loop_varying(ptr noalias nocapture %a, ptr noalias
 ; FIXEDLEN:       vector.body:
 ; FIXEDLEN-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; FIXEDLEN-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
-; FIXEDLEN-NEXT:    [[TMP1:%.*]] = add i64 [[INDEX]], 1
-; FIXEDLEN-NEXT:    [[TMP2:%.*]] = add i64 [[INDEX]], 2
-; FIXEDLEN-NEXT:    [[TMP3:%.*]] = add i64 [[INDEX]], 3
 ; FIXEDLEN-NEXT:    [[TMP4:%.*]] = add i64 [[INDEX]], 4
 ; FIXEDLEN-NEXT:    [[TMP5:%.*]] = add i64 [[INDEX]], 5
 ; FIXEDLEN-NEXT:    [[TMP6:%.*]] = add i64 [[INDEX]], 6
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-interleave.ll b/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-interleave.ll
index 895c89b768acb3..d0c04e70de9167 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-interleave.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-interleave.ll
@@ -28,14 +28,13 @@ define void @interleave(ptr noalias %a, ptr noalias %b, i64 %N) {
 ; IF-EVL-NEXT:    [[TRIP_COUNT_MINUS_1:%.*]] = sub i64 [[N]], 1
 ; IF-EVL-NEXT:    [[TMP9:%.*]] = call i64 @llvm.vscale.i64()
 ; IF-EVL-NEXT:    [[TMP10:%.*]] = mul i64 [[TMP9]], 8
+; IF-EVL-NEXT:    [[TMP14:%.*]] = call i64 @llvm.vscale.i64()
+; IF-EVL-NEXT:    [[TMP15:%.*]] = mul i64 [[TMP14]], 4
 ; IF-EVL-NEXT:    [[TMP11:%.*]] = call <vscale x 4 x i64> @llvm.experimental.stepvector.nxv4i64()
 ; IF-EVL-NEXT:    [[TMP12:%.*]] = add <vscale x 4 x i64> [[TMP11]], zeroinitializer
 ; IF-EVL-NEXT:    [[TMP13:%.*]] = mul <vscale x 4 x i64> [[TMP12]], shufflevector (<vscale x 4 x i64> insertelement (<vscale x 4 x i64> poison, i64 1, i64 0), <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer)
 ; IF-EVL-NEXT:    [[INDUCTION:%.*]] = add <vscale x 4 x i64> zeroinitializer, [[TMP13]]
-; IF-EVL-NEXT:    [[TMP14:%.*]] = call i64 @llvm.vscale.i64()
-; IF-EVL-NEXT:    [[TMP15:%.*]] = mul i64 [[TMP14]], 4
-; IF-EVL-NEXT:    [[TMP37:%.*]] = mul i64 1, [[TMP15]]
-; IF-EVL-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 4 x i64> poison, i64 [[TMP37]], i64 0
+; IF-EVL-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 4 x i64> poison, i64 [[TMP15]], i64 0
 ; IF-EVL-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <vscale x 4 x i64> [[DOTSPLATINSERT]], <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer
 ; IF-EVL-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 4 x i64> poison, i64 [[TRIP_COUNT_MINUS_1]], i64 0
 ; IF-EVL-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 4 x i64> [[BROADCAST_SPLATINSERT]], <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer
@@ -110,35 +109,6 @@ define void @interleave(ptr noalias %a, ptr noalias %b, i64 %N) {
 ; NO-VP-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; NO-VP:       vector.body:
 ; NO-VP-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; NO-VP-NEXT:    [[TMP6:%.*]] = add i64 [[INDEX]], 0
-; NO-VP-NEXT:    [[TMP7:%.*]] = call i64 @llvm.vscale.i64()
-; NO-VP-NEXT:    [[TMP8:%.*]] = mul i64 [[TMP7]], 4
-; NO-VP-NEXT:    [[TMP9:%.*]] = add i64 [[TMP8]], 0
-; NO-VP-NEXT:    [[TMP10:%.*]] = mul i64 [[TMP9]], 1
-; NO-VP-NEXT:    [[TMP11:%.*]] = add i64 [[INDEX]], [[TMP10]]
-; NO-VP-NEXT:    [[TMP12:%.*]] = getelementptr inbounds [2 x i32], ptr [[B:%.*]], i64 [[TMP6]], i32 0
-; NO-VP-NEXT:    [[TMP13:%.*]] = getelementptr inbounds [2 x i32], ptr [[B]], i64 [[TMP11]], i32 0
-; NO-VP-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i32, ptr [[TMP12]], i32 0
-; NO-VP-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i32, ptr [[TMP13]], i32 0
-; NO-VP-NEXT:    [[WIDE_VEC:%.*]] = load <vscale x 8 x i32>, ptr [[TMP14]], align 4
-; NO-VP-NEXT:    [[WIDE_VEC1:%.*]] = load <vscale x 8 x i32>, ptr [[TMP15]], align 4
-; NO-VP-NEXT:    [[STRIDED_VEC:%.*]] = call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.vector.deinterleave2.nxv8i32(<vscale x 8 x i32> [[WIDE_VEC]])
-; NO-VP-NEXT:    [[TMP16:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[STRIDED_VEC]], 0
-; NO-VP-NEXT:    [[TMP17:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[STRIDED_VEC]], 1
-; NO-VP-NEXT:    [[STRIDED_VEC2:%.*]] = call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.vector.deinterleave2.nxv8i32(<vscale x 8 x i32> [[WIDE_VEC1]])
-; NO-VP-NEXT:    [[TMP18:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[STRIDED_VEC2]], 0
-; NO-VP-NEXT:    [[TMP19:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[STRIDED_VEC2]], 1
-; NO-VP-NEXT:    [[TMP20:%.*]] = add nsw <vscale x 4 x i32> [[TMP17]], [[TMP16]]
-; NO-VP-NEXT:    [[TMP21:%.*]] = add nsw <vscale x 4 x i32> [[TMP19]], [[TMP18]]
-; NO-VP-NEXT:    [[TMP22:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[TMP6]]
-; NO-VP-NEXT:    [[TMP23:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP11]]
-; NO-VP-NEXT:    [[TMP24:%.*]] = getelementptr inbounds i32, ptr [[TMP22]], i32 0
-; NO-VP-NEXT:    [[TMP25:%.*]] = call i64 @llvm.vscale.i64()
-; NO-VP-NEXT:    [[TMP26:%.*]] = mul i64 [[TMP25]], 4
-; NO-VP-NEXT:    [[TMP27:%.*]] = getelementptr inbounds i32, ptr [[TMP22]], i64 [[TMP26]]
-; NO-VP-NEXT:    store <vscale x 4 x i32> [[TMP20]], ptr [[TMP24]], align 4
-; NO-VP-NEXT:    store <vscale x 4 x i32> [[TMP21]], ptr [[TMP27]], align 4
-; NO-VP-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
 ; NO-VP-NEXT:    [[TMP28:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; NO-VP-NEXT:    br i1 [[TMP28]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; NO-VP:       middle.block:
diff --git a/llvm/test/Transforms/LoopVectorize/X86/epilog-vectorization-inductions.ll b/llvm/test/Transforms/LoopVectorize/X86/epilog-vectorization-inductions.ll
index c1be67853bf7cc..ba94663178bf42 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/epilog-vectorization-inductions.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/epilog-vectorization-inductions.ll
@@ -143,15 +143,17 @@ define void @test_induction_step_needs_expansion(ptr noalias %j, ptr %k, i64 %l,
 ; CHECK:       vector.ph:
 ; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[L]], 64
 ; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[L]], [[N_MOD_VF]]
+; CHECK-NEXT:    [[DOTSPLATINSERT2:%.*]] = insertelement <16 x i16> poison, i16 [[TMP0]], i64 0
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <16 x i16> [[DOTSPLATINSERT2]], <16 x i16> poison, <16 x i32> zeroinitializer
+; CHECK-NEXT:    [[DOTSPLAT3:%.*]] = mul <16 x i16> <i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16>, [[TMP2]]
+
 ; CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <16 x i16> poison, i16 [[TMP0]], i64 0
 ; CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <16 x i16> [[DOTSPLATINSERT]], <16 x i16> poison, <16 x i32> zeroinitializer
 ; CHECK-NEXT:    [[TMP1:%.*]] = mul <16 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15>, [[DOTSPLAT]]
 ; CHECK-NEXT:    [[INDUCTION:%.*]] = add <16 x i16> zeroinitializer, [[TMP1]]
-; CHECK-NEXT:    [[TMP2:%.*]] = mul i16 [[TMP0]], 16
-; CHECK-NEXT:    [[DOTSPLATINSERT2:%.*]] = insertelement <16 x i16> poison, i16 [[TMP2]], i64 0
-; CHECK-NEXT:    [[DOTSPLAT3:%.*]] = shufflevector <16 x i16> [[DOTSPLATINSERT2]], <16 x i16> poison, <16 x i32> zeroinitializer
 ; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <16 x i16> poison, i16 [[OFF]], i64 0
 ; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <16 x i16> [[BROADCAST_SPLATINSERT]], <16 x i16> poison, <16 x i32> zeroinitializer
+
 ; CHECK-NEXT:    [[DOTCAST:%.*]] = trunc i64 [[N_VEC]] to i16
 ; CHECK-NEXT:    [[IND_END:%.*]] = mul i16 [[DOTCAST]], [[TMP0]]
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
diff --git a/llvm/test/Transforms/LoopVectorize/X86/interleaving.ll b/llvm/test/Transforms/LoopVectorize/X86/interleaving.ll
index 6ac1e446d13add..b02826643ac669 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/interleaving.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/interleaving.ll
@@ -13,22 +13,6 @@ define void @foo(ptr noalias nocapture %a, ptr noalias nocapture readonly %b) {
 ; SSE-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; SSE:       vector.body:
 ; SSE-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; SSE-NEXT:    [[TMP0:%.*]] = shl i64 [[INDEX]], 1
-; SSE-NEXT:    [[TMP1:%.*]] = or disjoint i64 [[TMP0]], 8
-; SSE-NEXT:    [[DOTIDX:%.*]] = shl nsw i64 [[INDEX]], 3
-; SSE-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i8, ptr [[B:%.*]], i64 [[DOTIDX]]
-; SSE-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[TMP1]]
-; SSE-NEXT:    [[WIDE_VEC:%.*]] = load <8 x i32>, ptr [[TMP2]], align 4
-; SSE-NEXT:    [[WIDE_VEC1:%.*]] = load <8 x i32>, ptr [[TMP3]], align 4
-; SSE-NEXT:    [[STRIDED_VEC:%.*]] = shufflevector <8 x i32> [[WIDE_VEC]], <8 x i32> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
-; SSE-NEXT:    [[STRIDED_VEC2:%.*]] = shufflevector <8 x i32> [[WIDE_VEC1]], <8 x i32> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
-; SSE-NEXT:    [[STRIDED_VEC3:%.*]] = shufflevector <8 x i32> [[WIDE_VEC]], <8 x i32> poison, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
-; SSE-NEXT:    [[STRIDED_VEC4:%.*]] = shufflevector <8 x i32> [[WIDE_VEC1]], <8 x i32> poison, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
-; SSE-NEXT:    [[TMP4:%.*]] = add nsw <4 x i32> [[STRIDED_VEC3]], [[STRIDED_VEC]]
-; SSE-NEXT:    [[TMP5:%.*]] = add nsw <4 x i32> [[STRIDED_VEC4]], [[STRIDED_VEC2]]
-; SSE-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[INDEX]]
-; SSE-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i8, ptr [[TMP6]], i64 16
-; SSE-NEXT:    store <4 x i32> [[TMP4]], ptr [[TMP6]], align 4
 ; SSE-NEXT:    store <4 x i32> [[TMP5]], ptr [[TMP7]], align 4
 ; SSE-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
 ; SSE-NEXT:    [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
@@ -49,38 +33,6 @@ define void @foo(ptr noalias nocapture %a, ptr noalias nocapture readonly %b) {
 ; AVX1-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; AVX1:       vector.body:
 ; AVX1-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; AVX1-NEXT:    [[TMP0:%.*]] = shl i64 [[INDEX]], 1
-; AVX1-NEXT:    [[TMP1:%.*]] = or disjoint i64 [[TMP0]], 8
-; AVX1-NEXT:    [[TMP2:%.*]] = shl i64 [[INDEX]], 1
-; AVX1-NEXT:    [[TMP3:%.*]] = or disjoint i64 [[TMP2]], 16
-; AVX1-NEXT:    [[TMP4:%.*]] = shl i64 [[INDEX]], 1
-; AVX1-NEXT:    [[TMP5:%.*]] = or disjoint i64 [[TMP4]], 24
-; AVX1-NEXT:    [[DOTIDX:%.*]] = shl nsw i64 [[INDEX]], 3
-; AVX1-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i8, ptr [[B:%.*]], i64 [[DOTIDX]]
-; AVX1-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[TMP1]]
-; AVX1-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[TMP3]]
-; AVX1-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[TMP5]]
-; AVX1-NEXT:    [[WIDE_VEC:%.*]] = load <8 x i32>, ptr [[TMP6]], align 4
-; AVX1-NEXT:    [[WIDE_VEC1:%.*]] = load <8 x i32>, ptr [[TMP7]], align 4
-; AVX1-NEXT:    [[WIDE_VEC2:%.*]] = load <8 x i32>, ptr [[TMP8]], align 4
-; AVX1-NEXT:    [[WIDE_VEC3:%.*]] = load <8 x i32>, ptr [[TMP9]], align 4
-; AVX1-NEXT:    [[STRIDED_VEC:%.*]] = shufflevector <8 x i32> [[WIDE_VEC]], <8 x i32> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
-; AVX1-NEXT:    [[STRIDED_VEC4:%.*]] = shufflevector <8 x i32> [[WIDE_VEC1]], <8 x i32> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
-; AVX1-NEXT:    [[STRIDED_VEC5:%.*]] = shufflevector <8 x i32> [[WIDE_VEC2]], <8 x i32> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
-; AVX1-NEXT:    [[STRIDED_VEC6:%.*]] = shufflevector <8 x i32> [[WIDE_VEC3]], <8 x i32> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
-; AVX1-NEXT:    [[STRIDED_VEC7:%.*]] = shufflevector <8 x i32> [[WIDE_VEC]], <8 x i32> poison, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
-; AVX1-NEXT:    [[STRIDED_VEC8:%.*]] = shufflevector <8 x i32> [[WIDE_VEC1]], <8 x i32> poison, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
-; AVX1-NEXT:    [[STRIDED_VEC9:%.*]] = shufflevector <8 x i32> [[WIDE_VEC2]], <8 x i32> poison, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
-; AVX1-NEXT:    [[STRIDED_VEC10:%.*]] = shufflevector <8 x i32> [[WIDE_VEC3]], <8 x i32> poison, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
-; AVX1-NEXT:    [[TMP10:%.*]] = add nsw <4 x i32> [[STRIDED_VEC7]], [[STRIDED_VEC]]
-; AVX1-NEXT:    [[TMP11:%.*]] = add nsw <4 x i32> [[STRIDED_VEC8]], [[STRIDED_VEC4]]
-; AVX1-NEXT:    [[TMP12:%.*]] = add nsw <4 x i32> [[STRIDED_VEC9]], [[STRIDED_VEC5]]
-; AVX1-NEXT:    [[TMP13:%.*]] = add nsw <4 x i32> [[STRIDED_VEC10]], [[STRIDED_VEC6]]
-; AVX1-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[INDEX]]
-; AVX1-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i8, ptr [[TMP14]], i64 16
-; AVX1-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i8, ptr [[TMP14]], i64 32
-; AVX1-NEXT:    [[TMP17:%.*]] = getelementptr inbounds i8, ptr [[TMP14]], i64 48
-; AVX1-NEXT:    store <4 x i32> [[TMP10]], ptr [[TMP14]], align 4
 ; AVX1-NEXT:    store <4 x i32> [[TMP11]], ptr [[TMP15]], align 4
 ; AVX1-NEXT:    store <4 x i32> [[TMP12]], ptr [[TMP16]], align 4
 ; AVX1-NEXT:    store <4 x i32> [[TMP13]], ptr [[TMP17]], align 4
@@ -103,38 +55,6 @@ define void @foo(ptr noalias nocapture %a, ptr noalias nocapture readonly %b) {
 ; AVX2-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; AVX2:       vector.body:
 ; AVX2-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; AVX2-NEXT:    [[TMP0:%.*]] = shl i64 [[INDEX]], 1
-; AVX2-NEXT:    [[TMP1:%.*]] = or disjoint i64 [[TMP0]], 16
-; AVX2-NEXT:    [[TMP2:%.*]] = shl i64 [[INDEX]], 1
-; AVX2-NEXT:    [[TMP3:%.*]] = or disjoint i64 [[TMP2]], 32
-; AVX2-NEXT:    [[TMP4:%.*]] = shl i64 [[INDEX]], 1
-; AVX2-NEXT:    [[TMP5:%.*]] = or disjoint i64 [[TMP4]], 48
-; AVX2-NEXT:    [[DOTIDX:%.*]] = shl nsw i64 [[INDEX]], 3
-; AVX2-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i8, ptr [[B:%.*]], i64 [[DOTIDX]]
-; AVX2-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[TMP1]]
-; AVX2-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[TMP3]]
-; AVX2-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[TMP5]]
-; AVX2-NEXT:    [[WIDE_VEC:%.*]] = load <16 x i32>, ptr [[TMP6]], align 4
-; AVX2-NEXT:    [[WIDE_VEC1:%.*]] = load <16 x i32>, ptr [[TMP7]], align 4
-; AVX2-NEXT:    [[WIDE_VEC2:%.*]] = load <16 x i32>, ptr [[TMP8]], align 4
-; AVX2-NEXT:    [[WIDE_VEC3:%.*]] = load <16 x i32>, ptr [[TMP9]], align 4
-; AVX2-NEXT:    [[STRIDED_VEC:%.*]] = shufflevector <16 x i32> [[WIDE_VEC]], <16 x i32> poison, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
-; AVX2-NEXT:    [[STRIDED_VEC4:%.*]] = shufflevector <16 x i32> [[WIDE_VEC1]], <16 x i32> poison, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
-; AVX2-NEXT:    [[STRIDED_VEC5:%.*]] = shufflevector <16 x i32> [[WIDE_VEC2]], <16 x i32> poison, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
-; AVX2-NEXT:    [[STRIDED_VEC6:%.*]] = shufflevector <16 x i32> [[WIDE_VEC3]], <16 x i32> poison, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
-; AVX2-NEXT:    [[STRIDED_VEC7:%.*]] = shufflevector <16 x i32> [[WIDE_VEC]], <16 x i32> poison, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
-; AVX2-NEXT:    [[STRIDED_VEC8:%.*]] = shufflevector <16 x i32> [[WIDE_VEC1]], <16 x i32> poison, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
-; AVX2-NEXT:    [[STRIDED_VEC9:%.*]] = shufflevector <16 x i32> [[WIDE_VEC2]], <16 x i32> poison, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
-; AVX2-NEXT:    [[STRIDED_VEC10:%.*]] = shufflevector <16 x i32> [[WIDE_VEC3]], <16 x i32> poison, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
-; AVX2-NEXT:    [[TMP10:%.*]] = add nsw <8 x i32> [[STRIDED_VEC7]], [[STRIDED_VEC]]
-; AVX2-NEXT:    [[TMP11:%.*]] = add nsw <8 x i32> [[STRIDED_VEC8]], [[STRIDED_VEC4]]
-; AVX2-NEXT:    [[TMP12:%.*]] = add nsw <8 x i32> [[STRIDED_VEC9]], [[STRIDED_VEC5]]
-; AVX2-NEXT:    [[TMP13:%.*]] = add nsw <8 x i32> [[STRIDED_VEC10]], [[STRIDED_VEC6]]
-; AVX2-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[INDEX]]
-; AVX2-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i8, ptr [[TMP14]], i64 32
-; AVX2-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i8, ptr [[TMP14]], i64 64
-; AVX2-NEXT:    [[TMP17:%.*]] = getelementptr inbounds i8, ptr [[TMP14]], i64 96
-; AVX2-NEXT:    store <8 x i32> [[TMP10]], ptr [[TMP14]], align 4
 ; AVX2-NEXT:    store <8 x i32> [[TMP11]], ptr [[TMP15]], align 4
 ; AVX2-NEXT:    store <8 x i32> [[TMP12]], ptr [[TMP16]], align 4
 ; AVX2-NEXT:    store <8 x i32> [[TMP13]], ptr [[TMP17]], align 4
diff --git a/llvm/test/Transforms/LoopVectorize/X86/pr47437.ll b/llvm/test/Transforms/LoopVectorize/X86/pr47437.ll
index 7cbf0ab0252061..7b16665a416d44 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/pr47437.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/pr47437.ll
@@ -101,24 +101,24 @@ define void @test_muladd(ptr noalias nocapture %d1, ptr noalias nocapture readon
 ; SSE41-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i16, ptr [[S1:%.*]], i64 [[TMP2]]
 ; SSE41-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i16, ptr [[S1]], i64 [[TMP3]]
 ; SSE41-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i16, ptr [[TMP4]], i32 0
-; SSE41-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i16, ptr [[TMP5]], i32 0
 ; SSE41-NEXT:    [[WIDE_VEC:%.*]] = load <8 x i16>, ptr [[TMP6]], align 2
-; SSE41-NEXT:    [[WIDE_VEC1:%.*]] = load <8 x i16>, ptr [[TMP7]], align 2
 ; SSE41-NEXT:    [[STRIDED_VEC:%.*]] = shufflevector <8 x i16> [[WIDE_VEC]], <8 x i16> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
-; SSE41-NEXT:    [[STRIDED_VEC2:%.*]] = shufflevector <8 x i16> [[WIDE_VEC1]], <8 x i16> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
 ; SSE41-NEXT:    [[STRIDED_VEC3:%.*]] = shufflevector <8 x i16> [[WIDE_VEC]], <8 x i16> poison, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+; SSE41-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i16, ptr [[TMP5]], i32 0
+; SSE41-NEXT:    [[WIDE_VEC1:%.*]] = load <8 x i16>, ptr [[TMP7]], align 2
+; SSE41-NEXT:    [[STRIDED_VEC2:%.*]] = shufflevector <8 x i16> [[WIDE_VEC1]], <8 x i16> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
 ; SSE41-NEXT:    [[STRIDED_VEC4:%.*]] = shufflevector <8 x i16> [[WIDE_VEC1]], <8 x i16> poison, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
 ; SSE41-NEXT:    [[TMP8:%.*]] = sext <4 x i16> [[STRIDED_VEC]] to <4 x i32>
 ; SSE41-NEXT:    [[TMP9:%.*]] = sext <4 x i16> [[STRIDED_VEC2]] to <4 x i32>
 ; SSE41-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i16, ptr [[S2:%.*]], i64 [[TMP2]]
 ; SSE41-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i16, ptr [[S2]], i64 [[TMP3]]
 ; SSE41-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i16, ptr [[TMP10]], i32 0
-; SSE41-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i16, ptr [[TMP11]], i32 0
 ; SSE41-NEXT:    [[WIDE_VEC5:%.*]] = load <8 x i16>, ptr [[TMP12]], align 2
-; SSE41-NEXT:    [[WIDE_VEC6:%.*]] = load <8 x i16>, ptr [[TMP13]], align 2
 ; SSE41-NEXT:    [[STRIDED_VEC7:%.*]] = shufflevector <8 x i16> [[WIDE_VEC5]], <8 x i16> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
-; SSE41-NEXT:    [[STRIDED_VEC8:%.*]] = shufflevector <8 x i16> [[WIDE_VEC6]], <8 x i16> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
 ; SSE41-NEXT:    [[STRIDED_VEC9:%.*]] = shufflevector <8 x i16> [[WIDE_VEC5]], <8 x i16> poison, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+; SSE41-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i16, ptr [[TMP11]], i32 0
+; SSE41-NEXT:    [[WIDE_VEC6:%.*]] = load <8 x i16>, ptr [[TMP13]], align 2
+; SSE41-NEXT:    [[STRIDED_VEC8:%.*]] = shufflevector <8 x i16> [[WIDE_VEC6]], <8 x i16> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
 ; SSE41-NEXT:    [[STRIDED_VEC10:%.*]] = shufflevector <8 x i16> [[WIDE_VEC6]], <8 x i16> poison, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
 ; SSE41-NEXT:    [[TMP14:%.*]] = sext <4 x i16> [[STRIDED_VEC7]] to <4 x i32>
 ; SSE41-NEXT:    [[TMP15:%.*]] = sext <4 x i16> [[STRIDED_VEC8]] to <4 x i32>
@@ -203,20 +203,20 @@ define void @test_muladd(ptr noalias nocapture %d1, ptr noalias nocapture readon
 ; AVX1-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i16, ptr [[S1]], i64 [[TMP6]]
 ; AVX1-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i16, ptr [[S1]], i64 [[TMP7]]
 ; AVX1-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i16, ptr [[TMP8]], i32 0
-; AVX1-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i16, ptr [[TMP9]], i32 0
-; AVX1-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i16, ptr [[TMP10]], i32 0
-; AVX1-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i16, ptr [[TMP11]], i32 0
 ; AVX1-NEXT:    [[WIDE_VEC:%.*]] = load <8 x i16>, ptr [[TMP12]], align 2
-; AVX1-NEXT:    [[WIDE_VEC1:%.*]] = load <8 x i16>, ptr [[TMP13]], align 2
-; AVX1-NEXT:    [[WIDE_VEC2:%.*]] = load <8 x i16>, ptr [[TMP14]], align 2
-; AVX1-NEXT:    [[WIDE_VEC3:%.*]] = load <8 x i16>, ptr [[TMP15]], align 2
 ; AVX1-NEXT:    [[STRIDED_VEC:%.*]] = shufflevector <8 x i16> [[WIDE_VEC]], <8 x i16> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
-; AVX1-NEXT:    [[STRIDED_VEC4:%.*]] = shufflevector <8 x i16> [[WIDE_VEC1]], <8 x i16> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
-; AVX1-NEXT:    [[STRIDED_VEC5:%.*]] = shufflevector <8 x i16> [[WIDE_VEC2]], <8 x i16> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
-; AVX1-NEXT:    [[STRIDED_VEC6:%.*]] = shufflevector <8 x i16> [[WIDE_VEC3]], <8 x i16> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
 ; AVX1-NEXT:    [[STRIDED_VEC7:%.*]] = shufflevector <8 x i16> [[WIDE_VEC]], <8 x i16> poison, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+; AVX1-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i16, ptr [[TMP9]], i32 0
+; AVX1-NEXT:    [[WIDE_VEC1:%.*]] = load <8 x i16>, ptr [[TMP13]], align 2
+; AVX1-NEXT:    [[STRIDED_VEC4:%.*]] = shufflevector <8 x i16> [[WIDE_VEC1]], <8 x i16> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
 ; AVX1-NEXT:    [[STRIDED_VEC8:%.*]] = shufflevector <8 x i16> [[WIDE_VEC1]], <8 x i16> poison, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+; AVX1-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i16, ptr [[TMP10]], i32 0
+; AVX1-NEXT:    [[WIDE_VEC2:%.*]] = load <8 x i16>, ptr [[TMP14]], align 2
+; AVX1-NEXT:    [[STRIDED_VEC5:%.*]] = shufflevector <8 x i16> [[WIDE_VEC2]], <8 x i16> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
 ; AVX1-NEXT:    [[STRIDED_VEC9:%.*]] = shufflevector <8 x i16> [[WIDE_VEC2]], <8 x i16> poison, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+; AVX1-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i16, ptr [[TMP11]], i32 0
+; AVX1-NEXT:    [[WIDE_VEC3:%.*]] = load <8 x i16>, ptr [[TMP15]], align 2
+; AVX1-NEXT:    [[STRIDED_VEC6:%.*]] = shufflevector <8 x i16> [[WIDE_VEC3]], <8 x i16> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
 ; AVX1-NEXT:    [[STRIDED_VEC10:%.*]] = shufflevector <8 x i16> [[WIDE_VEC3]], <8 x i16> poison, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
 ; AVX1-NEXT:    [[TMP16:%.*]] = sext <4 x i16> [[STRIDED_VEC]] to <4 x i32>
 ; AVX1-NEXT:    [[TMP17:%.*]] = sext <4 x i16> [[STRIDED_VEC4]] to <4 x i32>
@@ -227,20 +227,20 @@ define void @test_muladd(ptr noalias nocapture %d1, ptr noalias nocapture readon
 ; AVX1-NEXT:    [[TMP22:%.*]] = getelementptr inbounds i16, ptr [[S2]], i64 [[TMP6]]
 ; AVX1-NEXT:    [[TMP23:%.*]] = getelementptr inbounds i16, ptr [[S2]], i64 [[TMP7]]
 ; AVX1-NEXT:    [[TMP24:%.*]] = getelementptr inbounds i16, ptr [[TMP20]], i32 0
-; AVX1-NEXT:    [[TMP25:%.*]] = getelementptr inbounds i16, ptr [[TMP21]], i32 0
-; AVX1-NEXT:    [[TMP26:%.*]] = getelementptr inbounds i16, ptr [[TMP22]], i32 0
-; AVX1-NEXT:    [[TMP27:%.*]] = getelementptr inbounds i16, ptr [[TMP23]], i32 0
 ; AVX1-NEXT:    [[WIDE_VEC11:%.*]] = load <8 x i16>, ptr [[TMP24]], align 2
-; AVX1-NEXT:    [[WIDE_VEC12:%.*]] = load <8 x i16>, ptr [[TMP25]], align 2
-; AVX1-NEXT:    [[WIDE_VEC13:%.*]] = load <8 x i16>, ptr [[TMP26]], align 2
-; AVX1-NEXT:    [[WIDE_VEC14:%.*]] = load <8 x i16>, ptr [[TMP27]], align 2
 ; AVX1-NEXT:    [[STRIDED_VEC15:%.*]] = shufflevector <8 x i16> [[WIDE_VEC11]], <8 x i16> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
-; AVX1-NEXT:    [[STRIDED_VEC16:%.*]] = shufflevector <8 x i16> [[WIDE_VEC12]], <8 x i16> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
-; AVX1-NEXT:    [[STRIDED_VEC17:%.*]] = shufflevector <8 x i16> [[WIDE_VEC13]], <8 x i16> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
-; AVX1-NEXT:    [[STRIDED_VEC18:%.*]] = shufflevector <8 x i16> [[WIDE_VEC14]], <8 x i16> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
 ; AVX1-NEXT:    [[STRIDED_VEC19:%.*]] = shufflevector <8 x i16> [[WIDE_VEC11]], <8 x i16> poison, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+; AVX1-NEXT:    [[TMP25:%.*]] = getelementptr inbounds i16, ptr [[TMP21]], i32 0
+; AVX1-NEXT:    [[WIDE_VEC12:%.*]] = load <8 x i16>, ptr [[TMP25]], align 2
+; AVX1-NEXT:    [[STRIDED_VEC16:%.*]] = shufflevector <8 x i16> [[WIDE_VEC12]], <8 x i16> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
 ; AVX1-NEXT:    [[STRIDED_VEC20:%.*]] = shufflevector <8 x i16> [[WIDE_VEC12]], <8 x i16> poison, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+; AVX1-NEXT:    [[TMP26:%.*]] = getelementptr inbounds i16, ptr [[TMP22]], i32 0
+; AVX1-NEXT:    [[WIDE_VEC13:%.*]] = load <8 x i16>, ptr [[TMP26]], align 2
+; AVX1-NEXT:    [[STRIDED_VEC17:%.*]] = shufflevector <8 x i16> [[WIDE_VEC13]], <8 x i16> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
 ; AVX1-NEXT:    [[STRIDED_VEC21:%.*]] = shufflevector <8 x i16> [[WIDE_VEC13]], <8 x i16> poison, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+; AVX1-NEXT:    [[TMP27:%.*]] = getelementptr inbounds i16, ptr [[TMP23]], i32 0
+; AVX1-NEXT:    [[WIDE_VEC14:%.*]] = load <8 x i16>, ptr [[TMP27]], align 2
+; AVX1-NEXT:    [[STRIDED_VEC18:%.*]] = shufflevector <8 x i16> [[WIDE_VEC14]], <8 x i16> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
 ; AVX1-NEXT:    [[STRIDED_VEC22:%.*]] = shufflevector <8 x i16> [[WIDE_VEC14]], <8 x i16> poison, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
 ; AVX1-NEXT:    [[TMP28:%.*]] = sext <4 x i16> [[STRIDED_VEC15]] to <4 x i32>
 ; AVX1-NEXT:    [[TMP29:%.*]] = sext <4 x i16> [[STRIDED_VEC16]] to <4 x i32>
diff --git a/llvm/test/Transforms/LoopVectorize/X86/uniform_mem_op.ll b/llvm/test/Transforms/LoopVectorize/X86/uniform_mem_op.ll
index aaaea2f39c2c85..08a272283594c8 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/uniform_mem_op.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/uniform_mem_op.ll
@@ -214,28 +214,19 @@ define void @uniform_store_varying_value(ptr align(4) %addr) {
 ; CHECK:       vector.ph:
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
-; CHECK-NEXT:    [[OFFSET_IDX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP0:%.*]] = trunc i64 [[OFFSET_IDX]] to i32
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP0:%.*]] = trunc i64 [[INDEX]] to i32
 ; CHECK-NEXT:    [[TMP1:%.*]] = add i32 [[TMP0]], 0
-; CHECK-NEXT:    [[TMP2:%.*]] = add i32 [[TMP0]], 1
-; CHECK-NEXT:    [[TMP3:%.*]] = add i32 [[TMP0]], 2
-; CHECK-NEXT:    [[TMP4:%.*]] = add i32 [[TMP0]], 3
-; CHECK-NEXT:    [[TMP5:%.*]] = add i32 [[TMP0]], 4
-; CHECK-NEXT:    [[TMP6:%.*]] = add i32 [[TMP0]], 5
-; CHECK-NEXT:    [[TMP7:%.*]] = add i32 [[TMP0]], 6
-; CHECK-NEXT:    [[TMP8:%.*]] = add i32 [[TMP0]], 7
-; CHECK-NEXT:    [[TMP9:%.*]] = add i32 [[TMP0]], 8
-; CHECK-NEXT:    [[TMP10:%.*]] = add i32 [[TMP0]], 9
-; CHECK-NEXT:    [[TMP11:%.*]] = add i32 [[TMP0]], 10
-; CHECK-NEXT:    [[TMP12:%.*]] = add i32 [[TMP0]], 11
-; CHECK-NEXT:    [[TMP13:%.*]] = add i32 [[TMP0]], 12
-; CHECK-NEXT:    [[TMP14:%.*]] = add i32 [[TMP0]], 13
-; CHECK-NEXT:    [[TMP15:%.*]] = add i32 [[TMP0]], 14
-; CHECK-NEXT:    [[TMP16:%.*]] = add i32 [[TMP0]], 15
-; CHECK-NEXT:    store i32 [[TMP16]], ptr [[ADDR:%.*]], align 4
-; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[OFFSET_IDX]], 16
-; CHECK-NEXT:    [[TMP17:%.*]] = icmp eq i64 [[INDEX_NEXT]], 4096
-; CHECK-NEXT:    br i1 [[TMP17]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
+; CHECK-NEXT:    [[TMP2:%.*]] = add i32 [[TMP0]], 4
+; CHECK-NEXT:    [[TMP3:%.*]] = add i32 [[TMP0]], 8
+; CHECK-NEXT:    [[TMP4:%.*]] = add i32 [[TMP0]], 12
+; CHECK-NEXT:    [[TMP5:%.*]] = add i32 [[TMP0]], 13
+; CHECK-NEXT:    [[TMP6:%.*]] = add i32 [[TMP0]], 14
+; CHECK-NEXT:    [[TMP7:%.*]] = add i32 [[TMP0]], 15
+; CHECK-NEXT:    store i32 [[TMP7]], ptr [[ADDR:%.*]], align 4
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
+; CHECK-NEXT:    [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], 4096
+; CHECK-NEXT:    br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    br i1 false, label [[LOOPEXIT:%.*]], label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
diff --git a/llvm/test/Transforms/LoopVectorize/first-order-recurrence.ll b/llvm/test/Transforms/LoopVectorize/first-order-recurrence.ll
index 10b5aa64c180a0..1c59419bd3f7c7 100644
--- a/llvm/test/Transforms/LoopVectorize/first-order-recurrence.ll
+++ b/llvm/test/Transforms/LoopVectorize/first-order-recurrence.ll
@@ -2754,66 +2754,66 @@ define i32 @sink_into_replication_region(i32 %y) {
 ; UNROLL-NO-IC-NEXT:    [[TMP8:%.*]] = phi <4 x i32> [ poison, [[VECTOR_BODY]] ], [ [[TMP7]], [[PRED_UDIV_IF]] ]
 ; UNROLL-NO-IC-NEXT:    [[TMP9:%.*]] = extractelement <4 x i1> [[TMP2]], i32 1
 ; UNROLL-NO-IC-NEXT:    br i1 [[TMP9]], label [[PRED_UDIV_IF5:%.*]], label [[PRED_UDIV_CONTINUE6:%.*]]
-; UNROLL-NO-IC:       pred.udiv.if5:
+; UNROLL-NO-IC:       pred.udiv.if7:
 ; UNROLL-NO-IC-NEXT:    [[TMP10:%.*]] = add i32 [[OFFSET_IDX]], -1
 ; UNROLL-NO-IC-NEXT:    [[TMP11:%.*]] = udiv i32 219220132, [[TMP10]]
 ; UNROLL-NO-IC-NEXT:    [[TMP12:%.*]] = insertelement <4 x i32> [[TMP8]], i32 [[TMP11]], i32 1
 ; UNROLL-NO-IC-NEXT:    br label [[PRED_UDIV_CONTINUE6]]
-; UNROLL-NO-IC:       pred.udiv.continue6:
+; UNROLL-NO-IC:       pred.udiv.continue8:
 ; UNROLL-NO-IC-NEXT:    [[TMP13:%.*]] = phi <4 x i32> [ [[TMP8]], [[PRED_UDIV_CONTINUE]] ], [ [[TMP12]], [[PRED_UDIV_IF5]] ]
 ; UNROLL-NO-IC-NEXT:    [[TMP14:%.*]] = extractelement <4 x i1> [[TMP2]], i32 2
 ; UNROLL-NO-IC-NEXT:    br i1 [[TMP14]], label [[PRED_UDIV_IF7:%.*]], label [[PRED_UDIV_CONTINUE8:%.*]]
-; UNROLL-NO-IC:       pred.udiv.if7:
+; UNROLL-NO-IC:       pred.udiv.if9:
 ; UNROLL-NO-IC-NEXT:    [[TMP15:%.*]] = add i32 [[OFFSET_IDX]], -2
 ; UNROLL-NO-IC-NEXT:    [[TMP16:%.*]] = udiv i32 219220132, [[TMP15]]
 ; UNROLL-NO-IC-NEXT:    [[TMP17:%.*]] = insertelement <4 x i32> [[TMP13]], i32 [[TMP16]], i32 2
 ; UNROLL-NO-IC-NEXT:    br label [[PRED_UDIV_CONTINUE8]]
-; UNROLL-NO-IC:       pred.udiv.continue8:
+; UNROLL-NO-IC:       pred.udiv.continue10:
 ; UNROLL-NO-IC-NEXT:    [[TMP18:%.*]] = phi <4 x i32> [ [[TMP13]], [[PRED_UDIV_CONTINUE6]] ], [ [[TMP17]], [[PRED_UDIV_IF7]] ]
 ; UNROLL-NO-IC-NEXT:    [[TMP19:%.*]] = extractelement <4 x i1> [[TMP2]], i32 3
 ; UNROLL-NO-IC-NEXT:    br i1 [[TMP19]], label [[PRED_UDIV_IF9:%.*]], label [[PRED_UDIV_CONTINUE10:%.*]]
-; UNROLL-NO-IC:       pred.udiv.if9:
+; UNROLL-NO-IC:       pred.udiv.if11:
 ; UNROLL-NO-IC-NEXT:    [[TMP20:%.*]] = add i32 [[OFFSET_IDX]], -3
 ; UNROLL-NO-IC-NEXT:    [[TMP21:%.*]] = udiv i32 219220132, [[TMP20]]
 ; UNROLL-NO-IC-NEXT:    [[TMP22:%.*]] = insertelement <4 x i32> [[TMP18]], i32 [[TMP21]], i32 3
 ; UNROLL-NO-IC-NEXT:    br label [[PRED_UDIV_CONTINUE10]]
-; UNROLL-NO-IC:       pred.udiv.continue10:
+; UNROLL-NO-IC:       pred.udiv.continue12:
 ; UNROLL-NO-IC-NEXT:    [[TMP23:%.*]] = phi <4 x i32> [ [[TMP18]], [[PRED_UDIV_CONTINUE8]] ], [ [[TMP22]], [[PRED_UDIV_IF9]] ]
 ; UNROLL-NO-IC-NEXT:    [[TMP24:%.*]] = extractelement <4 x i1> [[TMP3]], i32 0
 ; UNROLL-NO-IC-NEXT:    br i1 [[TMP24]], label [[PRED_UDIV_IF11:%.*]], label [[PRED_UDIV_CONTINUE12:%.*]]
-; UNROLL-NO-IC:       pred.udiv.if11:
+; UNROLL-NO-IC:       pred.udiv.if13:
 ; UNROLL-NO-IC-NEXT:    [[TMP25:%.*]] = add i32 [[OFFSET_IDX]], -4
 ; UNROLL-NO-IC-NEXT:    [[TMP26:%.*]] = udiv i32 219220132, [[TMP25]]
 ; UNROLL-NO-IC-NEXT:    [[TMP27:%.*]] = insertelement <4 x i32> poison, i32 [[TMP26]], i32 0
 ; UNROLL-NO-IC-NEXT:    br label [[PRED_UDIV_CONTINUE12]]
-; UNROLL-NO-IC:       pred.udiv.continue12:
+; UNROLL-NO-IC:       pred.udiv.continue14:
 ; UNROLL-NO-IC-NEXT:    [[TMP28:%.*]] = phi <4 x i32> [ poison, [[PRED_UDIV_CONTINUE10]] ], [ [[TMP27]], [[PRED_UDIV_IF11]] ]
 ; UNROLL-NO-IC-NEXT:    [[TMP29:%.*]] = extractelement <4 x i1> [[TMP3]], i32 1
 ; UNROLL-NO-IC-NEXT:    br i1 [[TMP29]], label [[PRED_UDIV_IF13:%.*]], label [[PRED_UDIV_CONTINUE14:%.*]]
-; UNROLL-NO-IC:       pred.udiv.if13:
+; UNROLL-NO-IC:       pred.udiv.if15:
 ; UNROLL-NO-IC-NEXT:    [[TMP30:%.*]] = add i32 [[OFFSET_IDX]], -5
 ; UNROLL-NO-IC-NEXT:    [[TMP31:%.*]] = udiv i32 219220132, [[TMP30]]
 ; UNROLL-NO-IC-NEXT:    [[TMP32:%.*]] = insertelement <4 x i32> [[TMP28]], i32 [[TMP31]], i32 1
 ; UNROLL-NO-IC-NEXT:    br label [[PRED_UDIV_CONTINUE14]]
-; UNROLL-NO-IC:       pred.udiv.continue14:
+; UNROLL-NO-IC:       pred.udiv.continue16:
 ; UNROLL-NO-IC-NEXT:    [[TMP33:%.*]] = phi <4 x i32> [ [[TMP28]], [[PRED_UDIV_CONTINUE12]] ], [ [[TMP32]], [[PRED_UDIV_IF13]] ]
 ; UNROLL-NO-IC-NEXT:    [[TMP34:%.*]] = extractelement <4 x i1> [[TMP3]], i32 2
 ; UNROLL-NO-IC-NEXT:    br i1 [[TMP34]], label [[PRED_UDIV_IF15:%.*]], label [[PRED_UDIV_CONTINUE16:%.*]]
-; UNROLL-NO-IC:       pred.udiv.if15:
+; UNROLL-NO-IC:       pred.udiv.if17:
 ; UNROLL-NO-IC-NEXT:    [[TMP35:%.*]] = add i32 [[OFFSET_IDX]], -6
 ; UNROLL-NO-IC-NEXT:    [[TMP36:%.*]] = udiv i32 219220132, [[TMP35]]
 ; UNROLL-NO-IC-NEXT:    [[TMP37:%.*]] = insertelement <4 x i32> [[TMP33]], i32 [[TMP36]], i32 2
 ; UNROLL-NO-IC-NEXT:    br label [[PRED_UDIV_CONTINUE16]]
-; UNROLL-NO-IC:       pred.udiv.continue16:
+; UNROLL-NO-IC:       pred.udiv.continue18:
 ; UNROLL-NO-IC-NEXT:    [[TMP38:%.*]] = phi <4 x i32> [ [[TMP33]], [[PRED_UDIV_CONTINUE14]] ], [ [[TMP37]], [[PRED_UDIV_IF15]] ]
 ; UNROLL-NO-IC-NEXT:    [[TMP39:%.*]] = extractelement <4 x i1> [[TMP3]], i32 3
 ; UNROLL-NO-IC-NEXT:    br i1 [[TMP39]], label [[PRED_UDIV_IF17:%.*]], label [[PRED_UDIV_CONTINUE18]]
-; UNROLL-NO-IC:       pred.udiv.if17:
+; UNROLL-NO-IC:       pred.udiv.if19:
 ; UNROLL-NO-IC-NEXT:    [[TMP40:%.*]] = add i32 [[OFFSET_IDX]], -7
 ; UNROLL-NO-IC-NEXT:    [[TMP41:%.*]] = udiv i32 219220132, [[TMP40]]
 ; UNROLL-NO-IC-NEXT:    [[TMP42:%.*]] = insertelement <4 x i32> [[TMP38]], i32 [[TMP41]], i32 3
 ; UNROLL-NO-IC-NEXT:    br label [[PRED_UDIV_CONTINUE18]]
-; UNROLL-NO-IC:       pred.udiv.continue18:
+; UNROLL-NO-IC:       pred.udiv.continue20:
 ; UNROLL-NO-IC-NEXT:    [[TMP43]] = phi <4 x i32> [ [[TMP38]], [[PRED_UDIV_CONTINUE16]] ], [ [[TMP42]], [[PRED_UDIV_IF17]] ]
 ; UNROLL-NO-IC-NEXT:    [[TMP44:%.*]] = shufflevector <4 x i32> [[VECTOR_RECUR]], <4 x i32> [[TMP23]], <4 x i32> <i32 3, i32 4, i32 5, i32 6>
 ; UNROLL-NO-IC-NEXT:    [[TMP45:%.*]] = shufflevector <4 x i32> [[TMP23]], <4 x i32> [[TMP43]], <4 x i32> <i32 3, i32 4, i32 5, i32 6>
@@ -3064,59 +3064,59 @@ define i32 @sink_into_replication_region_multiple(ptr %x, i32 %y) {
 ; UNROLL-NO-IC-NEXT:    [[TMP15:%.*]] = phi <4 x i32> [ poison, [[VECTOR_BODY]] ], [ [[TMP14]], [[PRED_UDIV_IF]] ]
 ; UNROLL-NO-IC-NEXT:    [[TMP16:%.*]] = extractelement <4 x i1> [[TMP10]], i32 1
 ; UNROLL-NO-IC-NEXT:    br i1 [[TMP16]], label [[PRED_UDIV_IF4:%.*]], label [[PRED_UDIV_CONTINUE5:%.*]]
-; UNROLL-NO-IC:       pred.udiv.if4:
+; UNROLL-NO-IC:       pred.udiv.if3:
 ; UNROLL-NO-IC-NEXT:    [[TMP17:%.*]] = udiv i32 219220132, [[TMP3]]
 ; UNROLL-NO-IC-NEXT:    [[TMP18:%.*]] = insertelement <4 x i32> [[TMP15]], i32 [[TMP17]], i32 1
 ; UNROLL-NO-IC-NEXT:    br label [[PRED_UDIV_CONTINUE5]]
-; UNROLL-NO-IC:       pred.udiv.continue5:
+; UNROLL-NO-IC:       pred.udiv.continue4:
 ; UNROLL-NO-IC-NEXT:    [[TMP19:%.*]] = phi <4 x i32> [ [[TMP15]], [[PRED_UDIV_CONTINUE]] ], [ [[TMP18]], [[PRED_UDIV_IF4]] ]
 ; UNROLL-NO-IC-NEXT:    [[TMP20:%.*]] = extractelement <4 x i1> [[TMP10]], i32 2
 ; UNROLL-NO-IC-NEXT:    br i1 [[TMP20]], label [[PRED_UDIV_IF6:%.*]], label [[PRED_UDIV_CONTINUE7:%.*]]
-; UNROLL-NO-IC:       pred.udiv.if6:
+; UNROLL-NO-IC:       pred.udiv.if5:
 ; UNROLL-NO-IC-NEXT:    [[TMP21:%.*]] = udiv i32 219220132, [[TMP4]]
 ; UNROLL-NO-IC-NEXT:    [[TMP22:%.*]] = insertelement <4 x i32> [[TMP19]], i32 [[TMP21]], i32 2
 ; UNROLL-NO-IC-NEXT:    br label [[PRED_UDIV_CONTINUE7]]
-; UNROLL-NO-IC:       pred.udiv.continue7:
+; UNROLL-NO-IC:       pred.udiv.continue6:
 ; UNROLL-NO-IC-NEXT:    [[TMP23:%.*]] = phi <4 x i32> [ [[TMP19]], [[PRED_UDIV_CONTINUE5]] ], [ [[TMP22]], [[PRED_UDIV_IF6]] ]
 ; UNROLL-NO-IC-NEXT:    [[TMP24:%.*]] = extractelement <4 x i1> [[TMP10]], i32 3
 ; UNROLL-NO-IC-NEXT:    br i1 [[TMP24]], label [[PRED_UDIV_IF8:%.*]], label [[PRED_UDIV_CONTINUE9:%.*]]
-; UNROLL-NO-IC:       pred.udiv.if8:
+; UNROLL-NO-IC:       pred.udiv.if7:
 ; UNROLL-NO-IC-NEXT:    [[TMP25:%.*]] = udiv i32 219220132, [[TMP5]]
 ; UNROLL-NO-IC-NEXT:    [[TMP26:%.*]] = insertelement <4 x i32> [[TMP23]], i32 [[TMP25]], i32 3
 ; UNROLL-NO-IC-NEXT:    br label [[PRED_UDIV_CONTINUE9]]
-; UNROLL-NO-IC:       pred.udiv.continue9:
+; UNROLL-NO-IC:       pred.udiv.continue8:
 ; UNROLL-NO-IC-NEXT:    [[TMP27:%.*]] = phi <4 x i32> [ [[TMP23]], [[PRED_UDIV_CONTINUE7]] ], [ [[TMP26]], [[PRED_UDIV_IF8]] ]
 ; UNROLL-NO-IC-NEXT:    [[TMP28:%.*]] = extractelement <4 x i1> [[TMP11]], i32 0
 ; UNROLL-NO-IC-NEXT:    br i1 [[TMP28]], label [[PRED_UDIV_IF10:%.*]], label [[PRED_UDIV_CONTINUE11:%.*]]
-; UNROLL-NO-IC:       pred.udiv.if10:
+; UNROLL-NO-IC:       pred.udiv.if9:
 ; UNROLL-NO-IC-NEXT:    [[TMP29:%.*]] = udiv i32 219220132, [[TMP6]]
 ; UNROLL-NO-IC-NEXT:    [[TMP30:%.*]] = insertelement <4 x i32> poison, i32 [[TMP29]], i32 0
 ; UNROLL-NO-IC-NEXT:    br label [[PRED_UDIV_CONTINUE11]]
-; UNROLL-NO-IC:       pred.udiv.continue11:
+; UNROLL-NO-IC:       pred.udiv.continue10:
 ; UNROLL-NO-IC-NEXT:    [[TMP31:%.*]] = phi <4 x i32> [ poison, [[PRED_UDIV_CONTINUE9]] ], [ [[TMP30]], [[PRED_UDIV_IF10]] ]
 ; UNROLL-NO-IC-NEXT:    [[TMP32:%.*]] = extractelement <4 x i1> [[TMP11]], i32 1
 ; UNROLL-NO-IC-NEXT:    br i1 [[TMP32]], label [[PRED_UDIV_IF12:%.*]], label [[PRED_UDIV_CONTINUE13:%.*]]
-; UNROLL-NO-IC:       pred.udiv.if12:
+; UNROLL-NO-IC:       pred.udiv.if11:
 ; UNROLL-NO-IC-NEXT:    [[TMP33:%.*]] = udiv i32 219220132, [[TMP7]]
 ; UNROLL-NO-IC-NEXT:    [[TMP34:%.*]] = insertelement <4 x i32> [[TMP31]], i32 [[TMP33]], i32 1
 ; UNROLL-NO-IC-NEXT:    br label [[PRED_UDIV_CONTINUE13]]
-; UNROLL-NO-IC:       pred.udiv.continue13:
+; UNROLL-NO-IC:       pred.udiv.continue12:
 ; UNROLL-NO-IC-NEXT:    [[TMP35:%.*]] = phi <4 x i32> [ [[TMP31]], [[PRED_UDIV_CONTINUE11]] ], [ [[TMP34]], [[PRED_UDIV_IF12]] ]
 ; UNROLL-NO-IC-NEXT:    [[TMP36:%.*]] = extractelement <4 x i1> [[TMP11]], i32 2
 ; UNROLL-NO-IC-NEXT:    br i1 [[TMP36]], label [[PRED_UDIV_IF14:%.*]], label [[PRED_UDIV_CONTINUE15:%.*]]
-; UNROLL-NO-IC:       pred.udiv.if14:
+; UNROLL-NO-IC:       pred.udiv.if13:
 ; UNROLL-NO-IC-NEXT:    [[TMP37:%.*]] = udiv i32 219220132, [[TMP8]]
 ; UNROLL-NO-IC-NEXT:    [[TMP38:%.*]] = insertelement <4 x i32> [[TMP35]], i32 [[TMP37]], i32 2
 ; UNROLL-NO-IC-NEXT:    br label [[PRED_UDIV_CONTINUE15]]
-; UNROLL-NO-IC:       pred.udiv.continue15:
+; UNROLL-NO-IC:       pred.udiv.continue14:
 ; UNROLL-NO-IC-NEXT:    [[TMP39:%.*]] = phi <4 x i32> [ [[TMP35]], [[PRED_UDIV_CONTINUE13]] ], [ [[TMP38]], [[PRED_UDIV_IF14]] ]
 ; UNROLL-NO-IC-NEXT:    [[TMP40:%.*]] = extractelement <4 x i1> [[TMP11]], i32 3
 ; UNROLL-NO-IC-NEXT:    br i1 [[TMP40]], label [[PRED_UDIV_IF16:%.*]], label [[PRED_UDIV_CONTINUE17:%.*]]
-; UNROLL-NO-IC:       pred.udiv.if16:
+; UNROLL-NO-IC:       pred.udiv.if15:
 ; UNROLL-NO-IC-NEXT:    [[TMP41:%.*]] = udiv i32 219220132, [[TMP9]]
 ; UNROLL-NO-IC-NEXT:    [[TMP42:%.*]] = insertelement <4 x i32> [[TMP39]], i32 [[TMP41]], i32 3
 ; UNROLL-NO-IC-NEXT:    br label [[PRED_UDIV_CONTINUE17]]
-; UNROLL-NO-IC:       pred.udiv.continue17:
+; UNROLL-NO-IC:       pred.udiv.continue16:
 ; UNROLL-NO-IC-NEXT:    [[TMP43]] = phi <4 x i32> [ [[TMP39]], [[PRED_UDIV_CONTINUE15]] ], [ [[TMP42]], [[PRED_UDIV_IF16]] ]
 ; UNROLL-NO-IC-NEXT:    [[TMP44:%.*]] = shufflevector <4 x i32> [[VECTOR_RECUR]], <4 x i32> [[TMP27]], <4 x i32> <i32 3, i32 4, i32 5, i32 6>
 ; UNROLL-NO-IC-NEXT:    [[TMP45:%.*]] = shufflevector <4 x i32> [[TMP27]], <4 x i32> [[TMP43]], <4 x i32> <i32 3, i32 4, i32 5, i32 6>
@@ -3132,60 +3132,60 @@ define i32 @sink_into_replication_region_multiple(ptr %x, i32 %y) {
 ; UNROLL-NO-IC:       pred.store.continue:
 ; UNROLL-NO-IC-NEXT:    [[TMP51:%.*]] = extractelement <4 x i1> [[TMP10]], i32 1
 ; UNROLL-NO-IC-NEXT:    br i1 [[TMP51]], label [[PRED_STORE_IF18:%.*]], label [[PRED_STORE_CONTINUE19:%.*]]
-; UNROLL-NO-IC:       pred.store.if18:
+; UNROLL-NO-IC:       pred.store.if17:
 ; UNROLL-NO-IC-NEXT:    [[TMP52:%.*]] = add i32 [[INDEX]], 1
 ; UNROLL-NO-IC-NEXT:    [[TMP53:%.*]] = getelementptr inbounds i32, ptr [[X]], i32 [[TMP52]]
 ; UNROLL-NO-IC-NEXT:    store i32 [[TMP3]], ptr [[TMP53]], align 4
 ; UNROLL-NO-IC-NEXT:    br label [[PRED_STORE_CONTINUE19]]
-; UNROLL-NO-IC:       pred.store.continue19:
+; UNROLL-NO-IC:       pred.store.continue18:
 ; UNROLL-NO-IC-NEXT:    [[TMP54:%.*]] = extractelement <4 x i1> [[TMP10]], i32 2
 ; UNROLL-NO-IC-NEXT:    br i1 [[TMP54]], label [[PRED_STORE_IF20:%.*]], label [[PRED_STORE_CONTINUE21:%.*]]
-; UNROLL-NO-IC:       pred.store.if20:
+; UNROLL-NO-IC:       pred.store.if19:
 ; UNROLL-NO-IC-NEXT:    [[TMP55:%.*]] = add i32 [[INDEX]], 2
 ; UNROLL-NO-IC-NEXT:    [[TMP56:%.*]] = getelementptr inbounds i32, ptr [[X]], i32 [[TMP55]]
 ; UNROLL-NO-IC-NEXT:    store i32 [[TMP4]], ptr [[TMP56]], align 4
 ; UNROLL-NO-IC-NEXT:    br label [[PRED_STORE_CONTINUE21]]
-; UNROLL-NO-IC:       pred.store.continue21:
+; UNROLL-NO-IC:       pred.store.continue20:
 ; UNROLL-NO-IC-NEXT:    [[TMP57:%.*]] = extractelement <4 x i1> [[TMP10]], i32 3
 ; UNROLL-NO-IC-NEXT:    br i1 [[TMP57]], label [[PRED_STORE_IF22:%.*]], label [[PRED_STORE_CONTINUE23:%.*]]
-; UNROLL-NO-IC:       pred.store.if22:
+; UNROLL-NO-IC:       pred.store.if21:
 ; UNROLL-NO-IC-NEXT:    [[TMP58:%.*]] = add i32 [[INDEX]], 3
 ; UNROLL-NO-IC-NEXT:    [[TMP59:%.*]] = getelementptr inbounds i32, ptr [[X]], i32 [[TMP58]]
 ; UNROLL-NO-IC-NEXT:    store i32 [[TMP5]], ptr [[TMP59]], align 4
 ; UNROLL-NO-IC-NEXT:    br label [[PRED_STORE_CONTINUE23]]
-; UNROLL-NO-IC:       pred.store.continue23:
+; UNROLL-NO-IC:       pred.store.continue22:
 ; UNROLL-NO-IC-NEXT:    [[TMP60:%.*]] = extractelement <4 x i1> [[TMP11]], i32 0
 ; UNROLL-NO-IC-NEXT:    br i1 [[TMP60]], label [[PRED_STORE_IF24:%.*]], label [[PRED_STORE_CONTINUE25:%.*]]
-; UNROLL-NO-IC:       pred.store.if24:
+; UNROLL-NO-IC:       pred.store.if23:
 ; UNROLL-NO-IC-NEXT:    [[TMP61:%.*]] = add i32 [[INDEX]], 4
 ; UNROLL-NO-IC-NEXT:    [[TMP62:%.*]] = getelementptr inbounds i32, ptr [[X]], i32 [[TMP61]]
 ; UNROLL-NO-IC-NEXT:    store i32 [[TMP6]], ptr [[TMP62]], align 4
 ; UNROLL-NO-IC-NEXT:    br label [[PRED_STORE_CONTINUE25]]
-; UNROLL-NO-IC:       pred.store.continue25:
+; UNROLL-NO-IC:       pred.store.continue24:
 ; UNROLL-NO-IC-NEXT:    [[TMP63:%.*]] = extractelement <4 x i1> [[TMP11]], i32 1
 ; UNROLL-NO-IC-NEXT:    br i1 [[TMP63]], label [[PRED_STORE_IF26:%.*]], label [[PRED_STORE_CONTINUE27:%.*]]
-; UNROLL-NO-IC:       pred.store.if26:
+; UNROLL-NO-IC:       pred.store.if25:
 ; UNROLL-NO-IC-NEXT:    [[TMP64:%.*]] = add i32 [[INDEX]], 5
 ; UNROLL-NO-IC-NEXT:    [[TMP65:%.*]] = getelementptr inbounds i32, ptr [[X]], i32 [[TMP64]]
 ; UNROLL-NO-IC-NEXT:    store i32 [[TMP7]], ptr [[TMP65]], align 4
 ; UNROLL-NO-IC-NEXT:    br label [[PRED_STORE_CONTINUE27]]
-; UNROLL-NO-IC:       pred.store.continue27:
+; UNROLL-NO-IC:       pred.store.continue26:
 ; UNROLL-NO-IC-NEXT:    [[TMP66:%.*]] = extractelement <4 x i1> [[TMP11]], i32 2
 ; UNROLL-NO-IC-NEXT:    br i1 [[TMP66]], label [[PRED_STORE_IF28:%.*]], label [[PRED_STORE_CONTINUE29:%.*]]
-; UNROLL-NO-IC:       pred.store.if28:
+; UNROLL-NO-IC:       pred.store.if27:
 ; UNROLL-NO-IC-NEXT:    [[TMP67:%.*]] = add i32 [[INDEX]], 6
 ; UNROLL-NO-IC-NEXT:    [[TMP68:%.*]] = getelementptr inbounds i32, ptr [[X]], i32 [[TMP67]]
 ; UNROLL-NO-IC-NEXT:    store i32 [[TMP8]], ptr [[TMP68]], align 4
 ; UNROLL-NO-IC-NEXT:    br label [[PRED_STORE_CONTINUE29]]
-; UNROLL-NO-IC:       pred.store.continue29:
+; UNROLL-NO-IC:       pred.store.continue28:
 ; UNROLL-NO-IC-NEXT:    [[TMP69:%.*]] = extractelement <4 x i1> [[TMP11]], i32 3
 ; UNROLL-NO-IC-NEXT:    br i1 [[TMP69]], label [[PRED_STORE_IF30:%.*]], label [[PRED_STORE_CONTINUE31]]
-; UNROLL-NO-IC:       pred.store.if30:
+; UNROLL-NO-IC:       pred.store.if29:
 ; UNROLL-NO-IC-NEXT:    [[TMP70:%.*]] = add i32 [[INDEX]], 7
 ; UNROLL-NO-IC-NEXT:    [[TMP71:%.*]] = getelementptr inbounds i32, ptr [[X]], i32 [[TMP70]]
 ; UNROLL-NO-IC-NEXT:    store i32 [[TMP9]], ptr [[TMP71]], align 4
 ; UNROLL-NO-IC-NEXT:    br label [[PRED_STORE_CONTINUE31]]
-; UNROLL-NO-IC:       pred.store.continue31:
+; UNROLL-NO-IC:       pred.store.continue30:
 ; UNROLL-NO-IC-NEXT:    [[TMP72:%.*]] = select <4 x i1> [[TMP10]], <4 x i32> [[TMP46]], <4 x i32> [[VEC_PHI]]
 ; UNROLL-NO-IC-NEXT:    [[TMP73:%.*]] = select <4 x i1> [[TMP11]], <4 x i32> [[TMP47]], <4 x i32> [[VEC_PHI3]]
 ; UNROLL-NO-IC-NEXT:    [[INDEX_NEXT]] = add i32 [[INDEX]], 8
diff --git a/llvm/test/Transforms/LoopVectorize/float-induction.ll b/llvm/test/Transforms/LoopVectorize/float-induction.ll
index bd658c31768a84..b9328a927b1fd7 100644
--- a/llvm/test/Transforms/LoopVectorize/float-induction.ll
+++ b/llvm/test/Transforms/LoopVectorize/float-induction.ll
@@ -87,15 +87,15 @@ define void @fp_iv_loop1_fast_FMF(float %init, ptr noalias nocapture %A, i32 %N)
 ; VEC4_INTERL2-NEXT:    [[DOTCAST:%.*]] = uitofp nneg i64 [[N_VEC]] to float
 ; VEC4_INTERL2-NEXT:    [[TMP1:%.*]] = fmul fast float [[FPINC]], [[DOTCAST]]
 ; VEC4_INTERL2-NEXT:    [[IND_END:%.*]] = fsub fast float [[INIT:%.*]], [[TMP1]]
+; VEC4_INTERL2-NEXT:    [[FPINC_INS:%.*]] = insertelement <4 x float> poison, float [[FPINC]], i64 0
+; VEC4_INTERL2-NEXT:    [[TMP3:%.*]] = fmul fast <4 x float> [[FPINC_INS]], <float 4.000000e+00, float poison
+; VEC4_INTERL2-NEXT:    [[DOTSPLAT5:%.*]] = shufflevector <4 x float> [[TMP3]], <4 x float> poison, <4 x i32> zeroinitializer
 ; VEC4_INTERL2-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <4 x float> poison, float [[INIT]], i64 0
 ; VEC4_INTERL2-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <4 x float> [[DOTSPLATINSERT]], <4 x float> poison, <4 x i32> zeroinitializer
 ; VEC4_INTERL2-NEXT:    [[DOTSPLATINSERT2:%.*]] = insertelement <4 x float> poison, float [[FPINC]], i64 0
 ; VEC4_INTERL2-NEXT:    [[DOTSPLAT3:%.*]] = shufflevector <4 x float> [[DOTSPLATINSERT2]], <4 x float> poison, <4 x i32> zeroinitializer
 ; VEC4_INTERL2-NEXT:    [[TMP2:%.*]] = fmul fast <4 x float> [[DOTSPLAT3]], <float 0.000000e+00, float 1.000000e+00, float 2.000000e+00, float 3.000000e+00>
 ; VEC4_INTERL2-NEXT:    [[INDUCTION:%.*]] = fsub fast <4 x float> [[DOTSPLAT]], [[TMP2]]
-; VEC4_INTERL2-NEXT:    [[TMP3:%.*]] = fmul fast float [[FPINC]], 4.000000e+00
-; VEC4_INTERL2-NEXT:    [[DOTSPLATINSERT4:%.*]] = insertelement <4 x float> poison, float [[TMP3]], i64 0
-; VEC4_INTERL2-NEXT:    [[DOTSPLAT5:%.*]] = shufflevector <4 x float> [[DOTSPLATINSERT4]], <4 x float> poison, <4 x i32> zeroinitializer
 ; VEC4_INTERL2-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; VEC4_INTERL2:       vector.body:
 ; VEC4_INTERL2-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
@@ -334,15 +334,15 @@ define void @fp_iv_loop1_reassoc_FMF(float %init, ptr noalias nocapture %A, i32
 ; VEC4_INTERL2-NEXT:    [[DOTCAST:%.*]] = uitofp nneg i64 [[N_VEC]] to float
 ; VEC4_INTERL2-NEXT:    [[TMP1:%.*]] = fmul reassoc float [[FPINC]], [[DOTCAST]]
 ; VEC4_INTERL2-NEXT:    [[IND_END:%.*]] = fsub reassoc float [[INIT:%.*]], [[TMP1]]
+; VEC4_INTERL2-NEXT:    [[DOTSPLATINSERT2:%.*]] = insertelement <4 x float> poison, float [[FPINC]], i64 0
+; VEC4_INTERL2-NEXT:    [[MUL:%.*]] = fmul reassoc <4 x float> [[DOTSPLATINSERT2]], <float 4.000000e+00, float poison
+; VEC4_INTERL2-NEXT:    [[DOTSPLAT5:%.*]] = shufflevector <4 x float> [[MUL]], <4 x float> poison, <4 x i32> zeroinitializer
 ; VEC4_INTERL2-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <4 x float> poison, float [[INIT]], i64 0
 ; VEC4_INTERL2-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <4 x float> [[DOTSPLATINSERT]], <4 x float> poison, <4 x i32> zeroinitializer
 ; VEC4_INTERL2-NEXT:    [[DOTSPLATINSERT2:%.*]] = insertelement <4 x float> poison, float [[FPINC]], i64 0
 ; VEC4_INTERL2-NEXT:    [[DOTSPLAT3:%.*]] = shufflevector <4 x float> [[DOTSPLATINSERT2]], <4 x float> poison, <4 x i32> zeroinitializer
 ; VEC4_INTERL2-NEXT:    [[TMP2:%.*]] = fmul reassoc <4 x float> [[DOTSPLAT3]], <float 0.000000e+00, float 1.000000e+00, float 2.000000e+00, float 3.000000e+00>
 ; VEC4_INTERL2-NEXT:    [[INDUCTION:%.*]] = fsub reassoc <4 x float> [[DOTSPLAT]], [[TMP2]]
-; VEC4_INTERL2-NEXT:    [[TMP3:%.*]] = fmul reassoc float [[FPINC]], 4.000000e+00
-; VEC4_INTERL2-NEXT:    [[DOTSPLATINSERT4:%.*]] = insertelement <4 x float> poison, float [[TMP3]], i64 0
-; VEC4_INTERL2-NEXT:    [[DOTSPLAT5:%.*]] = shufflevector <4 x float> [[DOTSPLATINSERT4]], <4 x float> poison, <4 x i32> zeroinitializer
 ; VEC4_INTERL2-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; VEC4_INTERL2:       vector.body:
 ; VEC4_INTERL2-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
@@ -841,29 +841,27 @@ define void @fp_iv_loop3(float %init, ptr noalias nocapture %A, ptr noalias noca
 ; VEC4_INTERL2-NEXT:    [[DOTCAST2:%.*]] = uitofp nneg i64 [[N_VEC]] to float
 ; VEC4_INTERL2-NEXT:    [[TMP3:%.*]] = fmul fast float [[TMP0]], [[DOTCAST2]]
 ; VEC4_INTERL2-NEXT:    [[IND_END3:%.*]] = fadd fast float [[TMP3]], [[INIT:%.*]]
+; VEC4_INTERL2-NEXT:    [[DOTSPLATINSERT2:%.*]] = insertelement <4 x float> poison, float [[TMP0]], i64 0
+; VEC4_INTERL2-NEXT:    [[BROADCAST:%.*]] = shufflevector <4 x float> [[DOTSPLATINSERT2]], <4 x float> poison, <4 x i32> zeroinitializer
+; VEC4_INTERL2-NEXT:    [[DOTSPLAT5:%.*]] = fmul fast <4 x float> [[BROADCAST]], <float 4.000000e+00, float 4
 ; VEC4_INTERL2-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <4 x float> poison, float [[INIT]], i64 0
 ; VEC4_INTERL2-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <4 x float> [[DOTSPLATINSERT]], <4 x float> poison, <4 x i32> zeroinitializer
 ; VEC4_INTERL2-NEXT:    [[DOTSPLATINSERT6:%.*]] = insertelement <4 x float> poison, float [[TMP0]], i64 0
 ; VEC4_INTERL2-NEXT:    [[DOTSPLAT7:%.*]] = shufflevector <4 x float> [[DOTSPLATINSERT6]], <4 x float> poison, <4 x i32> zeroinitializer
 ; VEC4_INTERL2-NEXT:    [[TMP4:%.*]] = fmul fast <4 x float> [[DOTSPLAT7]], <float 0.000000e+00, float 1.000000e+00, float 2.000000e+00, float 3.000000e+00>
 ; VEC4_INTERL2-NEXT:    [[INDUCTION:%.*]] = fadd fast <4 x float> [[DOTSPLAT]], [[TMP4]]
-; VEC4_INTERL2-NEXT:    [[TMP5:%.*]] = fmul fast float [[TMP0]], 4.000000e+00
-; VEC4_INTERL2-NEXT:    [[DOTSPLATINSERT8:%.*]] = insertelement <4 x float> poison, float [[TMP5]], i64 0
-; VEC4_INTERL2-NEXT:    [[DOTSPLAT9:%.*]] = shufflevector <4 x float> [[DOTSPLATINSERT8]], <4 x float> poison, <4 x i32> zeroinitializer
-; VEC4_INTERL2-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x float> poison, float [[TMP0]], i64 0
-; VEC4_INTERL2-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x float> [[BROADCAST_SPLATINSERT]], <4 x float> poison, <4 x i32> zeroinitializer
 ; VEC4_INTERL2-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; VEC4_INTERL2:       vector.body:
 ; VEC4_INTERL2-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; VEC4_INTERL2-NEXT:    [[VEC_IND:%.*]] = phi <4 x float> [ <float 0x3FB99999A0000000, float 0xBFD99999A0000000, float 0xBFECCCCCC0000000, float 0xBFF6666660000000>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; VEC4_INTERL2-NEXT:    [[VEC_IND10:%.*]] = phi <4 x float> [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT13:%.*]], [[VECTOR_BODY]] ]
-; VEC4_INTERL2-NEXT:    [[STEP_ADD11:%.*]] = fadd fast <4 x float> [[VEC_IND10]], [[DOTSPLAT9]]
+; VEC4_INTERL2-NEXT:    [[STEP_ADD11:%.*]] = fadd fast <4 x float> [[VEC_IND10]], [[DOTSPLAT5]]
 ; VEC4_INTERL2-NEXT:    [[TMP6:%.*]] = getelementptr inbounds float, ptr [[A:%.*]], i64 [[INDEX]]
 ; VEC4_INTERL2-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i8, ptr [[TMP6]], i64 16
 ; VEC4_INTERL2-NEXT:    store <4 x float> [[VEC_IND10]], ptr [[TMP6]], align 4
 ; VEC4_INTERL2-NEXT:    store <4 x float> [[STEP_ADD11]], ptr [[TMP7]], align 4
-; VEC4_INTERL2-NEXT:    [[TMP8:%.*]] = fadd fast <4 x float> [[VEC_IND10]], [[BROADCAST_SPLAT]]
-; VEC4_INTERL2-NEXT:    [[TMP9:%.*]] = fadd fast <4 x float> [[STEP_ADD11]], [[BROADCAST_SPLAT]]
+; VEC4_INTERL2-NEXT:    [[TMP8:%.*]] = fadd fast <4 x float> [[VEC_IND10]], [[BROADCAST]]
+; VEC4_INTERL2-NEXT:    [[TMP9:%.*]] = fadd fast <4 x float> [[STEP_ADD11]], [[BROADCAST]]
 ; VEC4_INTERL2-NEXT:    [[TMP10:%.*]] = fadd fast <4 x float> [[VEC_IND]], <float -5.000000e-01, float -5.000000e-01, float -5.000000e-01, float -5.000000e-01>
 ; VEC4_INTERL2-NEXT:    [[TMP11:%.*]] = fadd fast <4 x float> [[VEC_IND]], <float -2.500000e+00, float -2.500000e+00, float -2.500000e+00, float -2.500000e+00>
 ; VEC4_INTERL2-NEXT:    [[TMP12:%.*]] = fadd fast <4 x float> [[TMP10]], [[TMP8]]
@@ -878,7 +876,7 @@ define void @fp_iv_loop3(float %init, ptr noalias nocapture %A, ptr noalias noca
 ; VEC4_INTERL2-NEXT:    store <4 x float> [[TMP11]], ptr [[TMP17]], align 4
 ; VEC4_INTERL2-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
 ; VEC4_INTERL2-NEXT:    [[VEC_IND_NEXT]] = fadd fast <4 x float> [[VEC_IND]], <float -4.000000e+00, float -4.000000e+00, float -4.000000e+00, float -4.000000e+00>
-; VEC4_INTERL2-NEXT:    [[VEC_IND_NEXT13]] = fadd fast <4 x float> [[STEP_ADD11]], [[DOTSPLAT9]]
+; VEC4_INTERL2-NEXT:    [[VEC_IND_NEXT13]] = fadd fast <4 x float> [[STEP_ADD11]], [[DOTSPLAT5]]
 ; VEC4_INTERL2-NEXT:    [[TMP18:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; VEC4_INTERL2-NEXT:    br i1 [[TMP18]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
 ; VEC4_INTERL2:       middle.block:
diff --git a/llvm/test/Transforms/LoopVectorize/induction.ll b/llvm/test/Transforms/LoopVectorize/induction.ll
index 45674acaae5387..36b5e31329bacc 100644
--- a/llvm/test/Transforms/LoopVectorize/induction.ll
+++ b/llvm/test/Transforms/LoopVectorize/induction.ll
@@ -1174,14 +1174,14 @@ define float @scalarize_induction_variable_02(ptr %a, ptr %b, i64 %n) {
 ; INTERLEAVE-NEXT:    [[TMP6:%.*]] = getelementptr inbounds float, ptr [[A:%.*]], i64 [[OFFSET_IDX]]
 ; INTERLEAVE-NEXT:    [[TMP7:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP5]]
 ; INTERLEAVE-NEXT:    [[WIDE_VEC:%.*]] = load <32 x float>, ptr [[TMP6]], align 4
-; INTERLEAVE-NEXT:    [[WIDE_VEC2:%.*]] = load <32 x float>, ptr [[TMP7]], align 4
 ; INTERLEAVE-NEXT:    [[STRIDED_VEC:%.*]] = shufflevector <32 x float> [[WIDE_VEC]], <32 x float> poison, <4 x i32> <i32 0, i32 8, i32 16, i32 24>
+; INTERLEAVE-NEXT:    [[WIDE_VEC2:%.*]] = load <32 x float>, ptr [[TMP7]], align 4
 ; INTERLEAVE-NEXT:    [[STRIDED_VEC3:%.*]] = shufflevector <32 x float> [[WIDE_VEC2]], <32 x float> poison, <4 x i32> <i32 0, i32 8, i32 16, i32 24>
 ; INTERLEAVE-NEXT:    [[TMP8:%.*]] = getelementptr inbounds float, ptr [[B:%.*]], i64 [[OFFSET_IDX]]
 ; INTERLEAVE-NEXT:    [[TMP9:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[TMP5]]
 ; INTERLEAVE-NEXT:    [[WIDE_VEC4:%.*]] = load <32 x float>, ptr [[TMP8]], align 4
-; INTERLEAVE-NEXT:    [[WIDE_VEC5:%.*]] = load <32 x float>, ptr [[TMP9]], align 4
 ; INTERLEAVE-NEXT:    [[STRIDED_VEC6:%.*]] = shufflevector <32 x float> [[WIDE_VEC4]], <32 x float> poison, <4 x i32> <i32 0, i32 8, i32 16, i32 24>
+; INTERLEAVE-NEXT:    [[WIDE_VEC5:%.*]] = load <32 x float>, ptr [[TMP9]], align 4
 ; INTERLEAVE-NEXT:    [[STRIDED_VEC7:%.*]] = shufflevector <32 x float> [[WIDE_VEC5]], <32 x float> poison, <4 x i32> <i32 0, i32 8, i32 16, i32 24>
 ; INTERLEAVE-NEXT:    [[TMP10:%.*]] = fadd fast <4 x float> [[VEC_PHI]], <float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00>
 ; INTERLEAVE-NEXT:    [[TMP11:%.*]] = fadd fast <4 x float> [[VEC_PHI1]], <float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00>
@@ -1487,8 +1487,8 @@ define void @scalarize_induction_variable_03(ptr %p, i32 %y, i64 %n) {
 ; INTERLEAVE-NEXT:    [[TMP15:%.*]] = getelementptr inbounds [[PAIR_I32]], ptr [[P]], i64 [[TMP7]], i32 1
 ; INTERLEAVE-NEXT:    [[TMP16:%.*]] = getelementptr inbounds [[PAIR_I32]], ptr [[P]], i64 [[TMP8]], i32 1
 ; INTERLEAVE-NEXT:    [[WIDE_VEC:%.*]] = load <8 x i32>, ptr [[TMP9]], align 8
-; INTERLEAVE-NEXT:    [[WIDE_VEC1:%.*]] = load <8 x i32>, ptr [[TMP13]], align 8
 ; INTERLEAVE-NEXT:    [[STRIDED_VEC:%.*]] = shufflevector <8 x i32> [[WIDE_VEC]], <8 x i32> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+; INTERLEAVE-NEXT:    [[WIDE_VEC1:%.*]] = load <8 x i32>, ptr [[TMP13]], align 8
 ; INTERLEAVE-NEXT:    [[STRIDED_VEC2:%.*]] = shufflevector <8 x i32> [[WIDE_VEC1]], <8 x i32> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
 ; INTERLEAVE-NEXT:    [[TMP17:%.*]] = xor <4 x i32> [[STRIDED_VEC]], [[BROADCAST_SPLAT]]
 ; INTERLEAVE-NEXT:    [[TMP18:%.*]] = xor <4 x i32> [[STRIDED_VEC2]], [[BROADCAST_SPLAT]]
@@ -5250,30 +5250,30 @@ define i32 @PR32419(i32 %a, i16 %b) {
 ; UNROLL-NEXT:    [[TMP8:%.*]] = phi <2 x i16> [ poison, [[VECTOR_BODY]] ], [ [[TMP7]], [[PRED_UREM_IF]] ]
 ; UNROLL-NEXT:    [[TMP9:%.*]] = extractelement <2 x i1> [[TMP2]], i64 1
 ; UNROLL-NEXT:    br i1 [[TMP9]], label [[PRED_UREM_IF3:%.*]], label [[PRED_UREM_CONTINUE4:%.*]]
-; UNROLL:       pred.urem.if3:
+; UNROLL:       pred.urem.if2:
 ; UNROLL-NEXT:    [[TMP10:%.*]] = add i16 [[TMP1]], -19
 ; UNROLL-NEXT:    [[TMP11:%.*]] = urem i16 [[B]], [[TMP10]]
 ; UNROLL-NEXT:    [[TMP12:%.*]] = insertelement <2 x i16> [[TMP8]], i16 [[TMP11]], i64 1
 ; UNROLL-NEXT:    br label [[PRED_UREM_CONTINUE4]]
-; UNROLL:       pred.urem.continue4:
+; UNROLL:       pred.urem.continue3:
 ; UNROLL-NEXT:    [[TMP13:%.*]] = phi <2 x i16> [ [[TMP8]], [[PRED_UREM_CONTINUE]] ], [ [[TMP12]], [[PRED_UREM_IF3]] ]
 ; UNROLL-NEXT:    [[TMP14:%.*]] = extractelement <2 x i1> [[TMP3]], i64 0
 ; UNROLL-NEXT:    br i1 [[TMP14]], label [[PRED_UREM_IF5:%.*]], label [[PRED_UREM_CONTINUE6:%.*]]
-; UNROLL:       pred.urem.if5:
+; UNROLL:       pred.urem.if4:
 ; UNROLL-NEXT:    [[TMP15:%.*]] = add i16 [[TMP1]], -18
 ; UNROLL-NEXT:    [[TMP16:%.*]] = urem i16 [[B]], [[TMP15]]
 ; UNROLL-NEXT:    [[TMP17:%.*]] = insertelement <2 x i16> poison, i16 [[TMP16]], i64 0
 ; UNROLL-NEXT:    br label [[PRED_UREM_CONTINUE6]]
-; UNROLL:       pred.urem.continue6:
+; UNROLL:       pred.urem.continue5:
 ; UNROLL-NEXT:    [[TMP18:%.*]] = phi <2 x i16> [ poison, [[PRED_UREM_CONTINUE4]] ], [ [[TMP17]], [[PRED_UREM_IF5]] ]
 ; UNROLL-NEXT:    [[TMP19:%.*]] = extractelement <2 x i1> [[TMP3]], i64 1
 ; UNROLL-NEXT:    br i1 [[TMP19]], label [[PRED_UREM_IF7:%.*]], label [[PRED_UREM_CONTINUE8]]
-; UNROLL:       pred.urem.if7:
+; UNROLL:       pred.urem.if6:
 ; UNROLL-NEXT:    [[TMP20:%.*]] = add i16 [[TMP1]], -17
 ; UNROLL-NEXT:    [[TMP21:%.*]] = urem i16 [[B]], [[TMP20]]
 ; UNROLL-NEXT:    [[TMP22:%.*]] = insertelement <2 x i16> [[TMP18]], i16 [[TMP21]], i64 1
 ; UNROLL-NEXT:    br label [[PRED_UREM_CONTINUE8]]
-; UNROLL:       pred.urem.continue8:
+; UNROLL:       pred.urem.continue7:
 ; UNROLL-NEXT:    [[TMP23:%.*]] = phi <2 x i16> [ [[TMP18]], [[PRED_UREM_CONTINUE6]] ], [ [[TMP22]], [[PRED_UREM_IF7]] ]
 ; UNROLL-NEXT:    [[PREDPHI:%.*]] = select <2 x i1> [[TMP2]], <2 x i16> [[TMP13]], <2 x i16> zeroinitializer
 ; UNROLL-NEXT:    [[PREDPHI9:%.*]] = select <2 x i1> [[TMP3]], <2 x i16> [[TMP23]], <2 x i16> zeroinitializer
@@ -5330,30 +5330,30 @@ define i32 @PR32419(i32 %a, i16 %b) {
 ; UNROLL-NO-IC-NEXT:    [[TMP10:%.*]] = phi <2 x i16> [ poison, [[VECTOR_BODY]] ], [ [[TMP9]], [[PRED_UREM_IF]] ]
 ; UNROLL-NO-IC-NEXT:    [[TMP11:%.*]] = extractelement <2 x i1> [[TMP4]], i32 1
 ; UNROLL-NO-IC-NEXT:    br i1 [[TMP11]], label [[PRED_UREM_IF3:%.*]], label [[PRED_UREM_CONTINUE4:%.*]]
-; UNROLL-NO-IC:       pred.urem.if3:
+; UNROLL-NO-IC:       pred.urem.if2:
 ; UNROLL-NO-IC-NEXT:    [[TMP12:%.*]] = add i16 [[TMP1]], 1
 ; UNROLL-NO-IC-NEXT:    [[TMP13:%.*]] = urem i16 [[B]], [[TMP12]]
 ; UNROLL-NO-IC-NEXT:    [[TMP14:%.*]] = insertelement <2 x i16> [[TMP10]], i16 [[TMP13]], i32 1
 ; UNROLL-NO-IC-NEXT:    br label [[PRED_UREM_CONTINUE4]]
-; UNROLL-NO-IC:       pred.urem.continue4:
+; UNROLL-NO-IC:       pred.urem.continue3:
 ; UNROLL-NO-IC-NEXT:    [[TMP15:%.*]] = phi <2 x i16> [ [[TMP10]], [[PRED_UREM_CONTINUE]] ], [ [[TMP14]], [[PRED_UREM_IF3]] ]
 ; UNROLL-NO-IC-NEXT:    [[TMP16:%.*]] = extractelement <2 x i1> [[TMP5]], i32 0
 ; UNROLL-NO-IC-NEXT:    br i1 [[TMP16]], label [[PRED_UREM_IF5:%.*]], label [[PRED_UREM_CONTINUE6:%.*]]
-; UNROLL-NO-IC:       pred.urem.if5:
+; UNROLL-NO-IC:       pred.urem.if4:
 ; UNROLL-NO-IC-NEXT:    [[TMP17:%.*]] = add i16 [[TMP1]], 2
 ; UNROLL-NO-IC-NEXT:    [[TMP18:%.*]] = urem i16 [[B]], [[TMP17]]
 ; UNROLL-NO-IC-NEXT:    [[TMP19:%.*]] = insertelement <2 x i16> poison, i16 [[TMP18]], i32 0
 ; UNROLL-NO-IC-NEXT:    br label [[PRED_UREM_CONTINUE6]]
-; UNROLL-NO-IC:       pred.urem.continue6:
+; UNROLL-NO-IC:       pred.urem.continue5:
 ; UNROLL-NO-IC-NEXT:    [[TMP20:%.*]] = phi <2 x i16> [ poison, [[PRED_UREM_CONTINUE4]] ], [ [[TMP19]], [[PRED_UREM_IF5]] ]
 ; UNROLL-NO-IC-NEXT:    [[TMP21:%.*]] = extractelement <2 x i1> [[TMP5]], i32 1
 ; UNROLL-NO-IC-NEXT:    br i1 [[TMP21]], label [[PRED_UREM_IF7:%.*]], label [[PRED_UREM_CONTINUE8]]
-; UNROLL-NO-IC:       pred.urem.if7:
+; UNROLL-NO-IC:       pred.urem.if6:
 ; UNROLL-NO-IC-NEXT:    [[TMP22:%.*]] = add i16 [[TMP1]], 3
 ; UNROLL-NO-IC-NEXT:    [[TMP23:%.*]] = urem i16 [[B]], [[TMP22]]
 ; UNROLL-NO-IC-NEXT:    [[TMP24:%.*]] = insertelement <2 x i16> [[TMP20]], i16 [[TMP23]], i32 1
 ; UNROLL-NO-IC-NEXT:    br label [[PRED_UREM_CONTINUE8]]
-; UNROLL-NO-IC:       pred.urem.continue8:
+; UNROLL-NO-IC:       pred.urem.continue7:
 ; UNROLL-NO-IC-NEXT:    [[TMP25:%.*]] = phi <2 x i16> [ [[TMP20]], [[PRED_UREM_CONTINUE6]] ], [ [[TMP24]], [[PRED_UREM_IF7]] ]
 ; UNROLL-NO-IC-NEXT:    [[PREDPHI:%.*]] = select <2 x i1> [[TMP2]], <2 x i16> zeroinitializer, <2 x i16> [[TMP15]]
 ; UNROLL-NO-IC-NEXT:    [[PREDPHI9:%.*]] = select <2 x i1> [[TMP3]], <2 x i16> zeroinitializer, <2 x i16> [[TMP25]]
@@ -5418,66 +5418,66 @@ define i32 @PR32419(i32 %a, i16 %b) {
 ; INTERLEAVE-NEXT:    [[TMP8:%.*]] = phi <4 x i16> [ poison, [[VECTOR_BODY]] ], [ [[TMP7]], [[PRED_UREM_IF]] ]
 ; INTERLEAVE-NEXT:    [[TMP9:%.*]] = extractelement <4 x i1> [[TMP2]], i64 1
 ; INTERLEAVE-NEXT:    br i1 [[TMP9]], label [[PRED_UREM_IF3:%.*]], label [[PRED_UREM_CONTINUE4:%.*]]
-; INTERLEAVE:       pred.urem.if3:
+; INTERLEAVE:       pred.urem.if2:
 ; INTERLEAVE-NEXT:    [[TMP10:%.*]] = add i16 [[TMP1]], -19
 ; INTERLEAVE-NEXT:    [[TMP11:%.*]] = urem i16 [[B]], [[TMP10]]
 ; INTERLEAVE-NEXT:    [[TMP12:%.*]] = insertelement <4 x i16> [[TMP8]], i16 [[TMP11]], i64 1
 ; INTERLEAVE-NEXT:    br label [[PRED_UREM_CONTINUE4]]
-; INTERLEAVE:       pred.urem.continue4:
+; INTERLEAVE:       pred.urem.continue3:
 ; INTERLEAVE-NEXT:    [[TMP13:%.*]] = phi <4 x i16> [ [[TMP8]], [[PRED_UREM_CONTINUE]] ], [ [[TMP12]], [[PRED_UREM_IF3]] ]
 ; INTERLEAVE-NEXT:    [[TMP14:%.*]] = extractelement <4 x i1> [[TMP2]], i64 2
 ; INTERLEAVE-NEXT:    br i1 [[TMP14]], label [[PRED_UREM_IF5:%.*]], label [[PRED_UREM_CONTINUE6:%.*]]
-; INTERLEAVE:       pred.urem.if5:
+; INTERLEAVE:       pred.urem.if4:
 ; INTERLEAVE-NEXT:    [[TMP15:%.*]] = add i16 [[TMP1]], -18
 ; INTERLEAVE-NEXT:    [[TMP16:%.*]] = urem i16 [[B]], [[TMP15]]
 ; INTERLEAVE-NEXT:    [[TMP17:%.*]] = insertelement <4 x i16> [[TMP13]], i16 [[TMP16]], i64 2
 ; INTERLEAVE-NEXT:    br label [[PRED_UREM_CONTINUE6]]
-; INTERLEAVE:       pred.urem.continue6:
+; INTERLEAVE:       pred.urem.continue5:
 ; INTERLEAVE-NEXT:    [[TMP18:%.*]] = phi <4 x i16> [ [[TMP13]], [[PRED_UREM_CONTINUE4]] ], [ [[TMP17]], [[PRED_UREM_IF5]] ]
 ; INTERLEAVE-NEXT:    [[TMP19:%.*]] = extractelement <4 x i1> [[TMP2]], i64 3
 ; INTERLEAVE-NEXT:    br i1 [[TMP19]], label [[PRED_UREM_IF7:%.*]], label [[PRED_UREM_CONTINUE8:%.*]]
-; INTERLEAVE:       pred.urem.if7:
+; INTERLEAVE:       pred.urem.if6:
 ; INTERLEAVE-NEXT:    [[TMP20:%.*]] = add i16 [[TMP1]], -17
 ; INTERLEAVE-NEXT:    [[TMP21:%.*]] = urem i16 [[B]], [[TMP20]]
 ; INTERLEAVE-NEXT:    [[TMP22:%.*]] = insertelement <4 x i16> [[TMP18]], i16 [[TMP21]], i64 3
 ; INTERLEAVE-NEXT:    br label [[PRED_UREM_CONTINUE8]]
-; INTERLEAVE:       pred.urem.continue8:
+; INTERLEAVE:       pred.urem.continue7:
 ; INTERLEAVE-NEXT:    [[TMP23:%.*]] = phi <4 x i16> [ [[TMP18]], [[PRED_UREM_CONTINUE6]] ], [ [[TMP22]], [[PRED_UREM_IF7]] ]
 ; INTERLEAVE-NEXT:    [[TMP24:%.*]] = extractelement <4 x i1> [[TMP3]], i64 0
 ; INTERLEAVE-NEXT:    br i1 [[TMP24]], label [[PRED_UREM_IF9:%.*]], label [[PRED_UREM_CONTINUE10:%.*]]
-; INTERLEAVE:       pred.urem.if9:
+; INTERLEAVE:       pred.urem.if8:
 ; INTERLEAVE-NEXT:    [[TMP25:%.*]] = add i16 [[TMP1]], -16
 ; INTERLEAVE-NEXT:    [[TMP26:%.*]] = urem i16 [[B]], [[TMP25]]
 ; INTERLEAVE-NEXT:    [[TMP27:%.*]] = insertelement <4 x i16> poison, i16 [[TMP26]], i64 0
 ; INTERLEAVE-NEXT:    br label [[PRED_UREM_CONTINUE10]]
-; INTERLEAVE:       pred.urem.continue10:
+; INTERLEAVE:       pred.urem.continue9:
 ; INTERLEAVE-NEXT:    [[TMP28:%.*]] = phi <4 x i16> [ poison, [[PRED_UREM_CONTINUE8]] ], [ [[TMP27]], [[PRED_UREM_IF9]] ]
 ; INTERLEAVE-NEXT:    [[TMP29:%.*]] = extractelement <4 x i1> [[TMP3]], i64 1
 ; INTERLEAVE-NEXT:    br i1 [[TMP29]], label [[PRED_UREM_IF11:%.*]], label [[PRED_UREM_CONTINUE12:%.*]]
-; INTERLEAVE:       pred.urem.if11:
+; INTERLEAVE:       pred.urem.if10:
 ; INTERLEAVE-NEXT:    [[TMP30:%.*]] = add i16 [[TMP1]], -15
 ; INTERLEAVE-NEXT:    [[TMP31:%.*]] = urem i16 [[B]], [[TMP30]]
 ; INTERLEAVE-NEXT:    [[TMP32:%.*]] = insertelement <4 x i16> [[TMP28]], i16 [[TMP31]], i64 1
 ; INTERLEAVE-NEXT:    br label [[PRED_UREM_CONTINUE12]]
-; INTERLEAVE:       pred.urem.continue12:
+; INTERLEAVE:       pred.urem.continue11:
 ; INTERLEAVE-NEXT:    [[TMP33:%.*]] = phi <4 x i16> [ [[TMP28]], [[PRED_UREM_CONTINUE10]] ], [ [[TMP32]], [[PRED_UREM_IF11]] ]
 ; INTERLEAVE-NEXT:    [[TMP34:%.*]] = extractelement <4 x i1> [[TMP3]], i64 2
 ; INTERLEAVE-NEXT:    br i1 [[TMP34]], label [[PRED_UREM_IF13:%.*]], label [[PRED_UREM_CONTINUE14:%.*]]
-; INTERLEAVE:       pred.urem.if13:
+; INTERLEAVE:       pred.urem.if12:
 ; INTERLEAVE-NEXT:    [[TMP35:%.*]] = add i16 [[TMP1]], -14
 ; INTERLEAVE-NEXT:    [[TMP36:%.*]] = urem i16 [[B]], [[TMP35]]
 ; INTERLEAVE-NEXT:    [[TMP37:%.*]] = insertelement <4 x i16> [[TMP33]], i16 [[TMP36]], i64 2
 ; INTERLEAVE-NEXT:    br label [[PRED_UREM_CONTINUE14]]
-; INTERLEAVE:       pred.urem.continue14:
+; INTERLEAVE:       pred.urem.continue13:
 ; INTERLEAVE-NEXT:    [[TMP38:%.*]] = phi <4 x i16> [ [[TMP33]], [[PRED_UREM_CONTINUE12]] ], [ [[TMP37]], [[PRED_UREM_IF13]] ]
 ; INTERLEAVE-NEXT:    [[TMP39:%.*]] = extractelement <4 x i1> [[TMP3]], i64 3
 ; INTERLEAVE-NEXT:    br i1 [[TMP39]], label [[PRED_UREM_IF15:%.*]], label [[PRED_UREM_CONTINUE16]]
-; INTERLEAVE:       pred.urem.if15:
+; INTERLEAVE:       pred.urem.if14:
 ; INTERLEAVE-NEXT:    [[TMP40:%.*]] = add i16 [[TMP1]], -13
 ; INTERLEAVE-NEXT:    [[TMP41:%.*]] = urem i16 [[B]], [[TMP40]]
 ; INTERLEAVE-NEXT:    [[TMP42:%.*]] = insertelement <4 x i16> [[TMP38]], i16 [[TMP41]], i64 3
 ; INTERLEAVE-NEXT:    br label [[PRED_UREM_CONTINUE16]]
-; INTERLEAVE:       pred.urem.continue16:
+; INTERLEAVE:       pred.urem.continue15:
 ; INTERLEAVE-NEXT:    [[TMP43:%.*]] = phi <4 x i16> [ [[TMP38]], [[PRED_UREM_CONTINUE14]] ], [ [[TMP42]], [[PRED_UREM_IF15]] ]
 ; INTERLEAVE-NEXT:    [[PREDPHI:%.*]] = select <4 x i1> [[TMP2]], <4 x i16> [[TMP23]], <4 x i16> zeroinitializer
 ; INTERLEAVE-NEXT:    [[PREDPHI17:%.*]] = select <4 x i1> [[TMP3]], <4 x i16> [[TMP43]], <4 x i16> zeroinitializer
@@ -6379,12 +6379,12 @@ define void @test_optimized_cast_induction_feeding_first_order_recurrence(i64 %n
 ; UNROLL-NEXT:    [[N_VEC:%.*]] = and i64 [[N]], -4
 ; UNROLL-NEXT:    [[DOTCAST:%.*]] = trunc i64 [[N_VEC]] to i32
 ; UNROLL-NEXT:    [[IND_END:%.*]] = mul i32 [[DOTCAST]], [[STEP]]
+; UNROLL-NEXT:    [[DOTSPLATINSERT2:%.*]] = insertelement <2 x i32> poison, i32 [[STEP]], i64 0
+; UNROLL-NEXT:    [[TMP16:%.*]] = shl <2 x i32> [[DOTSPLATINSERT2]], <i32 1, i32 0>
+; UNROLL-NEXT:    [[DOTSPLAT3:%.*]] = shufflevector <2 x i32> [[TMP16]], <2 x i32> poison, <2 x i32> zeroinitializer
 ; UNROLL-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <2 x i32> poison, i32 [[STEP]], i64 0
 ; UNROLL-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <2 x i32> [[DOTSPLATINSERT]], <2 x i32> poison, <2 x i32> zeroinitializer
 ; UNROLL-NEXT:    [[TMP15:%.*]] = mul nuw <2 x i32> [[DOTSPLAT]], <i32 0, i32 1>
-; UNROLL-NEXT:    [[TMP16:%.*]] = shl i32 [[STEP]], 1
-; UNROLL-NEXT:    [[DOTSPLATINSERT2:%.*]] = insertelement <2 x i32> poison, i32 [[TMP16]], i64 0
-; UNROLL-NEXT:    [[DOTSPLAT3:%.*]] = shufflevector <2 x i32> [[DOTSPLATINSERT2]], <2 x i32> poison, <2 x i32> zeroinitializer
 ; UNROLL-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; UNROLL:       vector.body:
 ; UNROLL-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
@@ -6457,13 +6457,13 @@ define void @test_optimized_cast_induction_feeding_first_order_recurrence(i64 %n
 ; UNROLL-NO-IC-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
 ; UNROLL-NO-IC-NEXT:    [[DOTCAST:%.*]] = trunc i64 [[N_VEC]] to i32
 ; UNROLL-NO-IC-NEXT:    [[IND_END:%.*]] = mul i32 [[DOTCAST]], [[STEP]]
+; UNROLL-NO-IC-NEXT:    [[DOTSPLATINSERT2:%.*]] = insertelement <2 x i32> poison, i32 [[STEP]], i64 0
+; UNROLL-NO-IC-NEXT:    [[TMP18:%.*]] = shufflevector <2 x i32> [[DOTSPLATINSERT2]], <2 x i32> poison, <2 x i32> zeroinitializer
+; UNROLL-NO-IC-NEXT:    [[DOTSPLAT3:%.*]] = mul <2 x i32> <i32 2, i32 2>, [[TMP18]]
 ; UNROLL-NO-IC-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <2 x i32> poison, i32 [[STEP]], i64 0
 ; UNROLL-NO-IC-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <2 x i32> [[DOTSPLATINSERT]], <2 x i32> poison, <2 x i32> zeroinitializer
 ; UNROLL-NO-IC-NEXT:    [[TMP17:%.*]] = mul <2 x i32> <i32 0, i32 1>, [[DOTSPLAT]]
 ; UNROLL-NO-IC-NEXT:    [[INDUCTION:%.*]] = add <2 x i32> zeroinitializer, [[TMP17]]
-; UNROLL-NO-IC-NEXT:    [[TMP18:%.*]] = mul i32 [[STEP]], 2
-; UNROLL-NO-IC-NEXT:    [[DOTSPLATINSERT2:%.*]] = insertelement <2 x i32> poison, i32 [[TMP18]], i64 0
-; UNROLL-NO-IC-NEXT:    [[DOTSPLAT3:%.*]] = shufflevector <2 x i32> [[DOTSPLATINSERT2]], <2 x i32> poison, <2 x i32> zeroinitializer
 ; UNROLL-NO-IC-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; UNROLL-NO-IC:       vector.body:
 ; UNROLL-NO-IC-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
@@ -6537,12 +6537,13 @@ define void @test_optimized_cast_induction_feeding_first_order_recurrence(i64 %n
 ; INTERLEAVE-NEXT:    [[N_VEC:%.*]] = and i64 [[N]], -8
 ; INTERLEAVE-NEXT:    [[DOTCAST:%.*]] = trunc i64 [[N_VEC]] to i32
 ; INTERLEAVE-NEXT:    [[IND_END:%.*]] = mul i32 [[DOTCAST]], [[STEP]]
+; INTERLEAVE-NEXT:    [[DOTSPLATINSERT2:%.*]] = insertelement <4 x i32> poison, i32 [[STEP]], i64 0
+; INTERLEAVE-NEXT:    [[TMP16:%.*]] = shl <4 x i32> [[DOTSPLATINSERT2]], <i32 2, i32 0, i32 0, i32 0>
+; INTERLEAVE-NEXT:    [[DOTSPLAT3:%.*]] = shufflevector <4 x i32> [[TMP16]], <4 x i32> poison, <4 x i32> zeroinitializer
+
 ; INTERLEAVE-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <4 x i32> poison, i32 [[STEP]], i64 0
 ; INTERLEAVE-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <4 x i32> [[DOTSPLATINSERT]], <4 x i32> poison, <4 x i32> zeroinitializer
 ; INTERLEAVE-NEXT:    [[TMP15:%.*]] = mul <4 x i32> [[DOTSPLAT]], <i32 0, i32 1, i32 2, i32 3>
-; INTERLEAVE-NEXT:    [[TMP16:%.*]] = shl i32 [[STEP]], 2
-; INTERLEAVE-NEXT:    [[DOTSPLATINSERT2:%.*]] = insertelement <4 x i32> poison, i32 [[TMP16]], i64 0
-; INTERLEAVE-NEXT:    [[DOTSPLAT3:%.*]] = shufflevector <4 x i32> [[DOTSPLATINSERT2]], <4 x i32> poison, <4 x i32> zeroinitializer
 ; INTERLEAVE-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; INTERLEAVE:       vector.body:
 ; INTERLEAVE-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
diff --git a/llvm/test/Transforms/LoopVectorize/interleave-and-scalarize-only.ll b/llvm/test/Transforms/LoopVectorize/interleave-and-scalarize-only.ll
index 2503520c0ff9d0..d80c5aed3ea2d4 100644
--- a/llvm/test/Transforms/LoopVectorize/interleave-and-scalarize-only.ll
+++ b/llvm/test/Transforms/LoopVectorize/interleave-and-scalarize-only.ll
@@ -230,7 +230,6 @@ define void @first_order_recurrence_using_induction(i32 %n, ptr %dst) {
 ; CHECK-NEXT:    [[TMP3:%.*]] = trunc i64 [[INDEX]] to i32
 ; CHECK-NEXT:    [[INDUCTION:%.*]] = add i32 [[TMP3]], 0
 ; CHECK-NEXT:    [[INDUCTION1]] = add i32 [[TMP3]], 1
-; CHECK-NEXT:    store i32 [[VECTOR_RECUR]], ptr [[DST:%.*]], align 4
 ; CHECK-NEXT:    store i32 [[INDUCTION]], ptr [[DST]], align 4
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
 ; CHECK-NEXT:    [[TMP4:%.*]] = icmp eq i64 [[INDEX_NEXT]], %n.vec
@@ -304,7 +303,6 @@ define void @scalarize_ptrtoint(ptr %src, ptr %dst) {
 ; CHECK-NEXT:    [[TMP9:%.*]] = add i64 [[TMP7]], 10
 ; CHECK-NEXT:    [[TMP10:%.*]] = inttoptr i64 [[TMP8]] to ptr
 ; CHECK-NEXT:    [[TMP11:%.*]] = inttoptr i64 [[TMP9]] to ptr
-; CHECK-NEXT:    store ptr [[TMP10]], ptr %dst, align 8
 ; CHECK-NEXT:    store ptr [[TMP11]], ptr %dst, align 8
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
 ; CHECK-NEXT:    [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], 0
diff --git a/llvm/test/Transforms/LoopVectorize/pr45679-fold-tail-by-masking.ll b/llvm/test/Transforms/LoopVectorize/pr45679-fold-tail-by-masking.ll
index 7c23b603b6e91a..dc3480fbb11a88 100644
--- a/llvm/test/Transforms/LoopVectorize/pr45679-fold-tail-by-masking.ll
+++ b/llvm/test/Transforms/LoopVectorize/pr45679-fold-tail-by-masking.ll
@@ -92,28 +92,28 @@ define void @pr45679(ptr %A) optsize {
 ; VF2UF2:       pred.store.continue:
 ; VF2UF2-NEXT:    [[TMP5:%.*]] = extractelement <2 x i1> [[TMP0]], i32 1
 ; VF2UF2-NEXT:    br i1 [[TMP5]], label [[PRED_STORE_IF2:%.*]], label [[PRED_STORE_CONTINUE3:%.*]]
-; VF2UF2:       pred.store.if2:
+; VF2UF2:       pred.store.if1:
 ; VF2UF2-NEXT:    [[TMP6:%.*]] = add i32 [[INDEX]], 1
 ; VF2UF2-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[A]], i32 [[TMP6]]
 ; VF2UF2-NEXT:    store i32 13, ptr [[TMP7]], align 1
 ; VF2UF2-NEXT:    br label [[PRED_STORE_CONTINUE3]]
-; VF2UF2:       pred.store.continue3:
+; VF2UF2:       pred.store.continue2:
 ; VF2UF2-NEXT:    [[TMP8:%.*]] = extractelement <2 x i1> [[TMP1]], i32 0
 ; VF2UF2-NEXT:    br i1 [[TMP8]], label [[PRED_STORE_IF4:%.*]], label [[PRED_STORE_CONTINUE5:%.*]]
-; VF2UF2:       pred.store.if4:
+; VF2UF2:       pred.store.if3:
 ; VF2UF2-NEXT:    [[TMP9:%.*]] = add i32 [[INDEX]], 2
 ; VF2UF2-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i32, ptr [[A]], i32 [[TMP9]]
 ; VF2UF2-NEXT:    store i32 13, ptr [[TMP10]], align 1
 ; VF2UF2-NEXT:    br label [[PRED_STORE_CONTINUE5]]
-; VF2UF2:       pred.store.continue5:
+; VF2UF2:       pred.store.continue4:
 ; VF2UF2-NEXT:    [[TMP11:%.*]] = extractelement <2 x i1> [[TMP1]], i32 1
 ; VF2UF2-NEXT:    br i1 [[TMP11]], label [[PRED_STORE_IF6:%.*]], label [[PRED_STORE_CONTINUE7]]
-; VF2UF2:       pred.store.if6:
+; VF2UF2:       pred.store.if5:
 ; VF2UF2-NEXT:    [[TMP12:%.*]] = add i32 [[INDEX]], 3
 ; VF2UF2-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i32, ptr [[A]], i32 [[TMP12]]
 ; VF2UF2-NEXT:    store i32 13, ptr [[TMP13]], align 1
 ; VF2UF2-NEXT:    br label [[PRED_STORE_CONTINUE7]]
-; VF2UF2:       pred.store.continue7:
+; VF2UF2:       pred.store.continue6:
 ; VF2UF2-NEXT:    [[INDEX_NEXT]] = add i32 [[INDEX]], 4
 ; VF2UF2-NEXT:    [[VEC_IND_NEXT]] = add <2 x i32> [[STEP_ADD]], <i32 2, i32 2>
 ; VF2UF2-NEXT:    [[TMP14:%.*]] = icmp eq i32 [[INDEX_NEXT]], 16
@@ -293,31 +293,31 @@ define void @load_variant(ptr noalias %a, ptr noalias %b) {
 ; VF2UF2:       pred.store.continue:
 ; VF2UF2-NEXT:    [[TMP7:%.*]] = extractelement <2 x i1> [[TMP0]], i32 1
 ; VF2UF2-NEXT:    br i1 [[TMP7]], label [[PRED_STORE_IF2:%.*]], label [[PRED_STORE_CONTINUE3:%.*]]
-; VF2UF2:       pred.store.if2:
+; VF2UF2:       pred.store.if1:
 ; VF2UF2-NEXT:    [[TMP8:%.*]] = add i64 [[INDEX]], 1
 ; VF2UF2-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP8]]
 ; VF2UF2-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP9]], align 8
 ; VF2UF2-NEXT:    store i64 [[TMP10]], ptr [[B]], align 8
 ; VF2UF2-NEXT:    br label [[PRED_STORE_CONTINUE3]]
-; VF2UF2:       pred.store.continue3:
+; VF2UF2:       pred.store.continue2:
 ; VF2UF2-NEXT:    [[TMP12:%.*]] = extractelement <2 x i1> [[TMP1]], i32 0
 ; VF2UF2-NEXT:    br i1 [[TMP12]], label [[PRED_STORE_IF4:%.*]], label [[PRED_STORE_CONTINUE5:%.*]]
-; VF2UF2:       pred.store.if4:
+; VF2UF2:       pred.store.if3:
 ; VF2UF2-NEXT:    [[TMP13:%.*]] = add i64 [[INDEX]], 2
 ; VF2UF2-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP13]]
 ; VF2UF2-NEXT:    [[TMP15:%.*]] = load i64, ptr [[TMP14]], align 8
 ; VF2UF2-NEXT:    store i64 [[TMP15]], ptr [[B]], align 8
 ; VF2UF2-NEXT:    br label [[PRED_STORE_CONTINUE5]]
-; VF2UF2:       pred.store.continue5:
+; VF2UF2:       pred.store.continue4:
 ; VF2UF2-NEXT:    [[TMP17:%.*]] = extractelement <2 x i1> [[TMP1]], i32 1
 ; VF2UF2-NEXT:    br i1 [[TMP17]], label [[PRED_STORE_IF6:%.*]], label [[PRED_STORE_CONTINUE7]]
-; VF2UF2:       pred.store.if6:
+; VF2UF2:       pred.store.if5:
 ; VF2UF2-NEXT:    [[TMP18:%.*]] = add i64 [[INDEX]], 3
 ; VF2UF2-NEXT:    [[TMP19:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP18]]
 ; VF2UF2-NEXT:    [[TMP20:%.*]] = load i64, ptr [[TMP19]], align 8
 ; VF2UF2-NEXT:    store i64 [[TMP20]], ptr [[B]], align 8
 ; VF2UF2-NEXT:    br label [[PRED_STORE_CONTINUE7]]
-; VF2UF2:       pred.store.continue7:
+; VF2UF2:       pred.store.continue6:
 ; VF2UF2-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 4
 ; VF2UF2-NEXT:    [[VEC_IND_NEXT]] = add <2 x i64> [[STEP_ADD]], <i64 2, i64 2>
 ; VF2UF2-NEXT:    [[TMP22:%.*]] = icmp eq i64 [[INDEX_NEXT]], 16
diff --git a/llvm/test/Transforms/LoopVectorize/reduction-inloop-uf4.ll b/llvm/test/Transforms/LoopVectorize/reduction-inloop-uf4.ll
index 306ec125dc2023..657860a0440d26 100644
--- a/llvm/test/Transforms/LoopVectorize/reduction-inloop-uf4.ll
+++ b/llvm/test/Transforms/LoopVectorize/reduction-inloop-uf4.ll
@@ -97,153 +97,153 @@ define i32 @predicated(ptr noalias nocapture %A) {
 ; CHECK-NEXT:    [[TMP8:%.*]] = phi <4 x i32> [ poison, [[VECTOR_BODY]] ], [ [[TMP7]], [[PRED_LOAD_IF]] ]
 ; CHECK-NEXT:    [[TMP9:%.*]] = extractelement <4 x i1> [[TMP0]], i64 1
 ; CHECK-NEXT:    br i1 [[TMP9]], label [[PRED_LOAD_IF7:%.*]], label [[PRED_LOAD_CONTINUE8:%.*]]
-; CHECK:       pred.load.if7:
+; CHECK:       pred.load.if4:
 ; CHECK-NEXT:    [[TMP10:%.*]] = or disjoint i64 [[INDEX]], 1
 ; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP10]]
 ; CHECK-NEXT:    [[TMP12:%.*]] = load i32, ptr [[TMP11]], align 4
 ; CHECK-NEXT:    [[TMP13:%.*]] = insertelement <4 x i32> [[TMP8]], i32 [[TMP12]], i64 1
 ; CHECK-NEXT:    br label [[PRED_LOAD_CONTINUE8]]
-; CHECK:       pred.load.continue8:
+; CHECK:       pred.load.continue5:
 ; CHECK-NEXT:    [[TMP14:%.*]] = phi <4 x i32> [ [[TMP8]], [[PRED_LOAD_CONTINUE]] ], [ [[TMP13]], [[PRED_LOAD_IF7]] ]
 ; CHECK-NEXT:    [[TMP15:%.*]] = extractelement <4 x i1> [[TMP0]], i64 2
 ; CHECK-NEXT:    br i1 [[TMP15]], label [[PRED_LOAD_IF9:%.*]], label [[PRED_LOAD_CONTINUE10:%.*]]
-; CHECK:       pred.load.if9:
+; CHECK:       pred.load.if6:
 ; CHECK-NEXT:    [[TMP16:%.*]] = or disjoint i64 [[INDEX]], 2
 ; CHECK-NEXT:    [[TMP17:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP16]]
 ; CHECK-NEXT:    [[TMP18:%.*]] = load i32, ptr [[TMP17]], align 4
 ; CHECK-NEXT:    [[TMP19:%.*]] = insertelement <4 x i32> [[TMP14]], i32 [[TMP18]], i64 2
 ; CHECK-NEXT:    br label [[PRED_LOAD_CONTINUE10]]
-; CHECK:       pred.load.continue10:
+; CHECK:       pred.load.continue7:
 ; CHECK-NEXT:    [[TMP20:%.*]] = phi <4 x i32> [ [[TMP14]], [[PRED_LOAD_CONTINUE8]] ], [ [[TMP19]], [[PRED_LOAD_IF9]] ]
 ; CHECK-NEXT:    [[TMP21:%.*]] = extractelement <4 x i1> [[TMP0]], i64 3
 ; CHECK-NEXT:    br i1 [[TMP21]], label [[PRED_LOAD_IF11:%.*]], label [[PRED_LOAD_CONTINUE12:%.*]]
-; CHECK:       pred.load.if11:
+; CHECK:       pred.load.if8:
 ; CHECK-NEXT:    [[TMP22:%.*]] = or disjoint i64 [[INDEX]], 3
 ; CHECK-NEXT:    [[TMP23:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP22]]
 ; CHECK-NEXT:    [[TMP24:%.*]] = load i32, ptr [[TMP23]], align 4
 ; CHECK-NEXT:    [[TMP25:%.*]] = insertelement <4 x i32> [[TMP20]], i32 [[TMP24]], i64 3
 ; CHECK-NEXT:    br label [[PRED_LOAD_CONTINUE12]]
-; CHECK:       pred.load.continue12:
+; CHECK:       pred.load.continue9:
 ; CHECK-NEXT:    [[TMP26:%.*]] = phi <4 x i32> [ [[TMP20]], [[PRED_LOAD_CONTINUE10]] ], [ [[TMP25]], [[PRED_LOAD_IF11]] ]
 ; CHECK-NEXT:    [[TMP27:%.*]] = extractelement <4 x i1> [[TMP1]], i64 0
 ; CHECK-NEXT:    br i1 [[TMP27]], label [[PRED_LOAD_IF13:%.*]], label [[PRED_LOAD_CONTINUE14:%.*]]
-; CHECK:       pred.load.if13:
+; CHECK:       pred.load.if10:
 ; CHECK-NEXT:    [[TMP28:%.*]] = or disjoint i64 [[INDEX]], 4
 ; CHECK-NEXT:    [[TMP29:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP28]]
 ; CHECK-NEXT:    [[TMP30:%.*]] = load i32, ptr [[TMP29]], align 4
 ; CHECK-NEXT:    [[TMP31:%.*]] = insertelement <4 x i32> poison, i32 [[TMP30]], i64 0
 ; CHECK-NEXT:    br label [[PRED_LOAD_CONTINUE14]]
-; CHECK:       pred.load.continue14:
+; CHECK:       pred.load.continue11:
 ; CHECK-NEXT:    [[TMP32:%.*]] = phi <4 x i32> [ poison, [[PRED_LOAD_CONTINUE12]] ], [ [[TMP31]], [[PRED_LOAD_IF13]] ]
 ; CHECK-NEXT:    [[TMP33:%.*]] = extractelement <4 x i1> [[TMP1]], i64 1
 ; CHECK-NEXT:    br i1 [[TMP33]], label [[PRED_LOAD_IF15:%.*]], label [[PRED_LOAD_CONTINUE16:%.*]]
-; CHECK:       pred.load.if15:
+; CHECK:       pred.load.if12:
 ; CHECK-NEXT:    [[TMP34:%.*]] = or disjoint i64 [[INDEX]], 5
 ; CHECK-NEXT:    [[TMP35:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP34]]
 ; CHECK-NEXT:    [[TMP36:%.*]] = load i32, ptr [[TMP35]], align 4
 ; CHECK-NEXT:    [[TMP37:%.*]] = insertelement <4 x i32> [[TMP32]], i32 [[TMP36]], i64 1
 ; CHECK-NEXT:    br label [[PRED_LOAD_CONTINUE16]]
-; CHECK:       pred.load.continue16:
+; CHECK:       pred.load.continue13:
 ; CHECK-NEXT:    [[TMP38:%.*]] = phi <4 x i32> [ [[TMP32]], [[PRED_LOAD_CONTINUE14]] ], [ [[TMP37]], [[PRED_LOAD_IF15]] ]
 ; CHECK-NEXT:    [[TMP39:%.*]] = extractelement <4 x i1> [[TMP1]], i64 2
 ; CHECK-NEXT:    br i1 [[TMP39]], label [[PRED_LOAD_IF17:%.*]], label [[PRED_LOAD_CONTINUE18:%.*]]
-; CHECK:       pred.load.if17:
+; CHECK:       pred.load.if14:
 ; CHECK-NEXT:    [[TMP40:%.*]] = or disjoint i64 [[INDEX]], 6
 ; CHECK-NEXT:    [[TMP41:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP40]]
 ; CHECK-NEXT:    [[TMP42:%.*]] = load i32, ptr [[TMP41]], align 4
 ; CHECK-NEXT:    [[TMP43:%.*]] = insertelement <4 x i32> [[TMP38]], i32 [[TMP42]], i64 2
 ; CHECK-NEXT:    br label [[PRED_LOAD_CONTINUE18]]
-; CHECK:       pred.load.continue18:
+; CHECK:       pred.load.continue15:
 ; CHECK-NEXT:    [[TMP44:%.*]] = phi <4 x i32> [ [[TMP38]], [[PRED_LOAD_CONTINUE16]] ], [ [[TMP43]], [[PRED_LOAD_IF17]] ]
 ; CHECK-NEXT:    [[TMP45:%.*]] = extractelement <4 x i1> [[TMP1]], i64 3
 ; CHECK-NEXT:    br i1 [[TMP45]], label [[PRED_LOAD_IF19:%.*]], label [[PRED_LOAD_CONTINUE20:%.*]]
-; CHECK:       pred.load.if19:
+; CHECK:       pred.load.if16:
 ; CHECK-NEXT:    [[TMP46:%.*]] = or disjoint i64 [[INDEX]], 7
 ; CHECK-NEXT:    [[TMP47:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP46]]
 ; CHECK-NEXT:    [[TMP48:%.*]] = load i32, ptr [[TMP47]], align 4
 ; CHECK-NEXT:    [[TMP49:%.*]] = insertelement <4 x i32> [[TMP44]], i32 [[TMP48]], i64 3
 ; CHECK-NEXT:    br label [[PRED_LOAD_CONTINUE20]]
-; CHECK:       pred.load.continue20:
+; CHECK:       pred.load.continue17:
 ; CHECK-NEXT:    [[TMP50:%.*]] = phi <4 x i32> [ [[TMP44]], [[PRED_LOAD_CONTINUE18]] ], [ [[TMP49]], [[PRED_LOAD_IF19]] ]
 ; CHECK-NEXT:    [[TMP51:%.*]] = extractelement <4 x i1> [[TMP2]], i64 0
 ; CHECK-NEXT:    br i1 [[TMP51]], label [[PRED_LOAD_IF21:%.*]], label [[PRED_LOAD_CONTINUE22:%.*]]
-; CHECK:       pred.load.if21:
+; CHECK:       pred.load.if18:
 ; CHECK-NEXT:    [[TMP52:%.*]] = or disjoint i64 [[INDEX]], 8
 ; CHECK-NEXT:    [[TMP53:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP52]]
 ; CHECK-NEXT:    [[TMP54:%.*]] = load i32, ptr [[TMP53]], align 4
 ; CHECK-NEXT:    [[TMP55:%.*]] = insertelement <4 x i32> poison, i32 [[TMP54]], i64 0
 ; CHECK-NEXT:    br label [[PRED_LOAD_CONTINUE22]]
-; CHECK:       pred.load.continue22:
+; CHECK:       pred.load.continue19:
 ; CHECK-NEXT:    [[TMP56:%.*]] = phi <4 x i32> [ poison, [[PRED_LOAD_CONTINUE20]] ], [ [[TMP55]], [[PRED_LOAD_IF21]] ]
 ; CHECK-NEXT:    [[TMP57:%.*]] = extractelement <4 x i1> [[TMP2]], i64 1
 ; CHECK-NEXT:    br i1 [[TMP57]], label [[PRED_LOAD_IF23:%.*]], label [[PRED_LOAD_CONTINUE24:%.*]]
-; CHECK:       pred.load.if23:
+; CHECK:       pred.load.if20:
 ; CHECK-NEXT:    [[TMP58:%.*]] = or disjoint i64 [[INDEX]], 9
 ; CHECK-NEXT:    [[TMP59:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP58]]
 ; CHECK-NEXT:    [[TMP60:%.*]] = load i32, ptr [[TMP59]], align 4
 ; CHECK-NEXT:    [[TMP61:%.*]] = insertelement <4 x i32> [[TMP56]], i32 [[TMP60]], i64 1
 ; CHECK-NEXT:    br label [[PRED_LOAD_CONTINUE24]]
-; CHECK:       pred.load.continue24:
+; CHECK:       pred.load.continue21:
 ; CHECK-NEXT:    [[TMP62:%.*]] = phi <4 x i32> [ [[TMP56]], [[PRED_LOAD_CONTINUE22]] ], [ [[TMP61]], [[PRED_LOAD_IF23]] ]
 ; CHECK-NEXT:    [[TMP63:%.*]] = extractelement <4 x i1> [[TMP2]], i64 2
 ; CHECK-NEXT:    br i1 [[TMP63]], label [[PRED_LOAD_IF25:%.*]], label [[PRED_LOAD_CONTINUE26:%.*]]
-; CHECK:       pred.load.if25:
+; CHECK:       pred.load.if22:
 ; CHECK-NEXT:    [[TMP64:%.*]] = or disjoint i64 [[INDEX]], 10
 ; CHECK-NEXT:    [[TMP65:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP64]]
 ; CHECK-NEXT:    [[TMP66:%.*]] = load i32, ptr [[TMP65]], align 4
 ; CHECK-NEXT:    [[TMP67:%.*]] = insertelement <4 x i32> [[TMP62]], i32 [[TMP66]], i64 2
 ; CHECK-NEXT:    br label [[PRED_LOAD_CONTINUE26]]
-; CHECK:       pred.load.continue26:
+; CHECK:       pred.load.continue23:
 ; CHECK-NEXT:    [[TMP68:%.*]] = phi <4 x i32> [ [[TMP62]], [[PRED_LOAD_CONTINUE24]] ], [ [[TMP67]], [[PRED_LOAD_IF25]] ]
 ; CHECK-NEXT:    [[TMP69:%.*]] = extractelement <4 x i1> [[TMP2]], i64 3
 ; CHECK-NEXT:    br i1 [[TMP69]], label [[PRED_LOAD_IF27:%.*]], label [[PRED_LOAD_CONTINUE28:%.*]]
-; CHECK:       pred.load.if27:
+; CHECK:       pred.load.if24:
 ; CHECK-NEXT:    [[TMP70:%.*]] = or disjoint i64 [[INDEX]], 11
 ; CHECK-NEXT:    [[TMP71:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP70]]
 ; CHECK-NEXT:    [[TMP72:%.*]] = load i32, ptr [[TMP71]], align 4
 ; CHECK-NEXT:    [[TMP73:%.*]] = insertelement <4 x i32> [[TMP68]], i32 [[TMP72]], i64 3
 ; CHECK-NEXT:    br label [[PRED_LOAD_CONTINUE28]]
-; CHECK:       pred.load.continue28:
+; CHECK:       pred.load.continue25:
 ; CHECK-NEXT:    [[TMP74:%.*]] = phi <4 x i32> [ [[TMP68]], [[PRED_LOAD_CONTINUE26]] ], [ [[TMP73]], [[PRED_LOAD_IF27]] ]
 ; CHECK-NEXT:    [[TMP75:%.*]] = extractelement <4 x i1> [[TMP3]], i64 0
 ; CHECK-NEXT:    br i1 [[TMP75]], label [[PRED_LOAD_IF29:%.*]], label [[PRED_LOAD_CONTINUE30:%.*]]
-; CHECK:       pred.load.if29:
+; CHECK:       pred.load.if26:
 ; CHECK-NEXT:    [[TMP76:%.*]] = or disjoint i64 [[INDEX]], 12
 ; CHECK-NEXT:    [[TMP77:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP76]]
 ; CHECK-NEXT:    [[TMP78:%.*]] = load i32, ptr [[TMP77]], align 4
 ; CHECK-NEXT:    [[TMP79:%.*]] = insertelement <4 x i32> poison, i32 [[TMP78]], i64 0
 ; CHECK-NEXT:    br label [[PRED_LOAD_CONTINUE30]]
-; CHECK:       pred.load.continue30:
+; CHECK:       pred.load.continue27:
 ; CHECK-NEXT:    [[TMP80:%.*]] = phi <4 x i32> [ poison, [[PRED_LOAD_CONTINUE28]] ], [ [[TMP79]], [[PRED_LOAD_IF29]] ]
 ; CHECK-NEXT:    [[TMP81:%.*]] = extractelement <4 x i1> [[TMP3]], i64 1
 ; CHECK-NEXT:    br i1 [[TMP81]], label [[PRED_LOAD_IF31:%.*]], label [[PRED_LOAD_CONTINUE32:%.*]]
-; CHECK:       pred.load.if31:
+; CHECK:       pred.load.if28:
 ; CHECK-NEXT:    [[TMP82:%.*]] = or disjoint i64 [[INDEX]], 13
 ; CHECK-NEXT:    [[TMP83:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP82]]
 ; CHECK-NEXT:    [[TMP84:%.*]] = load i32, ptr [[TMP83]], align 4
 ; CHECK-NEXT:    [[TMP85:%.*]] = insertelement <4 x i32> [[TMP80]], i32 [[TMP84]], i64 1
 ; CHECK-NEXT:    br label [[PRED_LOAD_CONTINUE32]]
-; CHECK:       pred.load.continue32:
+; CHECK:       pred.load.continue29:
 ; CHECK-NEXT:    [[TMP86:%.*]] = phi <4 x i32> [ [[TMP80]], [[PRED_LOAD_CONTINUE30]] ], [ [[TMP85]], [[PRED_LOAD_IF31]] ]
 ; CHECK-NEXT:    [[TMP87:%.*]] = extractelement <4 x i1> [[TMP3]], i64 2
 ; CHECK-NEXT:    br i1 [[TMP87]], label [[PRED_LOAD_IF33:%.*]], label [[PRED_LOAD_CONTINUE34:%.*]]
-; CHECK:       pred.load.if33:
+; CHECK:       pred.load.if30:
 ; CHECK-NEXT:    [[TMP88:%.*]] = or disjoint i64 [[INDEX]], 14
 ; CHECK-NEXT:    [[TMP89:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP88]]
 ; CHECK-NEXT:    [[TMP90:%.*]] = load i32, ptr [[TMP89]], align 4
 ; CHECK-NEXT:    [[TMP91:%.*]] = insertelement <4 x i32> [[TMP86]], i32 [[TMP90]], i64 2
 ; CHECK-NEXT:    br label [[PRED_LOAD_CONTINUE34]]
-; CHECK:       pred.load.continue34:
+; CHECK:       pred.load.continue31:
 ; CHECK-NEXT:    [[TMP92:%.*]] = phi <4 x i32> [ [[TMP86]], [[PRED_LOAD_CONTINUE32]] ], [ [[TMP91]], [[PRED_LOAD_IF33]] ]
 ; CHECK-NEXT:    [[TMP93:%.*]] = extractelement <4 x i1> [[TMP3]], i64 3
 ; CHECK-NEXT:    br i1 [[TMP93]], label [[PRED_LOAD_IF35:%.*]], label [[PRED_LOAD_CONTINUE36]]
-; CHECK:       pred.load.if35:
+; CHECK:       pred.load.if32:
 ; CHECK-NEXT:    [[TMP94:%.*]] = or disjoint i64 [[INDEX]], 15
 ; CHECK-NEXT:    [[TMP95:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP94]]
 ; CHECK-NEXT:    [[TMP96:%.*]] = load i32, ptr [[TMP95]], align 4
 ; CHECK-NEXT:    [[TMP97:%.*]] = insertelement <4 x i32> [[TMP92]], i32 [[TMP96]], i64 3
 ; CHECK-NEXT:    br label [[PRED_LOAD_CONTINUE36]]
-; CHECK:       pred.load.continue36:
+; CHECK:       pred.load.continue33:
 ; CHECK-NEXT:    [[TMP98:%.*]] = phi <4 x i32> [ [[TMP92]], [[PRED_LOAD_CONTINUE34]] ], [ [[TMP97]], [[PRED_LOAD_IF35]] ]
 ; CHECK-NEXT:    [[TMP99:%.*]] = select <4 x i1> [[TMP0]], <4 x i32> [[TMP26]], <4 x i32> zeroinitializer
 ; CHECK-NEXT:    [[TMP100:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP99]])
@@ -339,153 +339,153 @@ define i32 @cond_rdx_pred(i32 %cond, ptr noalias %a, i64 %N) {
 ; CHECK-NEXT:    [[TMP16:%.*]] = phi <4 x i32> [ poison, [[VECTOR_BODY]] ], [ [[TMP15]], [[PRED_LOAD_IF]] ]
 ; CHECK-NEXT:    [[TMP17:%.*]] = extractelement <4 x i1> [[TMP8]], i64 1
 ; CHECK-NEXT:    br i1 [[TMP17]], label [[PRED_LOAD_IF9:%.*]], label [[PRED_LOAD_CONTINUE10:%.*]]
-; CHECK:       pred.load.if9:
+; CHECK:       pred.load.if6:
 ; CHECK-NEXT:    [[TMP18:%.*]] = or disjoint i64 [[INDEX]], 1
 ; CHECK-NEXT:    [[TMP19:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP18]]
 ; CHECK-NEXT:    [[TMP20:%.*]] = load i32, ptr [[TMP19]], align 4
 ; CHECK-NEXT:    [[TMP21:%.*]] = insertelement <4 x i32> [[TMP16]], i32 [[TMP20]], i64 1
 ; CHECK-NEXT:    br label [[PRED_LOAD_CONTINUE10]]
-; CHECK:       pred.load.continue10:
+; CHECK:       pred.load.continue7:
 ; CHECK-NEXT:    [[TMP22:%.*]] = phi <4 x i32> [ [[TMP16]], [[PRED_LOAD_CONTINUE]] ], [ [[TMP21]], [[PRED_LOAD_IF9]] ]
 ; CHECK-NEXT:    [[TMP23:%.*]] = extractelement <4 x i1> [[TMP8]], i64 2
 ; CHECK-NEXT:    br i1 [[TMP23]], label [[PRED_LOAD_IF11:%.*]], label [[PRED_LOAD_CONTINUE12:%.*]]
-; CHECK:       pred.load.if11:
+; CHECK:       pred.load.if8:
 ; CHECK-NEXT:    [[TMP24:%.*]] = or disjoint i64 [[INDEX]], 2
 ; CHECK-NEXT:    [[TMP25:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP24]]
 ; CHECK-NEXT:    [[TMP26:%.*]] = load i32, ptr [[TMP25]], align 4
 ; CHECK-NEXT:    [[TMP27:%.*]] = insertelement <4 x i32> [[TMP22]], i32 [[TMP26]], i64 2
 ; CHECK-NEXT:    br label [[PRED_LOAD_CONTINUE12]]
-; CHECK:       pred.load.continue12:
+; CHECK:       pred.load.continue9:
 ; CHECK-NEXT:    [[TMP28:%.*]] = phi <4 x i32> [ [[TMP22]], [[PRED_LOAD_CONTINUE10]] ], [ [[TMP27]], [[PRED_LOAD_IF11]] ]
 ; CHECK-NEXT:    [[TMP29:%.*]] = extractelement <4 x i1> [[TMP8]], i64 3
 ; CHECK-NEXT:    br i1 [[TMP29]], label [[PRED_LOAD_IF13:%.*]], label [[PRED_LOAD_CONTINUE14:%.*]]
-; CHECK:       pred.load.if13:
+; CHECK:       pred.load.if10:
 ; CHECK-NEXT:    [[TMP30:%.*]] = or disjoint i64 [[INDEX]], 3
 ; CHECK-NEXT:    [[TMP31:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP30]]
 ; CHECK-NEXT:    [[TMP32:%.*]] = load i32, ptr [[TMP31]], align 4
 ; CHECK-NEXT:    [[TMP33:%.*]] = insertelement <4 x i32> [[TMP28]], i32 [[TMP32]], i64 3
 ; CHECK-NEXT:    br label [[PRED_LOAD_CONTINUE14]]
-; CHECK:       pred.load.continue14:
+; CHECK:       pred.load.continue11:
 ; CHECK-NEXT:    [[TMP34:%.*]] = phi <4 x i32> [ [[TMP28]], [[PRED_LOAD_CONTINUE12]] ], [ [[TMP33]], [[PRED_LOAD_IF13]] ]
 ; CHECK-NEXT:    [[TMP35:%.*]] = extractelement <4 x i1> [[TMP9]], i64 0
 ; CHECK-NEXT:    br i1 [[TMP35]], label [[PRED_LOAD_IF15:%.*]], label [[PRED_LOAD_CONTINUE16:%.*]]
-; CHECK:       pred.load.if15:
+; CHECK:       pred.load.if12:
 ; CHECK-NEXT:    [[TMP36:%.*]] = or disjoint i64 [[INDEX]], 4
 ; CHECK-NEXT:    [[TMP37:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP36]]
 ; CHECK-NEXT:    [[TMP38:%.*]] = load i32, ptr [[TMP37]], align 4
 ; CHECK-NEXT:    [[TMP39:%.*]] = insertelement <4 x i32> poison, i32 [[TMP38]], i64 0
 ; CHECK-NEXT:    br label [[PRED_LOAD_CONTINUE16]]
-; CHECK:       pred.load.continue16:
+; CHECK:       pred.load.continue13:
 ; CHECK-NEXT:    [[TMP40:%.*]] = phi <4 x i32> [ poison, [[PRED_LOAD_CONTINUE14]] ], [ [[TMP39]], [[PRED_LOAD_IF15]] ]
 ; CHECK-NEXT:    [[TMP41:%.*]] = extractelement <4 x i1> [[TMP9]], i64 1
 ; CHECK-NEXT:    br i1 [[TMP41]], label [[PRED_LOAD_IF17:%.*]], label [[PRED_LOAD_CONTINUE18:%.*]]
-; CHECK:       pred.load.if17:
+; CHECK:       pred.load.if14:
 ; CHECK-NEXT:    [[TMP42:%.*]] = or disjoint i64 [[INDEX]], 5
 ; CHECK-NEXT:    [[TMP43:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP42]]
 ; CHECK-NEXT:    [[TMP44:%.*]] = load i32, ptr [[TMP43]], align 4
 ; CHECK-NEXT:    [[TMP45:%.*]] = insertelement <4 x i32> [[TMP40]], i32 [[TMP44]], i64 1
 ; CHECK-NEXT:    br label [[PRED_LOAD_CONTINUE18]]
-; CHECK:       pred.load.continue18:
+; CHECK:       pred.load.continue15:
 ; CHECK-NEXT:    [[TMP46:%.*]] = phi <4 x i32> [ [[TMP40]], [[PRED_LOAD_CONTINUE16]] ], [ [[TMP45]], [[PRED_LOAD_IF17]] ]
 ; CHECK-NEXT:    [[TMP47:%.*]] = extractelement <4 x i1> [[TMP9]], i64 2
 ; CHECK-NEXT:    br i1 [[TMP47]], label [[PRED_LOAD_IF19:%.*]], label [[PRED_LOAD_CONTINUE20:%.*]]
-; CHECK:       pred.load.if19:
+; CHECK:       pred.load.if16:
 ; CHECK-NEXT:    [[TMP48:%.*]] = or disjoint i64 [[INDEX]], 6
 ; CHECK-NEXT:    [[TMP49:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP48]]
 ; CHECK-NEXT:    [[TMP50:%.*]] = load i32, ptr [[TMP49]], align 4
 ; CHECK-NEXT:    [[TMP51:%.*]] = insertelement <4 x i32> [[TMP46]], i32 [[TMP50]], i64 2
 ; CHECK-NEXT:    br label [[PRED_LOAD_CONTINUE20]]
-; CHECK:       pred.load.continue20:
+; CHECK:       pred.load.continue17:
 ; CHECK-NEXT:    [[TMP52:%.*]] = phi <4 x i32> [ [[TMP46]], [[PRED_LOAD_CONTINUE18]] ], [ [[TMP51]], [[PRED_LOAD_IF19]] ]
 ; CHECK-NEXT:    [[TMP53:%.*]] = extractelement <4 x i1> [[TMP9]], i64 3
 ; CHECK-NEXT:    br i1 [[TMP53]], label [[PRED_LOAD_IF21:%.*]], label [[PRED_LOAD_CONTINUE22:%.*]]
-; CHECK:       pred.load.if21:
+; CHECK:       pred.load.if18:
 ; CHECK-NEXT:    [[TMP54:%.*]] = or disjoint i64 [[INDEX]], 7
 ; CHECK-NEXT:    [[TMP55:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP54]]
 ; CHECK-NEXT:    [[TMP56:%.*]] = load i32, ptr [[TMP55]], align 4
 ; CHECK-NEXT:    [[TMP57:%.*]] = insertelement <4 x i32> [[TMP52]], i32 [[TMP56]], i64 3
 ; CHECK-NEXT:    br label [[PRED_LOAD_CONTINUE22]]
-; CHECK:       pred.load.continue22:
+; CHECK:       pred.load.continue19:
 ; CHECK-NEXT:    [[TMP58:%.*]] = phi <4 x i32> [ [[TMP52]], [[PRED_LOAD_CONTINUE20]] ], [ [[TMP57]], [[PRED_LOAD_IF21]] ]
 ; CHECK-NEXT:    [[TMP59:%.*]] = extractelement <4 x i1> [[TMP10]], i64 0
 ; CHECK-NEXT:    br i1 [[TMP59]], label [[PRED_LOAD_IF23:%.*]], label [[PRED_LOAD_CONTINUE24:%.*]]
-; CHECK:       pred.load.if23:
+; CHECK:       pred.load.if20:
 ; CHECK-NEXT:    [[TMP60:%.*]] = or disjoint i64 [[INDEX]], 8
 ; CHECK-NEXT:    [[TMP61:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP60]]
 ; CHECK-NEXT:    [[TMP62:%.*]] = load i32, ptr [[TMP61]], align 4
 ; CHECK-NEXT:    [[TMP63:%.*]] = insertelement <4 x i32> poison, i32 [[TMP62]], i64 0
 ; CHECK-NEXT:    br label [[PRED_LOAD_CONTINUE24]]
-; CHECK:       pred.load.continue24:
+; CHECK:       pred.load.continue21:
 ; CHECK-NEXT:    [[TMP64:%.*]] = phi <4 x i32> [ poison, [[PRED_LOAD_CONTINUE22]] ], [ [[TMP63]], [[PRED_LOAD_IF23]] ]
 ; CHECK-NEXT:    [[TMP65:%.*]] = extractelement <4 x i1> [[TMP10]], i64 1
 ; CHECK-NEXT:    br i1 [[TMP65]], label [[PRED_LOAD_IF25:%.*]], label [[PRED_LOAD_CONTINUE26:%.*]]
-; CHECK:       pred.load.if25:
+; CHECK:       pred.load.if22:
 ; CHECK-NEXT:    [[TMP66:%.*]] = or disjoint i64 [[INDEX]], 9
 ; CHECK-NEXT:    [[TMP67:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP66]]
 ; CHECK-NEXT:    [[TMP68:%.*]] = load i32, ptr [[TMP67]], align 4
 ; CHECK-NEXT:    [[TMP69:%.*]] = insertelement <4 x i32> [[TMP64]], i32 [[TMP68]], i64 1
 ; CHECK-NEXT:    br label [[PRED_LOAD_CONTINUE26]]
-; CHECK:       pred.load.continue26:
+; CHECK:       pred.load.continue23:
 ; CHECK-NEXT:    [[TMP70:%.*]] = phi <4 x i32> [ [[TMP64]], [[PRED_LOAD_CONTINUE24]] ], [ [[TMP69]], [[PRED_LOAD_IF25]] ]
 ; CHECK-NEXT:    [[TMP71:%.*]] = extractelement <4 x i1> [[TMP10]], i64 2
 ; CHECK-NEXT:    br i1 [[TMP71]], label [[PRED_LOAD_IF27:%.*]], label [[PRED_LOAD_CONTINUE28:%.*]]
-; CHECK:       pred.load.if27:
+; CHECK:       pred.load.if24:
 ; CHECK-NEXT:    [[TMP72:%.*]] = or disjoint i64 [[INDEX]], 10
 ; CHECK-NEXT:    [[TMP73:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP72]]
 ; CHECK-NEXT:    [[TMP74:%.*]] = load i32, ptr [[TMP73]], align 4
 ; CHECK-NEXT:    [[TMP75:%.*]] = insertelement <4 x i32> [[TMP70]], i32 [[TMP74]], i64 2
 ; CHECK-NEXT:    br label [[PRED_LOAD_CONTINUE28]]
-; CHECK:       pred.load.continue28:
+; CHECK:       pred.load.continue25:
 ; CHECK-NEXT:    [[TMP76:%.*]] = phi <4 x i32> [ [[TMP70]], [[PRED_LOAD_CONTINUE26]] ], [ [[TMP75]], [[PRED_LOAD_IF27]] ]
 ; CHECK-NEXT:    [[TMP77:%.*]] = extractelement <4 x i1> [[TMP10]], i64 3
 ; CHECK-NEXT:    br i1 [[TMP77]], label [[PRED_LOAD_IF29:%.*]], label [[PRED_LOAD_CONTINUE30:%.*]]
-; CHECK:       pred.load.if29:
+; CHECK:       pred.load.if26:
 ; CHECK-NEXT:    [[TMP78:%.*]] = or disjoint i64 [[INDEX]], 11
 ; CHECK-NEXT:    [[TMP79:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP78]]
 ; CHECK-NEXT:    [[TMP80:%.*]] = load i32, ptr [[TMP79]], align 4
 ; CHECK-NEXT:    [[TMP81:%.*]] = insertelement <4 x i32> [[TMP76]], i32 [[TMP80]], i64 3
 ; CHECK-NEXT:    br label [[PRED_LOAD_CONTINUE30]]
-; CHECK:       pred.load.continue30:
+; CHECK:       pred.load.continue27:
 ; CHECK-NEXT:    [[TMP82:%.*]] = phi <4 x i32> [ [[TMP76]], [[PRED_LOAD_CONTINUE28]] ], [ [[TMP81]], [[PRED_LOAD_IF29]] ]
 ; CHECK-NEXT:    [[TMP83:%.*]] = extractelement <4 x i1> [[TMP11]], i64 0
 ; CHECK-NEXT:    br i1 [[TMP83]], label [[PRED_LOAD_IF31:%.*]], label [[PRED_LOAD_CONTINUE32:%.*]]
-; CHECK:       pred.load.if31:
+; CHECK:       pred.load.if28:
 ; CHECK-NEXT:    [[TMP84:%.*]] = or disjoint i64 [[INDEX]], 12
 ; CHECK-NEXT:    [[TMP85:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP84]]
 ; CHECK-NEXT:    [[TMP86:%.*]] = load i32, ptr [[TMP85]], align 4
 ; CHECK-NEXT:    [[TMP87:%.*]] = insertelement <4 x i32> poison, i32 [[TMP86]], i64 0
 ; CHECK-NEXT:    br label [[PRED_LOAD_CONTINUE32]]
-; CHECK:       pred.load.continue32:
+; CHECK:       pred.load.continue29:
 ; CHECK-NEXT:    [[TMP88:%.*]] = phi <4 x i32> [ poison, [[PRED_LOAD_CONTINUE30]] ], [ [[TMP87]], [[PRED_LOAD_IF31]] ]
 ; CHECK-NEXT:    [[TMP89:%.*]] = extractelement <4 x i1> [[TMP11]], i64 1
 ; CHECK-NEXT:    br i1 [[TMP89]], label [[PRED_LOAD_IF33:%.*]], label [[PRED_LOAD_CONTINUE34:%.*]]
-; CHECK:       pred.load.if33:
+; CHECK:       pred.load.if30:
 ; CHECK-NEXT:    [[TMP90:%.*]] = or disjoint i64 [[INDEX]], 13
 ; CHECK-NEXT:    [[TMP91:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP90]]
 ; CHECK-NEXT:    [[TMP92:%.*]] = load i32, ptr [[TMP91]], align 4
 ; CHECK-NEXT:    [[TMP93:%.*]] = insertelement <4 x i32> [[TMP88]], i32 [[TMP92]], i64 1
 ; CHECK-NEXT:    br label [[PRED_LOAD_CONTINUE34]]
-; CHECK:       pred.load.continue34:
+; CHECK:       pred.load.continue31:
 ; CHECK-NEXT:    [[TMP94:%.*]] = phi <4 x i32> [ [[TMP88]], [[PRED_LOAD_CONTINUE32]] ], [ [[TMP93]], [[PRED_LOAD_IF33]] ]
 ; CHECK-NEXT:    [[TMP95:%.*]] = extractelement <4 x i1> [[TMP11]], i64 2
 ; CHECK-NEXT:    br i1 [[TMP95]], label [[PRED_LOAD_IF35:%.*]], label [[PRED_LOAD_CONTINUE36:%.*]]
-; CHECK:       pred.load.if35:
+; CHECK:       pred.load.if32:
 ; CHECK-NEXT:    [[TMP96:%.*]] = or disjoint i64 [[INDEX]], 14
 ; CHECK-NEXT:    [[TMP97:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP96]]
 ; CHECK-NEXT:    [[TMP98:%.*]] = load i32, ptr [[TMP97]], align 4
 ; CHECK-NEXT:    [[TMP99:%.*]] = insertelement <4 x i32> [[TMP94]], i32 [[TMP98]], i64 2
 ; CHECK-NEXT:    br label [[PRED_LOAD_CONTINUE36]]
-; CHECK:       pred.load.continue36:
+; CHECK:       pred.load.continue33:
 ; CHECK-NEXT:    [[TMP100:%.*]] = phi <4 x i32> [ [[TMP94]], [[PRED_LOAD_CONTINUE34]] ], [ [[TMP99]], [[PRED_LOAD_IF35]] ]
 ; CHECK-NEXT:    [[TMP101:%.*]] = extractelement <4 x i1> [[TMP11]], i64 3
 ; CHECK-NEXT:    br i1 [[TMP101]], label [[PRED_LOAD_IF37:%.*]], label [[PRED_LOAD_CONTINUE38]]
-; CHECK:       pred.load.if37:
+; CHECK:       pred.load.if34:
 ; CHECK-NEXT:    [[TMP102:%.*]] = or disjoint i64 [[INDEX]], 15
 ; CHECK-NEXT:    [[TMP103:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP102]]
 ; CHECK-NEXT:    [[TMP104:%.*]] = load i32, ptr [[TMP103]], align 4
 ; CHECK-NEXT:    [[TMP105:%.*]] = insertelement <4 x i32> [[TMP100]], i32 [[TMP104]], i64 3
 ; CHECK-NEXT:    br label [[PRED_LOAD_CONTINUE38]]
-; CHECK:       pred.load.continue38:
+; CHECK:       pred.load.continue35:
 ; CHECK-NEXT:    [[TMP106:%.*]] = phi <4 x i32> [ [[TMP100]], [[PRED_LOAD_CONTINUE36]] ], [ [[TMP105]], [[PRED_LOAD_IF37]] ]
 ; CHECK-NEXT:    [[TMP107:%.*]] = select <4 x i1> [[TMP8]], <4 x i32> [[TMP34]], <4 x i32> <i32 1, i32 1, i32 1, i32 1>
 ; CHECK-NEXT:    [[TMP108:%.*]] = call i32 @llvm.vector.reduce.mul.v4i32(<4 x i32> [[TMP107]])
diff --git a/llvm/test/Transforms/LoopVectorize/scalable-inductions.ll b/llvm/test/Transforms/LoopVectorize/scalable-inductions.ll
index 030eb9e76b51a5..577a9cc446ea3c 100644
--- a/llvm/test/Transforms/LoopVectorize/scalable-inductions.ll
+++ b/llvm/test/Transforms/LoopVectorize/scalable-inductions.ll
@@ -21,9 +21,9 @@ define void @add_ind64_unrolled(ptr noalias nocapture %a, ptr noalias nocapture
 ; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
 ; CHECK-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-NEXT:    [[TMP5:%.*]] = shl i64 [[TMP4]], 2
-; CHECK-NEXT:    [[TMP6:%.*]] = call <vscale x 2 x i64> @llvm.experimental.stepvector.nxv2i64()
 ; CHECK-NEXT:    [[TMP7:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-NEXT:    [[TMP8:%.*]] = shl i64 [[TMP7]], 1
+; CHECK-NEXT:    [[TMP6:%.*]] = call <vscale x 2 x i64> @llvm.experimental.stepvector.nxv2i64()
 ; CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 2 x i64> poison, i64 [[TMP8]], i64 0
 ; CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <vscale x 2 x i64> [[DOTSPLATINSERT]], <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
@@ -103,8 +103,8 @@ define void @add_ind64_unrolled_nxv1i64(ptr noalias nocapture %a, ptr noalias no
 ; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
 ; CHECK-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-NEXT:    [[TMP5:%.*]] = shl i64 [[TMP4]], 1
-; CHECK-NEXT:    [[TMP6:%.*]] = call <vscale x 1 x i64> @llvm.experimental.stepvector.nxv1i64()
 ; CHECK-NEXT:    [[TMP7:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP6:%.*]] = call <vscale x 1 x i64> @llvm.experimental.stepvector.nxv1i64()
 ; CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 1 x i64> poison, i64 [[TMP7]], i64 0
 ; CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <vscale x 1 x i64> [[DOTSPLATINSERT]], <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
diff --git a/llvm/test/Transforms/LoopVectorize/vplan-printing-before-execute.ll b/llvm/test/Transforms/LoopVectorize/vplan-printing-before-execute.ll
index e4984f52ee6ff9..d8955a8bd7c575 100644
--- a/llvm/test/Transforms/LoopVectorize/vplan-printing-before-execute.ll
+++ b/llvm/test/Transforms/LoopVectorize/vplan-printing-before-execute.ll
@@ -64,13 +64,20 @@ define void @test_tc_less_than_16(ptr %A, i64 %N) {
 ; CHECK-NEXT: <x1> vector loop: {
 ; CHECK-NEXT:   vector.body:
 ; CHECK-NEXT:     EMIT vp<[[CAN_IV:%.+]]> = CANONICAL-INDUCTION ir<0>, vp<[[CAN_IV_NEXT:%.+]]>
-; CHECK-NEXT:     vp<[[STEPS:%.+]]> = SCALAR-STEPS vp<[[CAN_IV]]>, ir<1>
-; CHECK-NEXT:     EMIT vp<[[PADD:%.+]]> = ptradd ir<%A>, vp<[[STEPS]]>
-; CHECK-NEXT:     vp<[[VPTR:%.]]> = vector-pointer vp<[[PADD]]>
-; CHECK-NEXT:     WIDEN ir<%l> = load vp<[[VPTR]]>
+; CHECK-NEXT:     vp<[[STEPS1:%.+]]> = SCALAR-STEPS vp<[[CAN_IV]]>, ir<1>
+; CHECK-NEXT:     vp<[[STEPS2:%.+]]> = SCALAR-STEPS vp<[[CAN_IV]]>, ir<1>, ir<1>
+; CHECK-NEXT:     EMIT vp<[[PADD1:%.+]]> = ptradd ir<%A>, vp<[[STEPS1]]>
+; CHECK-NEXT:     EMIT vp<[[PADD2:%.+]]> = ptradd ir<%A>, vp<[[STEPS2]]>
+; CHECK-NEXT:     vp<[[VPTR1:%.]]> = vector-pointer vp<[[PADD1]]>
+; CHECK-NEXT:     vp<[[VPTR2:%.]]> = vector-pointer vp<[[PADD1]]>
+; CHECK-NEXT:     WIDEN ir<%l> = load vp<[[VPTR1]]>
+; CHECK-NEXT:     WIDEN ir<%l>.1 = load vp<[[VPTR2]]>
 ; CHECK-NEXT:     WIDEN ir<%add> = add nsw ir<%l>, ir<10>
-; CHECK-NEXT:     vp<[[VPTR2:%.+]]> = vector-pointer vp<[[PADD]]>
-; CHECK-NEXT:     WIDEN store vp<[[VPTR2]]>, ir<%add>
+; CHECK-NEXT:     WIDEN ir<%add>.1 = add nsw ir<%l>.1, ir<10>
+; CHECK-NEXT:     vp<[[VPTR3:%.+]]> = vector-pointer vp<[[PADD1]]>
+; CHECK-NEXT:     vp<[[VPTR4:%.+]]> = vector-pointer vp<[[PADD1]]>
+; CHECK-NEXT:     WIDEN store vp<[[VPTR3]]>, ir<%add>
+; CHECK-NEXT:     WIDEN store vp<[[VPTR4]]>, ir<%add>.1
 ; CHECK-NEXT:     EMIT vp<[[CAN_IV_NEXT]]> = add nuw vp<[[CAN_IV:%.+]]>, vp<[[VFxUF]]>
 ; CHECK-NEXT:     EMIT branch-on-cond ir<true>
 ; CHECK-NEXT:   No successors

>From cba8b59136b92253a6f65d9bc2c5e1e44776c45e Mon Sep 17 00:00:00 2001
From: Florian Hahn <flo at fhahn.com>
Date: Thu, 20 Jun 2024 11:18:24 +0100
Subject: [PATCH 02/15] !fixup use pattern matching in a few more cases.

---
 llvm/lib/Transforms/Vectorize/VPlan.h         |  17 +-
 .../lib/Transforms/Vectorize/VPlanRecipes.cpp |   1 -
 .../Transforms/Vectorize/VPlanTransforms.cpp  | 252 ++++++++----------
 .../LoopVectorize/X86/masked-store-cost.ll    |   4 +-
 4 files changed, 116 insertions(+), 158 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h
index a79bdfdf098fdd..5f060796b1584d 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.h
+++ b/llvm/lib/Transforms/Vectorize/VPlan.h
@@ -3855,22 +3855,23 @@ bool isHeaderMask(const VPValue *V, VPlan &Plan);
 /// if it is either defined outside the vector region or its operand is known to
 /// be uniform across all VFs and UFs (e.g. VPDerivedIV or VPCanonicalIVPHI).
 inline bool isUniformAcrossVFsAndUFs(VPValue *V) {
-  if (auto *VPI = dyn_cast_or_null<VPInstruction>(V->getDefiningRecipe())) {
-    return VPI ==
-           VPI->getParent()->getPlan()->getCanonicalIV()->getBackedgeValue();
-  }
+  if (V->isLiveIn())
+    return true;
   if (isa<VPCanonicalIVPHIRecipe, VPDerivedIVRecipe, VPExpandSCEVRecipe>(V))
     return true;
+  auto *R = cast<VPSingleDefRecipe>(V->getDefiningRecipe());
+  if (R == R->getParent()->getPlan()->getCanonicalIV()->getBackedgeValue())
+    return true;
   if (isa<VPReplicateRecipe>(V) && cast<VPReplicateRecipe>(V)->isUniform() &&
       (isa<LoadInst, StoreInst>(V->getUnderlyingValue())) &&
       all_of(V->getDefiningRecipe()->operands(),
              [](VPValue *Op) { return Op->isDefinedOutsideVectorRegions(); }))
     return true;
 
-  auto *C = dyn_cast_or_null<VPScalarCastRecipe>(V->getDefiningRecipe());
-  return C && (C->isDefinedOutsideVectorRegions() ||
-               isa<VPDerivedIVRecipe>(C->getOperand(0)) ||
-               isa<VPCanonicalIVPHIRecipe>(C->getOperand(0)));
+  return isa<VPScalarCastRecipe, VPWidenCastRecipe>(R) &&
+         (R->isDefinedOutsideVectorRegions() || R->getOperand(0)->isLiveIn() ||
+          isa<VPDerivedIVRecipe>(R->getOperand(0)) ||
+          isa<VPCanonicalIVPHIRecipe>(R->getOperand(0)));
 }
 
 } // end namespace vputils
diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
index 63ec22faf5c76c..81907dff591c9b 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
@@ -1315,7 +1315,6 @@ void VPWidenIntOrFpInductionRecipe::execute(VPTransformState &State) {
     // Need to create stuff in PH.
     SplatVF = State.get(getOperand(2), 0);
   } else {
-
     // Multiply the vectorization factor by the step using integer or
     // floating-point arithmetic as appropriate.
     Type *StepType = Step->getType();
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
index 7a998c726f3290..a9fe012110f393 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
@@ -1623,19 +1623,48 @@ void VPlanTransforms::dropPoisonGeneratingRecipes(
   }
 }
 
-static VPValue *getInterleavedValue(
-    DenseMap<VPValue *, SmallVector<VPValue *>> &InterleavedValues, VPValue *V,
-    unsigned IC) {
-  if (IC == 0)
-    return V;
-  if (V->isLiveIn())
-    return V;
-  return InterleavedValues[V][IC - 1];
-}
+namespace {
+class InterleaveState {
+  DenseMap<VPValue *, SmallVector<VPValue *>> InterleavedValues;
+
+public:
+  VPValue *getInterleavedValue(VPValue *V, unsigned IC) {
+    if (IC == 0)
+      return V;
+    if (V->isLiveIn())
+      return V;
+    return InterleavedValues[V][IC - 1];
+  }
+
+  void addInterleavedValues(VPRecipeBase *OrigR, VPRecipeBase *CopyR) {
+    for (const auto &[Idx, VPV] : enumerate(OrigR->definedValues())) {
+      auto Ins = InterleavedValues.insert({VPV, {}});
+      Ins.first->second.push_back(CopyR->getVPValue(Idx));
+    }
+  }
+
+  void addUniform(VPSingleDefRecipe *R, unsigned IC) {
+    auto Ins = InterleavedValues.insert({R, {}});
+    for (unsigned I = 1; I != IC; ++I)
+      Ins.first->second.push_back(R);
+  }
+
+  bool contains(VPValue *VPV) { return InterleavedValues.contains(VPV); }
+
+  DenseMap<VPValue *, SmallVector<VPValue *>> &getInterleavedValues() {
+    return InterleavedValues;
+  }
+
+  void remapOperands(VPRecipeBase *R, unsigned I) {
+    for (const auto &[Idx, Op] : enumerate(R->operands()))
+      R->setOperand(Idx, getInterleavedValue(Op, I));
+  }
+};
+} // namespace
 
-static void interleaveReplicateRegion(
-    VPRegionBlock *VPR, VPlan &Plan, unsigned IC,
-    DenseMap<VPValue *, SmallVector<VPValue *>> &InterleavedValues) {
+static void interleaveReplicateRegion(VPRegionBlock *VPR, VPlan &Plan,
+                                      unsigned IC,
+                                      InterleaveState &InterleavedValues) {
   Type *CanIVIntTy = Plan.getCanonicalIV()->getScalarType();
   VPBlockBase *InsertPt = VPR;
   for (unsigned I = 1; I != IC; ++I) {
@@ -1650,35 +1679,24 @@ static void interleaveReplicateRegion(
     for (const auto &[New, Old] :
          zip(VPBlockUtils::blocksOnly<VPBasicBlock>(RPOT),
              VPBlockUtils::blocksOnly<VPBasicBlock>(RPOT2))) {
-      if (New->getParent() != Copy)
-        break;
       for (const auto &[CopyR, OrigR] : zip(*New, *Old)) {
-        for (unsigned Idx = 0; Idx != CopyR.getNumOperands(); ++Idx) {
-          CopyR.setOperand(Idx, getInterleavedValue(InterleavedValues,
-                                                    CopyR.getOperand(Idx), I));
-        }
+        InterleavedValues.remapOperands(&CopyR, I);
         if (auto *ScalarIVSteps = dyn_cast<VPScalarIVStepsRecipe>(&CopyR)) {
           ScalarIVSteps->addOperand(
               Plan.getOrAddLiveIn(ConstantInt::get(CanIVIntTy, I)));
         }
 
-        unsigned Idx = 0;
-        for (VPValue *Res : OrigR.definedValues()) {
-          auto Ins = InterleavedValues.insert({Res, {}});
-          Ins.first->second.push_back(CopyR.getVPValue(Idx));
-          Idx++;
-        }
+        InterleavedValues.addInterleavedValues(&OrigR, &CopyR);
       }
     }
   }
 }
 
-static void interleaveHeaderPHI(
-    VPRecipeBase &R, VPlan &Plan, unsigned IC,
-    VPBasicBlock::iterator &InsertPtForPhi,
-    DenseMap<VPValue *, SmallVector<VPValue *>> &InterleavedValues,
-    VPTypeAnalysis &TypeInfo, SmallPtrSet<VPRecipeBase *, 8> &ToSkip,
-    SmallVector<SmallVector<VPHeaderPHIRecipe *>> &PhisToRemap) {
+static void interleaveHeaderPHI(VPRecipeBase &R, VPlan &Plan, unsigned IC,
+                                VPBasicBlock::iterator &InsertPtForPhi,
+                                InterleaveState &InterleavedValues,
+                                VPTypeAnalysis &TypeInfo,
+                                SmallPtrSet<VPRecipeBase *, 8> &ToSkip) {
   if (isa<VPFirstOrderRecurrencePHIRecipe>(&R))
     return;
 
@@ -1733,8 +1751,7 @@ static void interleaveHeaderPHI(
     for (unsigned I = 1; I != IC; ++I) {
       VPBuilder Builder;
       Builder.setInsertPoint(R.getParent(), InsertPtForPhi);
-      auto Ins = InterleavedValues.insert({IV, {}});
-      VPValue *Prev = getInterleavedValue(InterleavedValues, IV, I - 1);
+      VPValue *Prev = InterleavedValues.getInterleavedValue(IV, I - 1);
       VPInstruction *Add;
       std::string Name = I > 1 ? "step.add." + std::to_string(I) : "step.add";
 
@@ -1753,10 +1770,10 @@ static void interleaveHeaderPHI(
                                    },
                                    R.getDebugLoc(), Name);
       ToSkip.insert(Add);
-      Ins.first->second.push_back(Add);
+      InterleavedValues.addInterleavedValues(IV, Add);
       InsertPtForPhi = std::next(Add->getIterator());
     }
-    R.addOperand(getInterleavedValue(InterleavedValues, IV, IC - 1));
+    R.addOperand(InterleavedValues.getInterleavedValue(IV, IC - 1));
     return;
   }
 
@@ -1766,12 +1783,7 @@ static void interleaveHeaderPHI(
     VPRecipeBase *Copy = R.clone();
     Copy->insertAfter(InsertPt);
     InsertPt = Copy;
-    unsigned Idx = 0;
-    for (VPValue *Res : R.definedValues()) {
-      auto Ins = InterleavedValues.insert({Res, {}});
-      Ins.first->second.push_back(Copy->getVPValue(Idx));
-      Idx++;
-    }
+    InterleavedValues.addInterleavedValues(&R, Copy);
     if (isa<VPWidenPointerInductionRecipe>(&R)) {
       if (I == 1)
         R.addOperand(Plan.getOrAddLiveIn(ConstantInt::get(CanIVIntTy, IC)));
@@ -1788,95 +1800,61 @@ static void interleaveHeaderPHI(
       }
       Copy->addOperand(Plan.getOrAddLiveIn(ConstantInt::get(CanIVIntTy, I)));
     }
-
-    if (I == 1)
-      PhisToRemap.emplace_back();
-
-    auto *H = cast<VPHeaderPHIRecipe>(Copy);
-    PhisToRemap.back().push_back(H);
   }
 }
 
-static void
-interleaveRecipe(VPRecipeBase &R, VPlan &Plan, unsigned IC,
-                 DenseMap<VPValue *, SmallVector<VPValue *>> &InterleavedValues,
-                 VPTypeAnalysis &TypeInfo) {
+static void interleaveRecipe(VPRecipeBase &R, VPlan &Plan, unsigned IC,
+                             InterleaveState &InterleavedValues,
+                             VPTypeAnalysis &TypeInfo) {
   using namespace llvm::VPlanPatternMatch;
+  if (match(&R, m_BranchOnCond(m_VPValue())) ||
+      match(&R, m_BranchOnCount(m_VPValue(), m_VPValue())))
+    return;
+
   VPValue *Op1;
   if (match(&R, m_VPInstruction<VPInstruction::ComputeReductionResult>(
                     m_VPValue(), m_VPValue(Op1)))) {
-    auto Ins = InterleavedValues.insert({R.getVPSingleValue(), {}});
-    for (unsigned I = 1; I != IC; ++I) {
-      R.addOperand(getInterleavedValue(InterleavedValues, Op1, I));
-      Ins.first->second.push_back(R.getVPSingleValue());
-    }
+    InterleavedValues.addUniform(cast<VPInstruction>(&R), IC);
+    for (unsigned I = 1; I != IC; ++I)
+      R.addOperand(InterleavedValues.getInterleavedValue(Op1, I));
     return;
   }
   VPValue *Op0;
   if (match(&R, m_VPInstruction<VPInstruction::ExtractFromEnd>(m_VPValue(Op0),
                                                                m_VPValue()))) {
-    auto Ins = InterleavedValues.insert({R.getVPSingleValue(), {}});
-    for (unsigned I = 1; I != IC; ++I) {
-      Ins.first->second.push_back(R.getVPSingleValue());
-    }
-
+    InterleavedValues.addUniform(cast<VPInstruction>(&R), IC);
     bool ScalarVFOnly = Plan.hasScalarVFOnly();
     if (!ScalarVFOnly) {
-      R.setOperand(0, getInterleavedValue(InterleavedValues, Op0, IC - 1));
+      InterleavedValues.remapOperands(&R, IC - 1);
       return;
     }
   }
 
   Type *CanIVIntTy = Plan.getCanonicalIV()->getScalarType();
-  if (isa<VPInstruction>(&R) && cast<VPInstruction>(&R)->getOpcode() ==
-                                    VPInstruction::CalculateTripCountMinusVF) {
+  if (match(&R, m_VPInstruction<VPInstruction::CalculateTripCountMinusVF>(
+                    m_VPValue()))) {
+    InterleavedValues.addUniform(cast<VPInstruction>(&R), IC);
     R.addOperand(Plan.getOrAddLiveIn(ConstantInt::get(CanIVIntTy, IC)));
-    auto Ins = InterleavedValues.insert({R.getVPSingleValue(), {}});
-    for (unsigned I = 1; I != IC; ++I) {
-      Ins.first->second.push_back(R.getVPSingleValue());
-    }
-
     return;
   }
 
-  if (auto *VPI = dyn_cast<VPInstruction>(&R)) {
-    if (VPI->getOpcode() == VPInstruction::BranchOnCount ||
-        VPI->getOpcode() == VPInstruction::BranchOnCond)
-      return;
-  }
-
   if (auto *RepR = dyn_cast<VPReplicateRecipe>(&R)) {
     if (isa<StoreInst>(RepR->getUnderlyingValue()) &&
         RepR->getOperand(1)->isDefinedOutsideVectorRegions()) {
-      R.setOperand(
-          0, getInterleavedValue(InterleavedValues, R.getOperand(0), IC - 1));
+      InterleavedValues.remapOperands(&R, IC - 1);
       return;
     }
     if (auto *II = dyn_cast<IntrinsicInst>(RepR->getUnderlyingValue())) {
       if (II->getIntrinsicID() == Intrinsic::experimental_noalias_scope_decl) {
-        auto Ins = InterleavedValues.insert({RepR, {}});
-        Ins.first->second.push_back(RepR);
+        InterleavedValues.addUniform(RepR, IC);
         return;
       }
     }
   }
 
-  // TODO: Generalize for any uniform recipe.
-  if (auto *Cast = dyn_cast<VPWidenCastRecipe>(&R)) {
-    if (Cast->getOperand(0)->isLiveIn()) {
-      auto Ins = InterleavedValues.insert({Cast, {}});
-      Ins.first->second.push_back(Cast);
-      return;
-    }
-  }
-
   if (isa<VPInstruction>(&R) &&
       vputils::onlyFirstPartUsed(R.getVPSingleValue())) {
-    auto Ins = InterleavedValues.insert({R.getVPSingleValue(), {}});
-    for (unsigned I = 1; I != IC; ++I) {
-      Ins.first->second.push_back(R.getVPSingleValue());
-    }
-
+    InterleavedValues.addUniform(cast<VPInstruction>(&R), IC);
     return;
   }
 
@@ -1885,29 +1863,22 @@ interleaveRecipe(VPRecipeBase &R, VPlan &Plan, unsigned IC,
     VPRecipeBase *Copy = R.clone();
     Copy->insertAfter(InsertPt);
     InsertPt = Copy;
-    unsigned Idx = 0;
-    for (VPValue *Res : R.definedValues()) {
-      auto Ins = InterleavedValues.insert({Res, {}});
-      Ins.first->second.push_back(Copy->getVPValue(Idx));
-      Idx++;
-    }
+    InterleavedValues.addInterleavedValues(&R, Copy);
 
-    if (auto *VPI = dyn_cast<VPInstruction>(&R)) {
-      if (VPI->getOpcode() == VPInstruction::CanonicalIVIncrementForPart) {
-        Copy->addOperand(Plan.getOrAddLiveIn(ConstantInt::get(CanIVIntTy, I)));
-      }
-      if (VPI->getOpcode() == VPInstruction::FirstOrderRecurrenceSplice) {
-        Copy->setOperand(
-            0, getInterleavedValue(InterleavedValues, R.getOperand(1), I - 1));
-        Copy->setOperand(
-            1, getInterleavedValue(InterleavedValues, R.getOperand(1), I));
-        continue;
-      }
+    if (match(&R, m_VPInstruction<VPInstruction::CanonicalIVIncrementForPart>(
+                      m_VPValue())))
+      Copy->addOperand(Plan.getOrAddLiveIn(ConstantInt::get(CanIVIntTy, I)));
+    VPValue *Op;
+    if (match(&R, m_VPInstruction<VPInstruction::FirstOrderRecurrenceSplice>(
+                      m_VPValue(), m_VPValue(Op)))) {
+      Copy->setOperand(0, InterleavedValues.getInterleavedValue(Op, I - 1));
+      Copy->setOperand(1, InterleavedValues.getInterleavedValue(Op, I));
+      continue;
     }
     if (auto *Red = dyn_cast<VPReductionRecipe>(&R)) {
       auto *Phi = cast<VPReductionPHIRecipe>(R.getOperand(0));
       if (Phi->isOrdered()) {
-        auto Ins = InterleavedValues.insert({Phi, {}});
+        auto Ins = InterleavedValues.getInterleavedValues().insert({Phi, {}});
         if (I == 1) {
           Ins.first->second.clear();
           Ins.first->second.push_back(Red);
@@ -1916,9 +1887,7 @@ interleaveRecipe(VPRecipeBase &R, VPlan &Plan, unsigned IC,
         Phi->setOperand(1, Copy->getVPSingleValue());
       }
     }
-    for (unsigned Idx = 0; Idx != Copy->getNumOperands(); ++Idx)
-      Copy->setOperand(Idx, getInterleavedValue(InterleavedValues,
-                                                Copy->getOperand(Idx), I));
+    InterleavedValues.remapOperands(Copy, I);
 
     // Add operand indicating the part to generate code for to recipes still
     // requiring it.
@@ -1931,12 +1900,10 @@ interleaveRecipe(VPRecipeBase &R, VPlan &Plan, unsigned IC,
   }
 }
 
-static void
-interleaveBlock(VPBlockBase *VPB, VPlan &Plan, unsigned IC,
-                DenseMap<VPValue *, SmallVector<VPValue *>> &InterleavedValues,
-                VPTypeAnalysis &TypeInfo,
-                SmallPtrSet<VPRecipeBase *, 8> &ToSkip,
-                SmallVector<SmallVector<VPHeaderPHIRecipe *>> &PhisToRemap) {
+static void interleaveBlock(VPBlockBase *VPB, VPlan &Plan, unsigned IC,
+                            InterleaveState &InterleavedValues,
+                            VPTypeAnalysis &TypeInfo,
+                            SmallPtrSet<VPRecipeBase *, 8> &ToSkip) {
   auto *VPR = dyn_cast<VPRegionBlock>(VPB);
   if (VPR) {
     if (VPR->isReplicator())
@@ -1945,8 +1912,7 @@ interleaveBlock(VPBlockBase *VPB, VPlan &Plan, unsigned IC,
       ReversePostOrderTraversal<VPBlockShallowTraversalWrapper<VPBlockBase *>>
           RPOT(VPR->getEntry());
       for (VPBlockBase *VPB : RPOT) {
-        interleaveBlock(VPB, Plan, IC, InterleavedValues, TypeInfo, ToSkip,
-                        PhisToRemap);
+        interleaveBlock(VPB, Plan, IC, InterleavedValues, TypeInfo, ToSkip);
       }
     }
     return;
@@ -1960,16 +1926,13 @@ interleaveBlock(VPBlockBase *VPB, VPlan &Plan, unsigned IC,
 
     auto *SingleDef = dyn_cast<VPSingleDefRecipe>(&R);
     if (SingleDef && vputils::isUniformAcrossVFsAndUFs(SingleDef)) {
-      for (unsigned I = 1; I != IC; ++I) {
-        auto Ins = InterleavedValues.insert({SingleDef, {}});
-        Ins.first->second.push_back(SingleDef);
-      }
+      InterleavedValues.addUniform(SingleDef, IC);
       continue;
     }
 
     if (auto *H = dyn_cast<VPHeaderPHIRecipe>(&R)) {
       interleaveHeaderPHI(R, Plan, IC, InsertPtForPhi, InterleavedValues,
-                          TypeInfo, ToSkip, PhisToRemap);
+                          TypeInfo, ToSkip);
       continue;
     }
 
@@ -1981,40 +1944,35 @@ void VPlanTransforms::interleave(VPlan &Plan, unsigned IC, LLVMContext &Ctx) {
   assert(IC > 0);
   if (IC == 1)
     return;
-  DenseMap<VPValue *, SmallVector<VPValue *>> InterleavedValues;
+  InterleaveState InterleavedValues;
 
   SmallPtrSet<VPRecipeBase *, 8> ToSkip;
-
   Type *CanIVIntTy = Plan.getCanonicalIV()->getScalarType();
   VPTypeAnalysis TypeInfo(CanIVIntTy, Ctx);
   ReversePostOrderTraversal<VPBlockShallowTraversalWrapper<VPBlockBase *>> RPOT(
       Plan.getEntry());
-  SmallVector<SmallVector<VPHeaderPHIRecipe *>> PhisToRemap;
   interleaveBlock(Plan.getPreheader(), Plan, IC, InterleavedValues, TypeInfo,
-                  ToSkip, PhisToRemap);
+                  ToSkip);
 
   for (VPBlockBase *VPB : RPOT) {
-    interleaveBlock(VPB, Plan, IC, InterleavedValues, TypeInfo, ToSkip,
-                    PhisToRemap);
-  }
-
-  for (auto &R : PhisToRemap) {
-    unsigned I = 1;
-    for (VPHeaderPHIRecipe *H : R) {
-      for (unsigned Idx = 0; Idx != H->getNumOperands(); ++Idx)
-        H->setOperand(
-            Idx, getInterleavedValue(InterleavedValues, H->getOperand(Idx), I));
-      I++;
-    }
+    interleaveBlock(VPB, Plan, IC, InterleavedValues, TypeInfo, ToSkip);
   }
 
+  unsigned I = 1;
   for (VPRecipeBase &H :
        Plan.getVectorLoopRegion()->getEntryBasicBlock()->phis()) {
-    if (!isa<VPFirstOrderRecurrencePHIRecipe>(&H)) {
+    if (isa<VPFirstOrderRecurrencePHIRecipe>(&H)) {
+      H.setOperand(
+          1, InterleavedValues.getInterleavedValue(H.getOperand(1), IC - 1));
+      continue;
+    }
+    if (InterleavedValues.contains(H.getVPSingleValue()) ||
+        isa<VPWidenPointerInductionRecipe>(&H)) {
+      I = 1;
       continue;
     }
-    H.setOperand(
-        1, getInterleavedValue(InterleavedValues, H.getOperand(1), IC - 1));
+    InterleavedValues.remapOperands(&H, I);
+    I++;
   }
 
   using namespace llvm::VPlanPatternMatch;
@@ -2030,12 +1988,12 @@ void VPlanTransforms::interleave(VPlan &Plan, unsigned IC, LLVMContext &Ctx) {
       unsigned Offset =
           cast<ConstantInt>(Extract->getOperand(1)->getLiveInIRValue())
               ->getZExtValue();
-      In = getInterleavedValue(InterleavedValues, Op0, IC - Offset);
+      In = InterleavedValues.getInterleavedValue(Op0, IC - Offset);
       LO->setOperand(0, In);
       Extract->getDefiningRecipe()->eraseFromParent();
       continue;
     } else
-      In = getInterleavedValue(InterleavedValues, LO->getOperand(0), IC - 1);
+      In = InterleavedValues.getInterleavedValue(LO->getOperand(0), IC - 1);
 
     LO->setOperand(0, In);
   }
diff --git a/llvm/test/Transforms/LoopVectorize/X86/masked-store-cost.ll b/llvm/test/Transforms/LoopVectorize/X86/masked-store-cost.ll
index a53bd922631918..89c0cafde062af 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/masked-store-cost.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/masked-store-cost.ll
@@ -185,10 +185,10 @@ define void @test_scalar_cost_single_store_loop_varying_cond(ptr %dst, ptr noali
 ; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i64 [[TMP2]]
 ; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i64 [[TMP3]]
 ; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[TMP4]], i32 0
-; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[TMP5]], i32 0
 ; CHECK-NEXT:    [[WIDE_VEC:%.*]] = load <16 x i32>, ptr [[TMP6]], align 4
-; CHECK-NEXT:    [[WIDE_VEC4:%.*]] = load <16 x i32>, ptr [[TMP7]], align 4
 ; CHECK-NEXT:    [[STRIDED_VEC:%.*]] = shufflevector <16 x i32> [[WIDE_VEC]], <16 x i32> poison, <4 x i32> <i32 0, i32 4, i32 8, i32 12>
+; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[TMP5]], i32 0
+; CHECK-NEXT:    [[WIDE_VEC4:%.*]] = load <16 x i32>, ptr [[TMP7]], align 4
 ; CHECK-NEXT:    [[STRIDED_VEC5:%.*]] = shufflevector <16 x i32> [[WIDE_VEC4]], <16 x i32> poison, <4 x i32> <i32 0, i32 4, i32 8, i32 12>
 ; CHECK-NEXT:    [[TMP8:%.*]] = icmp eq <4 x i32> [[STRIDED_VEC]], <i32 123, i32 123, i32 123, i32 123>
 ; CHECK-NEXT:    [[TMP9:%.*]] = icmp eq <4 x i32> [[STRIDED_VEC5]], <i32 123, i32 123, i32 123, i32 123>

>From f3e47f53bc04032ee42b3c1eeec8a3d19a647f97 Mon Sep 17 00:00:00 2001
From: Florian Hahn <flo at fhahn.com>
Date: Mon, 12 Aug 2024 12:55:40 +0100
Subject: [PATCH 03/15] !fixup rebase and fixup

---
 .../Transforms/Vectorize/LoopVectorize.cpp    | 81 ------------------
 llvm/lib/Transforms/Vectorize/VPlan.h         |  3 +-
 .../lib/Transforms/Vectorize/VPlanRecipes.cpp | 40 ++++++---
 .../LoopVectorize/first-order-recurrence.ll   | 84 +++++++++----------
 4 files changed, 72 insertions(+), 136 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index f47c33d8a96941..afcef4599cd640 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -9230,87 +9230,6 @@ void LoopVectorizationPlanner::adjustRecipesForReductions(
   VPlanTransforms::clearReductionWrapFlags(*Plan);
 }
 
-void VPWidenPointerInductionRecipe::execute(VPTransformState &State) {
-  assert(IndDesc.getKind() == InductionDescriptor::IK_PtrInduction &&
-         "Not a pointer induction according to InductionDescriptor!");
-  assert(cast<PHINode>(getUnderlyingInstr())->getType()->isPointerTy() &&
-         "Unexpected type.");
-  assert(!onlyScalarsGenerated(State.VF.isScalable()) &&
-         "Recipe should have been replaced");
-
-  auto *IVR = getParent()->getPlan()->getCanonicalIV();
-  PHINode *CanonicalIV = cast<PHINode>(State.get(IVR, 0, /*IsScalar*/ true));
-  unsigned CurrentPart = 0;
-  if (getNumOperands() == 5)
-    CurrentPart =
-        cast<ConstantInt>(getOperand(4)->getLiveInIRValue())->getZExtValue();
-  Type *PhiType = IndDesc.getStep()->getType();
-
-  // Build a pointer phi
-  Value *ScalarStartValue = getStartValue()->getLiveInIRValue();
-  Type *ScStValueType = ScalarStartValue->getType();
-  PHINode *NewPointerPhi = nullptr;
-
-  BasicBlock *VectorPH = State.CFG.getPreheaderBBFor(this);
-  if (getNumOperands() == 5) {
-    auto *GEP = cast<GetElementPtrInst>(State.get(getOperand(3), 0));
-    NewPointerPhi = cast<PHINode>(GEP->getPointerOperand());
-  } else {
-    NewPointerPhi =
-        PHINode::Create(ScStValueType, 2, "pointer.phi", CanonicalIV);
-    NewPointerPhi->addIncoming(ScalarStartValue, VectorPH);
-  }
-
-  // A pointer induction, performed by using a gep
-  BasicBlock::iterator InductionLoc = State.Builder.GetInsertPoint();
-  unsigned UF = getNumOperands() == 2
-                    ? 1
-                    : cast<ConstantInt>(getOperand(2)->getLiveInIRValue())
-                          ->getZExtValue();
-
-  Value *ScalarStepValue = State.get(getOperand(1), VPIteration(0, 0));
-  Value *RuntimeVF = getRuntimeVF(State.Builder, PhiType, State.VF);
-  Value *NumUnrolledElems =
-      State.Builder.CreateMul(RuntimeVF, ConstantInt::get(PhiType, UF));
-  // Add induction update using an incorrect block temporarily. The phi node
-  // will be fixed after VPlan execution. Note that at this point the latch
-  // block cannot be used, as it does not exist yet.
-  // TODO: Model increment value in VPlan, by turning the recipe into a
-  // multi-def and a subclass of VPHeaderPHIRecipe.
-  if (getNumOperands() != 5) {
-    Value *InductionGEP = GetElementPtrInst::Create(
-        State.Builder.getInt8Ty(), NewPointerPhi,
-        State.Builder.CreateMul(ScalarStepValue, NumUnrolledElems), "ptr.ind",
-        InductionLoc);
-
-    NewPointerPhi->addIncoming(InductionGEP, VectorPH);
-  }
-
-  // Create UF many actual address geps that use the pointer
-  // phi as base and a vectorized version of the step value
-  // (<step*0, ..., step*N>) as offset.
-  for (unsigned Part = 0; Part < State.UF; ++Part) {
-    Type *VecPhiType = VectorType::get(PhiType, State.VF);
-    Value *StartOffsetScalar = State.Builder.CreateMul(
-        RuntimeVF, ConstantInt::get(PhiType, CurrentPart));
-    Value *StartOffset =
-        State.Builder.CreateVectorSplat(State.VF, StartOffsetScalar);
-    // Create a vector of consecutive numbers from zero to VF.
-    StartOffset = State.Builder.CreateAdd(
-        StartOffset, State.Builder.CreateStepVector(VecPhiType));
-
-    assert(ScalarStepValue == State.get(getOperand(1), VPIteration(Part, 0)) &&
-           "scalar step must be the same across all parts");
-    Value *GEP = State.Builder.CreateGEP(
-        State.Builder.getInt8Ty(), NewPointerPhi,
-        State.Builder.CreateMul(
-            StartOffset,
-            State.Builder.CreateVectorSplat(State.VF, ScalarStepValue),
-            "vector.gep"));
-    State.set(this, GEP, Part);
-  }
-}
-
 void VPDerivedIVRecipe::execute(VPTransformState &State) {
   assert(!State.Instance && "VPDerivedIVRecipe being replicated.");
 
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h
index 5f060796b1584d..311d12cf4f1965 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.h
+++ b/llvm/lib/Transforms/Vectorize/VPlan.h
@@ -1682,8 +1682,7 @@ class VPVectorPointerRecipe : public VPRecipeWithIRFlags {
   bool onlyFirstPartUsed(const VPValue *Op) const override {
     assert(is_contained(operands(), Op) &&
            "Op must be an operand of the recipe");
-    assert(getNumOperands() == 1 && "must have a single operand");
-    return true;
+    return Op == getOperand(0);
   }
 
   VPVectorPointerRecipe *clone() override {
diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
index 81907dff591c9b..aa5f51a8578200 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
@@ -692,6 +692,7 @@ unsigned VPInstruction::getInterleaveCount() const {
              ? 1
              : cast<ConstantInt>(getOperand(1)->getLiveInIRValue())
                    ->getZExtValue();
+}
 
 #if !defined(NDEBUG)
 bool VPInstruction::isFPMathOp() const {
@@ -2531,42 +2532,59 @@ void VPWidenPointerInductionRecipe::execute(VPTransformState &State) {
 
   auto *IVR = getParent()->getPlan()->getCanonicalIV();
   PHINode *CanonicalIV = cast<PHINode>(State.get(IVR, 0, /*IsScalar*/ true));
+  unsigned CurrentPart = 0;
+  if (getNumOperands() == 5)
+    CurrentPart =
+        cast<ConstantInt>(getOperand(4)->getLiveInIRValue())->getZExtValue();
   Type *PhiType = IndDesc.getStep()->getType();
 
   // Build a pointer phi
   Value *ScalarStartValue = getStartValue()->getLiveInIRValue();
   Type *ScStValueType = ScalarStartValue->getType();
-  PHINode *NewPointerPhi = PHINode::Create(ScStValueType, 2, "pointer.phi",
-                                           CanonicalIV->getIterator());
+  PHINode *NewPointerPhi = nullptr;
 
   BasicBlock *VectorPH = State.CFG.getPreheaderBBFor(this);
-  NewPointerPhi->addIncoming(ScalarStartValue, VectorPH);
+  if (getNumOperands() == 5) {
+    auto *GEP = cast<GetElementPtrInst>(State.get(getOperand(3), 0));
+    NewPointerPhi = cast<PHINode>(GEP->getPointerOperand());
+  } else {
+    NewPointerPhi =
+        PHINode::Create(ScStValueType, 2, "pointer.phi", CanonicalIV);
+    NewPointerPhi->addIncoming(ScalarStartValue, VectorPH);
+  }
 
   // A pointer induction, performed by using a gep
   BasicBlock::iterator InductionLoc = State.Builder.GetInsertPoint();
+  unsigned UF = getNumOperands() == 2
+                    ? 1
+                    : cast<ConstantInt>(getOperand(2)->getLiveInIRValue())
+                          ->getZExtValue();
 
   Value *ScalarStepValue = State.get(getOperand(1), VPIteration(0, 0));
   Value *RuntimeVF = getRuntimeVF(State.Builder, PhiType, State.VF);
   Value *NumUnrolledElems =
-      State.Builder.CreateMul(RuntimeVF, ConstantInt::get(PhiType, State.UF));
-  Value *InductionGEP = GetElementPtrInst::Create(
-      State.Builder.getInt8Ty(), NewPointerPhi,
-      State.Builder.CreateMul(ScalarStepValue, NumUnrolledElems), "ptr.ind",
-      InductionLoc);
+      State.Builder.CreateMul(RuntimeVF, ConstantInt::get(PhiType, UF));
   // Add induction update using an incorrect block temporarily. The phi node
   // will be fixed after VPlan execution. Note that at this point the latch
   // block cannot be used, as it does not exist yet.
   // TODO: Model increment value in VPlan, by turning the recipe into a
   // multi-def and a subclass of VPHeaderPHIRecipe.
-  NewPointerPhi->addIncoming(InductionGEP, VectorPH);
+  if (getNumOperands() != 5) {
+    Value *InductionGEP = GetElementPtrInst::Create(
+        State.Builder.getInt8Ty(), NewPointerPhi,
+        State.Builder.CreateMul(ScalarStepValue, NumUnrolledElems), "ptr.ind",
+        InductionLoc);
+
+    NewPointerPhi->addIncoming(InductionGEP, VectorPH);
+  }
 
   // Create UF many actual address geps that use the pointer
   // phi as base and a vectorized version of the step value
   // (<step*0, ..., step*N>) as offset.
   for (unsigned Part = 0; Part < State.UF; ++Part) {
     Type *VecPhiType = VectorType::get(PhiType, State.VF);
-    Value *StartOffsetScalar =
-        State.Builder.CreateMul(RuntimeVF, ConstantInt::get(PhiType, Part));
+    Value *StartOffsetScalar = State.Builder.CreateMul(
+        RuntimeVF, ConstantInt::get(PhiType, CurrentPart));
     Value *StartOffset =
         State.Builder.CreateVectorSplat(State.VF, StartOffsetScalar);
     // Create a vector of consecutive numbers from zero to VF.
diff --git a/llvm/test/Transforms/LoopVectorize/first-order-recurrence.ll b/llvm/test/Transforms/LoopVectorize/first-order-recurrence.ll
index 1c59419bd3f7c7..10b5aa64c180a0 100644
--- a/llvm/test/Transforms/LoopVectorize/first-order-recurrence.ll
+++ b/llvm/test/Transforms/LoopVectorize/first-order-recurrence.ll
@@ -2754,66 +2754,66 @@ define i32 @sink_into_replication_region(i32 %y) {
 ; UNROLL-NO-IC-NEXT:    [[TMP8:%.*]] = phi <4 x i32> [ poison, [[VECTOR_BODY]] ], [ [[TMP7]], [[PRED_UDIV_IF]] ]
 ; UNROLL-NO-IC-NEXT:    [[TMP9:%.*]] = extractelement <4 x i1> [[TMP2]], i32 1
 ; UNROLL-NO-IC-NEXT:    br i1 [[TMP9]], label [[PRED_UDIV_IF5:%.*]], label [[PRED_UDIV_CONTINUE6:%.*]]
-; UNROLL-NO-IC:       pred.udiv.if7:
+; UNROLL-NO-IC:       pred.udiv.if5:
 ; UNROLL-NO-IC-NEXT:    [[TMP10:%.*]] = add i32 [[OFFSET_IDX]], -1
 ; UNROLL-NO-IC-NEXT:    [[TMP11:%.*]] = udiv i32 219220132, [[TMP10]]
 ; UNROLL-NO-IC-NEXT:    [[TMP12:%.*]] = insertelement <4 x i32> [[TMP8]], i32 [[TMP11]], i32 1
 ; UNROLL-NO-IC-NEXT:    br label [[PRED_UDIV_CONTINUE6]]
-; UNROLL-NO-IC:       pred.udiv.continue8:
+; UNROLL-NO-IC:       pred.udiv.continue6:
 ; UNROLL-NO-IC-NEXT:    [[TMP13:%.*]] = phi <4 x i32> [ [[TMP8]], [[PRED_UDIV_CONTINUE]] ], [ [[TMP12]], [[PRED_UDIV_IF5]] ]
 ; UNROLL-NO-IC-NEXT:    [[TMP14:%.*]] = extractelement <4 x i1> [[TMP2]], i32 2
 ; UNROLL-NO-IC-NEXT:    br i1 [[TMP14]], label [[PRED_UDIV_IF7:%.*]], label [[PRED_UDIV_CONTINUE8:%.*]]
-; UNROLL-NO-IC:       pred.udiv.if9:
+; UNROLL-NO-IC:       pred.udiv.if7:
 ; UNROLL-NO-IC-NEXT:    [[TMP15:%.*]] = add i32 [[OFFSET_IDX]], -2
 ; UNROLL-NO-IC-NEXT:    [[TMP16:%.*]] = udiv i32 219220132, [[TMP15]]
 ; UNROLL-NO-IC-NEXT:    [[TMP17:%.*]] = insertelement <4 x i32> [[TMP13]], i32 [[TMP16]], i32 2
 ; UNROLL-NO-IC-NEXT:    br label [[PRED_UDIV_CONTINUE8]]
-; UNROLL-NO-IC:       pred.udiv.continue10:
+; UNROLL-NO-IC:       pred.udiv.continue8:
 ; UNROLL-NO-IC-NEXT:    [[TMP18:%.*]] = phi <4 x i32> [ [[TMP13]], [[PRED_UDIV_CONTINUE6]] ], [ [[TMP17]], [[PRED_UDIV_IF7]] ]
 ; UNROLL-NO-IC-NEXT:    [[TMP19:%.*]] = extractelement <4 x i1> [[TMP2]], i32 3
 ; UNROLL-NO-IC-NEXT:    br i1 [[TMP19]], label [[PRED_UDIV_IF9:%.*]], label [[PRED_UDIV_CONTINUE10:%.*]]
-; UNROLL-NO-IC:       pred.udiv.if11:
+; UNROLL-NO-IC:       pred.udiv.if9:
 ; UNROLL-NO-IC-NEXT:    [[TMP20:%.*]] = add i32 [[OFFSET_IDX]], -3
 ; UNROLL-NO-IC-NEXT:    [[TMP21:%.*]] = udiv i32 219220132, [[TMP20]]
 ; UNROLL-NO-IC-NEXT:    [[TMP22:%.*]] = insertelement <4 x i32> [[TMP18]], i32 [[TMP21]], i32 3
 ; UNROLL-NO-IC-NEXT:    br label [[PRED_UDIV_CONTINUE10]]
-; UNROLL-NO-IC:       pred.udiv.continue12:
+; UNROLL-NO-IC:       pred.udiv.continue10:
 ; UNROLL-NO-IC-NEXT:    [[TMP23:%.*]] = phi <4 x i32> [ [[TMP18]], [[PRED_UDIV_CONTINUE8]] ], [ [[TMP22]], [[PRED_UDIV_IF9]] ]
 ; UNROLL-NO-IC-NEXT:    [[TMP24:%.*]] = extractelement <4 x i1> [[TMP3]], i32 0
 ; UNROLL-NO-IC-NEXT:    br i1 [[TMP24]], label [[PRED_UDIV_IF11:%.*]], label [[PRED_UDIV_CONTINUE12:%.*]]
-; UNROLL-NO-IC:       pred.udiv.if13:
+; UNROLL-NO-IC:       pred.udiv.if11:
 ; UNROLL-NO-IC-NEXT:    [[TMP25:%.*]] = add i32 [[OFFSET_IDX]], -4
 ; UNROLL-NO-IC-NEXT:    [[TMP26:%.*]] = udiv i32 219220132, [[TMP25]]
 ; UNROLL-NO-IC-NEXT:    [[TMP27:%.*]] = insertelement <4 x i32> poison, i32 [[TMP26]], i32 0
 ; UNROLL-NO-IC-NEXT:    br label [[PRED_UDIV_CONTINUE12]]
-; UNROLL-NO-IC:       pred.udiv.continue14:
+; UNROLL-NO-IC:       pred.udiv.continue12:
 ; UNROLL-NO-IC-NEXT:    [[TMP28:%.*]] = phi <4 x i32> [ poison, [[PRED_UDIV_CONTINUE10]] ], [ [[TMP27]], [[PRED_UDIV_IF11]] ]
 ; UNROLL-NO-IC-NEXT:    [[TMP29:%.*]] = extractelement <4 x i1> [[TMP3]], i32 1
 ; UNROLL-NO-IC-NEXT:    br i1 [[TMP29]], label [[PRED_UDIV_IF13:%.*]], label [[PRED_UDIV_CONTINUE14:%.*]]
-; UNROLL-NO-IC:       pred.udiv.if15:
+; UNROLL-NO-IC:       pred.udiv.if13:
 ; UNROLL-NO-IC-NEXT:    [[TMP30:%.*]] = add i32 [[OFFSET_IDX]], -5
 ; UNROLL-NO-IC-NEXT:    [[TMP31:%.*]] = udiv i32 219220132, [[TMP30]]
 ; UNROLL-NO-IC-NEXT:    [[TMP32:%.*]] = insertelement <4 x i32> [[TMP28]], i32 [[TMP31]], i32 1
 ; UNROLL-NO-IC-NEXT:    br label [[PRED_UDIV_CONTINUE14]]
-; UNROLL-NO-IC:       pred.udiv.continue16:
+; UNROLL-NO-IC:       pred.udiv.continue14:
 ; UNROLL-NO-IC-NEXT:    [[TMP33:%.*]] = phi <4 x i32> [ [[TMP28]], [[PRED_UDIV_CONTINUE12]] ], [ [[TMP32]], [[PRED_UDIV_IF13]] ]
 ; UNROLL-NO-IC-NEXT:    [[TMP34:%.*]] = extractelement <4 x i1> [[TMP3]], i32 2
 ; UNROLL-NO-IC-NEXT:    br i1 [[TMP34]], label [[PRED_UDIV_IF15:%.*]], label [[PRED_UDIV_CONTINUE16:%.*]]
-; UNROLL-NO-IC:       pred.udiv.if17:
+; UNROLL-NO-IC:       pred.udiv.if15:
 ; UNROLL-NO-IC-NEXT:    [[TMP35:%.*]] = add i32 [[OFFSET_IDX]], -6
 ; UNROLL-NO-IC-NEXT:    [[TMP36:%.*]] = udiv i32 219220132, [[TMP35]]
 ; UNROLL-NO-IC-NEXT:    [[TMP37:%.*]] = insertelement <4 x i32> [[TMP33]], i32 [[TMP36]], i32 2
 ; UNROLL-NO-IC-NEXT:    br label [[PRED_UDIV_CONTINUE16]]
-; UNROLL-NO-IC:       pred.udiv.continue18:
+; UNROLL-NO-IC:       pred.udiv.continue16:
 ; UNROLL-NO-IC-NEXT:    [[TMP38:%.*]] = phi <4 x i32> [ [[TMP33]], [[PRED_UDIV_CONTINUE14]] ], [ [[TMP37]], [[PRED_UDIV_IF15]] ]
 ; UNROLL-NO-IC-NEXT:    [[TMP39:%.*]] = extractelement <4 x i1> [[TMP3]], i32 3
 ; UNROLL-NO-IC-NEXT:    br i1 [[TMP39]], label [[PRED_UDIV_IF17:%.*]], label [[PRED_UDIV_CONTINUE18]]
-; UNROLL-NO-IC:       pred.udiv.if19:
+; UNROLL-NO-IC:       pred.udiv.if17:
 ; UNROLL-NO-IC-NEXT:    [[TMP40:%.*]] = add i32 [[OFFSET_IDX]], -7
 ; UNROLL-NO-IC-NEXT:    [[TMP41:%.*]] = udiv i32 219220132, [[TMP40]]
 ; UNROLL-NO-IC-NEXT:    [[TMP42:%.*]] = insertelement <4 x i32> [[TMP38]], i32 [[TMP41]], i32 3
 ; UNROLL-NO-IC-NEXT:    br label [[PRED_UDIV_CONTINUE18]]
-; UNROLL-NO-IC:       pred.udiv.continue20:
+; UNROLL-NO-IC:       pred.udiv.continue18:
 ; UNROLL-NO-IC-NEXT:    [[TMP43]] = phi <4 x i32> [ [[TMP38]], [[PRED_UDIV_CONTINUE16]] ], [ [[TMP42]], [[PRED_UDIV_IF17]] ]
 ; UNROLL-NO-IC-NEXT:    [[TMP44:%.*]] = shufflevector <4 x i32> [[VECTOR_RECUR]], <4 x i32> [[TMP23]], <4 x i32> <i32 3, i32 4, i32 5, i32 6>
 ; UNROLL-NO-IC-NEXT:    [[TMP45:%.*]] = shufflevector <4 x i32> [[TMP23]], <4 x i32> [[TMP43]], <4 x i32> <i32 3, i32 4, i32 5, i32 6>
@@ -3064,59 +3064,59 @@ define i32 @sink_into_replication_region_multiple(ptr %x, i32 %y) {
 ; UNROLL-NO-IC-NEXT:    [[TMP15:%.*]] = phi <4 x i32> [ poison, [[VECTOR_BODY]] ], [ [[TMP14]], [[PRED_UDIV_IF]] ]
 ; UNROLL-NO-IC-NEXT:    [[TMP16:%.*]] = extractelement <4 x i1> [[TMP10]], i32 1
 ; UNROLL-NO-IC-NEXT:    br i1 [[TMP16]], label [[PRED_UDIV_IF4:%.*]], label [[PRED_UDIV_CONTINUE5:%.*]]
-; UNROLL-NO-IC:       pred.udiv.if3:
+; UNROLL-NO-IC:       pred.udiv.if4:
 ; UNROLL-NO-IC-NEXT:    [[TMP17:%.*]] = udiv i32 219220132, [[TMP3]]
 ; UNROLL-NO-IC-NEXT:    [[TMP18:%.*]] = insertelement <4 x i32> [[TMP15]], i32 [[TMP17]], i32 1
 ; UNROLL-NO-IC-NEXT:    br label [[PRED_UDIV_CONTINUE5]]
-; UNROLL-NO-IC:       pred.udiv.continue4:
+; UNROLL-NO-IC:       pred.udiv.continue5:
 ; UNROLL-NO-IC-NEXT:    [[TMP19:%.*]] = phi <4 x i32> [ [[TMP15]], [[PRED_UDIV_CONTINUE]] ], [ [[TMP18]], [[PRED_UDIV_IF4]] ]
 ; UNROLL-NO-IC-NEXT:    [[TMP20:%.*]] = extractelement <4 x i1> [[TMP10]], i32 2
 ; UNROLL-NO-IC-NEXT:    br i1 [[TMP20]], label [[PRED_UDIV_IF6:%.*]], label [[PRED_UDIV_CONTINUE7:%.*]]
-; UNROLL-NO-IC:       pred.udiv.if5:
+; UNROLL-NO-IC:       pred.udiv.if6:
 ; UNROLL-NO-IC-NEXT:    [[TMP21:%.*]] = udiv i32 219220132, [[TMP4]]
 ; UNROLL-NO-IC-NEXT:    [[TMP22:%.*]] = insertelement <4 x i32> [[TMP19]], i32 [[TMP21]], i32 2
 ; UNROLL-NO-IC-NEXT:    br label [[PRED_UDIV_CONTINUE7]]
-; UNROLL-NO-IC:       pred.udiv.continue6:
+; UNROLL-NO-IC:       pred.udiv.continue7:
 ; UNROLL-NO-IC-NEXT:    [[TMP23:%.*]] = phi <4 x i32> [ [[TMP19]], [[PRED_UDIV_CONTINUE5]] ], [ [[TMP22]], [[PRED_UDIV_IF6]] ]
 ; UNROLL-NO-IC-NEXT:    [[TMP24:%.*]] = extractelement <4 x i1> [[TMP10]], i32 3
 ; UNROLL-NO-IC-NEXT:    br i1 [[TMP24]], label [[PRED_UDIV_IF8:%.*]], label [[PRED_UDIV_CONTINUE9:%.*]]
-; UNROLL-NO-IC:       pred.udiv.if7:
+; UNROLL-NO-IC:       pred.udiv.if8:
 ; UNROLL-NO-IC-NEXT:    [[TMP25:%.*]] = udiv i32 219220132, [[TMP5]]
 ; UNROLL-NO-IC-NEXT:    [[TMP26:%.*]] = insertelement <4 x i32> [[TMP23]], i32 [[TMP25]], i32 3
 ; UNROLL-NO-IC-NEXT:    br label [[PRED_UDIV_CONTINUE9]]
-; UNROLL-NO-IC:       pred.udiv.continue8:
+; UNROLL-NO-IC:       pred.udiv.continue9:
 ; UNROLL-NO-IC-NEXT:    [[TMP27:%.*]] = phi <4 x i32> [ [[TMP23]], [[PRED_UDIV_CONTINUE7]] ], [ [[TMP26]], [[PRED_UDIV_IF8]] ]
 ; UNROLL-NO-IC-NEXT:    [[TMP28:%.*]] = extractelement <4 x i1> [[TMP11]], i32 0
 ; UNROLL-NO-IC-NEXT:    br i1 [[TMP28]], label [[PRED_UDIV_IF10:%.*]], label [[PRED_UDIV_CONTINUE11:%.*]]
-; UNROLL-NO-IC:       pred.udiv.if9:
+; UNROLL-NO-IC:       pred.udiv.if10:
 ; UNROLL-NO-IC-NEXT:    [[TMP29:%.*]] = udiv i32 219220132, [[TMP6]]
 ; UNROLL-NO-IC-NEXT:    [[TMP30:%.*]] = insertelement <4 x i32> poison, i32 [[TMP29]], i32 0
 ; UNROLL-NO-IC-NEXT:    br label [[PRED_UDIV_CONTINUE11]]
-; UNROLL-NO-IC:       pred.udiv.continue10:
+; UNROLL-NO-IC:       pred.udiv.continue11:
 ; UNROLL-NO-IC-NEXT:    [[TMP31:%.*]] = phi <4 x i32> [ poison, [[PRED_UDIV_CONTINUE9]] ], [ [[TMP30]], [[PRED_UDIV_IF10]] ]
 ; UNROLL-NO-IC-NEXT:    [[TMP32:%.*]] = extractelement <4 x i1> [[TMP11]], i32 1
 ; UNROLL-NO-IC-NEXT:    br i1 [[TMP32]], label [[PRED_UDIV_IF12:%.*]], label [[PRED_UDIV_CONTINUE13:%.*]]
-; UNROLL-NO-IC:       pred.udiv.if11:
+; UNROLL-NO-IC:       pred.udiv.if12:
 ; UNROLL-NO-IC-NEXT:    [[TMP33:%.*]] = udiv i32 219220132, [[TMP7]]
 ; UNROLL-NO-IC-NEXT:    [[TMP34:%.*]] = insertelement <4 x i32> [[TMP31]], i32 [[TMP33]], i32 1
 ; UNROLL-NO-IC-NEXT:    br label [[PRED_UDIV_CONTINUE13]]
-; UNROLL-NO-IC:       pred.udiv.continue12:
+; UNROLL-NO-IC:       pred.udiv.continue13:
 ; UNROLL-NO-IC-NEXT:    [[TMP35:%.*]] = phi <4 x i32> [ [[TMP31]], [[PRED_UDIV_CONTINUE11]] ], [ [[TMP34]], [[PRED_UDIV_IF12]] ]
 ; UNROLL-NO-IC-NEXT:    [[TMP36:%.*]] = extractelement <4 x i1> [[TMP11]], i32 2
 ; UNROLL-NO-IC-NEXT:    br i1 [[TMP36]], label [[PRED_UDIV_IF14:%.*]], label [[PRED_UDIV_CONTINUE15:%.*]]
-; UNROLL-NO-IC:       pred.udiv.if13:
+; UNROLL-NO-IC:       pred.udiv.if14:
 ; UNROLL-NO-IC-NEXT:    [[TMP37:%.*]] = udiv i32 219220132, [[TMP8]]
 ; UNROLL-NO-IC-NEXT:    [[TMP38:%.*]] = insertelement <4 x i32> [[TMP35]], i32 [[TMP37]], i32 2
 ; UNROLL-NO-IC-NEXT:    br label [[PRED_UDIV_CONTINUE15]]
-; UNROLL-NO-IC:       pred.udiv.continue14:
+; UNROLL-NO-IC:       pred.udiv.continue15:
 ; UNROLL-NO-IC-NEXT:    [[TMP39:%.*]] = phi <4 x i32> [ [[TMP35]], [[PRED_UDIV_CONTINUE13]] ], [ [[TMP38]], [[PRED_UDIV_IF14]] ]
 ; UNROLL-NO-IC-NEXT:    [[TMP40:%.*]] = extractelement <4 x i1> [[TMP11]], i32 3
 ; UNROLL-NO-IC-NEXT:    br i1 [[TMP40]], label [[PRED_UDIV_IF16:%.*]], label [[PRED_UDIV_CONTINUE17:%.*]]
-; UNROLL-NO-IC:       pred.udiv.if15:
+; UNROLL-NO-IC:       pred.udiv.if16:
 ; UNROLL-NO-IC-NEXT:    [[TMP41:%.*]] = udiv i32 219220132, [[TMP9]]
 ; UNROLL-NO-IC-NEXT:    [[TMP42:%.*]] = insertelement <4 x i32> [[TMP39]], i32 [[TMP41]], i32 3
 ; UNROLL-NO-IC-NEXT:    br label [[PRED_UDIV_CONTINUE17]]
-; UNROLL-NO-IC:       pred.udiv.continue16:
+; UNROLL-NO-IC:       pred.udiv.continue17:
 ; UNROLL-NO-IC-NEXT:    [[TMP43]] = phi <4 x i32> [ [[TMP39]], [[PRED_UDIV_CONTINUE15]] ], [ [[TMP42]], [[PRED_UDIV_IF16]] ]
 ; UNROLL-NO-IC-NEXT:    [[TMP44:%.*]] = shufflevector <4 x i32> [[VECTOR_RECUR]], <4 x i32> [[TMP27]], <4 x i32> <i32 3, i32 4, i32 5, i32 6>
 ; UNROLL-NO-IC-NEXT:    [[TMP45:%.*]] = shufflevector <4 x i32> [[TMP27]], <4 x i32> [[TMP43]], <4 x i32> <i32 3, i32 4, i32 5, i32 6>
@@ -3132,60 +3132,60 @@ define i32 @sink_into_replication_region_multiple(ptr %x, i32 %y) {
 ; UNROLL-NO-IC:       pred.store.continue:
 ; UNROLL-NO-IC-NEXT:    [[TMP51:%.*]] = extractelement <4 x i1> [[TMP10]], i32 1
 ; UNROLL-NO-IC-NEXT:    br i1 [[TMP51]], label [[PRED_STORE_IF18:%.*]], label [[PRED_STORE_CONTINUE19:%.*]]
-; UNROLL-NO-IC:       pred.store.if17:
+; UNROLL-NO-IC:       pred.store.if18:
 ; UNROLL-NO-IC-NEXT:    [[TMP52:%.*]] = add i32 [[INDEX]], 1
 ; UNROLL-NO-IC-NEXT:    [[TMP53:%.*]] = getelementptr inbounds i32, ptr [[X]], i32 [[TMP52]]
 ; UNROLL-NO-IC-NEXT:    store i32 [[TMP3]], ptr [[TMP53]], align 4
 ; UNROLL-NO-IC-NEXT:    br label [[PRED_STORE_CONTINUE19]]
-; UNROLL-NO-IC:       pred.store.continue18:
+; UNROLL-NO-IC:       pred.store.continue19:
 ; UNROLL-NO-IC-NEXT:    [[TMP54:%.*]] = extractelement <4 x i1> [[TMP10]], i32 2
 ; UNROLL-NO-IC-NEXT:    br i1 [[TMP54]], label [[PRED_STORE_IF20:%.*]], label [[PRED_STORE_CONTINUE21:%.*]]
-; UNROLL-NO-IC:       pred.store.if19:
+; UNROLL-NO-IC:       pred.store.if20:
 ; UNROLL-NO-IC-NEXT:    [[TMP55:%.*]] = add i32 [[INDEX]], 2
 ; UNROLL-NO-IC-NEXT:    [[TMP56:%.*]] = getelementptr inbounds i32, ptr [[X]], i32 [[TMP55]]
 ; UNROLL-NO-IC-NEXT:    store i32 [[TMP4]], ptr [[TMP56]], align 4
 ; UNROLL-NO-IC-NEXT:    br label [[PRED_STORE_CONTINUE21]]
-; UNROLL-NO-IC:       pred.store.continue20:
+; UNROLL-NO-IC:       pred.store.continue21:
 ; UNROLL-NO-IC-NEXT:    [[TMP57:%.*]] = extractelement <4 x i1> [[TMP10]], i32 3
 ; UNROLL-NO-IC-NEXT:    br i1 [[TMP57]], label [[PRED_STORE_IF22:%.*]], label [[PRED_STORE_CONTINUE23:%.*]]
-; UNROLL-NO-IC:       pred.store.if21:
+; UNROLL-NO-IC:       pred.store.if22:
 ; UNROLL-NO-IC-NEXT:    [[TMP58:%.*]] = add i32 [[INDEX]], 3
 ; UNROLL-NO-IC-NEXT:    [[TMP59:%.*]] = getelementptr inbounds i32, ptr [[X]], i32 [[TMP58]]
 ; UNROLL-NO-IC-NEXT:    store i32 [[TMP5]], ptr [[TMP59]], align 4
 ; UNROLL-NO-IC-NEXT:    br label [[PRED_STORE_CONTINUE23]]
-; UNROLL-NO-IC:       pred.store.continue22:
+; UNROLL-NO-IC:       pred.store.continue23:
 ; UNROLL-NO-IC-NEXT:    [[TMP60:%.*]] = extractelement <4 x i1> [[TMP11]], i32 0
 ; UNROLL-NO-IC-NEXT:    br i1 [[TMP60]], label [[PRED_STORE_IF24:%.*]], label [[PRED_STORE_CONTINUE25:%.*]]
-; UNROLL-NO-IC:       pred.store.if23:
+; UNROLL-NO-IC:       pred.store.if24:
 ; UNROLL-NO-IC-NEXT:    [[TMP61:%.*]] = add i32 [[INDEX]], 4
 ; UNROLL-NO-IC-NEXT:    [[TMP62:%.*]] = getelementptr inbounds i32, ptr [[X]], i32 [[TMP61]]
 ; UNROLL-NO-IC-NEXT:    store i32 [[TMP6]], ptr [[TMP62]], align 4
 ; UNROLL-NO-IC-NEXT:    br label [[PRED_STORE_CONTINUE25]]
-; UNROLL-NO-IC:       pred.store.continue24:
+; UNROLL-NO-IC:       pred.store.continue25:
 ; UNROLL-NO-IC-NEXT:    [[TMP63:%.*]] = extractelement <4 x i1> [[TMP11]], i32 1
 ; UNROLL-NO-IC-NEXT:    br i1 [[TMP63]], label [[PRED_STORE_IF26:%.*]], label [[PRED_STORE_CONTINUE27:%.*]]
-; UNROLL-NO-IC:       pred.store.if25:
+; UNROLL-NO-IC:       pred.store.if26:
 ; UNROLL-NO-IC-NEXT:    [[TMP64:%.*]] = add i32 [[INDEX]], 5
 ; UNROLL-NO-IC-NEXT:    [[TMP65:%.*]] = getelementptr inbounds i32, ptr [[X]], i32 [[TMP64]]
 ; UNROLL-NO-IC-NEXT:    store i32 [[TMP7]], ptr [[TMP65]], align 4
 ; UNROLL-NO-IC-NEXT:    br label [[PRED_STORE_CONTINUE27]]
-; UNROLL-NO-IC:       pred.store.continue26:
+; UNROLL-NO-IC:       pred.store.continue27:
 ; UNROLL-NO-IC-NEXT:    [[TMP66:%.*]] = extractelement <4 x i1> [[TMP11]], i32 2
 ; UNROLL-NO-IC-NEXT:    br i1 [[TMP66]], label [[PRED_STORE_IF28:%.*]], label [[PRED_STORE_CONTINUE29:%.*]]
-; UNROLL-NO-IC:       pred.store.if27:
+; UNROLL-NO-IC:       pred.store.if28:
 ; UNROLL-NO-IC-NEXT:    [[TMP67:%.*]] = add i32 [[INDEX]], 6
 ; UNROLL-NO-IC-NEXT:    [[TMP68:%.*]] = getelementptr inbounds i32, ptr [[X]], i32 [[TMP67]]
 ; UNROLL-NO-IC-NEXT:    store i32 [[TMP8]], ptr [[TMP68]], align 4
 ; UNROLL-NO-IC-NEXT:    br label [[PRED_STORE_CONTINUE29]]
-; UNROLL-NO-IC:       pred.store.continue28:
+; UNROLL-NO-IC:       pred.store.continue29:
 ; UNROLL-NO-IC-NEXT:    [[TMP69:%.*]] = extractelement <4 x i1> [[TMP11]], i32 3
 ; UNROLL-NO-IC-NEXT:    br i1 [[TMP69]], label [[PRED_STORE_IF30:%.*]], label [[PRED_STORE_CONTINUE31]]
-; UNROLL-NO-IC:       pred.store.if29:
+; UNROLL-NO-IC:       pred.store.if30:
 ; UNROLL-NO-IC-NEXT:    [[TMP70:%.*]] = add i32 [[INDEX]], 7
 ; UNROLL-NO-IC-NEXT:    [[TMP71:%.*]] = getelementptr inbounds i32, ptr [[X]], i32 [[TMP70]]
 ; UNROLL-NO-IC-NEXT:    store i32 [[TMP9]], ptr [[TMP71]], align 4
 ; UNROLL-NO-IC-NEXT:    br label [[PRED_STORE_CONTINUE31]]
-; UNROLL-NO-IC:       pred.store.continue30:
+; UNROLL-NO-IC:       pred.store.continue31:
 ; UNROLL-NO-IC-NEXT:    [[TMP72:%.*]] = select <4 x i1> [[TMP10]], <4 x i32> [[TMP46]], <4 x i32> [[VEC_PHI]]
 ; UNROLL-NO-IC-NEXT:    [[TMP73:%.*]] = select <4 x i1> [[TMP11]], <4 x i32> [[TMP47]], <4 x i32> [[VEC_PHI3]]
 ; UNROLL-NO-IC-NEXT:    [[INDEX_NEXT]] = add i32 [[INDEX]], 8

>From 4abc317c150ba14f63596a105fc83d3aaaa50b18 Mon Sep 17 00:00:00 2001
From: Florian Hahn <flo at fhahn.com>
Date: Wed, 14 Aug 2024 20:57:14 +0100
Subject: [PATCH 04/15] !fixup address latest comments, thanks!

---
 .../Vectorize/LoopVectorizationPlanner.h      |   4 +-
 .../Transforms/Vectorize/LoopVectorize.cpp    |   4 +-
 llvm/lib/Transforms/Vectorize/VPlan.cpp       |  22 +-
 llvm/lib/Transforms/Vectorize/VPlan.h         |  58 +++-
 .../lib/Transforms/Vectorize/VPlanRecipes.cpp |  11 +-
 .../Transforms/Vectorize/VPlanTransforms.cpp  | 315 ++++++++++--------
 .../Transforms/Vectorize/VPlanTransforms.h    |   2 +-
 7 files changed, 236 insertions(+), 180 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h b/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h
index 4de253b6cc475b..e5c05c262fac4c 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h
@@ -166,8 +166,8 @@ class VPBuilder {
                             std::initializer_list<VPValue *> Operands,
                             DebugLoc DL = {}, const Twine &Name = "",
                             FastMathFlags FMFs = {}) {
-    auto *Op = new VPInstruction(Opcode, Operands, FMFs, DL, Name);
-    return tryInsertInstruction(Op);
+    return tryInsertInstruction(
+        new VPInstruction(Opcode, Operands, FMFs, DL, Name));
   }
 
   VPValue *createNot(VPValue *Operand, DebugLoc DL = {},
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index afcef4599cd640..a21dd70587944c 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -7392,8 +7392,8 @@ LoopVectorizationPlanner::executePlan(
       "expanded SCEVs to reuse can only be used during epilogue vectorization");
   (void)IsEpilogueVectorization;
 
-  VPlanTransforms::interleave(BestVPlan, BestUF,
-                              OrigLoop->getHeader()->getModule()->getContext());
+  VPlanTransforms::interleaveByUF(
+      BestVPlan, BestUF, OrigLoop->getHeader()->getModule()->getContext());
   VPlanTransforms::optimizeForVFAndUF(BestVPlan, BestVF, BestUF, PSE);
 
   LLVM_DEBUG(dbgs() << "Executing best plan with VF=" << BestVF
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.cpp b/llvm/lib/Transforms/Vectorize/VPlan.cpp
index 8933e5fc178307..edd549955ae365 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlan.cpp
@@ -572,8 +572,7 @@ VPBasicBlock *VPBasicBlock::splitAt(iterator SplitAt) {
   return SplitBlock;
 }
 
-VPRegionBlock *VPBasicBlock::getEnclosingLoopRegion() {
-  VPRegionBlock *P = getParent();
+template <typename T> static T *getEnclosingLoopRegionImpl(T *P) {
   if (P && P->isReplicator()) {
     P = P->getParent();
     assert(!cast<VPRegionBlock>(P)->isReplicator() &&
@@ -582,6 +581,14 @@ VPRegionBlock *VPBasicBlock::getEnclosingLoopRegion() {
   return P;
 }
 
+const VPRegionBlock *VPBasicBlock::getEnclosingLoopRegion() const {
+  return getEnclosingLoopRegionImpl(getParent());
+}
+
+VPRegionBlock *VPBasicBlock::getEnclosingLoopRegion() {
+  return getEnclosingLoopRegionImpl(getParent());
+}
+
 static bool hasConditionalTerminator(const VPBasicBlock *VPBB) {
   if (VPBB->empty()) {
     assert(
@@ -933,7 +940,7 @@ void VPlan::prepareToExecute(Value *TripCountV, Value *VectorTripCountV,
       createStepForVF(Builder, TripCountV->getType(), State.VF, State.UF));
   if (VF.getNumUsers() > 0) {
     VF.setUnderlyingValue(
-        createStepForVF(Builder, TripCountV->getType(), State.VF, 1));
+        getRuntimeVF(Builder, TripCountV->getType(), State.VF));
   }
 
   // When vectorizing the epilogue loop, the canonical induction start value
@@ -1053,10 +1060,12 @@ void VPlan::execute(VPTransformState *State) {
       // Move the last step to the end of the latch block. This ensures
       // consistent placement of all induction updates.
       Instruction *Inc = cast<Instruction>(Phi->getIncomingValue(1));
+      Inc->moveBefore(VectorLatchBB->getTerminator()->getPrevNode());
+
+      // When the VPlan has been unrolled, chain together the steps of the
+      // unrolled parts together.
       if (isa<VPWidenIntOrFpInductionRecipe>(&R) && R.getNumOperands() == 4)
         Inc->setOperand(0, State->get(R.getOperand(3), 0));
-
-      Inc->moveBefore(VectorLatchBB->getTerminator()->getPrevNode());
       continue;
     }
 
@@ -1427,7 +1436,8 @@ void VPlanIngredient::print(raw_ostream &O) const {
 template void DomTreeBuilder::Calculate<VPDominatorTree>(VPDominatorTree &DT);
 
 bool VPValue::isDefinedOutsideVectorRegions() const {
-  return !hasDefiningRecipe() || !getDefiningRecipe()->getParent()->getParent();
+  return !hasDefiningRecipe() ||
+         !getDefiningRecipe()->getParent()->getEnclosingLoopRegion();
 }
 
 void VPValue::replaceAllUsesWith(VPValue *New) {
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h
index 311d12cf4f1965..778e5635cf3a14 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.h
+++ b/llvm/lib/Transforms/Vectorize/VPlan.h
@@ -535,6 +535,7 @@ class VPBlockBase {
   VPBlocksTy &getSuccessors() { return Successors; }
 
   iterator_range<VPBlockBase **> successors() { return Successors; }
+  iterator_range<VPBlockBase **> predecessors() { return Predecessors; }
 
   const VPBlocksTy &getPredecessors() const { return Predecessors; }
   VPBlocksTy &getPredecessors() { return Predecessors; }
@@ -1400,7 +1401,7 @@ class VPInstruction : public VPRecipeWithIRFlags {
   /// result is also a single scalar.
   bool isSingleScalar() const;
 
-  /// Return the interleave count from the VPInstruction's last argument.
+  /// Return the interleave count from VPInstruction's last operand.
   unsigned getInterleaveCount() const;
 };
 
@@ -1690,7 +1691,7 @@ class VPVectorPointerRecipe : public VPRecipeWithIRFlags {
                                      isInBounds(), getDebugLoc());
   }
 
-  /// Return the current part for this vector pointer.
+  /// Return the part associated with this vector pointer.
   unsigned getPartForRecipe() const;
 
 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
@@ -2034,7 +2035,7 @@ class VPReductionPHIRecipe : public VPHeaderPHIRecipe {
   /// Returns true, if the phi is part of an in-loop reduction.
   bool isInLoop() const { return IsInLoop; }
 
-  /// Return the current part for this scalar step.
+  /// Return the part associated with this reduction phi.
   unsigned getPartForRecipe() const;
 };
 
@@ -2746,9 +2747,6 @@ class VPCanonicalIVPHIRecipe : public VPHeaderPHIRecipe {
   /// Generate the canonical scalar induction phi of the vector loop.
   void execute(VPTransformState &State) override;
 
-  /// Return the current part for this scalar step.
-  unsigned getPartForRecipe() const;
-
 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
   /// Print the recipe.
   void print(raw_ostream &O, const Twine &Indent,
@@ -2873,7 +2871,7 @@ class VPWidenCanonicalIVRecipe : public VPSingleDefRecipe {
   /// step = <VF*UF, VF*UF, ..., VF*UF>.
   void execute(VPTransformState &State) override;
 
-  /// Return the current part for this scalar step.
+  /// Return the part associated with this widened IV.
   unsigned getPartForRecipe() const;
 
 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
@@ -2989,7 +2987,7 @@ class VPScalarIVStepsRecipe : public VPRecipeWithIRFlags {
     return true;
   }
 
-  /// Return the current part for this scalar step.
+  /// Return the part associated with this scalar step
   unsigned getPartForRecipe() const;
 };
 
@@ -3093,6 +3091,7 @@ class VPBasicBlock : public VPBlockBase {
   VPBasicBlock *splitAt(iterator SplitAt);
 
   VPRegionBlock *getEnclosingLoopRegion();
+  const VPRegionBlock *getEnclosingLoopRegion() const;
 
 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
   /// Print this VPBsicBlock to \p O, prefixing all lines with \p Indent. \p
@@ -3315,6 +3314,7 @@ class VPlan {
   /// Represents the loop-invariant VF * UF of the vector loop region.
   VPValue VFxUF;
 
+  /// Represents the loop-invariant VF of the vector loop region.
   VPValue VF;
 
   /// Holds a mapping between Values and their corresponding VPValue inside
@@ -3620,6 +3620,19 @@ class VPBlockUtils {
     connectBlocks(BlockPtr, NewBlock);
   }
 
+  static void insertBlockBefore(VPBlockBase *NewBlock, VPBlockBase *BlockPtr) {
+    assert(NewBlock->getSuccessors().empty() &&
+           NewBlock->getPredecessors().empty() &&
+           "Can't insert new block with predecessors or successors.");
+    NewBlock->setParent(BlockPtr->getParent());
+    SmallVector<VPBlockBase *> Preds(BlockPtr->predecessors());
+    for (VPBlockBase *Pred : Preds) {
+      disconnectBlocks(Pred, BlockPtr);
+      connectBlocks(Pred, NewBlock);
+    }
+    connectBlocks(NewBlock, BlockPtr);
+  }
+
   /// Insert disconnected VPBlockBases \p IfTrue and \p IfFalse after \p
   /// BlockPtr. Add \p IfTrue and \p IfFalse as succesors of \p BlockPtr and \p
   /// BlockPtr as predecessor of \p IfTrue and \p IfFalse. Propagate \p BlockPtr
@@ -3850,25 +3863,36 @@ inline bool isUniformAfterVectorization(const VPValue *VPV) {
 /// Return true if \p V is a header mask in \p Plan.
 bool isHeaderMask(const VPValue *V, VPlan &Plan);
 
-/// Checks if \p C is uniform across all VFs and UFs. It is considered as such
-/// if it is either defined outside the vector region or its operand is known to
-/// be uniform across all VFs and UFs (e.g. VPDerivedIV or VPCanonicalIVPHI).
+/// Checks if \p C is uniform across all VF lanes and UF parts. It is considered
+/// as such if it is either loop invariant (defined outside the vector region)
+/// or its operand is known to be uniform across all VFs and UFs (e.g.
+/// VPDerivedIV or VPCanonicalIVPHI).
 inline bool isUniformAcrossVFsAndUFs(VPValue *V) {
-  if (V->isLiveIn())
+  // Loop invariants are uniform:
+  if (V->isDefinedOutsideVectorRegions())
     return true;
-  if (isa<VPCanonicalIVPHIRecipe, VPDerivedIVRecipe, VPExpandSCEVRecipe>(V))
+
+  auto *R = V->getDefiningRecipe();
+  // Canonical IV chain is uniform:
+  auto *CanonicalIV = R->getParent()->getPlan()->getCanonicalIV();
+  if (R == CanonicalIV || V == CanonicalIV->getBackedgeValue())
     return true;
-  auto *R = cast<VPSingleDefRecipe>(V->getDefiningRecipe());
-  if (R == R->getParent()->getPlan()->getCanonicalIV()->getBackedgeValue())
+
+  // DerivedIV is uniform:
+  if (isa<VPDerivedIVRecipe>(R))
     return true;
+
+  // Loads and stores that are uniform across VF lanes are handled by
+  // VPReplicateRecipe.IsUniform. They are also uniform across UF parts if all
+  // their operands are invariant:
   if (isa<VPReplicateRecipe>(V) && cast<VPReplicateRecipe>(V)->isUniform() &&
       (isa<LoadInst, StoreInst>(V->getUnderlyingValue())) &&
-      all_of(V->getDefiningRecipe()->operands(),
+      all_of(R->operands(),
              [](VPValue *Op) { return Op->isDefinedOutsideVectorRegions(); }))
     return true;
 
   return isa<VPScalarCastRecipe, VPWidenCastRecipe>(R) &&
-         (R->isDefinedOutsideVectorRegions() || R->getOperand(0)->isLiveIn() ||
+         (R->getOperand(0)->isLiveIn() ||
           isa<VPDerivedIVRecipe>(R->getOperand(0)) ||
           isa<VPCanonicalIVPHIRecipe>(R->getOperand(0)));
 }
diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
index aa5f51a8578200..cbe9f667a03ad7 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
@@ -544,8 +544,6 @@ Value *VPInstruction::generatePerPart(VPTransformState &State, unsigned Part) {
     return CondBr;
   }
   case VPInstruction::ComputeReductionResult: {
-    unsigned NumParts = getNumOperands() - 1;
-
     // FIXME: The cross-recipe dependency on VPReductionPHIRecipe is temporary
     // and will be removed by breaking up the recipe further.
     auto *PhiR = cast<VPReductionPHIRecipe>(getOperand(0));
@@ -556,8 +554,11 @@ Value *VPInstruction::generatePerPart(VPTransformState &State, unsigned Part) {
     RecurKind RK = RdxDesc.getRecurrenceKind();
 
     Type *PhiTy = OrigPhi->getType();
+    // The recipe's operands are the reduction phi, followed by one operand for
+    // each part of the reduction.
+    unsigned NumParts = getNumOperands() - 1;
     VectorParts RdxParts(NumParts);
-    for (unsigned Part = 0; Part != NumParts; ++Part)
+    for (unsigned Part = 0; Part < NumParts; ++Part)
       RdxParts[Part] = State.get(getOperand(1 + Part), 0, PhiR->isInLoop());
 
     // If the vector reduction can be performed in a smaller type, we truncate
@@ -688,6 +689,9 @@ bool VPInstruction::isSingleScalar() const {
 }
 
 unsigned VPInstruction::getInterleaveCount() const {
+  assert((getOpcode() == VPInstruction::CalculateTripCountMinusVF ||
+          getOpcode() == VPInstruction::CanonicalIVIncrementForPart) &&
+         "used with unexpected opcode");
   return getNumOperands() == 1
              ? 1
              : cast<ConstantInt>(getOperand(1)->getLiveInIRValue())
@@ -1313,7 +1317,6 @@ void VPWidenIntOrFpInductionRecipe::execute(VPTransformState &State) {
 
   Value *SplatVF;
   if (getNumOperands() == 4) {
-    // Need to create stuff in PH.
     SplatVF = State.get(getOperand(2), 0);
   } else {
     // Multiply the vectorization factor by the step using integer or
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
index a9fe012110f393..945b395a1fd2e9 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
@@ -1629,16 +1629,19 @@ class InterleaveState {
 
 public:
   VPValue *getInterleavedValue(VPValue *V, unsigned IC) {
-    if (IC == 0)
-      return V;
-    if (V->isLiveIn())
+    if (IC == 0 || V->isLiveIn())
       return V;
+    assert(
+        (InterleavedValues.contains(V) && InterleavedValues[V].size() >= IC) &&
+        "accessed value does not exist");
     return InterleavedValues[V][IC - 1];
   }
 
-  void addInterleavedValues(VPRecipeBase *OrigR, VPRecipeBase *CopyR) {
+  void addInterleavedValues(VPRecipeBase *OrigR, VPRecipeBase *CopyR,
+                            unsigned I) {
     for (const auto &[Idx, VPV] : enumerate(OrigR->definedValues())) {
       auto Ins = InterleavedValues.insert({VPV, {}});
+      assert(Ins.first->second.size() == I - 1 && "earlier parts not set");
       Ins.first->second.push_back(CopyR->getVPValue(Idx));
     }
   }
@@ -1655,9 +1658,14 @@ class InterleaveState {
     return InterleavedValues;
   }
 
-  void remapOperands(VPRecipeBase *R, unsigned I) {
-    for (const auto &[Idx, Op] : enumerate(R->operands()))
-      R->setOperand(Idx, getInterleavedValue(Op, I));
+  void remapOperand(VPRecipeBase *R, unsigned OpIdx, unsigned Part) {
+    auto *Op = R->getOperand(OpIdx);
+    R->setOperand(OpIdx, getInterleavedValue(Op, Part));
+  }
+
+  void remapOperands(VPRecipeBase *R, unsigned Part) {
+    for (const auto &[OpIdx, Op] : enumerate(R->operands()))
+      R->setOperand(OpIdx, getInterleavedValue(Op, Part));
   }
 };
 } // namespace
@@ -1666,114 +1674,120 @@ static void interleaveReplicateRegion(VPRegionBlock *VPR, VPlan &Plan,
                                       unsigned IC,
                                       InterleaveState &InterleavedValues) {
   Type *CanIVIntTy = Plan.getCanonicalIV()->getScalarType();
-  VPBlockBase *InsertPt = VPR;
+  VPBlockBase *InsertPt = VPR->getSingleSuccessor();
   for (unsigned I = 1; I != IC; ++I) {
     auto *Copy = VPR->clone();
-    VPBlockUtils::insertBlockAfter(Copy, InsertPt);
-    InsertPt = Copy;
-
-    ReversePostOrderTraversal<VPBlockShallowTraversalWrapper<VPBlockBase *>>
-        RPOT(Copy->getEntry());
-    ReversePostOrderTraversal<VPBlockShallowTraversalWrapper<VPBlockBase *>>
-        RPOT2(VPR->getEntry());
-    for (const auto &[New, Old] :
-         zip(VPBlockUtils::blocksOnly<VPBasicBlock>(RPOT),
-             VPBlockUtils::blocksOnly<VPBasicBlock>(RPOT2))) {
-      for (const auto &[CopyR, OrigR] : zip(*New, *Old)) {
-        InterleavedValues.remapOperands(&CopyR, I);
-        if (auto *ScalarIVSteps = dyn_cast<VPScalarIVStepsRecipe>(&CopyR)) {
+    VPBlockUtils::insertBlockBefore(Copy, InsertPt);
+
+    auto PartI = vp_depth_first_shallow(Copy->getEntry());
+    auto Part0 = vp_depth_first_shallow(VPR->getEntry());
+    for (const auto &[PartIVPBB, Part0VPBB] :
+         zip(VPBlockUtils::blocksOnly<VPBasicBlock>(PartI),
+             VPBlockUtils::blocksOnly<VPBasicBlock>(Part0))) {
+      for (const auto &[PartIR, Part0R] : zip(*PartIVPBB, *Part0VPBB)) {
+        InterleavedValues.remapOperands(&PartIR, I);
+        if (auto *ScalarIVSteps = dyn_cast<VPScalarIVStepsRecipe>(&PartIR)) {
           ScalarIVSteps->addOperand(
               Plan.getOrAddLiveIn(ConstantInt::get(CanIVIntTy, I)));
         }
 
-        InterleavedValues.addInterleavedValues(&OrigR, &CopyR);
+        InterleavedValues.addInterleavedValues(&Part0R, &PartIR, I);
       }
     }
   }
 }
 
+static void interleaveWidenInduction(VPWidenIntOrFpInductionRecipe *IV,
+                                     VPlan &Plan, unsigned IC,
+                                     VPBasicBlock::iterator &InsertPtForPhi,
+                                     InterleaveState &InterleavedValues,
+                                     VPTypeAnalysis &TypeInfo,
+                                     SmallPtrSet<VPRecipeBase *, 8> &ToSkip) {
+  VPBasicBlock *PH = cast<VPBasicBlock>(
+      IV->getParent()->getEnclosingLoopRegion()->getSinglePredecessor());
+  VPValue *Step = &Plan.getVF();
+  Type *IVTy = TypeInfo.inferScalarType(IV);
+  auto &ID = IV->getInductionDescriptor();
+  FastMathFlags FMFs;
+  if (ID.getInductionBinOp() && isa<FPMathOperator>(ID.getInductionBinOp()))
+    FMFs = ID.getInductionBinOp()->getFastMathFlags();
+
+  if (TypeInfo.inferScalarType(Step) != IVTy) {
+    Instruction::CastOps CastOp =
+        IVTy->isFloatingPointTy() ? Instruction::UIToFP : Instruction::Trunc;
+    Step = new VPWidenCastRecipe(CastOp, Step, IV->getScalarType());
+    PH->appendRecipe(Step->getDefiningRecipe());
+    ToSkip.insert(Step->getDefiningRecipe());
+  }
+
+  auto *ConstScale =
+      IV->getOperand(1)->isLiveIn()
+          ? dyn_cast<ConstantInt>(IV->getOperand(1)->getLiveInIRValue())
+          : nullptr;
+  if (!ConstScale || ConstScale->getZExtValue() != 1) {
+    VPValue *Scale = IV->getOperand(1);
+    if (TypeInfo.inferScalarType(Scale) != IVTy) {
+      Scale =
+          new VPWidenCastRecipe(Instruction::Trunc, Scale, IV->getScalarType());
+      PH->appendRecipe(Scale->getDefiningRecipe());
+      ToSkip.insert(Scale->getDefiningRecipe());
+    }
+
+    VPBuilder Builder(PH);
+    VPInstruction *Mul;
+    if (IVTy->isFloatingPointTy())
+      Mul = Builder.createFPOp(Instruction::FMul, {Step, Scale},
+                               IV->getDebugLoc(), "", FMFs);
+    else
+      Mul = Builder.createNaryOp(Instruction::Mul, {Step, Scale},
+                                 IV->getDebugLoc());
+    Step = Mul;
+    ToSkip.insert(Mul);
+  }
+  IV->addOperand(Step);
+
+  for (unsigned I = 1; I != IC; ++I) {
+    VPBuilder Builder;
+    Builder.setInsertPoint(IV->getParent(), InsertPtForPhi);
+    VPValue *Prev = InterleavedValues.getInterleavedValue(IV, I - 1);
+    VPInstruction *Add;
+    std::string Name = I > 1 ? "step.add." + std::to_string(I) : "step.add";
+
+    if (IVTy->isFloatingPointTy())
+      Add = Builder.createFPOp(ID.getInductionOpcode(),
+                               {
+                                   Prev,
+                                   Step,
+                               },
+                               IV->getDebugLoc(), Name, FMFs);
+    else
+      Add = Builder.createNaryOp(Instruction::Add,
+                                 {
+                                     Prev,
+                                     Step,
+                                 },
+                                 IV->getDebugLoc(), Name);
+    ToSkip.insert(Add);
+    InterleavedValues.addInterleavedValues(IV, Add, I);
+    InsertPtForPhi = std::next(Add->getIterator());
+  }
+  IV->addOperand(InterleavedValues.getInterleavedValue(IV, IC - 1));
+}
+
 static void interleaveHeaderPHI(VPRecipeBase &R, VPlan &Plan, unsigned IC,
                                 VPBasicBlock::iterator &InsertPtForPhi,
                                 InterleaveState &InterleavedValues,
                                 VPTypeAnalysis &TypeInfo,
                                 SmallPtrSet<VPRecipeBase *, 8> &ToSkip) {
+  // First-order recurrences pass a single vector or scalar through their header
+  // phis, irrespective of interleaving.
   if (isa<VPFirstOrderRecurrencePHIRecipe>(&R))
     return;
 
   // Generate step vectors for each unrolled part.
   if (auto *IV = dyn_cast<VPWidenIntOrFpInductionRecipe>(&R)) {
-    VPBasicBlock *PH =
-        cast<VPBasicBlock>(Plan.getVectorLoopRegion()->getSinglePredecessor());
-    VPValue *Step = &Plan.getVF();
-    Type *IVTy = TypeInfo.inferScalarType(IV);
-    auto &ID = IV->getInductionDescriptor();
-    FastMathFlags FMFs;
-    if (ID.getInductionBinOp() && isa<FPMathOperator>(ID.getInductionBinOp()))
-      FMFs = ID.getInductionBinOp()->getFastMathFlags();
-
-    if (TypeInfo.inferScalarType(Step) != IVTy) {
-      Instruction::CastOps CastOp;
-      if (IVTy->isFloatingPointTy())
-        CastOp = Instruction::UIToFP;
-      else
-        CastOp = Instruction::Trunc;
-      Step = new VPWidenCastRecipe(CastOp, Step, IV->getScalarType());
-      PH->appendRecipe(Step->getDefiningRecipe());
-      ToSkip.insert(Step->getDefiningRecipe());
-    }
-
-    auto *ConstScale =
-        IV->getOperand(1)->isLiveIn()
-            ? dyn_cast<ConstantInt>(IV->getOperand(1)->getLiveInIRValue())
-            : nullptr;
-    if (!ConstScale || ConstScale->getZExtValue() != 1) {
-      VPValue *Scale = IV->getOperand(1);
-      if (TypeInfo.inferScalarType(Scale) != IVTy) {
-        Scale = new VPWidenCastRecipe(Instruction::Trunc, Scale,
-                                      IV->getScalarType());
-        PH->appendRecipe(Scale->getDefiningRecipe());
-        ToSkip.insert(Scale->getDefiningRecipe());
-      }
-
-      VPBuilder Builder(PH);
-      VPInstruction *Mul;
-      if (IVTy->isFloatingPointTy())
-        Mul = Builder.createFPOp(Instruction::FMul, {Step, Scale},
-                                 R.getDebugLoc(), "", FMFs);
-      else
-        Mul = Builder.createNaryOp(Instruction::Mul, {Step, Scale},
-                                   R.getDebugLoc());
-      Step = Mul;
-      ToSkip.insert(Mul);
-    }
-    R.addOperand(Step);
-
-    for (unsigned I = 1; I != IC; ++I) {
-      VPBuilder Builder;
-      Builder.setInsertPoint(R.getParent(), InsertPtForPhi);
-      VPValue *Prev = InterleavedValues.getInterleavedValue(IV, I - 1);
-      VPInstruction *Add;
-      std::string Name = I > 1 ? "step.add." + std::to_string(I) : "step.add";
-
-      if (IVTy->isFloatingPointTy())
-        Add = Builder.createFPOp(ID.getInductionOpcode(),
-                                 {
-                                     Prev,
-                                     Step,
-                                 },
-                                 R.getDebugLoc(), Name, FMFs);
-      else
-        Add = Builder.createNaryOp(Instruction::Add,
-                                   {
-                                       Prev,
-                                       Step,
-                                   },
-                                   R.getDebugLoc(), Name);
-      ToSkip.insert(Add);
-      InterleavedValues.addInterleavedValues(IV, Add);
-      InsertPtForPhi = std::next(Add->getIterator());
-    }
-    R.addOperand(InterleavedValues.getInterleavedValue(IV, IC - 1));
+    interleaveWidenInduction(IV, Plan, IC, InsertPtForPhi, InterleavedValues,
+                             TypeInfo, ToSkip);
     return;
   }
 
@@ -1783,7 +1797,7 @@ static void interleaveHeaderPHI(VPRecipeBase &R, VPlan &Plan, unsigned IC,
     VPRecipeBase *Copy = R.clone();
     Copy->insertAfter(InsertPt);
     InsertPt = Copy;
-    InterleavedValues.addInterleavedValues(&R, Copy);
+    InterleavedValues.addInterleavedValues(&R, Copy, I);
     if (isa<VPWidenPointerInductionRecipe>(&R)) {
       if (I == 1)
         R.addOperand(Plan.getOrAddLiveIn(ConstantInt::get(CanIVIntTy, IC)));
@@ -1803,6 +1817,7 @@ static void interleaveHeaderPHI(VPRecipeBase &R, VPlan &Plan, unsigned IC,
   }
 }
 
+/// Handle non-uniform, non-header-phi recipes.
 static void interleaveRecipe(VPRecipeBase &R, VPlan &Plan, unsigned IC,
                              InterleaveState &InterleavedValues,
                              VPTypeAnalysis &TypeInfo) {
@@ -1811,23 +1826,19 @@ static void interleaveRecipe(VPRecipeBase &R, VPlan &Plan, unsigned IC,
       match(&R, m_BranchOnCount(m_VPValue(), m_VPValue())))
     return;
 
-  VPValue *Op1;
-  if (match(&R, m_VPInstruction<VPInstruction::ComputeReductionResult>(
-                    m_VPValue(), m_VPValue(Op1)))) {
-    InterleavedValues.addUniform(cast<VPInstruction>(&R), IC);
-    for (unsigned I = 1; I != IC; ++I)
-      R.addOperand(InterleavedValues.getInterleavedValue(Op1, I));
-    return;
-  }
   VPValue *Op0;
   if (match(&R, m_VPInstruction<VPInstruction::ExtractFromEnd>(m_VPValue(Op0),
                                                                m_VPValue()))) {
     InterleavedValues.addUniform(cast<VPInstruction>(&R), IC);
-    bool ScalarVFOnly = Plan.hasScalarVFOnly();
-    if (!ScalarVFOnly) {
+    if (Plan.hasScalarVFOnly()) {
+      unsigned Offset = cast<ConstantInt>(R.getOperand(1)->getLiveInIRValue())
+                            ->getZExtValue();
+      R.getVPSingleValue()->replaceAllUsesWith(
+          InterleavedValues.getInterleavedValue(Op0, IC - Offset));
+    } else {
       InterleavedValues.remapOperands(&R, IC - 1);
-      return;
     }
+    return;
   }
 
   Type *CanIVIntTy = Plan.getCanonicalIV()->getScalarType();
@@ -1863,7 +1874,7 @@ static void interleaveRecipe(VPRecipeBase &R, VPlan &Plan, unsigned IC,
     VPRecipeBase *Copy = R.clone();
     Copy->insertAfter(InsertPt);
     InsertPt = Copy;
-    InterleavedValues.addInterleavedValues(&R, Copy);
+    InterleavedValues.addInterleavedValues(&R, Copy, I);
 
     if (match(&R, m_VPInstruction<VPInstruction::CanonicalIVIncrementForPart>(
                       m_VPValue())))
@@ -1900,20 +1911,21 @@ static void interleaveRecipe(VPRecipeBase &R, VPlan &Plan, unsigned IC,
   }
 }
 
+using namespace llvm::VPlanPatternMatch;
 static void interleaveBlock(VPBlockBase *VPB, VPlan &Plan, unsigned IC,
                             InterleaveState &InterleavedValues,
                             VPTypeAnalysis &TypeInfo,
                             SmallPtrSet<VPRecipeBase *, 8> &ToSkip) {
   auto *VPR = dyn_cast<VPRegionBlock>(VPB);
   if (VPR) {
-    if (VPR->isReplicator())
+    if (VPR->isReplicator()) {
       interleaveReplicateRegion(VPR, Plan, IC, InterleavedValues);
-    else {
-      ReversePostOrderTraversal<VPBlockShallowTraversalWrapper<VPBlockBase *>>
-          RPOT(VPR->getEntry());
-      for (VPBlockBase *VPB : RPOT) {
-        interleaveBlock(VPB, Plan, IC, InterleavedValues, TypeInfo, ToSkip);
-      }
+      return;
+    }
+    ReversePostOrderTraversal<VPBlockShallowTraversalWrapper<VPBlockBase *>>
+        RPOT(VPR->getEntry());
+    for (VPBlockBase *VPB : RPOT) {
+      interleaveBlock(VPB, Plan, IC, InterleavedValues, TypeInfo, ToSkip);
     }
     return;
   }
@@ -1924,6 +1936,17 @@ static void interleaveBlock(VPBlockBase *VPB, VPlan &Plan, unsigned IC,
     if (ToSkip.contains(&R))
       continue;
 
+    // Add all VPValues for all parts to ComputeReductionResult which combines
+    // the parts to compute the final reduction value.
+    VPValue *Op1;
+    if (match(&R, m_VPInstruction<VPInstruction::ComputeReductionResult>(
+                      m_VPValue(), m_VPValue(Op1)))) {
+      InterleavedValues.addUniform(cast<VPInstruction>(&R), IC);
+      for (unsigned I = 1; I != IC; ++I)
+        R.addOperand(InterleavedValues.getInterleavedValue(Op1, I));
+      continue;
+    }
+
     auto *SingleDef = dyn_cast<VPSingleDefRecipe>(&R);
     if (SingleDef && vputils::isUniformAcrossVFsAndUFs(SingleDef)) {
       InterleavedValues.addUniform(SingleDef, IC);
@@ -1933,68 +1956,64 @@ static void interleaveBlock(VPBlockBase *VPB, VPlan &Plan, unsigned IC,
     if (auto *H = dyn_cast<VPHeaderPHIRecipe>(&R)) {
       interleaveHeaderPHI(R, Plan, IC, InsertPtForPhi, InterleavedValues,
                           TypeInfo, ToSkip);
-      continue;
+    } else {
+      interleaveRecipe(R, Plan, IC, InterleavedValues, TypeInfo);
     }
-
-    interleaveRecipe(R, Plan, IC, InterleavedValues, TypeInfo);
   }
 }
 
-void VPlanTransforms::interleave(VPlan &Plan, unsigned IC, LLVMContext &Ctx) {
-  assert(IC > 0);
+void VPlanTransforms::interleaveByUF(VPlan &Plan, unsigned IC,
+                                     LLVMContext &Ctx) {
+  assert(IC > 0 && "Interleave count must be positive");
   if (IC == 1)
     return;
+
+  // Associate with each VPValue of part 0 its unrolled instances of parts 1,
+  // ..., UF-1.
   InterleaveState InterleavedValues;
 
+  /// Interleaving may create recipes that should not be interleaved themselves.
+  /// Those are tracked in ToSkip.
   SmallPtrSet<VPRecipeBase *, 8> ToSkip;
   Type *CanIVIntTy = Plan.getCanonicalIV()->getScalarType();
   VPTypeAnalysis TypeInfo(CanIVIntTy, Ctx);
-  ReversePostOrderTraversal<VPBlockShallowTraversalWrapper<VPBlockBase *>> RPOT(
-      Plan.getEntry());
   interleaveBlock(Plan.getPreheader(), Plan, IC, InterleavedValues, TypeInfo,
                   ToSkip);
 
-  for (VPBlockBase *VPB : RPOT) {
+  // Iterate over all blocks in the plan starting from Entry, and interleave
+  // recipes inside them. This includes the vector preheader and middle blocks,
+  // which may set up or post-process per-part values.
+  ReversePostOrderTraversal<VPBlockShallowTraversalWrapper<VPBlockBase *>> RPOT(
+      Plan.getEntry());
+  for (VPBlockBase *VPB : RPOT)
     interleaveBlock(VPB, Plan, IC, InterleavedValues, TypeInfo, ToSkip);
-  }
 
-  unsigned I = 1;
+  unsigned Part = 1;
+  // Remap operands of cloned header phis to update backedge values. The header
+  // phis cloned during interleaving are just after the header phi for part 0.
+  // Reset Part to 1 when reaching the first (part 0) recipe of a block.
   for (VPRecipeBase &H :
        Plan.getVectorLoopRegion()->getEntryBasicBlock()->phis()) {
+    // The second operand of Fixed Order Recurrence phi's, feeding the spliced
+    // value across the backedge, needs to remap to the last part of the spliced
+    // value.
     if (isa<VPFirstOrderRecurrencePHIRecipe>(&H)) {
-      H.setOperand(
-          1, InterleavedValues.getInterleavedValue(H.getOperand(1), IC - 1));
+      InterleavedValues.remapOperand(&H, 1, IC - 1);
       continue;
     }
     if (InterleavedValues.contains(H.getVPSingleValue()) ||
         isa<VPWidenPointerInductionRecipe>(&H)) {
-      I = 1;
+      Part = 1;
       continue;
     }
-    InterleavedValues.remapOperands(&H, I);
-    I++;
+    InterleavedValues.remapOperands(&H, Part);
+    Part++;
   }
 
-  using namespace llvm::VPlanPatternMatch;
-  bool ScalarVFOnly = Plan.hasScalarVFOnly();
+  // Remap the operand of live-outs to the last part.
   for (const auto &[_, LO] : Plan.getLiveOuts()) {
-    VPValue *In = nullptr;
-    VPValue *Op0;
-    if (ScalarVFOnly &&
-        match(LO->getOperand(0), m_VPInstruction<VPInstruction::ExtractFromEnd>(
-                                     m_VPValue(Op0), m_VPValue()))) {
-      VPInstruction *Extract =
-          cast<VPInstruction>(LO->getOperand(0)->getDefiningRecipe());
-      unsigned Offset =
-          cast<ConstantInt>(Extract->getOperand(1)->getLiveInIRValue())
-              ->getZExtValue();
-      In = InterleavedValues.getInterleavedValue(Op0, IC - Offset);
-      LO->setOperand(0, In);
-      Extract->getDefiningRecipe()->eraseFromParent();
-      continue;
-    } else
-      In = InterleavedValues.getInterleavedValue(LO->getOperand(0), IC - 1);
-
+    VPValue *In =
+        InterleavedValues.getInterleavedValue(LO->getOperand(0), IC - 1);
     LO->setOperand(0, In);
   }
 }
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.h b/llvm/lib/Transforms/Vectorize/VPlanTransforms.h
index 67d22ce46b6d92..ef4a4d73cee5e5 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.h
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.h
@@ -107,7 +107,7 @@ struct VPlanTransforms {
   /// \returns true if the transformation succeeds, or false if it doesn't.
   static bool tryAddExplicitVectorLength(VPlan &Plan);
 
-  static void interleave(VPlan &Plan, unsigned IC, LLVMContext &Ctx);
+  static void interleaveByUF(VPlan &Plan, unsigned IC, LLVMContext &Ctx);
 };
 
 } // namespace llvm

>From 41b7cc93f10db43c3d9126e154d3646e5d2578da Mon Sep 17 00:00:00 2001
From: Florian Hahn <flo at fhahn.com>
Date: Thu, 5 Sep 2024 12:33:27 +0100
Subject: [PATCH 05/15] !fixup address comments, thanks

---
 .../Vectorize/LoopVectorizationPlanner.h      |   4 +-
 .../Transforms/Vectorize/LoopVectorize.cpp    |   6 +-
 llvm/lib/Transforms/Vectorize/VPlan.cpp       |  12 +-
 llvm/lib/Transforms/Vectorize/VPlan.h         |  81 ++--
 .../lib/Transforms/Vectorize/VPlanRecipes.cpp |  94 +++--
 .../Transforms/Vectorize/VPlanTransforms.cpp  | 363 +++++++++---------
 .../Transforms/Vectorize/VPlanTransforms.h    |   2 +-
 llvm/lib/Transforms/Vectorize/VPlanUtils.cpp  |  35 ++
 llvm/lib/Transforms/Vectorize/VPlanUtils.h    |   7 +
 .../vplan-printing-before-execute.ll          |   5 +-
 10 files changed, 320 insertions(+), 289 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h b/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h
index a30cfd17fa78fc..26e299d1790802 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h
@@ -164,8 +164,8 @@ class VPBuilder {
 
   VPInstruction *createFPOp(unsigned Opcode,
                             std::initializer_list<VPValue *> Operands,
-                            DebugLoc DL = {}, const Twine &Name = "",
-                            FastMathFlags FMFs = {}) {
+                            FastMathFlags FMFs = {}, DebugLoc DL = {},
+                            const Twine &Name = "") {
     return tryInsertInstruction(
         new VPInstruction(Opcode, Operands, FMFs, DL, Name));
   }
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 00898aab018c7d..3b5045d4840611 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -7524,8 +7524,10 @@ LoopVectorizationPlanner::executePlan(
       "expanded SCEVs to reuse can only be used during epilogue vectorization");
   (void)IsEpilogueVectorization;
 
-  VPlanTransforms::interleaveByUF(
-      BestVPlan, BestUF, OrigLoop->getHeader()->getModule()->getContext());
+  // TODO: Move to VPlan transform stage once the transition to the VPlan-based
+  // cost model is complete for better cost estimates.
+  VPlanTransforms::unrollByUF(BestVPlan, BestUF,
+                              OrigLoop->getHeader()->getModule()->getContext());
   VPlanTransforms::optimizeForVFAndUF(BestVPlan, BestVF, BestUF, PSE);
 
   LLVM_DEBUG(dbgs() << "Executing best plan with VF=" << BestVF
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.cpp b/llvm/lib/Transforms/Vectorize/VPlan.cpp
index f5feb926665486..bd55863bea5afb 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlan.cpp
@@ -577,7 +577,9 @@ VPBasicBlock *VPBasicBlock::splitAt(iterator SplitAt) {
   return SplitBlock;
 }
 
-template <typename T> static T *getEnclosingLoopRegionImpl(T *P) {
+/// Return the enclosing loop region for region \p P. The templated version is
+/// used to support both const and non-const block arguments.
+template <typename T> static T *getEnclosingLoopRegionForRegion(T *P) {
   if (P && P->isReplicator()) {
     P = P->getParent();
     assert(!cast<VPRegionBlock>(P)->isReplicator() &&
@@ -587,11 +589,11 @@ template <typename T> static T *getEnclosingLoopRegionImpl(T *P) {
 }
 
 const VPRegionBlock *VPBasicBlock::getEnclosingLoopRegion() const {
-  return getEnclosingLoopRegionImpl(getParent());
+  return getEnclosingLoopRegionForRegion(getParent());
 }
 
 VPRegionBlock *VPBasicBlock::getEnclosingLoopRegion() {
-  return getEnclosingLoopRegionImpl(getParent());
+  return getEnclosingLoopRegionForRegion(getParent());
 }
 
 static bool hasConditionalTerminator(const VPBasicBlock *VPBB) {
@@ -1069,8 +1071,8 @@ void VPlan::execute(VPTransformState *State) {
 
       // When the VPlan has been unrolled, chain together the steps of the
       // unrolled parts together.
-      if (isa<VPWidenIntOrFpInductionRecipe>(&R) && R.getNumOperands() == 4)
-        Inc->setOperand(0, State->get(R.getOperand(3), 0));
+      if (auto *IV = dyn_cast<VPWidenIntOrFpInductionRecipe>(&R))
+        Inc->setOperand(0, State->get(IV->getUnrolledPart(), 0));
       continue;
     }
 
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h
index 0190831b4651ad..3181805dc787c0 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.h
+++ b/llvm/lib/Transforms/Vectorize/VPlan.h
@@ -728,6 +728,9 @@ class VPLiveOut : public VPUser {
 
   PHINode *getPhi() const { return Phi; }
 
+  /// Live-outs are marked as only using the first part during the transtition
+  /// to unrolling directly on VPlan.
+  /// TODO: Remove after unroller transition.
   bool onlyFirstPartUsed(const VPValue *Op) const override { return true; }
 
 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
@@ -1404,9 +1407,6 @@ class VPInstruction : public VPRecipeWithIRFlags {
   /// Returns true if this VPInstruction's operands are single scalars and the
   /// result is also a single scalar.
   bool isSingleScalar() const;
-
-  /// Return the interleave count from VPInstruction's last operand.
-  unsigned getInterleaveCount() const;
 };
 
 /// VPWidenRecipe is a recipe for producing a widened instruction using the
@@ -1695,7 +1695,8 @@ class VPVectorPointerRecipe : public VPRecipeWithIRFlags {
   bool onlyFirstPartUsed(const VPValue *Op) const override {
     assert(is_contained(operands(), Op) &&
            "Op must be an operand of the recipe");
-    return Op == getOperand(0);
+    assert(getNumOperands() <= 2 && "must have a single operand");
+    return true;
   }
 
   VPVectorPointerRecipe *clone() override {
@@ -1703,8 +1704,8 @@ class VPVectorPointerRecipe : public VPRecipeWithIRFlags {
                                      isInBounds(), getDebugLoc());
   }
 
-  /// Return the part associated with this vector pointer.
-  unsigned getPartForRecipe() const;
+  /// Return the unroll part for this vector pointer.
+  unsigned getUnrollPart() const;
 
 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
   /// Print the recipe.
@@ -1870,6 +1871,12 @@ class VPWidenIntOrFpInductionRecipe : public VPHeaderPHIRecipe {
   Type *getScalarType() const {
     return Trunc ? Trunc->getType() : IV->getType();
   }
+
+  /// Returns the optional VPValue representing the value of this induction at
+  /// the last unrolled part, if it exists. If it does not exist.
+  VPValue *getUnrolledPart() {
+    return getNumOperands() == 4 ? getOperand(3) : this;
+  }
 };
 
 class VPWidenPointerInductionRecipe : public VPHeaderPHIRecipe {
@@ -1909,6 +1916,9 @@ class VPWidenPointerInductionRecipe : public VPHeaderPHIRecipe {
   /// Returns the induction descriptor for the recipe.
   const InductionDescriptor &getInductionDescriptor() const { return IndDesc; }
 
+  /// Return the unroll part for this reduction phi.
+  unsigned getUnrollPart() const;
+
 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
   /// Print the recipe.
   void print(raw_ostream &O, const Twine &Indent,
@@ -2047,8 +2057,8 @@ class VPReductionPHIRecipe : public VPHeaderPHIRecipe {
   /// Returns true, if the phi is part of an in-loop reduction.
   bool isInLoop() const { return IsInLoop; }
 
-  /// Return the part associated with this reduction phi.
-  unsigned getPartForRecipe() const;
+  /// Return the unroll part for this reduction phi.
+  unsigned getUnrollPart() const;
 };
 
 /// A recipe for vectorizing a phi-node as a sequence of mask-based select
@@ -2814,7 +2824,8 @@ class VPActiveLaneMaskPHIRecipe : public VPHeaderPHIRecipe {
 
   VPActiveLaneMaskPHIRecipe *clone() override {
     auto *R = new VPActiveLaneMaskPHIRecipe(getOperand(0), getDebugLoc());
-    R->addOperand(getOperand(1));
+    if (getNumOperands() == 2)
+      R->addOperand(getOperand(1));
     return R;
   }
 
@@ -2893,8 +2904,8 @@ class VPWidenCanonicalIVRecipe : public VPSingleDefRecipe {
   /// step = <VF*UF, VF*UF, ..., VF*UF>.
   void execute(VPTransformState &State) override;
 
-  /// Return the part associated with this widened IV.
-  unsigned getPartForRecipe() const;
+  /// Return the unroll part for this widened IV.
+  unsigned getUnrollPart() const;
 
 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
   /// Print the recipe.
@@ -3009,8 +3020,8 @@ class VPScalarIVStepsRecipe : public VPRecipeWithIRFlags {
     return true;
   }
 
-  /// Return the part associated with this scalar step
-  unsigned getPartForRecipe() const;
+  /// Return the unroll part for this scalar step
+  unsigned getUnrollPart() const;
 };
 
 /// VPBasicBlock serves as the leaf of the Hierarchical Control-Flow Graph. It
@@ -3457,6 +3468,10 @@ class VPlan {
   bool hasScalarVFOnly() const { return VFs.size() == 1 && VFs[0].isScalar(); }
 
   bool hasUF(unsigned UF) const { return UFs.empty() || UFs.contains(UF); }
+  unsigned getUF() const {
+    assert(UFs.size() == 1);
+    return UFs[0];
+  }
 
   void setUF(unsigned UF) {
     assert(hasUF(UF) && "Cannot set the UF not already in plan");
@@ -3642,13 +3657,16 @@ class VPBlockUtils {
     connectBlocks(BlockPtr, NewBlock);
   }
 
+  /// Insert disconnected block \p NewBlock before \p Blockptr. First
+  /// disconnects all predecessors of \p BlockPtr and connects them to \p
+  /// NewBlock. Add \p NewBlock as predecessor of \p BlockPtr and \p BlockPtr as
+  /// successor of \p NewBlock.
   static void insertBlockBefore(VPBlockBase *NewBlock, VPBlockBase *BlockPtr) {
     assert(NewBlock->getSuccessors().empty() &&
            NewBlock->getPredecessors().empty() &&
            "Can't insert new block with predecessors or successors.");
     NewBlock->setParent(BlockPtr->getParent());
-    SmallVector<VPBlockBase *> Preds(BlockPtr->predecessors());
-    for (VPBlockBase *Pred : Preds) {
+    for (VPBlockBase *Pred : to_vector(BlockPtr->predecessors())) {
       disconnectBlocks(Pred, BlockPtr);
       connectBlocks(Pred, NewBlock);
     }
@@ -3851,39 +3869,6 @@ class VPlanSlp {
 
 namespace vputils {
 
-/// Checks if \p C is uniform across all VF lanes and UF parts. It is considered
-/// as such if it is either loop invariant (defined outside the vector region)
-/// or its operand is known to be uniform across all VFs and UFs (e.g.
-/// VPDerivedIV or VPCanonicalIVPHI).
-inline bool isUniformAcrossVFsAndUFs(VPValue *V) {
-  // Loop invariants are uniform:
-  if (V->isDefinedOutsideVectorRegions())
-    return true;
-
-  auto *R = V->getDefiningRecipe();
-  // Canonical IV chain is uniform:
-  auto *CanonicalIV = R->getParent()->getPlan()->getCanonicalIV();
-  if (R == CanonicalIV || V == CanonicalIV->getBackedgeValue())
-    return true;
-
-  // DerivedIV is uniform:
-  if (isa<VPDerivedIVRecipe>(R))
-    return true;
-
-  // Loads and stores that are uniform across VF lanes are handled by
-  // VPReplicateRecipe.IsUniform. They are also uniform across UF parts if all
-  // their operands are invariant:
-  if (isa<VPReplicateRecipe>(V) && cast<VPReplicateRecipe>(V)->isUniform() &&
-      (isa<LoadInst, StoreInst>(V->getUnderlyingValue())) &&
-      all_of(R->operands(),
-             [](VPValue *Op) { return Op->isDefinedOutsideVectorRegions(); }))
-    return true;
-
-  return isa<VPScalarCastRecipe, VPWidenCastRecipe>(R) &&
-         (R->getOperand(0)->isLiveIn() ||
-          isa<VPDerivedIVRecipe>(R->getOperand(0)) ||
-          isa<VPCanonicalIVPHIRecipe>(R->getOperand(0)));
-}
 
 } // end namespace vputils
 
diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
index 7cf3f5edb6b5dc..bab89b9a658956 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
@@ -454,7 +454,7 @@ Value *VPInstruction::generatePerPart(VPTransformState &State, unsigned Part) {
     if (Part != 0)
       return State.get(this, 0, /*IsScalar*/ true);
 
-    unsigned UF = getInterleaveCount();
+    unsigned UF = getParent()->getPlan()->getUF();
     Value *ScalarTC = State.get(getOperand(0), {0, 0});
     Value *Step = createStepForVF(Builder, ScalarTC->getType(), State.VF, UF);
     Value *Sub = Builder.CreateSub(ScalarTC, Step);
@@ -488,16 +488,15 @@ Value *VPInstruction::generatePerPart(VPTransformState &State, unsigned Part) {
     return EVL;
   }
   case VPInstruction::CanonicalIVIncrementForPart: {
+    unsigned UF = getParent()->getPlan()->getUF();
     auto *IV = State.get(getOperand(0), VPIteration(0, 0));
-    if (getNumOperands() == 2) {
-      // The canonical IV is incremented by the vectorization factor (num of
-      // SIMD elements) times the unroll part.
-      Value *Step = createStepForVF(Builder, IV->getType(), State.VF,
-                                    getInterleaveCount());
-      return Builder.CreateAdd(IV, Step, Name, hasNoUnsignedWrap(),
-                               hasNoSignedWrap());
-    }
-    return IV;
+    if (UF == 1)
+      return IV;
+    // The canonical IV is incremented by the vectorization factor (num of
+    // SIMD elements) times the unroll part.
+    Value *Step = createStepForVF(Builder, IV->getType(), State.VF, UF);
+    return Builder.CreateAdd(IV, Step, Name, hasNoUnsignedWrap(),
+                             hasNoSignedWrap());
   }
   case VPInstruction::BranchOnCond: {
     if (Part != 0)
@@ -557,9 +556,9 @@ Value *VPInstruction::generatePerPart(VPTransformState &State, unsigned Part) {
     Type *PhiTy = OrigPhi->getType();
     // The recipe's operands are the reduction phi, followed by one operand for
     // each part of the reduction.
-    unsigned NumParts = getNumOperands() - 1;
-    VectorParts RdxParts(NumParts);
-    for (unsigned Part = 0; Part < NumParts; ++Part)
+    unsigned UF = getNumOperands() - 1;
+    VectorParts RdxParts(UF);
+    for (unsigned Part = 0; Part < UF; ++Part)
       RdxParts[Part] = State.get(getOperand(1 + Part), 0, PhiR->isInLoop());
 
     // If the vector reduction can be performed in a smaller type, we truncate
@@ -568,7 +567,7 @@ Value *VPInstruction::generatePerPart(VPTransformState &State, unsigned Part) {
     // TODO: Handle this in truncateToMinBW.
     if (State.VF.isVector() && PhiTy != RdxDesc.getRecurrenceType()) {
       Type *RdxVecTy = VectorType::get(RdxDesc.getRecurrenceType(), State.VF);
-      for (unsigned Part = 0; Part < NumParts; ++Part)
+      for (unsigned Part = 0; Part < UF; ++Part)
         RdxParts[Part] = Builder.CreateTrunc(RdxParts[Part], RdxVecTy);
     }
     // Reduce all of the unrolled parts into a single vector.
@@ -578,12 +577,12 @@ Value *VPInstruction::generatePerPart(VPTransformState &State, unsigned Part) {
       Op = Instruction::Or;
 
     if (PhiR->isOrdered()) {
-      ReducedPartRdx = RdxParts[NumParts - 1];
+      ReducedPartRdx = RdxParts[UF - 1];
     } else {
       // Floating-point operations should have some FMF to enable the reduction.
       IRBuilderBase::FastMathFlagGuard FMFG(Builder);
       Builder.setFastMathFlags(RdxDesc.getFastMathFlags());
-      for (unsigned Part = 1; Part < NumParts; ++Part) {
+      for (unsigned Part = 1; Part < UF; ++Part) {
         Value *RdxPart = RdxParts[Part];
         if (Op != Instruction::ICmp && Op != Instruction::FCmp)
           ReducedPartRdx = Builder.CreateBinOp(
@@ -689,16 +688,6 @@ bool VPInstruction::isSingleScalar() const {
   return getOpcode() == VPInstruction::ResumePhi;
 }
 
-unsigned VPInstruction::getInterleaveCount() const {
-  assert((getOpcode() == VPInstruction::CalculateTripCountMinusVF ||
-          getOpcode() == VPInstruction::CanonicalIVIncrementForPart) &&
-         "used with unexpected opcode");
-  return getNumOperands() == 1
-             ? 1
-             : cast<ConstantInt>(getOperand(1)->getLiveInIRValue())
-                   ->getZExtValue();
-}
-
 #if !defined(NDEBUG)
 bool VPInstruction::isFPMathOp() const {
   // Inspired by FPMathOperator::classof. Notable differences are that we don't
@@ -1439,6 +1428,8 @@ void VPWidenIntOrFpInductionRecipe::execute(VPTransformState &State) {
 
   Value *SplatVF;
   if (getNumOperands() == 4) {
+    // The recipe has been unrolled. In that case, fetch the splat value for the
+    // induction increment.
     SplatVF = State.get(getOperand(2), 0);
   } else {
     // Multiply the vectorization factor by the step using integer or
@@ -1580,7 +1571,7 @@ void VPScalarIVStepsRecipe::execute(VPTransformState &State) {
   }
 
   unsigned StartPart = 0;
-  unsigned EndPart = State.UF;
+  unsigned EndPart = 1;
   unsigned StartLane = 0;
   unsigned EndLane = FirstLaneOnly ? 1 : State.VF.getKnownMinValue();
   if (State.Instance) {
@@ -1589,9 +1580,10 @@ void VPScalarIVStepsRecipe::execute(VPTransformState &State) {
     StartLane = State.Instance->Lane.getKnownLane();
     EndLane = StartLane + 1;
   }
+  // TODO: Remove loop after VPlan-based unroller lands.
   for (unsigned Part = StartPart; Part < EndPart; ++Part) {
     Value *StartIdx0 =
-        createStepForVF(Builder, IntStepTy, State.VF, getPartForRecipe());
+        createStepForVF(Builder, IntStepTy, State.VF, getUnrollPart());
 
     if (!FirstLaneOnly && State.VF.isScalable()) {
       auto *SplatStartIdx = Builder.CreateVectorSplat(State.VF, StartIdx0);
@@ -1624,7 +1616,7 @@ void VPScalarIVStepsRecipe::execute(VPTransformState &State) {
   }
 }
 
-unsigned VPScalarIVStepsRecipe::getPartForRecipe() const {
+unsigned VPScalarIVStepsRecipe::getUnrollPart() const {
   return getNumOperands() == 2
              ? 0
              : cast<ConstantInt>(getOperand(2)->getLiveInIRValue())
@@ -1732,7 +1724,7 @@ void VPWidenGEPRecipe::print(raw_ostream &O, const Twine &Indent,
 void VPVectorPointerRecipe ::execute(VPTransformState &State) {
   auto &Builder = State.Builder;
   State.setDebugLocFrom(getDebugLoc());
-  unsigned CurrentPart = getPartForRecipe();
+  unsigned CurrentPart = getUnrollPart();
   for (unsigned Part = 0; Part < State.UF; ++Part) {
     // Calculate the pointer for the specific unroll-part.
     Value *PartPtr = nullptr;
@@ -1769,7 +1761,7 @@ void VPVectorPointerRecipe ::execute(VPTransformState &State) {
   }
 }
 
-unsigned VPVectorPointerRecipe::getPartForRecipe() const {
+unsigned VPVectorPointerRecipe::getUnrollPart() const {
   return getNumOperands() == 1
              ? 0
              : cast<ConstantInt>(getOperand(1)->getLiveInIRValue())
@@ -2845,20 +2837,18 @@ void VPWidenPointerInductionRecipe::execute(VPTransformState &State) {
 
   auto *IVR = getParent()->getPlan()->getCanonicalIV();
   PHINode *CanonicalIV = cast<PHINode>(State.get(IVR, 0, /*IsScalar*/ true));
-  unsigned CurrentPart = 0;
-  if (getNumOperands() == 5)
-    CurrentPart =
-        cast<ConstantInt>(getOperand(4)->getLiveInIRValue())->getZExtValue();
-  Type *PhiType = IndDesc.getStep()->getType();
+  unsigned CurrentPart = getUnrollPart();
 
   // Build a pointer phi
   Value *ScalarStartValue = getStartValue()->getLiveInIRValue();
   Type *ScStValueType = ScalarStartValue->getType();
-  PHINode *NewPointerPhi = nullptr;
 
   BasicBlock *VectorPH = State.CFG.getPreheaderBBFor(this);
-  if (getNumOperands() == 5) {
-    auto *GEP = cast<GetElementPtrInst>(State.get(getOperand(3), 0));
+  PHINode *NewPointerPhi = nullptr;
+  if (CurrentPart != 0) {
+    // The recipe has been unrolled. In that case, fetch the single pointer phi
+    // shared among all unrolled parts of the recipe.
+    auto *GEP = cast<GetElementPtrInst>(State.get(getOperand(2), 0));
     NewPointerPhi = cast<PHINode>(GEP->getPointerOperand());
   } else {
     NewPointerPhi =
@@ -2868,12 +2858,9 @@ void VPWidenPointerInductionRecipe::execute(VPTransformState &State) {
 
   // A pointer induction, performed by using a gep
   BasicBlock::iterator InductionLoc = State.Builder.GetInsertPoint();
-  unsigned UF = getNumOperands() == 2
-                    ? 1
-                    : cast<ConstantInt>(getOperand(2)->getLiveInIRValue())
-                          ->getZExtValue();
-
+  unsigned UF = CurrentPart == 0 ? getParent()->getPlan()->getUF() : 1;
   Value *ScalarStepValue = State.get(getOperand(1), VPIteration(0, 0));
+  Type *PhiType = IndDesc.getStep()->getType();
   Value *RuntimeVF = getRuntimeVF(State.Builder, PhiType, State.VF);
   Value *NumUnrolledElems =
       State.Builder.CreateMul(RuntimeVF, ConstantInt::get(PhiType, UF));
@@ -2882,7 +2869,9 @@ void VPWidenPointerInductionRecipe::execute(VPTransformState &State) {
   // block cannot be used, as it does not exist yet.
   // TODO: Model increment value in VPlan, by turning the recipe into a
   // multi-def and a subclass of VPHeaderPHIRecipe.
-  if (getNumOperands() != 5) {
+  if (getNumOperands() != 4) {
+    // The recipe represents the first part of the pointer induction. Create the
+    // GEP to increment the phi.
     Value *InductionGEP = GetElementPtrInst::Create(
         State.Builder.getInt8Ty(), NewPointerPhi,
         State.Builder.CreateMul(ScalarStepValue, NumUnrolledElems), "ptr.ind",
@@ -2916,6 +2905,13 @@ void VPWidenPointerInductionRecipe::execute(VPTransformState &State) {
   }
 }
 
+unsigned VPWidenPointerInductionRecipe::getUnrollPart() const {
+  return getNumOperands() == 4
+             ? cast<ConstantInt>(getOperand(3)->getLiveInIRValue())
+                   ->getZExtValue()
+             : 0;
+}
+
 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
 void VPWidenPointerInductionRecipe::print(raw_ostream &O, const Twine &Indent,
                                           VPSlotTracker &SlotTracker) const {
@@ -2967,7 +2963,7 @@ void VPWidenCanonicalIVRecipe::execute(VPTransformState &State) {
                       ? CanonicalIV
                       : Builder.CreateVectorSplat(VF, CanonicalIV, "broadcast");
   for (unsigned Part = 0, UF = State.UF; Part < UF; ++Part) {
-    Value *VStep = createStepForVF(Builder, STy, VF, getPartForRecipe());
+    Value *VStep = createStepForVF(Builder, STy, VF, getUnrollPart());
     if (VF.isVector()) {
       VStep = Builder.CreateVectorSplat(VF, VStep);
       VStep =
@@ -2978,7 +2974,7 @@ void VPWidenCanonicalIVRecipe::execute(VPTransformState &State) {
   }
 }
 
-unsigned VPWidenCanonicalIVRecipe::getPartForRecipe() const {
+unsigned VPWidenCanonicalIVRecipe::getUnrollPart() const {
   return getNumOperands() == 1
              ? 0
              : cast<ConstantInt>(getOperand(1)->getLiveInIRValue())
@@ -3063,7 +3059,7 @@ void VPReductionPHIRecipe::execute(VPTransformState &State) {
 
   Value *Iden = nullptr;
   RecurKind RK = RdxDesc.getRecurrenceKind();
-  unsigned CurrentPart = getPartForRecipe();
+  unsigned CurrentPart = getUnrollPart();
 
   if (RecurrenceDescriptor::isMinMaxRecurrenceKind(RK) ||
       RecurrenceDescriptor::isAnyOfRecurrenceKind(RK)) {
@@ -3102,7 +3098,7 @@ void VPReductionPHIRecipe::execute(VPTransformState &State) {
   }
 }
 
-unsigned VPReductionPHIRecipe::getPartForRecipe() const {
+unsigned VPReductionPHIRecipe::getUnrollPart() const {
   return getNumOperands() == 2
              ? 0
              : cast<ConstantInt>(getOperand(2)->getLiveInIRValue())
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
index 454cc980506b90..0c9d5f752398ed 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
@@ -1569,58 +1569,85 @@ void VPlanTransforms::createInterleaveGroups(
 }
 
 namespace {
-class InterleaveState {
-  DenseMap<VPValue *, SmallVector<VPValue *>> InterleavedValues;
+
+/// Helper to hold state needed for unrolling. It holds the Plan to unroll by
+/// UF. It also holds copies of VPValues across UF-1 unroll parts to facilitate
+/// the unrolling transformation, where the original VPValues are retained for
+/// part zero.
+class UnrollState {
+  /// Plan to unroll.
+  VPlan &Plan;
+  /// Unroll factor to unroll by.
+  const unsigned UF;
+  /// Analysis for types.
+  VPTypeAnalysis TypeInfo;
+
+  /// Unrolling may create recipes that should not be unrolled themselves.
+  /// Those are tracked in ToSkip.
+  SmallPtrSet<VPRecipeBase *, 8> ToSkip;
+
+  // Associate with each VPValue of part 0 its unrolled instances of parts 1,
+  // ..., UF-1.
+  DenseMap<VPValue *, SmallVector<VPValue *>> VPV2Parts;
+
+  void unrollReplicateRegion(VPRegionBlock *VPR);
+  void unrollRecipe(VPRecipeBase &R);
+  void unrollHeaderPHI(VPRecipeBase &R, VPBasicBlock::iterator InsertPtForPhi);
+  void unrollWidenInduction(VPWidenIntOrFpInductionRecipe *IV,
+                            VPBasicBlock::iterator InsertPtForPhi);
+
+  VPValue *getConstantVPV(unsigned Part) {
+    Type *CanIVIntTy = Plan.getCanonicalIV()->getScalarType();
+    return Plan.getOrAddLiveIn(ConstantInt::get(CanIVIntTy, Part));
+  }
 
 public:
-  VPValue *getInterleavedValue(VPValue *V, unsigned IC) {
-    if (IC == 0 || V->isLiveIn())
+  UnrollState(VPlan &Plan, unsigned UF, LLVMContext &Ctx)
+      : Plan(Plan), UF(UF),
+        TypeInfo(Plan.getCanonicalIV()->getScalarType(), Ctx) {}
+
+  VPValue *getValueForPart(VPValue *V, unsigned Part) {
+    if (Part == 0 || V->isLiveIn())
       return V;
-    assert(
-        (InterleavedValues.contains(V) && InterleavedValues[V].size() >= IC) &&
-        "accessed value does not exist");
-    return InterleavedValues[V][IC - 1];
+    assert((VPV2Parts.contains(V) && VPV2Parts[V].size() >= Part) &&
+           "accessed value does not exist");
+    return VPV2Parts[V][Part - 1];
   }
 
-  void addInterleavedValues(VPRecipeBase *OrigR, VPRecipeBase *CopyR,
-                            unsigned I) {
+  void addVPV2Parts(VPRecipeBase *OrigR, VPRecipeBase *CopyR, unsigned Part) {
     for (const auto &[Idx, VPV] : enumerate(OrigR->definedValues())) {
-      auto Ins = InterleavedValues.insert({VPV, {}});
-      assert(Ins.first->second.size() == I - 1 && "earlier parts not set");
+      auto Ins = VPV2Parts.insert({VPV, {}});
+      assert(Ins.first->second.size() == Part - 1 && "earlier parts not set");
       Ins.first->second.push_back(CopyR->getVPValue(Idx));
     }
   }
 
-  void addUniform(VPSingleDefRecipe *R, unsigned IC) {
-    auto Ins = InterleavedValues.insert({R, {}});
-    for (unsigned I = 1; I != IC; ++I)
+  void addUniform(VPSingleDefRecipe *R) {
+    auto Ins = VPV2Parts.insert({R, {}});
+    assert(Ins.second && "uniform value already added");
+    for (unsigned Part = 0; Part != UF; ++Part)
       Ins.first->second.push_back(R);
   }
 
-  bool contains(VPValue *VPV) { return InterleavedValues.contains(VPV); }
-
-  DenseMap<VPValue *, SmallVector<VPValue *>> &getInterleavedValues() {
-    return InterleavedValues;
-  }
+  bool contains(VPValue *VPV) const { return VPV2Parts.contains(VPV); }
 
   void remapOperand(VPRecipeBase *R, unsigned OpIdx, unsigned Part) {
     auto *Op = R->getOperand(OpIdx);
-    R->setOperand(OpIdx, getInterleavedValue(Op, Part));
+    R->setOperand(OpIdx, getValueForPart(Op, Part));
   }
 
   void remapOperands(VPRecipeBase *R, unsigned Part) {
     for (const auto &[OpIdx, Op] : enumerate(R->operands()))
-      R->setOperand(OpIdx, getInterleavedValue(Op, Part));
+      R->setOperand(OpIdx, getValueForPart(Op, Part));
   }
+
+  void unrollBlock(VPBlockBase *VPB);
 };
 } // namespace
 
-static void interleaveReplicateRegion(VPRegionBlock *VPR, VPlan &Plan,
-                                      unsigned IC,
-                                      InterleaveState &InterleavedValues) {
-  Type *CanIVIntTy = Plan.getCanonicalIV()->getScalarType();
+void UnrollState::unrollReplicateRegion(VPRegionBlock *VPR) {
   VPBlockBase *InsertPt = VPR->getSingleSuccessor();
-  for (unsigned I = 1; I != IC; ++I) {
+  for (unsigned Part = 1; Part != UF; ++Part) {
     auto *Copy = VPR->clone();
     VPBlockUtils::insertBlockBefore(Copy, InsertPt);
 
@@ -1630,100 +1657,103 @@ static void interleaveReplicateRegion(VPRegionBlock *VPR, VPlan &Plan,
          zip(VPBlockUtils::blocksOnly<VPBasicBlock>(PartI),
              VPBlockUtils::blocksOnly<VPBasicBlock>(Part0))) {
       for (const auto &[PartIR, Part0R] : zip(*PartIVPBB, *Part0VPBB)) {
-        InterleavedValues.remapOperands(&PartIR, I);
+        remapOperands(&PartIR, Part);
         if (auto *ScalarIVSteps = dyn_cast<VPScalarIVStepsRecipe>(&PartIR)) {
-          ScalarIVSteps->addOperand(
-              Plan.getOrAddLiveIn(ConstantInt::get(CanIVIntTy, I)));
+          ScalarIVSteps->addOperand(getConstantVPV(Part));
         }
 
-        InterleavedValues.addInterleavedValues(&Part0R, &PartIR, I);
+        addVPV2Parts(&Part0R, &PartIR, Part);
       }
     }
   }
 }
 
-static void interleaveWidenInduction(VPWidenIntOrFpInductionRecipe *IV,
-                                     VPlan &Plan, unsigned IC,
-                                     VPBasicBlock::iterator &InsertPtForPhi,
-                                     InterleaveState &InterleavedValues,
-                                     VPTypeAnalysis &TypeInfo,
-                                     SmallPtrSet<VPRecipeBase *, 8> &ToSkip) {
+void UnrollState::unrollWidenInduction(VPWidenIntOrFpInductionRecipe *IV,
+                                       VPBasicBlock::iterator InsertPtForPhi) {
   VPBasicBlock *PH = cast<VPBasicBlock>(
       IV->getParent()->getEnclosingLoopRegion()->getSinglePredecessor());
-  VPValue *Step = &Plan.getVF();
   Type *IVTy = TypeInfo.inferScalarType(IV);
   auto &ID = IV->getInductionDescriptor();
   FastMathFlags FMFs;
-  if (ID.getInductionBinOp() && isa<FPMathOperator>(ID.getInductionBinOp()))
+  if (isa_and_present<FPMathOperator>(ID.getInductionBinOp()))
     FMFs = ID.getInductionBinOp()->getFastMathFlags();
 
-  if (TypeInfo.inferScalarType(Step) != IVTy) {
+  VPValue *VectorStep = &Plan.getVF();
+  if (TypeInfo.inferScalarType(VectorStep) != IVTy) {
     Instruction::CastOps CastOp =
         IVTy->isFloatingPointTy() ? Instruction::UIToFP : Instruction::Trunc;
-    Step = new VPWidenCastRecipe(CastOp, Step, IV->getScalarType());
-    PH->appendRecipe(Step->getDefiningRecipe());
-    ToSkip.insert(Step->getDefiningRecipe());
+    VectorStep = new VPWidenCastRecipe(CastOp, VectorStep, IVTy);
+    PH->appendRecipe(VectorStep->getDefiningRecipe());
+    ToSkip.insert(VectorStep->getDefiningRecipe());
   }
 
-  auto *ConstScale =
-      IV->getOperand(1)->isLiveIn()
-          ? dyn_cast<ConstantInt>(IV->getOperand(1)->getLiveInIRValue())
-          : nullptr;
-  if (!ConstScale || ConstScale->getZExtValue() != 1) {
-    VPValue *Scale = IV->getOperand(1);
-    if (TypeInfo.inferScalarType(Scale) != IVTy) {
-      Scale =
-          new VPWidenCastRecipe(Instruction::Trunc, Scale, IV->getScalarType());
-      PH->appendRecipe(Scale->getDefiningRecipe());
-      ToSkip.insert(Scale->getDefiningRecipe());
+  VPValue *ScalarStep = IV->getStepValue();
+  auto *ConstStep = ScalarStep->isLiveIn()
+                        ? dyn_cast<ConstantInt>(ScalarStep->getLiveInIRValue())
+                        : nullptr;
+  if (!ConstStep || ConstStep->getZExtValue() != 1) {
+    if (TypeInfo.inferScalarType(ScalarStep) != IVTy) {
+      ScalarStep = new VPWidenCastRecipe(Instruction::Trunc, ScalarStep, IVTy);
+      PH->appendRecipe(ScalarStep->getDefiningRecipe());
+      ToSkip.insert(ScalarStep->getDefiningRecipe());
     }
 
     VPBuilder Builder(PH);
     VPInstruction *Mul;
     if (IVTy->isFloatingPointTy())
-      Mul = Builder.createFPOp(Instruction::FMul, {Step, Scale},
-                               IV->getDebugLoc(), "", FMFs);
+      Mul = Builder.createFPOp(Instruction::FMul, {VectorStep, ScalarStep},
+                               FMFs, IV->getDebugLoc());
     else
-      Mul = Builder.createNaryOp(Instruction::Mul, {Step, Scale},
+      Mul = Builder.createNaryOp(Instruction::Mul, {VectorStep, ScalarStep},
                                  IV->getDebugLoc());
-    Step = Mul;
+    VectorStep = Mul;
     ToSkip.insert(Mul);
   }
-  IV->addOperand(Step);
 
-  for (unsigned I = 1; I != IC; ++I) {
-    VPBuilder Builder;
-    Builder.setInsertPoint(IV->getParent(), InsertPtForPhi);
-    VPValue *Prev = InterleavedValues.getInterleavedValue(IV, I - 1);
+  // Now create recipes to compute the induction steps for part 1 .. UF. Part 0
+  // remains the header phi. Parts > 0 are computed by adding Step to the
+  // previous part. The header phi recipe will get 2 new operands: the step
+  // value for a single part and the last part, used to compute the backedge
+  // value during VPWidenIntOrFpInductionRecipe::execute. %Part.0 =
+  // VPWidenIntOrFpInductionRecipe %Start, %ScalarStep, %VectorSteop, %Part.3
+  // %Part.1 = %Part.0 + %VectorStep
+  // %Part.2 = %Part.1 + %VectorStep
+  // %Part.3 = %Part.2 + %VectorStep
+  //
+  // The newly added recipes are added to ToSkip to avoid interleaving them
+  // again.
+  VPValue *Prev = IV;
+  VPBuilder Builder;
+  Builder.setInsertPoint(IV->getParent(), InsertPtForPhi);
+  for (unsigned Part = 1; Part != UF; ++Part) {
     VPInstruction *Add;
-    std::string Name = I > 1 ? "step.add." + std::to_string(I) : "step.add";
+    std::string Name =
+        Part > 1 ? "step.add." + std::to_string(Part) : "step.add";
 
     if (IVTy->isFloatingPointTy())
       Add = Builder.createFPOp(ID.getInductionOpcode(),
                                {
                                    Prev,
-                                   Step,
+                                   VectorStep,
                                },
-                               IV->getDebugLoc(), Name, FMFs);
+                               FMFs, IV->getDebugLoc(), Name);
     else
       Add = Builder.createNaryOp(Instruction::Add,
                                  {
                                      Prev,
-                                     Step,
+                                     VectorStep,
                                  },
                                  IV->getDebugLoc(), Name);
     ToSkip.insert(Add);
-    InterleavedValues.addInterleavedValues(IV, Add, I);
-    InsertPtForPhi = std::next(Add->getIterator());
+    addVPV2Parts(IV, Add, Part);
+    Prev = Add;
   }
-  IV->addOperand(InterleavedValues.getInterleavedValue(IV, IC - 1));
+  IV->addOperand(VectorStep);
+  IV->addOperand(Prev);
 }
 
-static void interleaveHeaderPHI(VPRecipeBase &R, VPlan &Plan, unsigned IC,
-                                VPBasicBlock::iterator &InsertPtForPhi,
-                                InterleaveState &InterleavedValues,
-                                VPTypeAnalysis &TypeInfo,
-                                SmallPtrSet<VPRecipeBase *, 8> &ToSkip) {
+void UnrollState::unrollHeaderPHI(VPRecipeBase &R,
+                                  VPBasicBlock::iterator InsertPtForPhi) {
   // First-order recurrences pass a single vector or scalar through their header
   // phis, irrespective of interleaving.
   if (isa<VPFirstOrderRecurrencePHIRecipe>(&R))
@@ -1731,111 +1761,99 @@ static void interleaveHeaderPHI(VPRecipeBase &R, VPlan &Plan, unsigned IC,
 
   // Generate step vectors for each unrolled part.
   if (auto *IV = dyn_cast<VPWidenIntOrFpInductionRecipe>(&R)) {
-    interleaveWidenInduction(IV, Plan, IC, InsertPtForPhi, InterleavedValues,
-                             TypeInfo, ToSkip);
+    unrollWidenInduction(IV, InsertPtForPhi);
     return;
   }
 
-  VPRecipeBase *InsertPt = &R;
-  Type *CanIVIntTy = Plan.getCanonicalIV()->getScalarType();
-  for (unsigned I = 1; I != IC; ++I) {
+  auto *RdxPhi = dyn_cast<VPReductionPHIRecipe>(&R);
+  if (RdxPhi && RdxPhi->isOrdered())
+    return;
+
+  auto InsertPt = std::next(R.getIterator());
+  for (unsigned Part = 1; Part != UF; ++Part) {
     VPRecipeBase *Copy = R.clone();
-    Copy->insertAfter(InsertPt);
-    InsertPt = Copy;
-    InterleavedValues.addInterleavedValues(&R, Copy, I);
+    Copy->insertBefore(*R.getParent(), InsertPt);
+    addVPV2Parts(&R, Copy, Part);
     if (isa<VPWidenPointerInductionRecipe>(&R)) {
-      if (I == 1)
-        R.addOperand(Plan.getOrAddLiveIn(ConstantInt::get(CanIVIntTy, IC)));
-      Copy->addOperand(Plan.getOrAddLiveIn(ConstantInt::get(CanIVIntTy, IC)));
       Copy->addOperand(R.getVPSingleValue());
-      Copy->addOperand(Plan.getOrAddLiveIn(ConstantInt::get(CanIVIntTy, I)));
-      continue;
-    }
-
-    if (auto *RdxPhi = dyn_cast<VPReductionPHIRecipe>(&R)) {
-      if (RdxPhi->isOrdered()) {
-        Copy->eraseFromParent();
-        break;
-      }
-      Copy->addOperand(Plan.getOrAddLiveIn(ConstantInt::get(CanIVIntTy, I)));
+      Copy->addOperand(getConstantVPV(Part));
+    } else if (RdxPhi) {
+      Copy->addOperand(getConstantVPV(Part));
     }
   }
 }
 
-/// Handle non-uniform, non-header-phi recipes.
-static void interleaveRecipe(VPRecipeBase &R, VPlan &Plan, unsigned IC,
-                             InterleaveState &InterleavedValues,
-                             VPTypeAnalysis &TypeInfo) {
+/// Handle non-header-phi recipes.
+void UnrollState::unrollRecipe(VPRecipeBase &R) {
   using namespace llvm::VPlanPatternMatch;
   if (match(&R, m_BranchOnCond(m_VPValue())) ||
       match(&R, m_BranchOnCount(m_VPValue(), m_VPValue())))
     return;
 
-  VPValue *Op0;
-  if (match(&R, m_VPInstruction<VPInstruction::ExtractFromEnd>(m_VPValue(Op0),
-                                                               m_VPValue()))) {
-    InterleavedValues.addUniform(cast<VPInstruction>(&R), IC);
-    if (Plan.hasScalarVFOnly()) {
-      unsigned Offset = cast<ConstantInt>(R.getOperand(1)->getLiveInIRValue())
-                            ->getZExtValue();
-      R.getVPSingleValue()->replaceAllUsesWith(
-          InterleavedValues.getInterleavedValue(Op0, IC - Offset));
-    } else {
-      InterleavedValues.remapOperands(&R, IC - 1);
+  if (auto *VPI = dyn_cast<VPInstruction>(&R)) {
+    if (vputils::onlyFirstPartUsed(VPI)) {
+      addUniform(cast<VPInstruction>(&R));
+      return;
+    }
+    VPValue *Op0;
+    VPValue *Op1;
+    if (match(VPI, m_VPInstruction<VPInstruction::ExtractFromEnd>(
+                       m_VPValue(Op0), m_VPValue(Op1)))) {
+      addUniform(VPI);
+      if (Plan.hasScalarVFOnly()) {
+        // Extracting from end with VF = 1 implies retrieving the scalar part UF
+        // - Op1.
+        unsigned Offset =
+            cast<ConstantInt>(Op1->getLiveInIRValue())->getZExtValue();
+        VPI->replaceAllUsesWith(getValueForPart(Op0, UF - Offset));
+      } else {
+        // Otherwise we extract from the last part.
+        remapOperands(VPI, UF - 1);
+      }
+      return;
+    }
+    if (match(&R, m_VPInstruction<VPInstruction::CalculateTripCountMinusVF>(
+                      m_VPValue()))) {
+      addUniform(VPI);
+      return;
     }
-    return;
-  }
-
-  Type *CanIVIntTy = Plan.getCanonicalIV()->getScalarType();
-  if (match(&R, m_VPInstruction<VPInstruction::CalculateTripCountMinusVF>(
-                    m_VPValue()))) {
-    InterleavedValues.addUniform(cast<VPInstruction>(&R), IC);
-    R.addOperand(Plan.getOrAddLiveIn(ConstantInt::get(CanIVIntTy, IC)));
-    return;
   }
 
   if (auto *RepR = dyn_cast<VPReplicateRecipe>(&R)) {
     if (isa<StoreInst>(RepR->getUnderlyingValue()) &&
         RepR->getOperand(1)->isDefinedOutsideVectorRegions()) {
-      InterleavedValues.remapOperands(&R, IC - 1);
+      // Stores to an invariant address only need to store the last part.
+      remapOperands(&R, UF - 1);
       return;
     }
     if (auto *II = dyn_cast<IntrinsicInst>(RepR->getUnderlyingValue())) {
       if (II->getIntrinsicID() == Intrinsic::experimental_noalias_scope_decl) {
-        InterleavedValues.addUniform(RepR, IC);
+        addUniform(RepR);
         return;
       }
     }
   }
 
-  if (isa<VPInstruction>(&R) &&
-      vputils::onlyFirstPartUsed(R.getVPSingleValue())) {
-    InterleavedValues.addUniform(cast<VPInstruction>(&R), IC);
-    return;
-  }
-
-  VPRecipeBase *InsertPt = &R;
-  for (unsigned I = 1; I != IC; ++I) {
+  // Unroll non-uniform recipes.
+  auto InsertPt = std::next(R.getIterator());
+  VPBasicBlock &VPBB = *R.getParent();
+  for (unsigned Part = 1; Part != UF; ++Part) {
     VPRecipeBase *Copy = R.clone();
-    Copy->insertAfter(InsertPt);
-    InsertPt = Copy;
-    InterleavedValues.addInterleavedValues(&R, Copy, I);
+    Copy->insertBefore(VPBB, InsertPt);
+    addVPV2Parts(&R, Copy, Part);
 
-    if (match(&R, m_VPInstruction<VPInstruction::CanonicalIVIncrementForPart>(
-                      m_VPValue())))
-      Copy->addOperand(Plan.getOrAddLiveIn(ConstantInt::get(CanIVIntTy, I)));
     VPValue *Op;
     if (match(&R, m_VPInstruction<VPInstruction::FirstOrderRecurrenceSplice>(
                       m_VPValue(), m_VPValue(Op)))) {
-      Copy->setOperand(0, InterleavedValues.getInterleavedValue(Op, I - 1));
-      Copy->setOperand(1, InterleavedValues.getInterleavedValue(Op, I));
+      Copy->setOperand(0, getValueForPart(Op, Part - 1));
+      Copy->setOperand(1, getValueForPart(Op, Part));
       continue;
     }
     if (auto *Red = dyn_cast<VPReductionRecipe>(&R)) {
       auto *Phi = cast<VPReductionPHIRecipe>(R.getOperand(0));
       if (Phi->isOrdered()) {
-        auto Ins = InterleavedValues.getInterleavedValues().insert({Phi, {}});
-        if (I == 1) {
+        auto Ins = VPV2Parts.insert({Phi, {}});
+        if (Part == 1) {
           Ins.first->second.clear();
           Ins.first->second.push_back(Red);
         }
@@ -1843,13 +1861,13 @@ static void interleaveRecipe(VPRecipeBase &R, VPlan &Plan, unsigned IC,
         Phi->setOperand(1, Copy->getVPSingleValue());
       }
     }
-    InterleavedValues.remapOperands(Copy, I);
+    remapOperands(Copy, Part);
 
     // Add operand indicating the part to generate code for to recipes still
     // requiring it.
     if (isa<VPScalarIVStepsRecipe, VPWidenCanonicalIVRecipe,
             VPVectorPointerRecipe>(Copy))
-      Copy->addOperand(Plan.getOrAddLiveIn(ConstantInt::get(CanIVIntTy, I)));
+      Copy->addOperand(getConstantVPV(Part));
 
     if (isa<VPVectorPointerRecipe>(R))
       Copy->setOperand(0, R.getOperand(0));
@@ -1857,21 +1875,17 @@ static void interleaveRecipe(VPRecipeBase &R, VPlan &Plan, unsigned IC,
 }
 
 using namespace llvm::VPlanPatternMatch;
-static void interleaveBlock(VPBlockBase *VPB, VPlan &Plan, unsigned IC,
-                            InterleaveState &InterleavedValues,
-                            VPTypeAnalysis &TypeInfo,
-                            SmallPtrSet<VPRecipeBase *, 8> &ToSkip) {
+void UnrollState::unrollBlock(VPBlockBase *VPB) {
   auto *VPR = dyn_cast<VPRegionBlock>(VPB);
   if (VPR) {
     if (VPR->isReplicator()) {
-      interleaveReplicateRegion(VPR, Plan, IC, InterleavedValues);
+      unrollReplicateRegion(VPR);
       return;
     }
     ReversePostOrderTraversal<VPBlockShallowTraversalWrapper<VPBlockBase *>>
         RPOT(VPR->getEntry());
-    for (VPBlockBase *VPB : RPOT) {
-      interleaveBlock(VPB, Plan, IC, InterleavedValues, TypeInfo, ToSkip);
-    }
+    for (VPBlockBase *VPB : RPOT)
+      unrollBlock(VPB);
     return;
   }
 
@@ -1886,52 +1900,44 @@ static void interleaveBlock(VPBlockBase *VPB, VPlan &Plan, unsigned IC,
     VPValue *Op1;
     if (match(&R, m_VPInstruction<VPInstruction::ComputeReductionResult>(
                       m_VPValue(), m_VPValue(Op1)))) {
-      InterleavedValues.addUniform(cast<VPInstruction>(&R), IC);
-      for (unsigned I = 1; I != IC; ++I)
-        R.addOperand(InterleavedValues.getInterleavedValue(Op1, I));
+      addUniform(cast<VPInstruction>(&R));
+      for (unsigned Part = 1; Part != UF; ++Part)
+        R.addOperand(getValueForPart(Op1, Part));
       continue;
     }
 
     auto *SingleDef = dyn_cast<VPSingleDefRecipe>(&R);
     if (SingleDef && vputils::isUniformAcrossVFsAndUFs(SingleDef)) {
-      InterleavedValues.addUniform(SingleDef, IC);
+      addUniform(SingleDef);
       continue;
     }
 
     if (auto *H = dyn_cast<VPHeaderPHIRecipe>(&R)) {
-      interleaveHeaderPHI(R, Plan, IC, InsertPtForPhi, InterleavedValues,
-                          TypeInfo, ToSkip);
-    } else {
-      interleaveRecipe(R, Plan, IC, InterleavedValues, TypeInfo);
+      unrollHeaderPHI(R, InsertPtForPhi);
+      continue;
     }
+
+    unrollRecipe(R);
   }
 }
 
-void VPlanTransforms::interleaveByUF(VPlan &Plan, unsigned IC,
-                                     LLVMContext &Ctx) {
-  assert(IC > 0 && "Interleave count must be positive");
-  if (IC == 1)
+void VPlanTransforms::unrollByUF(VPlan &Plan, unsigned UF, LLVMContext &Ctx) {
+  assert(UF > 0 && "Unroll factor must be positive");
+  Plan.setUF(UF);
+  if (UF == 1)
     return;
 
-  // Associate with each VPValue of part 0 its unrolled instances of parts 1,
-  // ..., UF-1.
-  InterleaveState InterleavedValues;
+  UnrollState Unroller(Plan, UF, Ctx);
 
-  /// Interleaving may create recipes that should not be interleaved themselves.
-  /// Those are tracked in ToSkip.
-  SmallPtrSet<VPRecipeBase *, 8> ToSkip;
-  Type *CanIVIntTy = Plan.getCanonicalIV()->getScalarType();
-  VPTypeAnalysis TypeInfo(CanIVIntTy, Ctx);
-  interleaveBlock(Plan.getPreheader(), Plan, IC, InterleavedValues, TypeInfo,
-                  ToSkip);
+  Unroller.unrollBlock(Plan.getPreheader());
 
-  // Iterate over all blocks in the plan starting from Entry, and interleave
+  // Iterate over all blocks in the plan starting from Entry, and unroll
   // recipes inside them. This includes the vector preheader and middle blocks,
   // which may set up or post-process per-part values.
   ReversePostOrderTraversal<VPBlockShallowTraversalWrapper<VPBlockBase *>> RPOT(
       Plan.getEntry());
   for (VPBlockBase *VPB : RPOT)
-    interleaveBlock(VPB, Plan, IC, InterleavedValues, TypeInfo, ToSkip);
+    Unroller.unrollBlock(VPB);
 
   unsigned Part = 1;
   // Remap operands of cloned header phis to update backedge values. The header
@@ -1943,22 +1949,21 @@ void VPlanTransforms::interleaveByUF(VPlan &Plan, unsigned IC,
     // value across the backedge, needs to remap to the last part of the spliced
     // value.
     if (isa<VPFirstOrderRecurrencePHIRecipe>(&H)) {
-      InterleavedValues.remapOperand(&H, 1, IC - 1);
+      Unroller.remapOperand(&H, 1, UF - 1);
       continue;
     }
-    if (InterleavedValues.contains(H.getVPSingleValue()) ||
+    if (Unroller.contains(H.getVPSingleValue()) ||
         isa<VPWidenPointerInductionRecipe>(&H)) {
       Part = 1;
       continue;
     }
-    InterleavedValues.remapOperands(&H, Part);
+    Unroller.remapOperands(&H, Part);
     Part++;
   }
 
   // Remap the operand of live-outs to the last part.
   for (const auto &[_, LO] : Plan.getLiveOuts()) {
-    VPValue *In =
-        InterleavedValues.getInterleavedValue(LO->getOperand(0), IC - 1);
+    VPValue *In = Unroller.getValueForPart(LO->getOperand(0), UF - 1);
     LO->setOperand(0, In);
   }
 }
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.h b/llvm/lib/Transforms/Vectorize/VPlanTransforms.h
index 6bd7577069e1fc..a87a74b8bba680 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.h
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.h
@@ -115,7 +115,7 @@ struct VPlanTransforms {
       const SmallPtrSetImpl<const InterleaveGroup<Instruction> *> &InterleaveGroups,
       VPRecipeBuilder &RecipeBuilder, bool ScalarEpilogueAllowed);
 
-  static void interleaveByUF(VPlan &Plan, unsigned IC, LLVMContext &Ctx);
+  static void unrollByUF(VPlan &Plan, unsigned UF, LLVMContext &Ctx);
 };
 
 } // namespace llvm
diff --git a/llvm/lib/Transforms/Vectorize/VPlanUtils.cpp b/llvm/lib/Transforms/Vectorize/VPlanUtils.cpp
index c18bea4f4c5926..07a2ac9f2fc34a 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanUtils.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanUtils.cpp
@@ -8,6 +8,7 @@
 
 #include "VPlanUtils.h"
 #include "VPlanPatternMatch.h"
+#include "llvm/ADT/TypeSwitch.h"
 #include "llvm/Analysis/ScalarEvolutionExpressions.h"
 
 using namespace llvm;
@@ -60,3 +61,37 @@ bool vputils::isHeaderMask(const VPValue *V, VPlan &Plan) {
   return match(V, m_Binary<Instruction::ICmp>(m_VPValue(A), m_VPValue(B))) &&
          IsWideCanonicalIV(A) && B == Plan.getOrCreateBackedgeTakenCount();
 }
+
+bool vputils::isUniformAcrossVFsAndUFs(VPValue *V) {
+  // Loop invariants are uniform.
+  if (V->isDefinedOutsideVectorRegions())
+    return true;
+
+  VPRecipeBase *R = V->getDefiningRecipe();
+  auto *CanonicalIV = R->getParent()->getPlan()->getCanonicalIV();
+  // Canonical IV chain is uniform.
+  if (V == CanonicalIV || V == CanonicalIV->getBackedgeValue())
+    return true;
+
+  return TypeSwitch<const VPRecipeBase *, bool>(R)
+      .Case<VPCanonicalIVPHIRecipe, VPDerivedIVRecipe>(
+          [](const auto *R) { return true; })
+      .Case<VPReplicateRecipe>([](const auto *R) {
+        // Loads and stores that are uniform across VF lanes are handled by
+        // VPReplicateRecipe.IsUniform. They are also uniform across UF parts if
+        // all their operands are invariant.
+        // TODO: Further relax the restrictions.
+        return R->isUniform() &&
+               (isa<LoadInst, StoreInst>(R->getUnderlyingValue())) &&
+               all_of(R->operands(),
+                      [](VPValue *Op) { return isUniformAcrossVFsAndUFs(Op); });
+      })
+      .Case<VPScalarCastRecipe, VPWidenCastRecipe>([](const auto *R) {
+        // A cast is uniform according to its operand.
+        return isUniformAcrossVFsAndUFs(R->getOperand(0));
+      })
+      .Default([](const VPRecipeBase *) { // A values is considered non-uniform
+                                          // unless proven otherwise.
+        return false;
+      });
+}
diff --git a/llvm/lib/Transforms/Vectorize/VPlanUtils.h b/llvm/lib/Transforms/Vectorize/VPlanUtils.h
index fc11208a433961..07a5179c85a119 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanUtils.h
+++ b/llvm/lib/Transforms/Vectorize/VPlanUtils.h
@@ -45,6 +45,13 @@ inline bool isUniformAfterVectorization(const VPValue *VPV) {
 
 /// Return true if \p V is a header mask in \p Plan.
 bool isHeaderMask(const VPValue *V, VPlan &Plan);
+
+/// Checks if \p V is uniform across all VF lanes and UF parts. It is considered
+/// as such if it is either loop invariant (defined outside the vector region)
+/// or its operand is known to be uniform across all VFs and UFs (e.g.
+/// VPDerivedIV or VPCanonicalIVPHI).
+bool isUniformAcrossVFsAndUFs(VPValue *V);
+
 } // end namespace llvm::vputils
 
 #endif
diff --git a/llvm/test/Transforms/LoopVectorize/vplan-printing-before-execute.ll b/llvm/test/Transforms/LoopVectorize/vplan-printing-before-execute.ll
index d8955a8bd7c575..e4b41544044c93 100644
--- a/llvm/test/Transforms/LoopVectorize/vplan-printing-before-execute.ll
+++ b/llvm/test/Transforms/LoopVectorize/vplan-printing-before-execute.ll
@@ -67,15 +67,14 @@ define void @test_tc_less_than_16(ptr %A, i64 %N) {
 ; CHECK-NEXT:     vp<[[STEPS1:%.+]]> = SCALAR-STEPS vp<[[CAN_IV]]>, ir<1>
 ; CHECK-NEXT:     vp<[[STEPS2:%.+]]> = SCALAR-STEPS vp<[[CAN_IV]]>, ir<1>, ir<1>
 ; CHECK-NEXT:     EMIT vp<[[PADD1:%.+]]> = ptradd ir<%A>, vp<[[STEPS1]]>
-; CHECK-NEXT:     EMIT vp<[[PADD2:%.+]]> = ptradd ir<%A>, vp<[[STEPS2]]>
 ; CHECK-NEXT:     vp<[[VPTR1:%.]]> = vector-pointer vp<[[PADD1]]>
-; CHECK-NEXT:     vp<[[VPTR2:%.]]> = vector-pointer vp<[[PADD1]]>
+; CHECK-NEXT:     vp<[[VPTR2:%.]]> = vector-pointer vp<[[PADD1]]>, ir<1>
 ; CHECK-NEXT:     WIDEN ir<%l> = load vp<[[VPTR1]]>
 ; CHECK-NEXT:     WIDEN ir<%l>.1 = load vp<[[VPTR2]]>
 ; CHECK-NEXT:     WIDEN ir<%add> = add nsw ir<%l>, ir<10>
 ; CHECK-NEXT:     WIDEN ir<%add>.1 = add nsw ir<%l>.1, ir<10>
 ; CHECK-NEXT:     vp<[[VPTR3:%.+]]> = vector-pointer vp<[[PADD1]]>
-; CHECK-NEXT:     vp<[[VPTR4:%.+]]> = vector-pointer vp<[[PADD1]]>
+; CHECK-NEXT:     vp<[[VPTR4:%.+]]> = vector-pointer vp<[[PADD1]]>, ir<1>
 ; CHECK-NEXT:     WIDEN store vp<[[VPTR3]]>, ir<%add>
 ; CHECK-NEXT:     WIDEN store vp<[[VPTR4]]>, ir<%add>.1
 ; CHECK-NEXT:     EMIT vp<[[CAN_IV_NEXT]]> = add nuw vp<[[CAN_IV:%.+]]>, vp<[[VFxUF]]>

>From 6fd2416395fe57d28a8e7f1a6a9efe7058d46a05 Mon Sep 17 00:00:00 2001
From: Florian Hahn <flo at fhahn.com>
Date: Thu, 5 Sep 2024 19:38:28 +0100
Subject: [PATCH 06/15] !fixup remove empty vputils namespace from VPlan.h

---
 llvm/lib/Transforms/Vectorize/VPlan.h | 5 -----
 1 file changed, 5 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h
index 3181805dc787c0..2d65440149438b 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.h
+++ b/llvm/lib/Transforms/Vectorize/VPlan.h
@@ -3867,11 +3867,6 @@ class VPlanSlp {
   bool isCompletelySLP() const { return CompletelySLP; }
 };
 
-namespace vputils {
-
-
-} // end namespace vputils
-
 } // end namespace llvm
 
 #endif // LLVM_TRANSFORMS_VECTORIZE_VPLAN_H

>From 4daee0a4891cb8aec03014e8651a5bdce99ef089 Mon Sep 17 00:00:00 2001
From: Florian Hahn <flo at fhahn.com>
Date: Sun, 15 Sep 2024 13:14:49 +0100
Subject: [PATCH 07/15] !fixup address latest comments

---
 .../Transforms/Vectorize/LoopVectorize.cpp    |  1 +
 llvm/lib/Transforms/Vectorize/VPlan.cpp       |  9 ++++----
 llvm/lib/Transforms/Vectorize/VPlan.h         | 20 +++++++++-------
 .../lib/Transforms/Vectorize/VPlanRecipes.cpp | 13 +++++++----
 .../Transforms/Vectorize/VPlanTransforms.cpp  | 23 +++++++++++++++++--
 .../Transforms/Vectorize/VPlanTransforms.h    |  1 +
 6 files changed, 48 insertions(+), 19 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 8672bfc910849f..456ba61a402d57 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -7531,6 +7531,7 @@ LoopVectorizationPlanner::executePlan(
   // cost model is complete for better cost estimates.
   VPlanTransforms::unrollByUF(BestVPlan, BestUF,
                               OrigLoop->getHeader()->getModule()->getContext());
+  VPlanTransforms::cleanupRedundantRecipesAfterUnroll(BestVPlan);
   VPlanTransforms::optimizeForVFAndUF(BestVPlan, BestVF, BestUF, PSE);
 
   LLVM_DEBUG(dbgs() << "Executing best plan with VF=" << BestVF
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.cpp b/llvm/lib/Transforms/Vectorize/VPlan.cpp
index 0320023863a59f..3a06c2902b811d 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlan.cpp
@@ -588,11 +588,11 @@ template <typename T> static T *getEnclosingLoopRegionForRegion(T *P) {
   return P;
 }
 
-const VPRegionBlock *VPBasicBlock::getEnclosingLoopRegion() const {
+VPRegionBlock *VPBasicBlock::getEnclosingLoopRegion() {
   return getEnclosingLoopRegionForRegion(getParent());
 }
 
-VPRegionBlock *VPBasicBlock::getEnclosingLoopRegion() {
+const VPRegionBlock *VPBasicBlock::getEnclosingLoopRegion() const {
   return getEnclosingLoopRegionForRegion(getParent());
 }
 
@@ -1075,10 +1075,9 @@ void VPlan::execute(VPTransformState *State) {
       Instruction *Inc = cast<Instruction>(Phi->getIncomingValue(1));
       Inc->moveBefore(VectorLatchBB->getTerminator()->getPrevNode());
 
-      // When the VPlan has been unrolled, chain together the steps of the
-      // unrolled parts together.
+      // Use the steps for the last part as backedge value for the induction.
       if (auto *IV = dyn_cast<VPWidenIntOrFpInductionRecipe>(&R))
-        Inc->setOperand(0, State->get(IV->getUnrolledPart(), 0));
+        Inc->setOperand(0, State->get(IV->getLastUnrolledPart(), 0));
       continue;
     }
 
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h
index c89e7227cfa6c4..09c4c76410f475 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.h
+++ b/llvm/lib/Transforms/Vectorize/VPlan.h
@@ -728,7 +728,7 @@ class VPLiveOut : public VPUser {
 
   PHINode *getPhi() const { return Phi; }
 
-  /// Live-outs are marked as only using the first part during the transtition
+  /// Live-outs are marked as only using the first part during the transition
   /// to unrolling directly on VPlan.
   /// TODO: Remove after unroller transition.
   bool onlyFirstPartUsed(const VPValue *Op) const override { return true; }
@@ -1312,6 +1312,9 @@ class VPInstruction : public VPRecipeWithIRFlags {
   bool isFPMathOp() const;
 #endif
 
+  /// Return the unroll part for this VPInstruction.
+  unsigned getUnrollPart() const;
+
 public:
   VPInstruction(unsigned Opcode, ArrayRef<VPValue *> Operands, DebugLoc DL,
                 const Twine &Name = "")
@@ -1758,7 +1761,7 @@ class VPVectorPointerRecipe : public VPRecipeWithIRFlags {
   bool onlyFirstPartUsed(const VPValue *Op) const override {
     assert(is_contained(operands(), Op) &&
            "Op must be an operand of the recipe");
-    assert(getNumOperands() <= 2 && "must have a single operand");
+    assert(getNumOperands() <= 2 && "must have at most two operands");
     return true;
   }
 
@@ -1940,9 +1943,10 @@ class VPWidenIntOrFpInductionRecipe : public VPHeaderPHIRecipe {
     return Trunc ? Trunc->getType() : IV->getType();
   }
 
-  /// Returns the optional VPValue representing the value of this induction at
-  /// the last unrolled part, if it exists. If it does not exist.
-  VPValue *getUnrolledPart() {
+  /// Returns the VPValue representing the value of this induction at
+  /// the last unrolled part, if it exists. Returns itself if unrolling did not
+  /// take place.
+  VPValue *getLastUnrolledPart() {
     return getNumOperands() == 4 ? getOperand(3) : this;
   }
 };
@@ -1984,7 +1988,7 @@ class VPWidenPointerInductionRecipe : public VPHeaderPHIRecipe {
   /// Returns the induction descriptor for the recipe.
   const InductionDescriptor &getInductionDescriptor() const { return IndDesc; }
 
-  /// Return the unroll part for this reduction phi.
+  /// Return the unroll part for this induction phi.
   unsigned getUnrollPart() const;
 
 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
@@ -3538,8 +3542,9 @@ class VPlan {
   bool hasScalarVFOnly() const { return VFs.size() == 1 && VFs[0].isScalar(); }
 
   bool hasUF(unsigned UF) const { return UFs.empty() || UFs.contains(UF); }
+
   unsigned getUF() const {
-    assert(UFs.size() == 1);
+    assert(UFs.size() == 1 && "Expected a single UF");
     return UFs[0];
   }
 
@@ -3936,7 +3941,6 @@ class VPlanSlp {
   /// Return true if all visited instruction can be combined.
   bool isCompletelySLP() const { return CompletelySLP; }
 };
-
 } // end namespace llvm
 
 #endif // LLVM_TRANSFORMS_VECTORIZE_VPLAN_H
diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
index e598f6b48983bb..821eccfc0b9bb1 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
@@ -332,6 +332,12 @@ FastMathFlags VPRecipeWithIRFlags::getFastMathFlags() const {
   return Res;
 }
 
+unsigned VPInstruction::getUnrollPart() const {
+  assert(getOpcode() == VPInstruction::CanonicalIVIncrementForPart &&
+         getNumOperands() == 2 && "Called for unsupported recipe");
+  return cast<ConstantInt>(getOperand(1)->getLiveInIRValue())->getZExtValue();
+}
+
 VPInstruction::VPInstruction(unsigned Opcode, CmpInst::Predicate Pred,
                              VPValue *A, VPValue *B, DebugLoc DL,
                              const Twine &Name)
@@ -492,13 +498,12 @@ Value *VPInstruction::generatePerPart(VPTransformState &State, unsigned Part) {
     return EVL;
   }
   case VPInstruction::CanonicalIVIncrementForPart: {
-    unsigned UF = getParent()->getPlan()->getUF();
+    unsigned Part = getUnrollPart();
     auto *IV = State.get(getOperand(0), VPIteration(0, 0));
-    if (UF == 1)
-      return IV;
+    assert(Part != 0 && "Must have  a part");
     // The canonical IV is incremented by the vectorization factor (num of
     // SIMD elements) times the unroll part.
-    Value *Step = createStepForVF(Builder, IV->getType(), State.VF, UF);
+    Value *Step = createStepForVF(Builder, IV->getType(), State.VF, Part);
     return Builder.CreateAdd(IV, Step, Name, hasNoUnsignedWrap(),
                              hasNoSignedWrap());
   }
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
index 6fa99645a9262c..84ccb38ba32216 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
@@ -1886,7 +1886,9 @@ void UnrollState::unrollRecipe(VPRecipeBase &R) {
     // Add operand indicating the part to generate code for to recipes still
     // requiring it.
     if (isa<VPScalarIVStepsRecipe, VPWidenCanonicalIVRecipe,
-            VPVectorPointerRecipe>(Copy))
+            VPVectorPointerRecipe>(Copy) ||
+        match(Copy, m_VPInstruction<VPInstruction::CanonicalIVIncrementForPart>(
+                        m_VPValue())))
       Copy->addOperand(getConstantVPV(Part));
 
     if (isa<VPVectorPointerRecipe>(R))
@@ -1944,8 +1946,9 @@ void UnrollState::unrollBlock(VPBlockBase *VPB) {
 void VPlanTransforms::unrollByUF(VPlan &Plan, unsigned UF, LLVMContext &Ctx) {
   assert(UF > 0 && "Unroll factor must be positive");
   Plan.setUF(UF);
-  if (UF == 1)
+  if (UF == 1) {
     return;
+  }
 
   UnrollState Unroller(Plan, UF, Ctx);
 
@@ -1987,3 +1990,19 @@ void VPlanTransforms::unrollByUF(VPlan &Plan, unsigned UF, LLVMContext &Ctx) {
     LO->setOperand(0, In);
   }
 }
+
+void VPlanTransforms::cleanupRedundantRecipesAfterUnroll(VPlan &Plan) {
+  // Remove recipes that are redundant after unrolling.
+  auto Iter = vp_depth_first_deep(Plan.getEntry());
+  for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(Iter)) {
+    for (VPRecipeBase &R : make_early_inc_range(*VPBB)) {
+      auto *VPI = dyn_cast<VPInstruction>(&R);
+      if (VPI &&
+          VPI->getOpcode() == VPInstruction::CanonicalIVIncrementForPart &&
+          VPI->getNumOperands() == 1) {
+        VPI->replaceAllUsesWith(VPI->getOperand(0));
+        VPI->eraseFromParent();
+      }
+    }
+  }
+}
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.h b/llvm/lib/Transforms/Vectorize/VPlanTransforms.h
index a87a74b8bba680..82b81bba0b3a92 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.h
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.h
@@ -116,6 +116,7 @@ struct VPlanTransforms {
       VPRecipeBuilder &RecipeBuilder, bool ScalarEpilogueAllowed);
 
   static void unrollByUF(VPlan &Plan, unsigned UF, LLVMContext &Ctx);
+  static void cleanupRedundantRecipesAfterUnroll(VPlan &Plan);
 };
 
 } // namespace llvm

>From faf867c4af51e0b2949bb8343071fcf7c3dce532 Mon Sep 17 00:00:00 2001
From: Florian Hahn <flo at fhahn.com>
Date: Sun, 15 Sep 2024 13:36:29 +0100
Subject: [PATCH 08/15] !fixup address latest comments, thanks

---
 .../Vectorize/LoopVectorizationPlanner.h      |   5 +
 .../Transforms/Vectorize/LoopVectorize.cpp    |   1 -
 llvm/lib/Transforms/Vectorize/VPlan.h         |  19 ++-
 .../Transforms/Vectorize/VPlanPatternMatch.h  |   4 +
 .../lib/Transforms/Vectorize/VPlanRecipes.cpp |  58 +++++----
 .../Transforms/Vectorize/VPlanTransforms.cpp  | 111 ++++++++++--------
 .../Transforms/Vectorize/VPlanTransforms.h    |   6 +-
 llvm/lib/Transforms/Vectorize/VPlanUtils.cpp  |   3 +-
 llvm/lib/Transforms/Vectorize/VPlanValue.h    |   4 +-
 9 files changed, 121 insertions(+), 90 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h b/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h
index 736da2eedf35b4..8300db1dd14b13 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h
@@ -232,6 +232,11 @@ class VPBuilder {
     return tryInsertInstruction(new VPScalarCastRecipe(Opcode, Op, ResultTy));
   }
 
+  VPWidenCastRecipe *createWidenCast(Instruction::CastOps Opcode, VPValue *Op,
+                                     Type *ResultTy) {
+    return tryInsertInstruction(new VPWidenCastRecipe(Opcode, Op, ResultTy));
+  }
+
   VPScalarIVStepsRecipe *
   createScalarIVSteps(Instruction::BinaryOps InductionOpcode,
                       FPMathOperator *FPBinOp, VPValue *IV, VPValue *Step) {
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index d3ba201b486205..1e01feffcf086c 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -7523,7 +7523,6 @@ LoopVectorizationPlanner::executePlan(
   // cost model is complete for better cost estimates.
   VPlanTransforms::unrollByUF(BestVPlan, BestUF,
                               OrigLoop->getHeader()->getModule()->getContext());
-  VPlanTransforms::cleanupRedundantRecipesAfterUnroll(BestVPlan);
   VPlanTransforms::optimizeForVFAndUF(BestVPlan, BestVF, BestUF, PSE);
 
   LLVM_DEBUG(dbgs() << "Executing best plan with VF=" << BestVF
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h
index d61d37a6171577..95f591213c55ae 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.h
+++ b/llvm/lib/Transforms/Vectorize/VPlan.h
@@ -1314,7 +1314,7 @@ class VPInstruction : public VPRecipeWithIRFlags {
 #endif
 
   /// Return the unroll part for this VPInstruction.
-  unsigned getUnrollPart() const;
+  unsigned getUnrollPartOperand() const;
 
 public:
   VPInstruction(unsigned Opcode, ArrayRef<VPValue *> Operands, DebugLoc DL,
@@ -1811,7 +1811,7 @@ class VPVectorPointerRecipe : public VPRecipeWithIRFlags {
   }
 
   /// Return the unroll part for this vector pointer.
-  unsigned getUnrollPart() const;
+  unsigned getUnrollPartOperand() const;
 
 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
   /// Print the recipe.
@@ -2029,7 +2029,14 @@ class VPWidenPointerInductionRecipe : public VPHeaderPHIRecipe {
   const InductionDescriptor &getInductionDescriptor() const { return IndDesc; }
 
   /// Return the unroll part for this induction phi.
-  unsigned getUnrollPart() const;
+  unsigned getUnrollPartOperand() const;
+
+  /// Returns the VPValue representing the value of this induction at
+  /// the first unrolled part, if it exists. Returns itself if unrolling did not
+  /// take place.
+  VPValue *getFirstUnrolledPart() {
+    return getNumOperands() == 3 ? getOperand(2) : this;
+  }
 
 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
   /// Print the recipe.
@@ -2170,7 +2177,7 @@ class VPReductionPHIRecipe : public VPHeaderPHIRecipe {
   bool isInLoop() const { return IsInLoop; }
 
   /// Return the unroll part for this reduction phi.
-  unsigned getUnrollPart() const;
+  unsigned getUnrollPartOperand() const;
 };
 
 /// A recipe for vectorizing a phi-node as a sequence of mask-based select
@@ -3017,7 +3024,7 @@ class VPWidenCanonicalIVRecipe : public VPSingleDefRecipe {
   void execute(VPTransformState &State) override;
 
   /// Return the unroll part for this widened IV.
-  unsigned getUnrollPart() const;
+  unsigned getUnrollPartOperand() const;
 
 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
   /// Print the recipe.
@@ -3133,7 +3140,7 @@ class VPScalarIVStepsRecipe : public VPRecipeWithIRFlags {
   }
 
   /// Return the unroll part for this scalar step
-  unsigned getUnrollPart() const;
+  unsigned getUnrollPartOperand() const;
 };
 
 /// VPBasicBlock serves as the leaf of the Hierarchical Control-Flow Graph. It
diff --git a/llvm/lib/Transforms/Vectorize/VPlanPatternMatch.h b/llvm/lib/Transforms/Vectorize/VPlanPatternMatch.h
index 5f86f2c969651b..4ddbd0d5fafb8e 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanPatternMatch.h
+++ b/llvm/lib/Transforms/Vectorize/VPlanPatternMatch.h
@@ -144,6 +144,10 @@ struct UnaryRecipe_match {
     return DefR && match(DefR);
   }
 
+  bool match(const VPSingleDefRecipe *R) {
+    return match(static_cast<const VPRecipeBase *>(R));
+  }
+
   bool match(const VPRecipeBase *R) {
     if (!detail::MatchRecipeAndOpcode<Opcode, RecipeTys...>::match(R))
       return false;
diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
index 98206b3df82023..714e4f8cc56b28 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
@@ -332,7 +332,7 @@ FastMathFlags VPRecipeWithIRFlags::getFastMathFlags() const {
   return Res;
 }
 
-unsigned VPInstruction::getUnrollPart() const {
+unsigned VPInstruction::getUnrollPartOperand() const {
   assert(getOpcode() == VPInstruction::CanonicalIVIncrementForPart &&
          getNumOperands() == 2 && "Called for unsupported recipe");
   return cast<ConstantInt>(getOperand(1)->getLiveInIRValue())->getZExtValue();
@@ -498,7 +498,7 @@ Value *VPInstruction::generatePerPart(VPTransformState &State, unsigned Part) {
     return EVL;
   }
   case VPInstruction::CanonicalIVIncrementForPart: {
-    unsigned Part = getUnrollPart();
+    unsigned Part = getUnrollPartOperand();
     auto *IV = State.get(getOperand(0), VPIteration(0, 0));
     assert(Part != 0 && "Must have  a part");
     // The canonical IV is incremented by the vectorization factor (num of
@@ -1671,7 +1671,7 @@ void VPScalarIVStepsRecipe::execute(VPTransformState &State) {
   // TODO: Remove loop after VPlan-based unroller lands.
   for (unsigned Part = StartPart; Part < EndPart; ++Part) {
     Value *StartIdx0 =
-        createStepForVF(Builder, IntStepTy, State.VF, getUnrollPart());
+        createStepForVF(Builder, IntStepTy, State.VF, getUnrollPartOperand());
 
     if (!FirstLaneOnly && State.VF.isScalable()) {
       auto *SplatStartIdx = Builder.CreateVectorSplat(State.VF, StartIdx0);
@@ -1704,7 +1704,7 @@ void VPScalarIVStepsRecipe::execute(VPTransformState &State) {
   }
 }
 
-unsigned VPScalarIVStepsRecipe::getUnrollPart() const {
+unsigned VPScalarIVStepsRecipe::getUnrollPartOperand() const {
   return getNumOperands() == 2
              ? 0
              : cast<ConstantInt>(getOperand(2)->getLiveInIRValue())
@@ -1812,7 +1812,7 @@ void VPWidenGEPRecipe::print(raw_ostream &O, const Twine &Indent,
 void VPVectorPointerRecipe ::execute(VPTransformState &State) {
   auto &Builder = State.Builder;
   State.setDebugLocFrom(getDebugLoc());
-  unsigned CurrentPart = getUnrollPart();
+  unsigned CurrentPart = getUnrollPartOperand();
   for (unsigned Part = 0; Part < State.UF; ++Part) {
     // Calculate the pointer for the specific unroll-part.
     Value *PartPtr = nullptr;
@@ -1849,7 +1849,7 @@ void VPVectorPointerRecipe ::execute(VPTransformState &State) {
   }
 }
 
-unsigned VPVectorPointerRecipe::getUnrollPart() const {
+unsigned VPVectorPointerRecipe::getUnrollPartOperand() const {
   return getNumOperands() == 1
              ? 0
              : cast<ConstantInt>(getOperand(1)->getLiveInIRValue())
@@ -2925,7 +2925,7 @@ void VPWidenPointerInductionRecipe::execute(VPTransformState &State) {
 
   auto *IVR = getParent()->getPlan()->getCanonicalIV();
   PHINode *CanonicalIV = cast<PHINode>(State.get(IVR, 0, /*IsScalar*/ true));
-  unsigned CurrentPart = getUnrollPart();
+  unsigned CurrentPart = getUnrollPartOperand();
 
   // Build a pointer phi
   Value *ScalarStartValue = getStartValue()->getLiveInIRValue();
@@ -2933,33 +2933,34 @@ void VPWidenPointerInductionRecipe::execute(VPTransformState &State) {
 
   BasicBlock *VectorPH = State.CFG.getPreheaderBBFor(this);
   PHINode *NewPointerPhi = nullptr;
-  if (CurrentPart != 0) {
-    // The recipe has been unrolled. In that case, fetch the single pointer phi
-    // shared among all unrolled parts of the recipe.
-    auto *GEP = cast<GetElementPtrInst>(State.get(getOperand(2), 0));
-    NewPointerPhi = cast<PHINode>(GEP->getPointerOperand());
-  } else {
+  if (CurrentPart == 0) {
     NewPointerPhi =
         PHINode::Create(ScStValueType, 2, "pointer.phi", CanonicalIV);
     NewPointerPhi->addIncoming(ScalarStartValue, VectorPH);
+  } else {
+    // The recipe has been unrolled. In that case, fetch the single pointer phi
+    // shared among all unrolled parts of the recipe.
+    auto *GEP = cast<GetElementPtrInst>(State.get(getFirstUnrolledPart(), 0));
+    NewPointerPhi = cast<PHINode>(GEP->getPointerOperand());
   }
 
   // A pointer induction, performed by using a gep
   BasicBlock::iterator InductionLoc = State.Builder.GetInsertPoint();
-  unsigned UF = CurrentPart == 0 ? getParent()->getPlan()->getUF() : 1;
   Value *ScalarStepValue = State.get(getOperand(1), VPIteration(0, 0));
   Type *PhiType = IndDesc.getStep()->getType();
   Value *RuntimeVF = getRuntimeVF(State.Builder, PhiType, State.VF);
-  Value *NumUnrolledElems =
-      State.Builder.CreateMul(RuntimeVF, ConstantInt::get(PhiType, UF));
   // Add induction update using an incorrect block temporarily. The phi node
   // will be fixed after VPlan execution. Note that at this point the latch
   // block cannot be used, as it does not exist yet.
   // TODO: Model increment value in VPlan, by turning the recipe into a
   // multi-def and a subclass of VPHeaderPHIRecipe.
-  if (getNumOperands() != 4) {
+  if (CurrentPart == 0) {
     // The recipe represents the first part of the pointer induction. Create the
-    // GEP to increment the phi.
+    // GEP to increment the phi across all unrolled parts.
+    unsigned UF = CurrentPart == 0 ? getParent()->getPlan()->getUF() : 1;
+    Value *NumUnrolledElems =
+        State.Builder.CreateMul(RuntimeVF, ConstantInt::get(PhiType, UF));
+
     Value *InductionGEP = GetElementPtrInst::Create(
         State.Builder.getInt8Ty(), NewPointerPhi,
         State.Builder.CreateMul(ScalarStepValue, NumUnrolledElems), "ptr.ind",
@@ -2993,7 +2994,7 @@ void VPWidenPointerInductionRecipe::execute(VPTransformState &State) {
   }
 }
 
-unsigned VPWidenPointerInductionRecipe::getUnrollPart() const {
+unsigned VPWidenPointerInductionRecipe::getUnrollPartOperand() const {
   return getNumOperands() == 4
              ? cast<ConstantInt>(getOperand(3)->getLiveInIRValue())
                    ->getZExtValue()
@@ -3003,6 +3004,8 @@ unsigned VPWidenPointerInductionRecipe::getUnrollPart() const {
 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
 void VPWidenPointerInductionRecipe::print(raw_ostream &O, const Twine &Indent,
                                           VPSlotTracker &SlotTracker) const {
+  assert((getNumOperands() == 2 || getNumOperands() == 5) &&
+         "unexpected number of operands");
   O << Indent << "EMIT ";
   printAsOperand(O, SlotTracker);
   O << " = WIDEN-POINTER-INDUCTION ";
@@ -3051,7 +3054,7 @@ void VPWidenCanonicalIVRecipe::execute(VPTransformState &State) {
                       ? CanonicalIV
                       : Builder.CreateVectorSplat(VF, CanonicalIV, "broadcast");
   for (unsigned Part = 0, UF = State.UF; Part < UF; ++Part) {
-    Value *VStep = createStepForVF(Builder, STy, VF, getUnrollPart());
+    Value *VStep = createStepForVF(Builder, STy, VF, getUnrollPartOperand());
     if (VF.isVector()) {
       VStep = Builder.CreateVectorSplat(VF, VStep);
       VStep =
@@ -3062,7 +3065,7 @@ void VPWidenCanonicalIVRecipe::execute(VPTransformState &State) {
   }
 }
 
-unsigned VPWidenCanonicalIVRecipe::getUnrollPart() const {
+unsigned VPWidenCanonicalIVRecipe::getUnrollPartOperand() const {
   return getNumOperands() == 1
              ? 0
              : cast<ConstantInt>(getOperand(1)->getLiveInIRValue())
@@ -3147,7 +3150,7 @@ void VPReductionPHIRecipe::execute(VPTransformState &State) {
 
   Value *Iden = nullptr;
   RecurKind RK = RdxDesc.getRecurrenceKind();
-  unsigned CurrentPart = getUnrollPart();
+  unsigned CurrentPart = getUnrollPartOperand();
 
   if (RecurrenceDescriptor::isMinMaxRecurrenceKind(RK) ||
       RecurrenceDescriptor::isAnyOfRecurrenceKind(RK)) {
@@ -3165,14 +3168,17 @@ void VPReductionPHIRecipe::execute(VPTransformState &State) {
                                        RdxDesc.getFastMathFlags());
 
     if (!ScalarPHI) {
-      if (CurrentPart != 0) {
-        Iden = Builder.CreateVectorSplat(State.VF, Iden);
-      } else {
+      if (CurrentPart == 0) {
+        // Create start and identity vector values for the reduction in the
+        // preheader.
+        // TODO: Introduce recipes in VPlan preheader to create initial values.
         Iden = Builder.CreateVectorSplat(State.VF, Iden);
         IRBuilderBase::InsertPointGuard IPBuilder(Builder);
         Builder.SetInsertPoint(VectorPH->getTerminator());
         Constant *Zero = Builder.getInt32(0);
         StartV = Builder.CreateInsertElement(Iden, StartV, Zero);
+      } else {
+        Iden = Builder.CreateVectorSplat(State.VF, Iden);
       }
     }
   }
@@ -3186,7 +3192,7 @@ void VPReductionPHIRecipe::execute(VPTransformState &State) {
   }
 }
 
-unsigned VPReductionPHIRecipe::getUnrollPart() const {
+unsigned VPReductionPHIRecipe::getUnrollPartOperand() const {
   return getNumOperands() == 2
              ? 0
              : cast<ConstantInt>(getOperand(2)->getLiveInIRValue())
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
index 8cfd0561f35add..a76663d43e36b5 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
@@ -21,6 +21,7 @@
 #include "VPlanUtils.h"
 #include "llvm/ADT/PostOrderIterator.h"
 #include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/ScopeExit.h"
 #include "llvm/ADT/SetVector.h"
 #include "llvm/ADT/TypeSwitch.h"
 #include "llvm/Analysis/IVDescriptors.h"
@@ -1621,6 +1622,8 @@ class UnrollState {
       : Plan(Plan), UF(UF),
         TypeInfo(Plan.getCanonicalIV()->getScalarType(), Ctx) {}
 
+  void unrollBlock(VPBlockBase *VPB);
+
   VPValue *getValueForPart(VPValue *V, unsigned Part) {
     if (Part == 0 || V->isLiveIn())
       return V;
@@ -1629,7 +1632,11 @@ class UnrollState {
     return VPV2Parts[V][Part - 1];
   }
 
-  void addVPV2Parts(VPRecipeBase *OrigR, VPRecipeBase *CopyR, unsigned Part) {
+  /// Given a single original recipe \p OrigR (of part zero), and its copy \p
+  /// CopyR for part \p Part, map every VPValue defined by \p OrigR to its
+  /// corresponding VPValue defined by \p CopyR.
+  void addRecipeForPart(VPRecipeBase *OrigR, VPRecipeBase *CopyR,
+                        unsigned Part) {
     for (const auto &[Idx, VPV] : enumerate(OrigR->definedValues())) {
       auto Ins = VPV2Parts.insert({VPV, {}});
       assert(Ins.first->second.size() == Part - 1 && "earlier parts not set");
@@ -1637,7 +1644,8 @@ class UnrollState {
     }
   }
 
-  void addUniform(VPSingleDefRecipe *R) {
+  /// Given a uniform recipe \p R, add it for all parts.
+  void addUniformForAllParts(VPSingleDefRecipe *R) {
     auto Ins = VPV2Parts.insert({R, {}});
     assert(Ins.second && "uniform value already added");
     for (unsigned Part = 0; Part != UF; ++Part)
@@ -1646,17 +1654,18 @@ class UnrollState {
 
   bool contains(VPValue *VPV) const { return VPV2Parts.contains(VPV); }
 
+  /// Update \p R's operand at \p OpIdx with its corresponding VPValue for part
+  /// \p P.
   void remapOperand(VPRecipeBase *R, unsigned OpIdx, unsigned Part) {
     auto *Op = R->getOperand(OpIdx);
     R->setOperand(OpIdx, getValueForPart(Op, Part));
   }
 
+  /// Update \p R's operands with their corresponding VPValues for part \p P.
   void remapOperands(VPRecipeBase *R, unsigned Part) {
     for (const auto &[OpIdx, Op] : enumerate(R->operands()))
       R->setOperand(OpIdx, getValueForPart(Op, Part));
   }
-
-  void unrollBlock(VPBlockBase *VPB);
 };
 } // namespace
 
@@ -1677,7 +1686,7 @@ void UnrollState::unrollReplicateRegion(VPRegionBlock *VPR) {
           ScalarIVSteps->addOperand(getConstantVPV(Part));
         }
 
-        addVPV2Parts(&Part0R, &PartIR, Part);
+        addRecipeForPart(&Part0R, &PartIR, Part);
       }
     }
   }
@@ -1694,11 +1703,11 @@ void UnrollState::unrollWidenInduction(VPWidenIntOrFpInductionRecipe *IV,
     FMFs = ID.getInductionBinOp()->getFastMathFlags();
 
   VPValue *VectorStep = &Plan.getVF();
+  VPBuilder Builder(PH);
   if (TypeInfo.inferScalarType(VectorStep) != IVTy) {
     Instruction::CastOps CastOp =
         IVTy->isFloatingPointTy() ? Instruction::UIToFP : Instruction::Trunc;
-    VectorStep = new VPWidenCastRecipe(CastOp, VectorStep, IVTy);
-    PH->appendRecipe(VectorStep->getDefiningRecipe());
+    VectorStep = Builder.createWidenCast(CastOp, VectorStep, IVTy);
     ToSkip.insert(VectorStep->getDefiningRecipe());
   }
 
@@ -1708,12 +1717,11 @@ void UnrollState::unrollWidenInduction(VPWidenIntOrFpInductionRecipe *IV,
                         : nullptr;
   if (!ConstStep || ConstStep->getZExtValue() != 1) {
     if (TypeInfo.inferScalarType(ScalarStep) != IVTy) {
-      ScalarStep = new VPWidenCastRecipe(Instruction::Trunc, ScalarStep, IVTy);
-      PH->appendRecipe(ScalarStep->getDefiningRecipe());
+      ScalarStep =
+          Builder.createWidenCast(Instruction::Trunc, ScalarStep, IVTy);
       ToSkip.insert(ScalarStep->getDefiningRecipe());
     }
 
-    VPBuilder Builder(PH);
     VPInstruction *Mul;
     if (IVTy->isFloatingPointTy())
       Mul = Builder.createFPOp(Instruction::FMul, {VectorStep, ScalarStep},
@@ -1730,7 +1738,7 @@ void UnrollState::unrollWidenInduction(VPWidenIntOrFpInductionRecipe *IV,
   // previous part. The header phi recipe will get 2 new operands: the step
   // value for a single part and the last part, used to compute the backedge
   // value during VPWidenIntOrFpInductionRecipe::execute. %Part.0 =
-  // VPWidenIntOrFpInductionRecipe %Start, %ScalarStep, %VectorSteop, %Part.3
+  // VPWidenIntOrFpInductionRecipe %Start, %ScalarStep, %VectorStep, %Part.3
   // %Part.1 = %Part.0 + %VectorStep
   // %Part.2 = %Part.1 + %VectorStep
   // %Part.3 = %Part.2 + %VectorStep
@@ -1738,7 +1746,6 @@ void UnrollState::unrollWidenInduction(VPWidenIntOrFpInductionRecipe *IV,
   // The newly added recipes are added to ToSkip to avoid interleaving them
   // again.
   VPValue *Prev = IV;
-  VPBuilder Builder;
   Builder.setInsertPoint(IV->getParent(), InsertPtForPhi);
   for (unsigned Part = 1; Part != UF; ++Part) {
     VPInstruction *Add;
@@ -1760,7 +1767,7 @@ void UnrollState::unrollWidenInduction(VPWidenIntOrFpInductionRecipe *IV,
                                  },
                                  IV->getDebugLoc(), Name);
     ToSkip.insert(Add);
-    addVPV2Parts(IV, Add, Part);
+    addRecipeForPart(IV, Add, Part);
     Prev = Add;
   }
   IV->addOperand(VectorStep);
@@ -1788,12 +1795,15 @@ void UnrollState::unrollHeaderPHI(VPRecipeBase &R,
   for (unsigned Part = 1; Part != UF; ++Part) {
     VPRecipeBase *Copy = R.clone();
     Copy->insertBefore(*R.getParent(), InsertPt);
-    addVPV2Parts(&R, Copy, Part);
+    addRecipeForPart(&R, Copy, Part);
     if (isa<VPWidenPointerInductionRecipe>(&R)) {
       Copy->addOperand(R.getVPSingleValue());
       Copy->addOperand(getConstantVPV(Part));
     } else if (RdxPhi) {
       Copy->addOperand(getConstantVPV(Part));
+    } else {
+      assert(isa<VPActiveLaneMaskPHIRecipe>(&R) &&
+             "unexpected header phi recipe not needing unrolled part");
     }
   }
 }
@@ -1807,14 +1817,20 @@ void UnrollState::unrollRecipe(VPRecipeBase &R) {
 
   if (auto *VPI = dyn_cast<VPInstruction>(&R)) {
     if (vputils::onlyFirstPartUsed(VPI)) {
-      addUniform(cast<VPInstruction>(&R));
+      addUniformForAllParts(cast<VPInstruction>(&R));
       return;
     }
+    if (match(VPI, m_VPInstruction<VPInstruction::CalculateTripCountMinusVF>(
+                       m_VPValue()))) {
+      addUniformForAllParts(VPI);
+      return;
+    }
+
     VPValue *Op0;
     VPValue *Op1;
     if (match(VPI, m_VPInstruction<VPInstruction::ExtractFromEnd>(
                        m_VPValue(Op0), m_VPValue(Op1)))) {
-      addUniform(VPI);
+      addUniformForAllParts(VPI);
       if (Plan.hasScalarVFOnly()) {
         // Extracting from end with VF = 1 implies retrieving the scalar part UF
         // - Op1.
@@ -1827,13 +1843,7 @@ void UnrollState::unrollRecipe(VPRecipeBase &R) {
       }
       return;
     }
-    if (match(&R, m_VPInstruction<VPInstruction::CalculateTripCountMinusVF>(
-                      m_VPValue()))) {
-      addUniform(VPI);
-      return;
-    }
   }
-
   if (auto *RepR = dyn_cast<VPReplicateRecipe>(&R)) {
     if (isa<StoreInst>(RepR->getUnderlyingValue()) &&
         RepR->getOperand(1)->isDefinedOutsideVectorRegions()) {
@@ -1843,7 +1853,7 @@ void UnrollState::unrollRecipe(VPRecipeBase &R) {
     }
     if (auto *II = dyn_cast<IntrinsicInst>(RepR->getUnderlyingValue())) {
       if (II->getIntrinsicID() == Intrinsic::experimental_noalias_scope_decl) {
-        addUniform(RepR);
+        addUniformForAllParts(RepR);
         return;
       }
     }
@@ -1855,7 +1865,7 @@ void UnrollState::unrollRecipe(VPRecipeBase &R) {
   for (unsigned Part = 1; Part != UF; ++Part) {
     VPRecipeBase *Copy = R.clone();
     Copy->insertBefore(VPBB, InsertPt);
-    addVPV2Parts(&R, Copy, Part);
+    addRecipeForPart(&R, Copy, Part);
 
     VPValue *Op;
     if (match(&R, m_VPInstruction<VPInstruction::FirstOrderRecurrenceSplice>(
@@ -1878,7 +1888,7 @@ void UnrollState::unrollRecipe(VPRecipeBase &R) {
     }
     remapOperands(Copy, Part);
 
-    // Add operand indicating the part to generate code for to recipes still
+    // Add operand indicating the part to generate code for, to recipes still
     // requiring it.
     if (isa<VPScalarIVStepsRecipe, VPWidenCanonicalIVRecipe,
             VPVectorPointerRecipe>(Copy) ||
@@ -1895,10 +1905,11 @@ using namespace llvm::VPlanPatternMatch;
 void UnrollState::unrollBlock(VPBlockBase *VPB) {
   auto *VPR = dyn_cast<VPRegionBlock>(VPB);
   if (VPR) {
-    if (VPR->isReplicator()) {
-      unrollReplicateRegion(VPR);
-      return;
-    }
+    if (VPR->isReplicator())
+      return unrollReplicateRegion(VPR);
+
+    // Traverse blocks in region in RPO to ensure defs are visited before uses
+    // across blocks.
     ReversePostOrderTraversal<VPBlockShallowTraversalWrapper<VPBlockBase *>>
         RPOT(VPR->getEntry());
     for (VPBlockBase *VPB : RPOT)
@@ -1906,10 +1917,11 @@ void UnrollState::unrollBlock(VPBlockBase *VPB) {
     return;
   }
 
+  // VPB is a VPBasicBlock; unroll it, i.e., unroll its recipes.
   auto *VPBB = cast<VPBasicBlock>(VPB);
   auto InsertPtForPhi = VPBB->getFirstNonPhi();
   for (VPRecipeBase &R : make_early_inc_range(*VPBB)) {
-    if (ToSkip.contains(&R))
+    if (ToSkip.contains(&R) || isa<VPIRInstruction>(&R))
       continue;
 
     // Add all VPValues for all parts to ComputeReductionResult which combines
@@ -1917,7 +1929,7 @@ void UnrollState::unrollBlock(VPBlockBase *VPB) {
     VPValue *Op1;
     if (match(&R, m_VPInstruction<VPInstruction::ComputeReductionResult>(
                       m_VPValue(), m_VPValue(Op1)))) {
-      addUniform(cast<VPInstruction>(&R));
+      addUniformForAllParts(cast<VPInstruction>(&R));
       for (unsigned Part = 1; Part != UF; ++Part)
         R.addOperand(getValueForPart(Op1, Part));
       continue;
@@ -1925,7 +1937,7 @@ void UnrollState::unrollBlock(VPBlockBase *VPB) {
 
     auto *SingleDef = dyn_cast<VPSingleDefRecipe>(&R);
     if (SingleDef && vputils::isUniformAcrossVFsAndUFs(SingleDef)) {
-      addUniform(SingleDef);
+      addUniformForAllParts(SingleDef);
       continue;
     }
 
@@ -1938,9 +1950,26 @@ void UnrollState::unrollBlock(VPBlockBase *VPB) {
   }
 }
 
+/// Remove recipes that are redundant after unrolling.
+static void cleanupRedundantRecipesAfterUnroll(VPlan &Plan) {
+  auto Iter = vp_depth_first_deep(Plan.getEntry());
+  for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(Iter)) {
+    for (VPRecipeBase &R : make_early_inc_range(*VPBB)) {
+      auto *VPI = dyn_cast<VPInstruction>(&R);
+      if (VPI &&
+          VPI->getOpcode() == VPInstruction::CanonicalIVIncrementForPart &&
+          VPI->getNumOperands() == 1) {
+        VPI->replaceAllUsesWith(VPI->getOperand(0));
+        VPI->eraseFromParent();
+      }
+    }
+  }
+}
 void VPlanTransforms::unrollByUF(VPlan &Plan, unsigned UF, LLVMContext &Ctx) {
   assert(UF > 0 && "Unroll factor must be positive");
   Plan.setUF(UF);
+  auto Cleanup =
+      make_scope_exit([&Plan]() { cleanupRedundantRecipesAfterUnroll(Plan); });
   if (UF == 1) {
     return;
   }
@@ -1959,7 +1988,7 @@ void VPlanTransforms::unrollByUF(VPlan &Plan, unsigned UF, LLVMContext &Ctx) {
 
   unsigned Part = 1;
   // Remap operands of cloned header phis to update backedge values. The header
-  // phis cloned during interleaving are just after the header phi for part 0.
+  // phis cloned during unrolling are just after the header phi for part 0.
   // Reset Part to 1 when reaching the first (part 0) recipe of a block.
   for (VPRecipeBase &H :
        Plan.getVectorLoopRegion()->getEntryBasicBlock()->phis()) {
@@ -1985,19 +2014,3 @@ void VPlanTransforms::unrollByUF(VPlan &Plan, unsigned UF, LLVMContext &Ctx) {
     LO->setOperand(0, In);
   }
 }
-
-void VPlanTransforms::cleanupRedundantRecipesAfterUnroll(VPlan &Plan) {
-  // Remove recipes that are redundant after unrolling.
-  auto Iter = vp_depth_first_deep(Plan.getEntry());
-  for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(Iter)) {
-    for (VPRecipeBase &R : make_early_inc_range(*VPBB)) {
-      auto *VPI = dyn_cast<VPInstruction>(&R);
-      if (VPI &&
-          VPI->getOpcode() == VPInstruction::CanonicalIVIncrementForPart &&
-          VPI->getNumOperands() == 1) {
-        VPI->replaceAllUsesWith(VPI->getOperand(0));
-        VPI->eraseFromParent();
-      }
-    }
-  }
-}
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.h b/llvm/lib/Transforms/Vectorize/VPlanTransforms.h
index da41dbedbb8295..761bce7c6bedfb 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.h
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.h
@@ -49,6 +49,9 @@ struct VPlanTransforms {
   /// Clear NSW/NUW flags from reduction instructions if necessary.
   static void clearReductionWrapFlags(VPlan &Plan);
 
+  /// Explicitly unroll \p Plan by \p UF.
+  static void unrollByUF(VPlan &Plan, unsigned UF, LLVMContext &Ctx);
+
   /// Optimize \p Plan based on \p BestVF and \p BestUF. This may restrict the
   /// resulting plan to \p BestVF and \p BestUF.
   static void optimizeForVFAndUF(VPlan &Plan, ElementCount BestVF,
@@ -113,9 +116,6 @@ struct VPlanTransforms {
   static void createInterleaveGroups(
       const SmallPtrSetImpl<const InterleaveGroup<Instruction> *> &InterleaveGroups,
       VPRecipeBuilder &RecipeBuilder, bool ScalarEpilogueAllowed);
-
-  static void unrollByUF(VPlan &Plan, unsigned UF, LLVMContext &Ctx);
-  static void cleanupRedundantRecipesAfterUnroll(VPlan &Plan);
 };
 
 } // namespace llvm
diff --git a/llvm/lib/Transforms/Vectorize/VPlanUtils.cpp b/llvm/lib/Transforms/Vectorize/VPlanUtils.cpp
index 07a2ac9f2fc34a..7d630bfe1d39eb 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanUtils.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanUtils.cpp
@@ -74,8 +74,7 @@ bool vputils::isUniformAcrossVFsAndUFs(VPValue *V) {
     return true;
 
   return TypeSwitch<const VPRecipeBase *, bool>(R)
-      .Case<VPCanonicalIVPHIRecipe, VPDerivedIVRecipe>(
-          [](const auto *R) { return true; })
+      .Case<VPDerivedIVRecipe>([](const auto *R) { return true; })
       .Case<VPReplicateRecipe>([](const auto *R) {
         // Loads and stores that are uniform across VF lanes are handled by
         // VPReplicateRecipe.IsUniform. They are also uniform across UF parts if
diff --git a/llvm/lib/Transforms/Vectorize/VPlanValue.h b/llvm/lib/Transforms/Vectorize/VPlanValue.h
index aa31b96bb22af3..4523cca280482d 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanValue.h
+++ b/llvm/lib/Transforms/Vectorize/VPlanValue.h
@@ -180,9 +180,7 @@ class VPValue {
     return getUnderlyingValue();
   }
 
-  /// Returns true if the VPValue is defined outside any vector regions, i.e. it
-  /// is a live-in value.
-  /// TODO: Also handle recipes defined in pre-header blocks.
+  /// Returns true if the VPValue is defined outside any vector regions.
   bool isDefinedOutsideVectorRegions() const;
 
   // Set \p Val as the underlying Value of this VPValue.

>From 34595b8424c32cbed01a547e2c18dc1e7a131ccd Mon Sep 17 00:00:00 2001
From: Florian Hahn <flo at fhahn.com>
Date: Thu, 19 Sep 2024 11:11:23 +0100
Subject: [PATCH 09/15] !fixup fix build failure

---
 llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
index 5ee1d017507d6e..53ec0b649c04a9 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
@@ -1620,7 +1620,7 @@ class UnrollState {
 public:
   UnrollState(VPlan &Plan, unsigned UF, LLVMContext &Ctx)
       : Plan(Plan), UF(UF),
-        TypeInfo(Plan.getCanonicalIV()->getScalarType(), Ctx) {}
+        TypeInfo(Plan.getCanonicalIV()->getScalarType()) {}
 
   void unrollBlock(VPBlockBase *VPB);
 

>From a838eb4efdf139d6f9d72b7f3af791d286c030a2 Mon Sep 17 00:00:00 2001
From: Florian Hahn <flo at fhahn.com>
Date: Thu, 19 Sep 2024 12:51:10 +0100
Subject: [PATCH 10/15] !fixup address latest

---
 llvm/lib/Transforms/Vectorize/VPlan.cpp       |  2 +-
 llvm/lib/Transforms/Vectorize/VPlan.h         | 26 +++---
 .../lib/Transforms/Vectorize/VPlanRecipes.cpp | 39 +++++----
 .../Transforms/Vectorize/VPlanTransforms.cpp  | 70 +++++++++-------
 llvm/lib/Transforms/Vectorize/VPlanUtils.cpp  |  2 +-
 .../LoopVectorize/AArch64/sve-widen-phi.ll    | 25 ++++++
 .../LoopVectorize/RISCV/dead-ops-cost.ll      |  6 +-
 .../LoopVectorize/X86/cost-model.ll           | 12 +--
 .../LoopVectorize/X86/interleaving.ll         | 80 +++++++++++++++++++
 9 files changed, 194 insertions(+), 68 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/VPlan.cpp b/llvm/lib/Transforms/Vectorize/VPlan.cpp
index 4bc589a47a5c78..bfbc90e3d5ca30 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlan.cpp
@@ -1097,7 +1097,7 @@ void VPlan::execute(VPTransformState *State) {
 
       // Use the steps for the last part as backedge value for the induction.
       if (auto *IV = dyn_cast<VPWidenIntOrFpInductionRecipe>(&R))
-        Inc->setOperand(0, State->get(IV->getLastUnrolledPart(), 0));
+        Inc->setOperand(0, State->get(IV->getLastUnrolledPartOperand(), 0));
       continue;
     }
 
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h
index 248a824b4526cc..899a37d8bf6c35 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.h
+++ b/llvm/lib/Transforms/Vectorize/VPlan.h
@@ -1311,7 +1311,7 @@ class VPInstruction : public VPRecipeWithIRFlags {
 #endif
 
   /// Return the unroll part for this VPInstruction.
-  unsigned getUnrollPartOperand() const;
+  unsigned getUnrollPart() const;
 
 public:
   VPInstruction(unsigned Opcode, ArrayRef<VPValue *> Operands, DebugLoc DL,
@@ -1808,7 +1808,7 @@ class VPVectorPointerRecipe : public VPRecipeWithIRFlags {
   }
 
   /// Return the unroll part for this vector pointer.
-  unsigned getUnrollPartOperand() const;
+  unsigned getUnrollPart() const;
 
 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
   /// Print the recipe.
@@ -1960,6 +1960,12 @@ class VPWidenIntOrFpInductionRecipe : public VPHeaderPHIRecipe {
   VPValue *getVFValue() { return getOperand(2); }
   const VPValue *getVFValue() const { return getOperand(2); }
 
+  VPValue *getSplatVFValue() {
+    // If the recipe has been unrolled (4 operands), return the VPValue for the
+    // induction increment.
+    return getNumOperands() == 5 ? getOperand(3) : nullptr;
+  }
+
   /// Returns the first defined value as TruncInst, if it is one or nullptr
   /// otherwise.
   TruncInst *getTruncInst() { return Trunc; }
@@ -1983,8 +1989,8 @@ class VPWidenIntOrFpInductionRecipe : public VPHeaderPHIRecipe {
   /// Returns the VPValue representing the value of this induction at
   /// the last unrolled part, if it exists. Returns itself if unrolling did not
   /// take place.
-  VPValue *getLastUnrolledPart() {
-    return getNumOperands() == 4 ? getOperand(3) : this;
+  VPValue *getLastUnrolledPartOperand() {
+    return getNumOperands() == 5 ? getOperand(4) : this;
   }
 };
 
@@ -2026,13 +2032,13 @@ class VPWidenPointerInductionRecipe : public VPHeaderPHIRecipe {
   const InductionDescriptor &getInductionDescriptor() const { return IndDesc; }
 
   /// Return the unroll part for this induction phi.
-  unsigned getUnrollPartOperand() const;
+  unsigned getUnrollPart() const;
 
   /// Returns the VPValue representing the value of this induction at
   /// the first unrolled part, if it exists. Returns itself if unrolling did not
   /// take place.
-  VPValue *getFirstUnrolledPart() {
-    return getNumOperands() == 3 ? getOperand(2) : this;
+  VPValue *getFirstUnrolledPartOperand() {
+    return getUnrollPart() == 0 ? this : getOperand(2);
   }
 
 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
@@ -2174,7 +2180,7 @@ class VPReductionPHIRecipe : public VPHeaderPHIRecipe {
   bool isInLoop() const { return IsInLoop; }
 
   /// Return the unroll part for this reduction phi.
-  unsigned getUnrollPartOperand() const;
+  unsigned getUnrollPart() const;
 };
 
 /// A recipe for vectorizing a phi-node as a sequence of mask-based select
@@ -3021,7 +3027,7 @@ class VPWidenCanonicalIVRecipe : public VPSingleDefRecipe {
   void execute(VPTransformState &State) override;
 
   /// Return the unroll part for this widened IV.
-  unsigned getUnrollPartOperand() const;
+  unsigned getUnrollPart() const;
 
 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
   /// Print the recipe.
@@ -3137,7 +3143,7 @@ class VPScalarIVStepsRecipe : public VPRecipeWithIRFlags {
   }
 
   /// Return the unroll part for this scalar step
-  unsigned getUnrollPartOperand() const;
+  unsigned getUnrollPart() const;
 };
 
 /// VPBasicBlock serves as the leaf of the Hierarchical Control-Flow Graph. It
diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
index 01fbb97dc947c5..58b84b19e9e194 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
@@ -332,7 +332,7 @@ FastMathFlags VPRecipeWithIRFlags::getFastMathFlags() const {
   return Res;
 }
 
-unsigned VPInstruction::getUnrollPartOperand() const {
+unsigned VPInstruction::getUnrollPart() const {
   assert(getOpcode() == VPInstruction::CanonicalIVIncrementForPart &&
          getNumOperands() == 2 && "Called for unsupported recipe");
   return cast<ConstantInt>(getOperand(1)->getLiveInIRValue())->getZExtValue();
@@ -498,9 +498,9 @@ Value *VPInstruction::generatePerPart(VPTransformState &State, unsigned Part) {
     return EVL;
   }
   case VPInstruction::CanonicalIVIncrementForPart: {
-    unsigned Part = getUnrollPartOperand();
+    unsigned Part = getUnrollPart();
     auto *IV = State.get(getOperand(0), VPIteration(0, 0));
-    assert(Part != 0 && "Must have  a part");
+    assert(Part != 0 && "Must have a positive part");
     // The canonical IV is incremented by the vectorization factor (num of
     // SIMD elements) times the unroll part.
     Value *Step = createStepForVF(Builder, IV->getType(), State.VF, Part);
@@ -1512,10 +1512,10 @@ void VPWidenIntOrFpInductionRecipe::execute(VPTransformState &State) {
   }
 
   Value *SplatVF;
-  if (getNumOperands() == 4) {
+  if (VPValue *SplatVFOperand = getSplatVFValue()) {
     // The recipe has been unrolled. In that case, fetch the splat value for the
     // induction increment.
-    SplatVF = State.get(getOperand(2), 0);
+    SplatVF = State.get(SplatVFOperand, 0);
   } else {
     // Multiply the vectorization factor by the step using integer or
     // floating-point arithmetic as appropriate.
@@ -1671,7 +1671,7 @@ void VPScalarIVStepsRecipe::execute(VPTransformState &State) {
   // TODO: Remove loop after VPlan-based unroller lands.
   for (unsigned Part = StartPart; Part < EndPart; ++Part) {
     Value *StartIdx0 =
-        createStepForVF(Builder, IntStepTy, State.VF, getUnrollPartOperand());
+        createStepForVF(Builder, IntStepTy, State.VF, getUnrollPart());
 
     if (!FirstLaneOnly && State.VF.isScalable()) {
       auto *SplatStartIdx = Builder.CreateVectorSplat(State.VF, StartIdx0);
@@ -1704,7 +1704,7 @@ void VPScalarIVStepsRecipe::execute(VPTransformState &State) {
   }
 }
 
-unsigned VPScalarIVStepsRecipe::getUnrollPartOperand() const {
+unsigned VPScalarIVStepsRecipe::getUnrollPart() const {
   return getNumOperands() == 2
              ? 0
              : cast<ConstantInt>(getOperand(2)->getLiveInIRValue())
@@ -1812,7 +1812,7 @@ void VPWidenGEPRecipe::print(raw_ostream &O, const Twine &Indent,
 void VPVectorPointerRecipe ::execute(VPTransformState &State) {
   auto &Builder = State.Builder;
   State.setDebugLocFrom(getDebugLoc());
-  unsigned CurrentPart = getUnrollPartOperand();
+  unsigned CurrentPart = getUnrollPart();
   for (unsigned Part = 0; Part < State.UF; ++Part) {
     // Calculate the pointer for the specific unroll-part.
     Value *PartPtr = nullptr;
@@ -1849,7 +1849,7 @@ void VPVectorPointerRecipe ::execute(VPTransformState &State) {
   }
 }
 
-unsigned VPVectorPointerRecipe::getUnrollPartOperand() const {
+unsigned VPVectorPointerRecipe::getUnrollPart() const {
   return getNumOperands() == 1
              ? 0
              : cast<ConstantInt>(getOperand(1)->getLiveInIRValue())
@@ -2925,7 +2925,7 @@ void VPWidenPointerInductionRecipe::execute(VPTransformState &State) {
 
   auto *IVR = getParent()->getPlan()->getCanonicalIV();
   PHINode *CanonicalIV = cast<PHINode>(State.get(IVR, 0, /*IsScalar*/ true));
-  unsigned CurrentPart = getUnrollPartOperand();
+  unsigned CurrentPart = getUnrollPart();
 
   // Build a pointer phi
   Value *ScalarStartValue = getStartValue()->getLiveInIRValue();
@@ -2940,7 +2940,8 @@ void VPWidenPointerInductionRecipe::execute(VPTransformState &State) {
   } else {
     // The recipe has been unrolled. In that case, fetch the single pointer phi
     // shared among all unrolled parts of the recipe.
-    auto *GEP = cast<GetElementPtrInst>(State.get(getFirstUnrolledPart(), 0));
+    auto *GEP =
+        cast<GetElementPtrInst>(State.get(getFirstUnrolledPartOperand(), 0));
     NewPointerPhi = cast<PHINode>(GEP->getPointerOperand());
   }
 
@@ -2993,7 +2994,7 @@ void VPWidenPointerInductionRecipe::execute(VPTransformState &State) {
   }
 }
 
-unsigned VPWidenPointerInductionRecipe::getUnrollPartOperand() const {
+unsigned VPWidenPointerInductionRecipe::getUnrollPart() const {
   return getNumOperands() == 4
              ? cast<ConstantInt>(getOperand(3)->getLiveInIRValue())
                    ->getZExtValue()
@@ -3003,20 +3004,18 @@ unsigned VPWidenPointerInductionRecipe::getUnrollPartOperand() const {
 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
 void VPWidenPointerInductionRecipe::print(raw_ostream &O, const Twine &Indent,
                                           VPSlotTracker &SlotTracker) const {
-  assert((getNumOperands() == 2 || getNumOperands() == 5) &&
+  assert((getNumOperands() == 2 || getNumOperands() == 4) &&
          "unexpected number of operands");
   O << Indent << "EMIT ";
   printAsOperand(O, SlotTracker);
   O << " = WIDEN-POINTER-INDUCTION ";
   getStartValue()->printAsOperand(O, SlotTracker);
   O << ", " << *IndDesc.getStep();
-  if (getNumOperands() == 5) {
+  if (getNumOperands() == 4) {
     O << ", ";
     getOperand(2)->printAsOperand(O, SlotTracker);
     O << ", ";
     getOperand(3)->printAsOperand(O, SlotTracker);
-    O << ", ";
-    getOperand(4)->printAsOperand(O, SlotTracker);
   }
 }
 #endif
@@ -3053,7 +3052,7 @@ void VPWidenCanonicalIVRecipe::execute(VPTransformState &State) {
                       ? CanonicalIV
                       : Builder.CreateVectorSplat(VF, CanonicalIV, "broadcast");
   for (unsigned Part = 0, UF = State.UF; Part < UF; ++Part) {
-    Value *VStep = createStepForVF(Builder, STy, VF, getUnrollPartOperand());
+    Value *VStep = createStepForVF(Builder, STy, VF, getUnrollPart());
     if (VF.isVector()) {
       VStep = Builder.CreateVectorSplat(VF, VStep);
       VStep =
@@ -3064,7 +3063,7 @@ void VPWidenCanonicalIVRecipe::execute(VPTransformState &State) {
   }
 }
 
-unsigned VPWidenCanonicalIVRecipe::getUnrollPartOperand() const {
+unsigned VPWidenCanonicalIVRecipe::getUnrollPart() const {
   return getNumOperands() == 1
              ? 0
              : cast<ConstantInt>(getOperand(1)->getLiveInIRValue())
@@ -3149,7 +3148,7 @@ void VPReductionPHIRecipe::execute(VPTransformState &State) {
 
   Value *Iden = nullptr;
   RecurKind RK = RdxDesc.getRecurrenceKind();
-  unsigned CurrentPart = getUnrollPartOperand();
+  unsigned CurrentPart = getUnrollPart();
 
   if (RecurrenceDescriptor::isMinMaxRecurrenceKind(RK) ||
       RecurrenceDescriptor::isAnyOfRecurrenceKind(RK)) {
@@ -3191,7 +3190,7 @@ void VPReductionPHIRecipe::execute(VPTransformState &State) {
   }
 }
 
-unsigned VPReductionPHIRecipe::getUnrollPartOperand() const {
+unsigned VPReductionPHIRecipe::getUnrollPart() const {
   return getNumOperands() == 2
              ? 0
              : cast<ConstantInt>(getOperand(2)->getLiveInIRValue())
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
index 69d9fa726ec7e6..b06568d4ab195a 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
@@ -1816,20 +1816,11 @@ void UnrollState::unrollRecipe(VPRecipeBase &R) {
     return;
 
   if (auto *VPI = dyn_cast<VPInstruction>(&R)) {
-    if (vputils::onlyFirstPartUsed(VPI)) {
-      addUniformForAllParts(cast<VPInstruction>(&R));
-      return;
-    }
-    if (match(VPI, m_VPInstruction<VPInstruction::CalculateTripCountMinusVF>(
-                       m_VPValue()))) {
-      addUniformForAllParts(VPI);
-      return;
-    }
-
     VPValue *Op0;
     VPValue *Op1;
     if (match(VPI, m_VPInstruction<VPInstruction::ExtractFromEnd>(
                        m_VPValue(Op0), m_VPValue(Op1)))) {
+      VPI->setOperand(1, getValueForPart(Op1, UF - 1));
       addUniformForAllParts(VPI);
       if (Plan.hasScalarVFOnly()) {
         // Extracting from end with VF = 1 implies retrieving the scalar part UF
@@ -1843,6 +1834,16 @@ void UnrollState::unrollRecipe(VPRecipeBase &R) {
       }
       return;
     }
+
+    if (vputils::onlyFirstPartUsed(VPI)) {
+      addUniformForAllParts(cast<VPInstruction>(&R));
+      return;
+    }
+    if (match(VPI, m_VPInstruction<VPInstruction::CalculateTripCountMinusVF>(
+                       m_VPValue()))) {
+      addUniformForAllParts(VPI);
+      return;
+    }
   }
   if (auto *RepR = dyn_cast<VPReplicateRecipe>(&R)) {
     if (isa<StoreInst>(RepR->getUnderlyingValue()) &&
@@ -1934,6 +1935,23 @@ void UnrollState::unrollBlock(VPBlockBase *VPB) {
         R.addOperand(getValueForPart(Op1, Part));
       continue;
     }
+    VPValue *Op0;
+    if (match(&R, m_VPInstruction<VPInstruction::ExtractFromEnd>(
+                      m_VPValue(Op0), m_VPValue(Op1)))) {
+      addUniformForAllParts(cast<VPSingleDefRecipe>(&R));
+      if (Plan.hasScalarVFOnly()) {
+        // Extracting from end with VF = 1 implies retrieving the scalar part UF
+        // - Op1.
+        unsigned Offset =
+            cast<ConstantInt>(Op1->getLiveInIRValue())->getZExtValue();
+        R.getVPSingleValue()->replaceAllUsesWith(
+            getValueForPart(Op0, UF - Offset));
+      } else {
+        // Otherwise we extract from the last part.
+        remapOperands(&R, UF - 1);
+      }
+      continue;
+    }
 
     auto *SingleDef = dyn_cast<VPSingleDefRecipe>(&R);
     if (SingleDef && vputils::isUniformAcrossVFsAndUFs(SingleDef)) {
@@ -1950,26 +1968,24 @@ void UnrollState::unrollBlock(VPBlockBase *VPB) {
   }
 }
 
-/// Remove recipes that are redundant after unrolling.
-static void cleanupRedundantRecipesAfterUnroll(VPlan &Plan) {
-  auto Iter = vp_depth_first_deep(Plan.getEntry());
-  for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(Iter)) {
-    for (VPRecipeBase &R : make_early_inc_range(*VPBB)) {
-      auto *VPI = dyn_cast<VPInstruction>(&R);
-      if (VPI &&
-          VPI->getOpcode() == VPInstruction::CanonicalIVIncrementForPart &&
-          VPI->getNumOperands() == 1) {
-        VPI->replaceAllUsesWith(VPI->getOperand(0));
-        VPI->eraseFromParent();
-      }
-    }
-  }
-}
 void VPlanTransforms::unrollByUF(VPlan &Plan, unsigned UF, LLVMContext &Ctx) {
   assert(UF > 0 && "Unroll factor must be positive");
   Plan.setUF(UF);
-  auto Cleanup =
-      make_scope_exit([&Plan]() { cleanupRedundantRecipesAfterUnroll(Plan); });
+  auto Cleanup = make_scope_exit([&Plan]() {
+    auto Iter = vp_depth_first_deep(Plan.getEntry());
+    // Remove recipes that are redundant after unrolling.
+    for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(Iter)) {
+      for (VPRecipeBase &R : make_early_inc_range(*VPBB)) {
+        auto *VPI = dyn_cast<VPInstruction>(&R);
+        if (VPI &&
+            VPI->getOpcode() == VPInstruction::CanonicalIVIncrementForPart &&
+            VPI->getNumOperands() == 1) {
+          VPI->replaceAllUsesWith(VPI->getOperand(0));
+          VPI->eraseFromParent();
+        }
+      }
+    }
+  });
   if (UF == 1) {
     return;
   }
diff --git a/llvm/lib/Transforms/Vectorize/VPlanUtils.cpp b/llvm/lib/Transforms/Vectorize/VPlanUtils.cpp
index c021dc6125c2ee..b578413ed79b95 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanUtils.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanUtils.cpp
@@ -100,7 +100,7 @@ bool vputils::isUniformAcrossVFsAndUFs(VPValue *V) {
         // A cast is uniform according to its operand.
         return isUniformAcrossVFsAndUFs(R->getOperand(0));
       })
-      .Default([](const VPRecipeBase *) { // A values is considered non-uniform
+      .Default([](const VPRecipeBase *) { // A value is considered non-uniform
                                           // unless proven otherwise.
         return false;
       });
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-widen-phi.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-widen-phi.ll
index ac657006e65d83..15819070f1e137 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/sve-widen-phi.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-widen-phi.ll
@@ -37,6 +37,31 @@ define void @widen_ptr_phi_unrolled(ptr noalias nocapture %a, ptr noalias nocapt
 ; CHECK-NEXT:    [[TMP7:%.*]] = shl nuw nsw i64 [[TMP6]], 5
 ; CHECK-NEXT:    [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[C]], i64 [[OFFSET_IDX]]
 ; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[C]], i64 [[OFFSET_IDX]]
+; CHECK-NEXT:    [[NEXT_GEP2:%.*]] = getelementptr i8, ptr [[TMP8]], i64 [[TMP7]]
+; CHECK-NEXT:    [[WIDE_VEC:%.*]] = load <vscale x 8 x i32>, ptr [[NEXT_GEP]], align 4
+; CHECK-NEXT:    [[STRIDED_VEC:%.*]] = call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.vector.deinterleave2.nxv8i32(<vscale x 8 x i32> [[WIDE_VEC]])
+; CHECK-NEXT:    [[TMP9:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[STRIDED_VEC]], 0
+; CHECK-NEXT:    [[TMP10:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[STRIDED_VEC]], 1
+; CHECK-NEXT:    [[WIDE_VEC3:%.*]] = load <vscale x 8 x i32>, ptr [[NEXT_GEP2]], align 4
+; CHECK-NEXT:    [[STRIDED_VEC4:%.*]] = call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.vector.deinterleave2.nxv8i32(<vscale x 8 x i32> [[WIDE_VEC3]])
+; CHECK-NEXT:    [[TMP11:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[STRIDED_VEC4]], 0
+; CHECK-NEXT:    [[TMP12:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[STRIDED_VEC4]], 1
+; CHECK-NEXT:    [[TMP13:%.*]] = add nsw <vscale x 4 x i32> [[TMP9]], shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 1, i64 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer)
+; CHECK-NEXT:    [[TMP14:%.*]] = add nsw <vscale x 4 x i32> [[TMP11]], shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 1, i64 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer)
+; CHECK-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[INDEX]]
+; CHECK-NEXT:    [[TMP16:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[DOTIDX:%.*]] = shl nuw nsw i64 [[TMP16]], 4
+; CHECK-NEXT:    [[TMP17:%.*]] = getelementptr inbounds i8, ptr [[TMP15]], i64 [[DOTIDX]]
+; CHECK-NEXT:    store <vscale x 4 x i32> [[TMP13]], ptr [[TMP15]], align 4
+; CHECK-NEXT:    store <vscale x 4 x i32> [[TMP14]], ptr [[TMP17]], align 4
+; CHECK-NEXT:    [[TMP18:%.*]] = add nsw <vscale x 4 x i32> [[TMP10]], shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 1, i64 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer)
+; CHECK-NEXT:    [[TMP19:%.*]] = add nsw <vscale x 4 x i32> [[TMP12]], shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 1, i64 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer)
+; CHECK-NEXT:    [[TMP20:%.*]] = getelementptr inbounds i32, ptr [[B:%.*]], i64 [[INDEX]]
+; CHECK-NEXT:    [[TMP21:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[DOTIDX5:%.*]] = shl nuw nsw i64 [[TMP21]], 4
+; CHECK-NEXT:    [[TMP22:%.*]] = getelementptr inbounds i8, ptr [[TMP20]], i64 [[DOTIDX5]]
+; CHECK-NEXT:    store <vscale x 4 x i32> [[TMP18]], ptr [[TMP20]], align 4
+; CHECK-NEXT:    store <vscale x 4 x i32> [[TMP19]], ptr [[TMP22]], align 4
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
 ; CHECK-NEXT:    [[TMP23:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[TMP23]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/dead-ops-cost.ll b/llvm/test/Transforms/LoopVectorize/RISCV/dead-ops-cost.ll
index df02cb741700e5..40338f4eb78695 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/dead-ops-cost.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/dead-ops-cost.ll
@@ -432,12 +432,12 @@ define void @gather_interleave_group_with_dead_insert_pos(i64 %N, ptr noalias %s
 ; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr i8, ptr [[SRC]], i64 [[TMP5]]
 ; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[SRC]], i64 [[TMP6]]
 ; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[TMP7]], i32 0
-; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr i8, ptr [[TMP8]], i32 0
 ; CHECK-NEXT:    [[WIDE_VEC:%.*]] = load <32 x i8>, ptr [[TMP9]], align 1
-; CHECK-NEXT:    [[WIDE_VEC2:%.*]] = load <32 x i8>, ptr [[TMP10]], align 1
 ; CHECK-NEXT:    [[STRIDED_VEC:%.*]] = shufflevector <32 x i8> [[WIDE_VEC]], <32 x i8> poison, <8 x i32> <i32 0, i32 4, i32 8, i32 12, i32 16, i32 20, i32 24, i32 28>
-; CHECK-NEXT:    [[STRIDED_VEC3:%.*]] = shufflevector <32 x i8> [[WIDE_VEC2]], <32 x i8> poison, <8 x i32> <i32 0, i32 4, i32 8, i32 12, i32 16, i32 20, i32 24, i32 28>
 ; CHECK-NEXT:    [[STRIDED_VEC4:%.*]] = shufflevector <32 x i8> [[WIDE_VEC]], <32 x i8> poison, <8 x i32> <i32 1, i32 5, i32 9, i32 13, i32 17, i32 21, i32 25, i32 29>
+; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr i8, ptr [[TMP8]], i32 0
+; CHECK-NEXT:    [[WIDE_VEC2:%.*]] = load <32 x i8>, ptr [[TMP10]], align 1
+; CHECK-NEXT:    [[STRIDED_VEC3:%.*]] = shufflevector <32 x i8> [[WIDE_VEC2]], <32 x i8> poison, <8 x i32> <i32 0, i32 4, i32 8, i32 12, i32 16, i32 20, i32 24, i32 28>
 ; CHECK-NEXT:    [[STRIDED_VEC5:%.*]] = shufflevector <32 x i8> [[WIDE_VEC2]], <32 x i8> poison, <8 x i32> <i32 1, i32 5, i32 9, i32 13, i32 17, i32 21, i32 25, i32 29>
 ; CHECK-NEXT:    [[TMP11:%.*]] = zext <8 x i8> [[STRIDED_VEC4]] to <8 x i32>
 ; CHECK-NEXT:    [[TMP12:%.*]] = zext <8 x i8> [[STRIDED_VEC5]] to <8 x i32>
diff --git a/llvm/test/Transforms/LoopVectorize/X86/cost-model.ll b/llvm/test/Transforms/LoopVectorize/X86/cost-model.ll
index 8877e65a0d9d73..2ecb188969e765 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/cost-model.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/cost-model.ll
@@ -562,16 +562,16 @@ define void @cost_duplicate_recipe_for_sinking(ptr %A, i64 %N) #2 {
 ; CHECK-NEXT:    [[TMP13:%.*]] = getelementptr double, ptr [[A]], i64 [[TMP9]]
 ; CHECK-NEXT:    [[TMP14:%.*]] = getelementptr double, ptr [[A]], i64 [[TMP10]]
 ; CHECK-NEXT:    [[TMP15:%.*]] = getelementptr double, ptr [[TMP11]], i32 0
-; CHECK-NEXT:    [[TMP16:%.*]] = getelementptr double, ptr [[TMP12]], i32 0
-; CHECK-NEXT:    [[TMP17:%.*]] = getelementptr double, ptr [[TMP13]], i32 0
-; CHECK-NEXT:    [[TMP18:%.*]] = getelementptr double, ptr [[TMP14]], i32 0
 ; CHECK-NEXT:    [[WIDE_VEC:%.*]] = load <16 x double>, ptr [[TMP15]], align 8
-; CHECK-NEXT:    [[WIDE_VEC1:%.*]] = load <16 x double>, ptr [[TMP16]], align 8
-; CHECK-NEXT:    [[WIDE_VEC2:%.*]] = load <16 x double>, ptr [[TMP17]], align 8
-; CHECK-NEXT:    [[WIDE_VEC3:%.*]] = load <16 x double>, ptr [[TMP18]], align 8
 ; CHECK-NEXT:    [[STRIDED_VEC:%.*]] = shufflevector <16 x double> [[WIDE_VEC]], <16 x double> poison, <4 x i32> <i32 0, i32 4, i32 8, i32 12>
+; CHECK-NEXT:    [[TMP16:%.*]] = getelementptr double, ptr [[TMP12]], i32 0
+; CHECK-NEXT:    [[WIDE_VEC1:%.*]] = load <16 x double>, ptr [[TMP16]], align 8
 ; CHECK-NEXT:    [[STRIDED_VEC4:%.*]] = shufflevector <16 x double> [[WIDE_VEC1]], <16 x double> poison, <4 x i32> <i32 0, i32 4, i32 8, i32 12>
+; CHECK-NEXT:    [[TMP17:%.*]] = getelementptr double, ptr [[TMP13]], i32 0
+; CHECK-NEXT:    [[WIDE_VEC2:%.*]] = load <16 x double>, ptr [[TMP17]], align 8
 ; CHECK-NEXT:    [[STRIDED_VEC5:%.*]] = shufflevector <16 x double> [[WIDE_VEC2]], <16 x double> poison, <4 x i32> <i32 0, i32 4, i32 8, i32 12>
+; CHECK-NEXT:    [[TMP18:%.*]] = getelementptr double, ptr [[TMP14]], i32 0
+; CHECK-NEXT:    [[WIDE_VEC3:%.*]] = load <16 x double>, ptr [[TMP18]], align 8
 ; CHECK-NEXT:    [[STRIDED_VEC6:%.*]] = shufflevector <16 x double> [[WIDE_VEC3]], <16 x double> poison, <4 x i32> <i32 0, i32 4, i32 8, i32 12>
 ; CHECK-NEXT:    [[TMP19:%.*]] = fcmp oeq <4 x double> [[STRIDED_VEC]], zeroinitializer
 ; CHECK-NEXT:    [[TMP20:%.*]] = fcmp oeq <4 x double> [[STRIDED_VEC4]], zeroinitializer
diff --git a/llvm/test/Transforms/LoopVectorize/X86/interleaving.ll b/llvm/test/Transforms/LoopVectorize/X86/interleaving.ll
index b02826643ac669..6232c1ae752673 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/interleaving.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/interleaving.ll
@@ -13,6 +13,22 @@ define void @foo(ptr noalias nocapture %a, ptr noalias nocapture readonly %b) {
 ; SSE-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; SSE:       vector.body:
 ; SSE-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; SSE-NEXT:    [[TMP0:%.*]] = shl i64 [[INDEX]], 1
+; SSE-NEXT:    [[TMP1:%.*]] = or disjoint i64 [[TMP0]], 8
+; SSE-NEXT:    [[DOTIDX:%.*]] = shl nsw i64 [[INDEX]], 3
+; SSE-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i8, ptr [[B:%.*]], i64 [[DOTIDX]]
+; SSE-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[TMP1]]
+; SSE-NEXT:    [[WIDE_VEC:%.*]] = load <8 x i32>, ptr [[TMP2]], align 4
+; SSE-NEXT:    [[STRIDED_VEC:%.*]] = shufflevector <8 x i32> [[WIDE_VEC]], <8 x i32> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+; SSE-NEXT:    [[STRIDED_VEC3:%.*]] = shufflevector <8 x i32> [[WIDE_VEC]], <8 x i32> poison, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+; SSE-NEXT:    [[WIDE_VEC1:%.*]] = load <8 x i32>, ptr [[TMP3]], align 4
+; SSE-NEXT:    [[STRIDED_VEC2:%.*]] = shufflevector <8 x i32> [[WIDE_VEC1]], <8 x i32> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+; SSE-NEXT:    [[STRIDED_VEC4:%.*]] = shufflevector <8 x i32> [[WIDE_VEC1]], <8 x i32> poison, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+; SSE-NEXT:    [[TMP4:%.*]] = add nsw <4 x i32> [[STRIDED_VEC3]], [[STRIDED_VEC]]
+; SSE-NEXT:    [[TMP5:%.*]] = add nsw <4 x i32> [[STRIDED_VEC4]], [[STRIDED_VEC2]]
+; SSE-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[INDEX]]
+; SSE-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i8, ptr [[TMP6]], i64 16
+; SSE-NEXT:    store <4 x i32> [[TMP4]], ptr [[TMP6]], align 4
 ; SSE-NEXT:    store <4 x i32> [[TMP5]], ptr [[TMP7]], align 4
 ; SSE-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
 ; SSE-NEXT:    [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
@@ -33,6 +49,38 @@ define void @foo(ptr noalias nocapture %a, ptr noalias nocapture readonly %b) {
 ; AVX1-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; AVX1:       vector.body:
 ; AVX1-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; AVX1-NEXT:    [[TMP0:%.*]] = shl i64 [[INDEX]], 1
+; AVX1-NEXT:    [[TMP1:%.*]] = or disjoint i64 [[TMP0]], 8
+; AVX1-NEXT:    [[TMP2:%.*]] = shl i64 [[INDEX]], 1
+; AVX1-NEXT:    [[TMP3:%.*]] = or disjoint i64 [[TMP2]], 16
+; AVX1-NEXT:    [[TMP4:%.*]] = shl i64 [[INDEX]], 1
+; AVX1-NEXT:    [[TMP5:%.*]] = or disjoint i64 [[TMP4]], 24
+; AVX1-NEXT:    [[DOTIDX:%.*]] = shl nsw i64 [[INDEX]], 3
+; AVX1-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i8, ptr [[B:%.*]], i64 [[DOTIDX]]
+; AVX1-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[TMP1]]
+; AVX1-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[TMP3]]
+; AVX1-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[TMP5]]
+; AVX1-NEXT:    [[WIDE_VEC:%.*]] = load <8 x i32>, ptr [[TMP6]], align 4
+; AVX1-NEXT:    [[STRIDED_VEC:%.*]] = shufflevector <8 x i32> [[WIDE_VEC]], <8 x i32> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+; AVX1-NEXT:    [[STRIDED_VEC7:%.*]] = shufflevector <8 x i32> [[WIDE_VEC]], <8 x i32> poison, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+; AVX1-NEXT:    [[WIDE_VEC1:%.*]] = load <8 x i32>, ptr [[TMP7]], align 4
+; AVX1-NEXT:    [[STRIDED_VEC4:%.*]] = shufflevector <8 x i32> [[WIDE_VEC1]], <8 x i32> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+; AVX1-NEXT:    [[STRIDED_VEC8:%.*]] = shufflevector <8 x i32> [[WIDE_VEC1]], <8 x i32> poison, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+; AVX1-NEXT:    [[WIDE_VEC2:%.*]] = load <8 x i32>, ptr [[TMP8]], align 4
+; AVX1-NEXT:    [[STRIDED_VEC5:%.*]] = shufflevector <8 x i32> [[WIDE_VEC2]], <8 x i32> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+; AVX1-NEXT:    [[STRIDED_VEC9:%.*]] = shufflevector <8 x i32> [[WIDE_VEC2]], <8 x i32> poison, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+; AVX1-NEXT:    [[WIDE_VEC3:%.*]] = load <8 x i32>, ptr [[TMP9]], align 4
+; AVX1-NEXT:    [[STRIDED_VEC6:%.*]] = shufflevector <8 x i32> [[WIDE_VEC3]], <8 x i32> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+; AVX1-NEXT:    [[STRIDED_VEC10:%.*]] = shufflevector <8 x i32> [[WIDE_VEC3]], <8 x i32> poison, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+; AVX1-NEXT:    [[TMP10:%.*]] = add nsw <4 x i32> [[STRIDED_VEC7]], [[STRIDED_VEC]]
+; AVX1-NEXT:    [[TMP11:%.*]] = add nsw <4 x i32> [[STRIDED_VEC8]], [[STRIDED_VEC4]]
+; AVX1-NEXT:    [[TMP12:%.*]] = add nsw <4 x i32> [[STRIDED_VEC9]], [[STRIDED_VEC5]]
+; AVX1-NEXT:    [[TMP13:%.*]] = add nsw <4 x i32> [[STRIDED_VEC10]], [[STRIDED_VEC6]]
+; AVX1-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[INDEX]]
+; AVX1-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i8, ptr [[TMP14]], i64 16
+; AVX1-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i8, ptr [[TMP14]], i64 32
+; AVX1-NEXT:    [[TMP17:%.*]] = getelementptr inbounds i8, ptr [[TMP14]], i64 48
+; AVX1-NEXT:    store <4 x i32> [[TMP10]], ptr [[TMP14]], align 4
 ; AVX1-NEXT:    store <4 x i32> [[TMP11]], ptr [[TMP15]], align 4
 ; AVX1-NEXT:    store <4 x i32> [[TMP12]], ptr [[TMP16]], align 4
 ; AVX1-NEXT:    store <4 x i32> [[TMP13]], ptr [[TMP17]], align 4
@@ -55,6 +103,38 @@ define void @foo(ptr noalias nocapture %a, ptr noalias nocapture readonly %b) {
 ; AVX2-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; AVX2:       vector.body:
 ; AVX2-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; AVX2-NEXT:    [[TMP0:%.*]] = shl i64 [[INDEX]], 1
+; AVX2-NEXT:    [[TMP1:%.*]] = or disjoint i64 [[TMP0]], 16
+; AVX2-NEXT:    [[TMP2:%.*]] = shl i64 [[INDEX]], 1
+; AVX2-NEXT:    [[TMP3:%.*]] = or disjoint i64 [[TMP2]], 32
+; AVX2-NEXT:    [[TMP4:%.*]] = shl i64 [[INDEX]], 1
+; AVX2-NEXT:    [[TMP5:%.*]] = or disjoint i64 [[TMP4]], 48
+; AVX2-NEXT:    [[DOTIDX:%.*]] = shl nsw i64 [[INDEX]], 3
+; AVX2-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i8, ptr [[B:%.*]], i64 [[DOTIDX]]
+; AVX2-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[TMP1]]
+; AVX2-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[TMP3]]
+; AVX2-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[TMP5]]
+; AVX2-NEXT:    [[WIDE_VEC:%.*]] = load <16 x i32>, ptr [[TMP6]], align 4
+; AVX2-NEXT:    [[STRIDED_VEC:%.*]] = shufflevector <16 x i32> [[WIDE_VEC]], <16 x i32> poison, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
+; AVX2-NEXT:    [[STRIDED_VEC7:%.*]] = shufflevector <16 x i32> [[WIDE_VEC]], <16 x i32> poison, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
+; AVX2-NEXT:    [[WIDE_VEC1:%.*]] = load <16 x i32>, ptr [[TMP7]], align 4
+; AVX2-NEXT:    [[STRIDED_VEC4:%.*]] = shufflevector <16 x i32> [[WIDE_VEC1]], <16 x i32> poison, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
+; AVX2-NEXT:    [[STRIDED_VEC8:%.*]] = shufflevector <16 x i32> [[WIDE_VEC1]], <16 x i32> poison, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
+; AVX2-NEXT:    [[WIDE_VEC2:%.*]] = load <16 x i32>, ptr [[TMP8]], align 4
+; AVX2-NEXT:    [[STRIDED_VEC5:%.*]] = shufflevector <16 x i32> [[WIDE_VEC2]], <16 x i32> poison, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
+; AVX2-NEXT:    [[STRIDED_VEC9:%.*]] = shufflevector <16 x i32> [[WIDE_VEC2]], <16 x i32> poison, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
+; AVX2-NEXT:    [[WIDE_VEC3:%.*]] = load <16 x i32>, ptr [[TMP9]], align 4
+; AVX2-NEXT:    [[STRIDED_VEC10:%.*]] = shufflevector <16 x i32> [[WIDE_VEC3]], <16 x i32> poison, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
+; AVX2-NEXT:    [[STRIDED_VEC6:%.*]] = shufflevector <16 x i32> [[WIDE_VEC3]], <16 x i32> poison, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
+; AVX2-NEXT:    [[TMP10:%.*]] = add nsw <8 x i32> [[STRIDED_VEC7]], [[STRIDED_VEC]]
+; AVX2-NEXT:    [[TMP11:%.*]] = add nsw <8 x i32> [[STRIDED_VEC8]], [[STRIDED_VEC4]]
+; AVX2-NEXT:    [[TMP12:%.*]] = add nsw <8 x i32> [[STRIDED_VEC9]], [[STRIDED_VEC5]]
+; AVX2-NEXT:    [[TMP13:%.*]] = add nsw <8 x i32> [[STRIDED_VEC10]], [[STRIDED_VEC6]]
+; AVX2-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[INDEX]]
+; AVX2-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i8, ptr [[TMP14]], i64 32
+; AVX2-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i8, ptr [[TMP14]], i64 64
+; AVX2-NEXT:    [[TMP17:%.*]] = getelementptr inbounds i8, ptr [[TMP14]], i64 96
+; AVX2-NEXT:    store <8 x i32> [[TMP10]], ptr [[TMP14]], align 4
 ; AVX2-NEXT:    store <8 x i32> [[TMP11]], ptr [[TMP15]], align 4
 ; AVX2-NEXT:    store <8 x i32> [[TMP12]], ptr [[TMP16]], align 4
 ; AVX2-NEXT:    store <8 x i32> [[TMP13]], ptr [[TMP17]], align 4

>From 2b0b1e9617d0559cd2e03191effce4a05258e2b9 Mon Sep 17 00:00:00 2001
From: Florian Hahn <flo at fhahn.com>
Date: Thu, 19 Sep 2024 17:52:50 +0100
Subject: [PATCH 11/15] !fixup add VPUnrollPartAccessor to access
 getUnrollPart[Operand]

---
 llvm/lib/Transforms/Vectorize/VPlan.h         | 50 +++++++--------
 .../lib/Transforms/Vectorize/VPlanRecipes.cpp | 64 ++++++-------------
 .../Transforms/Vectorize/VPlanTransforms.cpp  |  3 +-
 3 files changed, 45 insertions(+), 72 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h
index 899a37d8bf6c35..a29d972ca1d16d 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.h
+++ b/llvm/lib/Transforms/Vectorize/VPlan.h
@@ -1232,11 +1232,24 @@ class VPRecipeWithIRFlags : public VPSingleDefRecipe {
 #endif
 };
 
+/// Helper to access the operand that contains the unroll part for this recipe
+/// after unrolling.
+template <unsigned PartOpIdx> class VPUnrollPartAccessor {
+protected:
+  /// Return the VPValue operand containing the unroll part or null if there is
+  /// no such operand.
+  VPValue *getUnrollPartOperand(VPUser &U) const;
+
+  /// Return the unroll part.
+  unsigned getUnrollPart(VPUser &U) const;
+};
+
 /// This is a concrete Recipe that models a single VPlan-level instruction.
 /// While as any Recipe it may generate a sequence of IR instructions when
 /// executed, these instructions would always form a single-def expression as
 /// the VPInstruction is also a single def-use vertex.
-class VPInstruction : public VPRecipeWithIRFlags {
+class VPInstruction : public VPRecipeWithIRFlags,
+                      public VPUnrollPartAccessor<1> {
   friend class VPlanSlp;
 
 public:
@@ -1310,9 +1323,6 @@ class VPInstruction : public VPRecipeWithIRFlags {
   bool isFPMathOp() const;
 #endif
 
-  /// Return the unroll part for this VPInstruction.
-  unsigned getUnrollPart() const;
-
 public:
   VPInstruction(unsigned Opcode, ArrayRef<VPValue *> Operands, DebugLoc DL,
                 const Twine &Name = "")
@@ -1773,7 +1783,8 @@ class VPWidenGEPRecipe : public VPRecipeWithIRFlags {
 /// A recipe to compute the pointers for widened memory accesses of IndexTy for
 /// all parts. If IsReverse is true, compute pointers for accessing the input in
 /// reverse order per part.
-class VPVectorPointerRecipe : public VPRecipeWithIRFlags {
+class VPVectorPointerRecipe : public VPRecipeWithIRFlags,
+                              public VPUnrollPartAccessor<1> {
   Type *IndexedTy;
   bool IsReverse;
 
@@ -1807,9 +1818,6 @@ class VPVectorPointerRecipe : public VPRecipeWithIRFlags {
                                      isInBounds(), getDebugLoc());
   }
 
-  /// Return the unroll part for this vector pointer.
-  unsigned getUnrollPart() const;
-
 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
   /// Print the recipe.
   void print(raw_ostream &O, const Twine &Indent,
@@ -1994,7 +2002,8 @@ class VPWidenIntOrFpInductionRecipe : public VPHeaderPHIRecipe {
   }
 };
 
-class VPWidenPointerInductionRecipe : public VPHeaderPHIRecipe {
+class VPWidenPointerInductionRecipe : public VPHeaderPHIRecipe,
+                                      public VPUnrollPartAccessor<3> {
   const InductionDescriptor &IndDesc;
 
   bool IsScalarAfterVectorization;
@@ -2031,14 +2040,11 @@ class VPWidenPointerInductionRecipe : public VPHeaderPHIRecipe {
   /// Returns the induction descriptor for the recipe.
   const InductionDescriptor &getInductionDescriptor() const { return IndDesc; }
 
-  /// Return the unroll part for this induction phi.
-  unsigned getUnrollPart() const;
-
   /// Returns the VPValue representing the value of this induction at
   /// the first unrolled part, if it exists. Returns itself if unrolling did not
   /// take place.
   VPValue *getFirstUnrolledPartOperand() {
-    return getUnrollPart() == 0 ? this : getOperand(2);
+    return getUnrollPart(*this) == 0 ? this : getOperand(2);
   }
 
 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
@@ -2123,7 +2129,8 @@ struct VPFirstOrderRecurrencePHIRecipe : public VPHeaderPHIRecipe {
 /// A recipe for handling reduction phis. The start value is the first operand
 /// of the recipe and the incoming value from the backedge is the second
 /// operand.
-class VPReductionPHIRecipe : public VPHeaderPHIRecipe {
+class VPReductionPHIRecipe : public VPHeaderPHIRecipe,
+                             public VPUnrollPartAccessor<2> {
   /// Descriptor for the reduction.
   const RecurrenceDescriptor &RdxDesc;
 
@@ -2178,9 +2185,6 @@ class VPReductionPHIRecipe : public VPHeaderPHIRecipe {
 
   /// Returns true, if the phi is part of an in-loop reduction.
   bool isInLoop() const { return IsInLoop; }
-
-  /// Return the unroll part for this reduction phi.
-  unsigned getUnrollPart() const;
 };
 
 /// A recipe for vectorizing a phi-node as a sequence of mask-based select
@@ -3007,7 +3011,8 @@ class VPEVLBasedIVPHIRecipe : public VPHeaderPHIRecipe {
 };
 
 /// A Recipe for widening the canonical induction variable of the vector loop.
-class VPWidenCanonicalIVRecipe : public VPSingleDefRecipe {
+class VPWidenCanonicalIVRecipe : public VPSingleDefRecipe,
+                                 public VPUnrollPartAccessor<1> {
 public:
   VPWidenCanonicalIVRecipe(VPCanonicalIVPHIRecipe *CanonicalIV)
       : VPSingleDefRecipe(VPDef::VPWidenCanonicalIVSC, {CanonicalIV}) {}
@@ -3026,9 +3031,6 @@ class VPWidenCanonicalIVRecipe : public VPSingleDefRecipe {
   /// step = <VF*UF, VF*UF, ..., VF*UF>.
   void execute(VPTransformState &State) override;
 
-  /// Return the unroll part for this widened IV.
-  unsigned getUnrollPart() const;
-
 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
   /// Print the recipe.
   void print(raw_ostream &O, const Twine &Indent,
@@ -3096,7 +3098,8 @@ class VPDerivedIVRecipe : public VPSingleDefRecipe {
 
 /// A recipe for handling phi nodes of integer and floating-point inductions,
 /// producing their scalar values.
-class VPScalarIVStepsRecipe : public VPRecipeWithIRFlags {
+class VPScalarIVStepsRecipe : public VPRecipeWithIRFlags,
+                              public VPUnrollPartAccessor<2> {
   Instruction::BinaryOps InductionOpcode;
 
 public:
@@ -3141,9 +3144,6 @@ class VPScalarIVStepsRecipe : public VPRecipeWithIRFlags {
            "Op must be an operand of the recipe");
     return true;
   }
-
-  /// Return the unroll part for this scalar step
-  unsigned getUnrollPart() const;
 };
 
 /// VPBasicBlock serves as the leaf of the Hierarchical Control-Flow Graph. It
diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
index 03093ffdef9d1d..7246058adc0f81 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
@@ -332,10 +332,19 @@ FastMathFlags VPRecipeWithIRFlags::getFastMathFlags() const {
   return Res;
 }
 
-unsigned VPInstruction::getUnrollPart() const {
-  assert(getOpcode() == VPInstruction::CanonicalIVIncrementForPart &&
-         getNumOperands() == 2 && "Called for unsupported recipe");
-  return cast<ConstantInt>(getOperand(1)->getLiveInIRValue())->getZExtValue();
+template <unsigned PartOpIdx>
+VPValue *
+VPUnrollPartAccessor<PartOpIdx>::getUnrollPartOperand(VPUser &U) const {
+  if (U.getNumOperands() == PartOpIdx + 1)
+    return U.getOperand(PartOpIdx);
+  return nullptr;
+}
+
+template <unsigned PartOpIdx>
+unsigned VPUnrollPartAccessor<PartOpIdx>::getUnrollPart(VPUser &U) const {
+  if (auto *UnrollPartOp = getUnrollPartOperand(U))
+    return cast<ConstantInt>(UnrollPartOp->getLiveInIRValue())->getZExtValue();
+  return 0;
 }
 
 VPInstruction::VPInstruction(unsigned Opcode, CmpInst::Predicate Pred,
@@ -498,7 +507,7 @@ Value *VPInstruction::generatePerPart(VPTransformState &State, unsigned Part) {
     return EVL;
   }
   case VPInstruction::CanonicalIVIncrementForPart: {
-    unsigned Part = getUnrollPart();
+    unsigned Part = getUnrollPart(*this);
     auto *IV = State.get(getOperand(0), VPIteration(0, 0));
     assert(Part != 0 && "Must have a positive part");
     // The canonical IV is incremented by the vectorization factor (num of
@@ -1671,7 +1680,7 @@ void VPScalarIVStepsRecipe::execute(VPTransformState &State) {
   // TODO: Remove loop after VPlan-based unroller lands.
   for (unsigned Part = StartPart; Part < EndPart; ++Part) {
     Value *StartIdx0 =
-        createStepForVF(Builder, IntStepTy, State.VF, getUnrollPart());
+        createStepForVF(Builder, IntStepTy, State.VF, getUnrollPart(*this));
 
     if (!FirstLaneOnly && State.VF.isScalable()) {
       auto *SplatStartIdx = Builder.CreateVectorSplat(State.VF, StartIdx0);
@@ -1704,13 +1713,6 @@ void VPScalarIVStepsRecipe::execute(VPTransformState &State) {
   }
 }
 
-unsigned VPScalarIVStepsRecipe::getUnrollPart() const {
-  return getNumOperands() == 2
-             ? 0
-             : cast<ConstantInt>(getOperand(2)->getLiveInIRValue())
-                   ->getZExtValue();
-}
-
 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
 void VPScalarIVStepsRecipe::print(raw_ostream &O, const Twine &Indent,
                                   VPSlotTracker &SlotTracker) const {
@@ -1812,7 +1814,7 @@ void VPWidenGEPRecipe::print(raw_ostream &O, const Twine &Indent,
 void VPVectorPointerRecipe ::execute(VPTransformState &State) {
   auto &Builder = State.Builder;
   State.setDebugLocFrom(getDebugLoc());
-  unsigned CurrentPart = getUnrollPart();
+  unsigned CurrentPart = getUnrollPart(*this);
   for (unsigned Part = 0; Part < State.UF; ++Part) {
     // Calculate the pointer for the specific unroll-part.
     Value *PartPtr = nullptr;
@@ -1849,13 +1851,6 @@ void VPVectorPointerRecipe ::execute(VPTransformState &State) {
   }
 }
 
-unsigned VPVectorPointerRecipe::getUnrollPart() const {
-  return getNumOperands() == 1
-             ? 0
-             : cast<ConstantInt>(getOperand(1)->getLiveInIRValue())
-                   ->getZExtValue();
-}
-
 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
 void VPVectorPointerRecipe::print(raw_ostream &O, const Twine &Indent,
                                   VPSlotTracker &SlotTracker) const {
@@ -2924,7 +2919,7 @@ void VPWidenPointerInductionRecipe::execute(VPTransformState &State) {
 
   auto *IVR = getParent()->getPlan()->getCanonicalIV();
   PHINode *CanonicalIV = cast<PHINode>(State.get(IVR, 0, /*IsScalar*/ true));
-  unsigned CurrentPart = getUnrollPart();
+  unsigned CurrentPart = getUnrollPart(*this);
 
   // Build a pointer phi
   Value *ScalarStartValue = getStartValue()->getLiveInIRValue();
@@ -2993,13 +2988,6 @@ void VPWidenPointerInductionRecipe::execute(VPTransformState &State) {
   }
 }
 
-unsigned VPWidenPointerInductionRecipe::getUnrollPart() const {
-  return getNumOperands() == 4
-             ? cast<ConstantInt>(getOperand(3)->getLiveInIRValue())
-                   ->getZExtValue()
-             : 0;
-}
-
 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
 void VPWidenPointerInductionRecipe::print(raw_ostream &O, const Twine &Indent,
                                           VPSlotTracker &SlotTracker) const {
@@ -3051,7 +3039,7 @@ void VPWidenCanonicalIVRecipe::execute(VPTransformState &State) {
                       ? CanonicalIV
                       : Builder.CreateVectorSplat(VF, CanonicalIV, "broadcast");
   for (unsigned Part = 0, UF = State.UF; Part < UF; ++Part) {
-    Value *VStep = createStepForVF(Builder, STy, VF, getUnrollPart());
+    Value *VStep = createStepForVF(Builder, STy, VF, getUnrollPart(*this));
     if (VF.isVector()) {
       VStep = Builder.CreateVectorSplat(VF, VStep);
       VStep =
@@ -3062,13 +3050,6 @@ void VPWidenCanonicalIVRecipe::execute(VPTransformState &State) {
   }
 }
 
-unsigned VPWidenCanonicalIVRecipe::getUnrollPart() const {
-  return getNumOperands() == 1
-             ? 0
-             : cast<ConstantInt>(getOperand(1)->getLiveInIRValue())
-                   ->getZExtValue();
-}
-
 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
 void VPWidenCanonicalIVRecipe::print(raw_ostream &O, const Twine &Indent,
                                      VPSlotTracker &SlotTracker) const {
@@ -3147,7 +3128,7 @@ void VPReductionPHIRecipe::execute(VPTransformState &State) {
 
   Value *Iden = nullptr;
   RecurKind RK = RdxDesc.getRecurrenceKind();
-  unsigned CurrentPart = getUnrollPart();
+  unsigned CurrentPart = getUnrollPart(*this);
 
   if (RecurrenceDescriptor::isMinMaxRecurrenceKind(RK) ||
       RecurrenceDescriptor::isAnyOfRecurrenceKind(RK)) {
@@ -3189,13 +3170,6 @@ void VPReductionPHIRecipe::execute(VPTransformState &State) {
   }
 }
 
-unsigned VPReductionPHIRecipe::getUnrollPart() const {
-  return getNumOperands() == 2
-             ? 0
-             : cast<ConstantInt>(getOperand(2)->getLiveInIRValue())
-                   ->getZExtValue();
-}
-
 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
 void VPReductionPHIRecipe::print(raw_ostream &O, const Twine &Indent,
                                  VPSlotTracker &SlotTracker) const {
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
index b06568d4ab195a..90972258183e30 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
@@ -1619,8 +1619,7 @@ class UnrollState {
 
 public:
   UnrollState(VPlan &Plan, unsigned UF, LLVMContext &Ctx)
-      : Plan(Plan), UF(UF),
-        TypeInfo(Plan.getCanonicalIV()->getScalarType()) {}
+      : Plan(Plan), UF(UF), TypeInfo(Plan.getCanonicalIV()->getScalarType()) {}
 
   void unrollBlock(VPBlockBase *VPB);
 

>From 862121d9ed74ab9d78a63b94f7833da0f6960084 Mon Sep 17 00:00:00 2001
From: Florian Hahn <flo at fhahn.com>
Date: Fri, 20 Sep 2024 14:11:17 +0100
Subject: [PATCH 12/15] !fixup update to avoid unused variable.

---
 .../Transforms/Vectorize/VPlanTransforms.cpp  | 27 ++++++++--------
 .../LoopVectorize/X86/cost-model.ll           | 32 ++++++-------------
 .../X86/divs-with-tail-folding.ll             |  6 ++--
 3 files changed, 27 insertions(+), 38 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
index f438d5b38d22ce..dbfdf2a131165f 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
@@ -1644,7 +1644,7 @@ class UnrollState {
 
   void unrollReplicateRegion(VPRegionBlock *VPR);
   void unrollRecipe(VPRecipeBase &R);
-  void unrollHeaderPHI(VPRecipeBase &R, VPBasicBlock::iterator InsertPtForPhi);
+  void unrollHeaderPHI(VPHeaderPHIRecipe *R, VPBasicBlock::iterator InsertPtForPhi);
   void unrollWidenInduction(VPWidenIntOrFpInductionRecipe *IV,
                             VPBasicBlock::iterator InsertPtForPhi);
 
@@ -1809,35 +1809,35 @@ void UnrollState::unrollWidenInduction(VPWidenIntOrFpInductionRecipe *IV,
   IV->addOperand(Prev);
 }
 
-void UnrollState::unrollHeaderPHI(VPRecipeBase &R,
+void UnrollState::unrollHeaderPHI(VPHeaderPHIRecipe *R,
                                   VPBasicBlock::iterator InsertPtForPhi) {
   // First-order recurrences pass a single vector or scalar through their header
   // phis, irrespective of interleaving.
-  if (isa<VPFirstOrderRecurrencePHIRecipe>(&R))
+  if (isa<VPFirstOrderRecurrencePHIRecipe>(R))
     return;
 
   // Generate step vectors for each unrolled part.
-  if (auto *IV = dyn_cast<VPWidenIntOrFpInductionRecipe>(&R)) {
+  if (auto *IV = dyn_cast<VPWidenIntOrFpInductionRecipe>(R)) {
     unrollWidenInduction(IV, InsertPtForPhi);
     return;
   }
 
-  auto *RdxPhi = dyn_cast<VPReductionPHIRecipe>(&R);
+  auto *RdxPhi = dyn_cast<VPReductionPHIRecipe>(R);
   if (RdxPhi && RdxPhi->isOrdered())
     return;
 
-  auto InsertPt = std::next(R.getIterator());
+  auto InsertPt = std::next(R->getIterator());
   for (unsigned Part = 1; Part != UF; ++Part) {
-    VPRecipeBase *Copy = R.clone();
-    Copy->insertBefore(*R.getParent(), InsertPt);
-    addRecipeForPart(&R, Copy, Part);
-    if (isa<VPWidenPointerInductionRecipe>(&R)) {
-      Copy->addOperand(R.getVPSingleValue());
+    VPRecipeBase *Copy = R->clone();
+    Copy->insertBefore(*R->getParent(), InsertPt);
+    addRecipeForPart(R, Copy, Part);
+    if (isa<VPWidenPointerInductionRecipe>(R)) {
+      Copy->addOperand(R);
       Copy->addOperand(getConstantVPV(Part));
     } else if (RdxPhi) {
       Copy->addOperand(getConstantVPV(Part));
     } else {
-      assert(isa<VPActiveLaneMaskPHIRecipe>(&R) &&
+      assert(isa<VPActiveLaneMaskPHIRecipe>(R) &&
              "unexpected header phi recipe not needing unrolled part");
     }
   }
@@ -1981,6 +1981,7 @@ void UnrollState::unrollBlock(VPBlockBase *VPB) {
             cast<ConstantInt>(Op1->getLiveInIRValue())->getZExtValue();
         R.getVPSingleValue()->replaceAllUsesWith(
             getValueForPart(Op0, UF - Offset));
+        R.eraseFromParent();
       } else {
         // Otherwise we extract from the last part.
         remapOperands(&R, UF - 1);
@@ -1995,7 +1996,7 @@ void UnrollState::unrollBlock(VPBlockBase *VPB) {
     }
 
     if (auto *H = dyn_cast<VPHeaderPHIRecipe>(&R)) {
-      unrollHeaderPHI(R, InsertPtForPhi);
+      unrollHeaderPHI(H, InsertPtForPhi);
       continue;
     }
 
diff --git a/llvm/test/Transforms/LoopVectorize/X86/cost-model.ll b/llvm/test/Transforms/LoopVectorize/X86/cost-model.ll
index 9a65f49f2d482e..546aa73e20f0cf 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/cost-model.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/cost-model.ll
@@ -783,9 +783,6 @@ define i64 @cost_assume(ptr %end, i64 %N) {
 ; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <2 x i64> poison, i64 [[N:%.*]], i64 0
 ; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <2 x i64> [[BROADCAST_SPLATINSERT]], <2 x i64> poison, <2 x i32> zeroinitializer
 ; CHECK-NEXT:    [[TMP3:%.*]] = icmp ne <2 x i64> [[BROADCAST_SPLAT]], zeroinitializer
-; CHECK-NEXT:    [[TMP4:%.*]] = icmp ne <2 x i64> [[BROADCAST_SPLAT]], zeroinitializer
-; CHECK-NEXT:    [[TMP5:%.*]] = icmp ne <2 x i64> [[BROADCAST_SPLAT]], zeroinitializer
-; CHECK-NEXT:    [[TMP6:%.*]] = icmp ne <2 x i64> [[BROADCAST_SPLAT]], zeroinitializer
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
@@ -799,20 +796,13 @@ define i64 @cost_assume(ptr %end, i64 %N) {
 ; CHECK-NEXT:    [[TMP10]] = add <2 x i64> [[VEC_PHI4]], <i64 1, i64 1>
 ; CHECK-NEXT:    [[TMP11:%.*]] = extractelement <2 x i1> [[TMP3]], i32 0
 ; CHECK-NEXT:    tail call void @llvm.assume(i1 [[TMP11]])
-; CHECK-NEXT:    [[TMP12:%.*]] = extractelement <2 x i1> [[TMP3]], i32 1
-; CHECK-NEXT:    tail call void @llvm.assume(i1 [[TMP12]])
-; CHECK-NEXT:    [[TMP13:%.*]] = extractelement <2 x i1> [[TMP4]], i32 0
-; CHECK-NEXT:    tail call void @llvm.assume(i1 [[TMP13]])
-; CHECK-NEXT:    [[TMP14:%.*]] = extractelement <2 x i1> [[TMP4]], i32 1
-; CHECK-NEXT:    tail call void @llvm.assume(i1 [[TMP14]])
-; CHECK-NEXT:    [[TMP15:%.*]] = extractelement <2 x i1> [[TMP5]], i32 0
-; CHECK-NEXT:    tail call void @llvm.assume(i1 [[TMP15]])
-; CHECK-NEXT:    [[TMP16:%.*]] = extractelement <2 x i1> [[TMP5]], i32 1
-; CHECK-NEXT:    tail call void @llvm.assume(i1 [[TMP16]])
-; CHECK-NEXT:    [[TMP17:%.*]] = extractelement <2 x i1> [[TMP6]], i32 0
-; CHECK-NEXT:    tail call void @llvm.assume(i1 [[TMP17]])
-; CHECK-NEXT:    [[TMP18:%.*]] = extractelement <2 x i1> [[TMP6]], i32 1
-; CHECK-NEXT:    tail call void @llvm.assume(i1 [[TMP18]])
+; CHECK-NEXT:    tail call void @llvm.assume(i1 [[TMP11]])
+; CHECK-NEXT:    tail call void @llvm.assume(i1 [[TMP11]])
+; CHECK-NEXT:    tail call void @llvm.assume(i1 [[TMP11]])
+; CHECK-NEXT:    tail call void @llvm.assume(i1 [[TMP11]])
+; CHECK-NEXT:    tail call void @llvm.assume(i1 [[TMP11]])
+; CHECK-NEXT:    tail call void @llvm.assume(i1 [[TMP11]])
+; CHECK-NEXT:    tail call void @llvm.assume(i1 [[TMP11]])
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
 ; CHECK-NEXT:    [[TMP19:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[TMP19]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP18:![0-9]+]]
@@ -869,6 +859,8 @@ define void @reduction_store(ptr noalias %src, ptr %dst, i1 %x) #2 {
 ; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i1> poison, i1 [[X:%.*]], i64 0
 ; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i1> [[BROADCAST_SPLATINSERT]], <4 x i1> poison, <4 x i32> zeroinitializer
 ; CHECK-NEXT:    [[TMP0:%.*]] = zext <4 x i1> [[BROADCAST_SPLAT]] to <4 x i64>
+; CHECK-NEXT:    [[TMP7:%.*]] = lshr <4 x i64> [[TMP0]], <i64 12, i64 12, i64 12, i64 12>
+; CHECK-NEXT:    [[TMP9:%.*]] = trunc <4 x i64> [[TMP7]] to <4 x i32>
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
@@ -880,12 +872,8 @@ define void @reduction_store(ptr noalias %src, ptr %dst, i1 %x) #2 {
 ; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i32 [[TMP2]]
 ; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i32, ptr [[TMP3]], i32 0
 ; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[TMP3]], i32 4
-; CHECK-NEXT:    [[TMP7:%.*]] = lshr <4 x i64> [[TMP0]], <i64 12, i64 12, i64 12, i64 12>
-; CHECK-NEXT:    [[TMP8:%.*]] = lshr <4 x i64> [[TMP0]], <i64 12, i64 12, i64 12, i64 12>
-; CHECK-NEXT:    [[TMP9:%.*]] = trunc <4 x i64> [[TMP7]] to <4 x i32>
-; CHECK-NEXT:    [[TMP10:%.*]] = trunc <4 x i64> [[TMP8]] to <4 x i32>
 ; CHECK-NEXT:    [[TMP11]] = and <4 x i32> [[VEC_PHI]], [[TMP9]]
-; CHECK-NEXT:    [[TMP12]] = and <4 x i32> [[VEC_PHI1]], [[TMP10]]
+; CHECK-NEXT:    [[TMP12]] = and <4 x i32> [[VEC_PHI1]], [[TMP9]]
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 8
 ; CHECK-NEXT:    [[TMP13:%.*]] = icmp eq i32 [[INDEX_NEXT]], 24
 ; CHECK-NEXT:    br i1 [[TMP13]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP20:![0-9]+]]
diff --git a/llvm/test/Transforms/LoopVectorize/X86/divs-with-tail-folding.ll b/llvm/test/Transforms/LoopVectorize/X86/divs-with-tail-folding.ll
index 7566844ff39df8..229bdc341a2c34 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/divs-with-tail-folding.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/divs-with-tail-folding.ll
@@ -20,6 +20,9 @@ define void @sdiv_feeding_gep(ptr %dst, i32 %x, i64 %M, i64 %conv6, i64 %N) {
 ; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
 ; CHECK-NEXT:    [[TRIP_COUNT_MINUS_1:%.*]] = sub i64 [[N]], 1
 ; CHECK-NEXT:    [[TMP17:%.*]] = sdiv i64 [[M]], [[CONV6]]
+; CHECK-NEXT:    [[TMP19:%.*]] = trunc i64 [[TMP17]] to i32
+; CHECK-NEXT:    [[TMP20:%.*]] = mul i64 [[TMP17]], [[CONV61]]
+; CHECK-NEXT:    [[TMP23:%.*]] = mul i32 [[X]], [[TMP19]]
 ; CHECK-NEXT:    [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <4 x i64> poison, i64 [[TRIP_COUNT_MINUS_1]], i64 0
 ; CHECK-NEXT:    [[BROADCAST_SPLAT2:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT1]], <4 x i64> poison, <4 x i32> zeroinitializer
 ; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
@@ -30,11 +33,8 @@ define void @sdiv_feeding_gep(ptr %dst, i32 %x, i64 %M, i64 %conv6, i64 %N) {
 ; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT]], <4 x i64> poison, <4 x i32> zeroinitializer
 ; CHECK-NEXT:    [[VEC_IV:%.*]] = add <4 x i64> [[BROADCAST_SPLAT]], <i64 0, i64 1, i64 2, i64 3>
 ; CHECK-NEXT:    [[TMP6:%.*]] = icmp ule <4 x i64> [[VEC_IV]], [[BROADCAST_SPLAT2]]
-; CHECK-NEXT:    [[TMP19:%.*]] = trunc i64 [[TMP17]] to i32
-; CHECK-NEXT:    [[TMP20:%.*]] = mul i64 [[TMP17]], [[CONV61]]
 ; CHECK-NEXT:    [[TMP21:%.*]] = sub i64 [[TMP5]], [[TMP20]]
 ; CHECK-NEXT:    [[TMP22:%.*]] = trunc i64 [[TMP21]] to i32
-; CHECK-NEXT:    [[TMP23:%.*]] = mul i32 [[X]], [[TMP19]]
 ; CHECK-NEXT:    [[TMP24:%.*]] = add i32 [[TMP23]], [[TMP22]]
 ; CHECK-NEXT:    [[TMP25:%.*]] = sext i32 [[TMP24]] to i64
 ; CHECK-NEXT:    [[TMP26:%.*]] = getelementptr double, ptr [[DST]], i64 [[TMP25]]

>From 1a5113c2031e479a39b1ba30a374b5ada25c8dea Mon Sep 17 00:00:00 2001
From: Florian Hahn <flo at fhahn.com>
Date: Fri, 20 Sep 2024 15:55:49 +0100
Subject: [PATCH 13/15] !fixup update tests

---
 llvm/lib/Transforms/Vectorize/VPlanUtils.cpp  |  14 +-
 .../AArch64/deterministic-type-shrinkage.ll   |  26 +-
 .../AArch64/divs-with-scalable-vfs.ll         |   8 +-
 .../AArch64/force-target-instruction-cost.ll  |   8 +-
 .../AArch64/reduction-recurrence-costs-sve.ll |  15 +-
 .../LoopVectorize/AArch64/store-costs-sve.ll  |   3 +-
 .../Transforms/LoopVectorize/RISCV/divrem.ll  |  10 +-
 .../RISCV/riscv-vector-reverse.ll             |   4 +-
 .../LoopVectorize/X86/cost-model.ll           |   8 +-
 .../LoopVectorize/X86/interleaving.ll         |   2 +-
 .../LoopVectorize/X86/uniform_mem_op.ll       |   6 -
 .../LoopVectorize/first-order-recurrence.ll   | 267 +++++++++---------
 .../LoopVectorize/float-induction.ll          |   2 +-
 .../Transforms/LoopVectorize/induction.ll     |   4 +-
 .../LoopVectorize/reduction-inloop-uf4.ll     |  15 +-
 .../LoopVectorize/vplan-predicate-switch.ll   |   2 +-
 16 files changed, 183 insertions(+), 211 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/VPlanUtils.cpp b/llvm/lib/Transforms/Vectorize/VPlanUtils.cpp
index b578413ed79b95..4621c28b051298 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanUtils.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanUtils.cpp
@@ -74,11 +74,21 @@ const SCEV *vputils::getSCEVExprForVPValue(VPValue *V, ScalarEvolution &SE) {
 }
 
 bool vputils::isUniformAcrossVFsAndUFs(VPValue *V) {
-  // Loop invariants are uniform.
-  if (V->isDefinedOutsideLoopRegions())
+  using namespace VPlanPatternMatch;
+  // Live-ins are uniform.
+  if (V->isLiveIn())
     return true;
 
   VPRecipeBase *R = V->getDefiningRecipe();
+  if (R && V->isDefinedOutsideLoopRegions()) {
+    if (match(V->getDefiningRecipe(),
+              m_VPInstruction<VPInstruction::CanonicalIVIncrementForPart>(
+                  m_VPValue())))
+      return false;
+    return all_of(R->operands(),
+                  [](VPValue *Op) { return isUniformAcrossVFsAndUFs(Op); });
+  }
+
   auto *CanonicalIV = R->getParent()->getPlan()->getCanonicalIV();
   // Canonical IV chain is uniform.
   if (V == CanonicalIV || V == CanonicalIV->getBackedgeValue())
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/deterministic-type-shrinkage.ll b/llvm/test/Transforms/LoopVectorize/AArch64/deterministic-type-shrinkage.ll
index dc5cb246039e1d..0f33e8fa79ce7b 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/deterministic-type-shrinkage.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/deterministic-type-shrinkage.ll
@@ -162,22 +162,18 @@ define void @test_shrink_zext_in_preheader(ptr noalias %src, ptr noalias %dst, i
 ; CHECK:       vector.ph:
 ; CHECK-NEXT:    [[TMP0:%.*]] = trunc i32 [[A]] to i16
 ; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <16 x i16> poison, i16 [[TMP0]], i64 0
-; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <16 x i16> [[TMP1]], <16 x i16> poison, <16 x i32> zeroinitializer
 ; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <16 x i16> poison, i16 [[B]], i64 0
-; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <16 x i16> [[TMP3]], <16 x i16> poison, <16 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP5:%.*]] = mul <16 x i16> [[TMP2]], [[TMP4]]
-; CHECK-NEXT:    [[TMP6:%.*]] = mul <16 x i16> [[TMP2]], [[TMP4]]
-; CHECK-NEXT:    [[TMP7:%.*]] = lshr <16 x i16> [[TMP5]], <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>
-; CHECK-NEXT:    [[TMP8:%.*]] = lshr <16 x i16> [[TMP6]], <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>
-; CHECK-NEXT:    [[TMP9:%.*]] = trunc nuw <16 x i16> [[TMP7]] to <16 x i8>
-; CHECK-NEXT:    [[TMP10:%.*]] = trunc nuw <16 x i16> [[TMP8]] to <16 x i8>
+; CHECK-NEXT:    [[TMP5:%.*]] = mul <16 x i16> [[TMP1]], [[TMP3]]
+; CHECK-NEXT:    [[TMP7:%.*]] = lshr <16 x i16> [[TMP5]], <i16 8, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0>
+; CHECK-NEXT:    [[TMP9:%.*]] = trunc <16 x i16> [[TMP7]] to <16 x i8>
+; CHECK-NEXT:    [[TMP10:%.*]] = shufflevector <16 x i8> [[TMP9]], <16 x i8> poison, <16 x i32> zeroinitializer
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP11:%.*]] = sext i32 [[INDEX]] to i64
 ; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i8, ptr [[DST]], i64 [[TMP11]]
 ; CHECK-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i8, ptr [[TMP12]], i64 16
-; CHECK-NEXT:    store <16 x i8> [[TMP9]], ptr [[TMP12]], align 1
+; CHECK-NEXT:    store <16 x i8> [[TMP10]], ptr [[TMP12]], align 1
 ; CHECK-NEXT:    store <16 x i8> [[TMP10]], ptr [[TMP13]], align 1
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 32
 ; CHECK-NEXT:    [[TMP14:%.*]] = icmp eq i32 [[INDEX_NEXT]], 992
@@ -244,15 +240,11 @@ define void @test_shrink_select(ptr noalias %src, ptr noalias %dst, i32 %A, i1 %
 ; CHECK:       vector.ph:
 ; CHECK-NEXT:    [[TMP0:%.*]] = trunc i32 [[A]] to i16
 ; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <16 x i16> poison, i16 [[TMP0]], i64 0
-; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <16 x i16> [[TMP1]], <16 x i16> poison, <16 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP3:%.*]] = mul <16 x i16> [[TMP2]], <i16 99, i16 99, i16 99, i16 99, i16 99, i16 99, i16 99, i16 99, i16 99, i16 99, i16 99, i16 99, i16 99, i16 99, i16 99, i16 99>
-; CHECK-NEXT:    [[TMP4:%.*]] = mul <16 x i16> [[TMP2]], <i16 99, i16 99, i16 99, i16 99, i16 99, i16 99, i16 99, i16 99, i16 99, i16 99, i16 99, i16 99, i16 99, i16 99, i16 99, i16 99>
-; CHECK-NEXT:    [[TMP5:%.*]] = lshr <16 x i16> [[TMP3]], <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>
-; CHECK-NEXT:    [[TMP6:%.*]] = lshr <16 x i16> [[TMP4]], <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>
-; CHECK-NEXT:    [[TMP7:%.*]] = select i1 [[C]], <16 x i16> [[TMP5]], <16 x i16> [[TMP3]]
-; CHECK-NEXT:    [[TMP8:%.*]] = select i1 [[C]], <16 x i16> [[TMP6]], <16 x i16> [[TMP4]]
+; CHECK-NEXT:    [[TMP3:%.*]] = mul <16 x i16> [[TMP1]],  <i16 99, i16 poison, i16 poison, i16 poison, i16 poison, i16 poison, i16 poison, i16 poison, i16 poison, i16 poison, i16 poison, i16 poison, i16 poison, i16 poison, i16 poison, i16 poison>
+; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <16 x i16> [[TMP3]], <16 x i16> poison, <16 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP5:%.*]] = lshr <16 x i16> [[TMP4]], <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>
+; CHECK-NEXT:    [[TMP7:%.*]] = select i1 [[C]], <16 x i16> [[TMP5]], <16 x i16> [[TMP4]]
 ; CHECK-NEXT:    [[TMP9:%.*]] = trunc <16 x i16> [[TMP7]] to <16 x i8>
-; CHECK-NEXT:    [[TMP10:%.*]] = trunc <16 x i16> [[TMP8]] to <16 x i8>
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/divs-with-scalable-vfs.ll b/llvm/test/Transforms/LoopVectorize/AArch64/divs-with-scalable-vfs.ll
index 631d04dbcb088b..623bc9d7f6b83e 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/divs-with-scalable-vfs.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/divs-with-scalable-vfs.ll
@@ -27,13 +27,9 @@ define void @sdiv_feeding_gep(ptr %dst, i32 %x, i64 %M, i64 %conv6, i64 %N) {
 ; CHECK-NEXT:    [[TMP10:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-NEXT:    [[TMP11:%.*]] = mul i64 [[TMP10]], 4
 ; CHECK-NEXT:    [[TMP18:%.*]] = sdiv i64 [[M]], [[CONV6]]
-; CHECK-NEXT:    [[TMP19:%.*]] = sdiv i64 [[M]], [[CONV6]]
 ; CHECK-NEXT:    [[TMP20:%.*]] = trunc i64 [[TMP18]] to i32
-; CHECK-NEXT:    [[TMP21:%.*]] = trunc i64 [[TMP19]] to i32
 ; CHECK-NEXT:    [[TMP22:%.*]] = mul i64 [[TMP18]], [[CONV61]]
-; CHECK-NEXT:    [[TMP23:%.*]] = mul i64 [[TMP19]], [[CONV61]]
 ; CHECK-NEXT:    [[TMP28:%.*]] = mul i32 [[X]], [[TMP20]]
-; CHECK-NEXT:    [[TMP29:%.*]] = mul i32 [[X]], [[TMP21]]
 ; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
 ; CHECK:       [[VECTOR_BODY]]:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
@@ -44,11 +40,11 @@ define void @sdiv_feeding_gep(ptr %dst, i32 %x, i64 %M, i64 %conv6, i64 %N) {
 ; CHECK-NEXT:    [[TMP16:%.*]] = mul i64 [[TMP15]], 1
 ; CHECK-NEXT:    [[TMP17:%.*]] = add i64 [[INDEX]], [[TMP16]]
 ; CHECK-NEXT:    [[TMP24:%.*]] = sub i64 [[TMP12]], [[TMP22]]
-; CHECK-NEXT:    [[TMP25:%.*]] = sub i64 [[TMP17]], [[TMP23]]
+; CHECK-NEXT:    [[TMP25:%.*]] = sub i64 [[TMP17]], [[TMP22]]
 ; CHECK-NEXT:    [[TMP26:%.*]] = trunc i64 [[TMP24]] to i32
 ; CHECK-NEXT:    [[TMP27:%.*]] = trunc i64 [[TMP25]] to i32
 ; CHECK-NEXT:    [[TMP30:%.*]] = add i32 [[TMP28]], [[TMP26]]
-; CHECK-NEXT:    [[TMP31:%.*]] = add i32 [[TMP29]], [[TMP27]]
+; CHECK-NEXT:    [[TMP31:%.*]] = add i32 [[TMP28]], [[TMP27]]
 ; CHECK-NEXT:    [[TMP32:%.*]] = sext i32 [[TMP30]] to i64
 ; CHECK-NEXT:    [[TMP33:%.*]] = sext i32 [[TMP31]] to i64
 ; CHECK-NEXT:    [[TMP34:%.*]] = getelementptr double, ptr [[DST]], i64 [[TMP32]]
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/force-target-instruction-cost.ll b/llvm/test/Transforms/LoopVectorize/AArch64/force-target-instruction-cost.ll
index 651fcf6bedc2d9..babbe3a17c4603 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/force-target-instruction-cost.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/force-target-instruction-cost.ll
@@ -207,18 +207,16 @@ define void @test_exit_branch_cost(ptr %dst, i64 %x, i32 %y, ptr %dst.1, i1 %c.4
 ; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <2 x i64> poison, i64 [[X]], i64 0
 ; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <2 x i64> [[BROADCAST_SPLATINSERT]], <2 x i64> poison, <2 x i32> zeroinitializer
 ; CHECK-NEXT:    [[TMP47:%.*]] = icmp eq <2 x i64> [[BROADCAST_SPLAT]], zeroinitializer
-; CHECK-NEXT:    [[TMP1:%.*]] = icmp eq <2 x i64> [[BROADCAST_SPLAT]], zeroinitializer
 ; CHECK-NEXT:    [[BROADCAST_SPLATINSERT40:%.*]] = insertelement <2 x i1> poison, i1 [[C_3]], i64 0
 ; CHECK-NEXT:    [[BROADCAST_SPLAT41:%.*]] = shufflevector <2 x i1> [[BROADCAST_SPLATINSERT40]], <2 x i1> poison, <2 x i32> zeroinitializer
 ; CHECK-NEXT:    [[TMP2:%.*]] = select i1 [[C_4]], <2 x i1> [[BROADCAST_SPLAT41]], <2 x i1> zeroinitializer
-; CHECK-NEXT:    [[TMP3:%.*]] = select i1 [[C_4]], <2 x i1> [[BROADCAST_SPLAT41]], <2 x i1> zeroinitializer
 ; CHECK-NEXT:    [[BROADCAST_SPLATINSERT56:%.*]] = insertelement <2 x i1> poison, i1 [[C_4]], i64 0
 ; CHECK-NEXT:    [[BROADCAST_SPLAT57:%.*]] = shufflevector <2 x i1> [[BROADCAST_SPLATINSERT56]], <2 x i1> poison, <2 x i32> zeroinitializer
 ; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
 ; CHECK:       [[VECTOR_BODY]]:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[PRED_STORE_CONTINUE74:.*]] ]
 ; CHECK-NEXT:    [[TMP4:%.*]] = xor <2 x i1> [[TMP47]], <i1 true, i1 true>
-; CHECK-NEXT:    [[TMP5:%.*]] = xor <2 x i1> [[TMP1]], <i1 true, i1 true>
+; CHECK-NEXT:    [[TMP5:%.*]] = xor <2 x i1> [[TMP47]], <i1 true, i1 true>
 ; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <2 x i1> [[TMP4]], i32 0
 ; CHECK-NEXT:    br i1 [[TMP6]], label %[[PRED_STORE_IF:.*]], label %[[PRED_STORE_CONTINUE:.*]]
 ; CHECK:       [[PRED_STORE_IF]]:
@@ -244,7 +242,7 @@ define void @test_exit_branch_cost(ptr %dst, i64 %x, i32 %y, ptr %dst.1, i1 %c.4
 ; CHECK-NEXT:    br label %[[PRED_STORE_CONTINUE47]]
 ; CHECK:       [[PRED_STORE_CONTINUE47]]:
 ; CHECK-NEXT:    [[TMP10:%.*]] = xor <2 x i1> [[TMP2]], <i1 true, i1 true>
-; CHECK-NEXT:    [[TMP11:%.*]] = xor <2 x i1> [[TMP3]], <i1 true, i1 true>
+; CHECK-NEXT:    [[TMP11:%.*]] = xor <2 x i1> [[TMP2]], <i1 true, i1 true>
 ; CHECK-NEXT:    [[TMP12:%.*]] = select <2 x i1> [[TMP4]], <2 x i1> [[TMP10]], <2 x i1> zeroinitializer
 ; CHECK-NEXT:    [[TMP13:%.*]] = select <2 x i1> [[TMP5]], <2 x i1> [[TMP11]], <2 x i1> zeroinitializer
 ; CHECK-NEXT:    [[TMP14:%.*]] = extractelement <2 x i1> [[TMP12]], i32 0
@@ -276,7 +274,7 @@ define void @test_exit_branch_cost(ptr %dst, i64 %x, i32 %y, ptr %dst.1, i1 %c.4
 ; CHECK-NEXT:    [[TMP20:%.*]] = select <2 x i1> [[TMP18]], <2 x i1> [[BROADCAST_SPLAT57]], <2 x i1> zeroinitializer
 ; CHECK-NEXT:    [[TMP21:%.*]] = select <2 x i1> [[TMP19]], <2 x i1> [[BROADCAST_SPLAT57]], <2 x i1> zeroinitializer
 ; CHECK-NEXT:    [[TMP22:%.*]] = or <2 x i1> [[TMP47]], [[TMP20]]
-; CHECK-NEXT:    [[TMP23:%.*]] = or <2 x i1> [[TMP1]], [[TMP21]]
+; CHECK-NEXT:    [[TMP23:%.*]] = or <2 x i1> [[TMP47]], [[TMP21]]
 ; CHECK-NEXT:    [[PREDPHI:%.*]] = select <2 x i1> [[TMP20]], <2 x i64> zeroinitializer, <2 x i64> <i64 1, i64 1>
 ; CHECK-NEXT:    [[PREDPHI58:%.*]] = select <2 x i1> [[TMP21]], <2 x i64> zeroinitializer, <2 x i64> <i64 1, i64 1>
 ; CHECK-NEXT:    [[TMP24:%.*]] = extractelement <2 x i1> [[TMP22]], i32 0
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/reduction-recurrence-costs-sve.ll b/llvm/test/Transforms/LoopVectorize/AArch64/reduction-recurrence-costs-sve.ll
index e6c620018b7db4..68fac519714182 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/reduction-recurrence-costs-sve.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/reduction-recurrence-costs-sve.ll
@@ -22,33 +22,22 @@ define i32 @chained_recurrences(i32 %x, i64 %y, ptr %src.1, i32 %z, ptr %src.2)
 ; DEFAULT-NEXT:    [[TMP5:%.*]] = call i64 @llvm.vscale.i64()
 ; DEFAULT-NEXT:    [[TMP6:%.*]] = mul i64 [[TMP5]], 8
 ; DEFAULT-NEXT:    [[TMP13:%.*]] = add i64 [[Y]], 1
-; DEFAULT-NEXT:    [[TMP14:%.*]] = add i64 [[Y]], 1
 ; DEFAULT-NEXT:    [[TMP15:%.*]] = getelementptr i32, ptr [[SRC_1]], i64 [[TMP13]]
-; DEFAULT-NEXT:    [[TMP16:%.*]] = getelementptr i32, ptr [[SRC_1]], i64 [[TMP14]]
 ; DEFAULT-NEXT:    [[BROADCAST_SPLATINSERT6:%.*]] = insertelement <vscale x 4 x i32> poison, i32 [[X]], i64 0
 ; DEFAULT-NEXT:    [[BROADCAST_SPLAT7:%.*]] = shufflevector <vscale x 4 x i32> [[BROADCAST_SPLATINSERT6]], <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
 ; DEFAULT-NEXT:    [[TMP25:%.*]] = lshr <vscale x 4 x i32> [[BROADCAST_SPLAT7]], shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 1, i64 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer)
-; DEFAULT-NEXT:    [[TMP26:%.*]] = lshr <vscale x 4 x i32> [[BROADCAST_SPLAT7]], shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 1, i64 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer)
 ; DEFAULT-NEXT:    [[TMP31:%.*]] = shl <vscale x 4 x i32> [[BROADCAST_SPLAT7]], shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 1, i64 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer)
-; DEFAULT-NEXT:    [[TMP32:%.*]] = shl <vscale x 4 x i32> [[BROADCAST_SPLAT7]], shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 1, i64 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer)
 ; DEFAULT-NEXT:    [[TMP33:%.*]] = or <vscale x 4 x i32> [[TMP25]], [[TMP31]]
-; DEFAULT-NEXT:    [[TMP34:%.*]] = or <vscale x 4 x i32> [[TMP26]], [[TMP32]]
 ; DEFAULT-NEXT:    [[BROADCAST_SPLATINSERT8:%.*]] = insertelement <vscale x 4 x i32> poison, i32 [[Z]], i64 0
 ; DEFAULT-NEXT:    [[BROADCAST_SPLAT9:%.*]] = shufflevector <vscale x 4 x i32> [[BROADCAST_SPLATINSERT8]], <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
 ; DEFAULT-NEXT:    [[TMP39:%.*]] = or <vscale x 4 x i32> [[BROADCAST_SPLAT9]], [[BROADCAST_SPLAT7]]
-; DEFAULT-NEXT:    [[TMP40:%.*]] = or <vscale x 4 x i32> [[BROADCAST_SPLAT9]], [[BROADCAST_SPLAT7]]
 ; DEFAULT-NEXT:    [[TMP41:%.*]] = and <vscale x 4 x i32> [[TMP39]], shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 1, i64 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer)
-; DEFAULT-NEXT:    [[TMP42:%.*]] = and <vscale x 4 x i32> [[TMP40]], shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 1, i64 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer)
 ; DEFAULT-NEXT:    [[TMP43:%.*]] = xor <vscale x 4 x i32> [[TMP41]], shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 1, i64 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer)
-; DEFAULT-NEXT:    [[TMP44:%.*]] = xor <vscale x 4 x i32> [[TMP42]], shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 1, i64 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer)
 ; DEFAULT-NEXT:    [[TMP45:%.*]] = zext <vscale x 4 x i32> [[TMP43]] to <vscale x 4 x i64>
-; DEFAULT-NEXT:    [[TMP46:%.*]] = zext <vscale x 4 x i32> [[TMP44]] to <vscale x 4 x i64>
 ; DEFAULT-NEXT:    [[TMP61:%.*]] = extractelement <vscale x 4 x i64> [[TMP45]], i32 0
 ; DEFAULT-NEXT:    [[TMP62:%.*]] = getelementptr i32, ptr [[SRC_2]], i64 [[TMP61]]
 ; DEFAULT-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 4 x ptr> poison, ptr [[TMP62]], i64 0
 ; DEFAULT-NEXT:    [[TMP47:%.*]] = shufflevector <vscale x 4 x ptr> [[DOTSPLATINSERT]], <vscale x 4 x ptr> poison, <vscale x 4 x i32> zeroinitializer
-; DEFAULT-NEXT:    [[DOTSPLATINSERT3:%.*]] = insertelement <vscale x 4 x ptr> poison, ptr [[TMP62]], i64 0
-; DEFAULT-NEXT:    [[TMP48:%.*]] = shufflevector <vscale x 4 x ptr> [[DOTSPLATINSERT3]], <vscale x 4 x ptr> poison, <vscale x 4 x i32> zeroinitializer
 ; DEFAULT-NEXT:    [[TMP7:%.*]] = call i32 @llvm.vscale.i32()
 ; DEFAULT-NEXT:    [[TMP8:%.*]] = mul i32 [[TMP7]], 4
 ; DEFAULT-NEXT:    [[TMP9:%.*]] = sub i32 [[TMP8]], 1
@@ -78,11 +67,11 @@ define i32 @chained_recurrences(i32 %x, i64 %y, ptr %src.1, i32 %z, ptr %src.2)
 ; DEFAULT-NEXT:    [[TMP29:%.*]] = or <vscale x 4 x i32> [[TMP27]], shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 2, i64 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer)
 ; DEFAULT-NEXT:    [[TMP30:%.*]] = or <vscale x 4 x i32> [[TMP28]], shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 2, i64 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer)
 ; DEFAULT-NEXT:    [[TMP35:%.*]] = or <vscale x 4 x i32> [[TMP33]], [[TMP29]]
-; DEFAULT-NEXT:    [[TMP36:%.*]] = or <vscale x 4 x i32> [[TMP34]], [[TMP30]]
+; DEFAULT-NEXT:    [[TMP36:%.*]] = or <vscale x 4 x i32> [[TMP33]], [[TMP30]]
 ; DEFAULT-NEXT:    [[TMP37:%.*]] = or <vscale x 4 x i32> [[TMP35]], [[BROADCAST_SPLAT7]]
 ; DEFAULT-NEXT:    [[TMP38:%.*]] = or <vscale x 4 x i32> [[TMP36]], [[BROADCAST_SPLAT7]]
 ; DEFAULT-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <vscale x 4 x i32> @llvm.masked.gather.nxv4i32.nxv4p0(<vscale x 4 x ptr> [[TMP47]], i32 4, <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer), <vscale x 4 x i32> poison)
-; DEFAULT-NEXT:    [[WIDE_MASKED_GATHER10:%.*]] = call <vscale x 4 x i32> @llvm.masked.gather.nxv4i32.nxv4p0(<vscale x 4 x ptr> [[TMP48]], i32 4, <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer), <vscale x 4 x i32> poison)
+; DEFAULT-NEXT:    [[WIDE_MASKED_GATHER10:%.*]] = call <vscale x 4 x i32> @llvm.masked.gather.nxv4i32.nxv4p0(<vscale x 4 x ptr> [[TMP47]], i32 4, <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer), <vscale x 4 x i32> poison)
 ; DEFAULT-NEXT:    [[TMP49:%.*]] = lshr <vscale x 4 x i32> [[TMP37]], shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 1, i64 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer)
 ; DEFAULT-NEXT:    [[TMP50:%.*]] = lshr <vscale x 4 x i32> [[TMP38]], shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 1, i64 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer)
 ; DEFAULT-NEXT:    [[TMP51:%.*]] = zext <vscale x 4 x i32> [[TMP49]] to <vscale x 4 x i64>
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/store-costs-sve.ll b/llvm/test/Transforms/LoopVectorize/AArch64/store-costs-sve.ll
index d62e5991da0ad0..5a064dc45551ee 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/store-costs-sve.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/store-costs-sve.ll
@@ -169,9 +169,8 @@ define void @trunc_store(ptr %dst, ptr %src, i16 %x) #1 {
 ; DEFAULT-NEXT:    [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <16 x i64> poison, i64 [[TMP4]], i64 0
 ; DEFAULT-NEXT:    [[BROADCAST_SPLAT2:%.*]] = shufflevector <16 x i64> [[BROADCAST_SPLATINSERT1]], <16 x i64> poison, <16 x i32> zeroinitializer
 ; DEFAULT-NEXT:    [[TMP5:%.*]] = trunc <16 x i64> [[BROADCAST_SPLAT2]] to <16 x i8>
-; DEFAULT-NEXT:    [[TMP6:%.*]] = trunc <16 x i64> [[BROADCAST_SPLAT2]] to <16 x i8>
 ; DEFAULT-NEXT:    [[TMP8:%.*]] = and <16 x i8> [[TMP5]], [[TMP7]]
-; DEFAULT-NEXT:    [[TMP9:%.*]] = and <16 x i8> [[TMP6]], [[TMP7]]
+; DEFAULT-NEXT:    [[TMP9:%.*]] = and <16 x i8> [[TMP5]], [[TMP7]]
 ; DEFAULT-NEXT:    [[TMP10:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP2]]
 ; DEFAULT-NEXT:    [[TMP11:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP3]]
 ; DEFAULT-NEXT:    [[TMP12:%.*]] = getelementptr i8, ptr [[TMP10]], i32 0
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/divrem.ll b/llvm/test/Transforms/LoopVectorize/RISCV/divrem.ll
index 9a2fcf43c81575..9b7681d2d8d16c 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/divrem.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/divrem.ll
@@ -485,7 +485,6 @@ define void @predicated_udiv(ptr noalias nocapture %a, i64 %v, i64 %n) {
 ; FIXED-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i64> poison, i64 [[V:%.*]], i64 0
 ; FIXED-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT]], <4 x i64> poison, <4 x i32> zeroinitializer
 ; FIXED-NEXT:    [[TMP6:%.*]] = icmp ne <4 x i64> [[BROADCAST_SPLAT]], zeroinitializer
-; FIXED-NEXT:    [[TMP7:%.*]] = icmp ne <4 x i64> [[BROADCAST_SPLAT]], zeroinitializer
 ; FIXED-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; FIXED:       vector.body:
 ; FIXED-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
@@ -498,11 +497,11 @@ define void @predicated_udiv(ptr noalias nocapture %a, i64 %v, i64 %n) {
 ; FIXED-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i64>, ptr [[TMP4]], align 8
 ; FIXED-NEXT:    [[WIDE_LOAD1:%.*]] = load <4 x i64>, ptr [[TMP5]], align 8
 ; FIXED-NEXT:    [[TMP8:%.*]] = select <4 x i1> [[TMP6]], <4 x i64> [[BROADCAST_SPLAT]], <4 x i64> <i64 1, i64 1, i64 1, i64 1>
-; FIXED-NEXT:    [[TMP9:%.*]] = select <4 x i1> [[TMP7]], <4 x i64> [[BROADCAST_SPLAT]], <4 x i64> <i64 1, i64 1, i64 1, i64 1>
+; FIXED-NEXT:    [[TMP9:%.*]] = select <4 x i1> [[TMP6]], <4 x i64> [[BROADCAST_SPLAT]], <4 x i64> <i64 1, i64 1, i64 1, i64 1>
 ; FIXED-NEXT:    [[TMP10:%.*]] = udiv <4 x i64> [[WIDE_LOAD]], [[TMP8]]
 ; FIXED-NEXT:    [[TMP11:%.*]] = udiv <4 x i64> [[WIDE_LOAD1]], [[TMP9]]
 ; FIXED-NEXT:    [[PREDPHI:%.*]] = select <4 x i1> [[TMP6]], <4 x i64> [[TMP10]], <4 x i64> [[WIDE_LOAD]]
-; FIXED-NEXT:    [[PREDPHI2:%.*]] = select <4 x i1> [[TMP7]], <4 x i64> [[TMP11]], <4 x i64> [[WIDE_LOAD1]]
+; FIXED-NEXT:    [[PREDPHI2:%.*]] = select <4 x i1> [[TMP6]], <4 x i64> [[TMP11]], <4 x i64> [[WIDE_LOAD1]]
 ; FIXED-NEXT:    store <4 x i64> [[PREDPHI]], ptr [[TMP4]], align 8
 ; FIXED-NEXT:    store <4 x i64> [[PREDPHI2]], ptr [[TMP5]], align 8
 ; FIXED-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
@@ -616,7 +615,6 @@ define void @predicated_sdiv(ptr noalias nocapture %a, i64 %v, i64 %n) {
 ; FIXED-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i64> poison, i64 [[V:%.*]], i64 0
 ; FIXED-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT]], <4 x i64> poison, <4 x i32> zeroinitializer
 ; FIXED-NEXT:    [[TMP6:%.*]] = icmp ne <4 x i64> [[BROADCAST_SPLAT]], zeroinitializer
-; FIXED-NEXT:    [[TMP7:%.*]] = icmp ne <4 x i64> [[BROADCAST_SPLAT]], zeroinitializer
 ; FIXED-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; FIXED:       vector.body:
 ; FIXED-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
@@ -629,11 +627,11 @@ define void @predicated_sdiv(ptr noalias nocapture %a, i64 %v, i64 %n) {
 ; FIXED-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i64>, ptr [[TMP4]], align 8
 ; FIXED-NEXT:    [[WIDE_LOAD1:%.*]] = load <4 x i64>, ptr [[TMP5]], align 8
 ; FIXED-NEXT:    [[TMP8:%.*]] = select <4 x i1> [[TMP6]], <4 x i64> [[BROADCAST_SPLAT]], <4 x i64> <i64 1, i64 1, i64 1, i64 1>
-; FIXED-NEXT:    [[TMP9:%.*]] = select <4 x i1> [[TMP7]], <4 x i64> [[BROADCAST_SPLAT]], <4 x i64> <i64 1, i64 1, i64 1, i64 1>
+; FIXED-NEXT:    [[TMP9:%.*]] = select <4 x i1> [[TMP6]], <4 x i64> [[BROADCAST_SPLAT]], <4 x i64> <i64 1, i64 1, i64 1, i64 1>
 ; FIXED-NEXT:    [[TMP10:%.*]] = sdiv <4 x i64> [[WIDE_LOAD]], [[TMP8]]
 ; FIXED-NEXT:    [[TMP11:%.*]] = sdiv <4 x i64> [[WIDE_LOAD1]], [[TMP9]]
 ; FIXED-NEXT:    [[PREDPHI:%.*]] = select <4 x i1> [[TMP6]], <4 x i64> [[TMP10]], <4 x i64> [[WIDE_LOAD]]
-; FIXED-NEXT:    [[PREDPHI2:%.*]] = select <4 x i1> [[TMP7]], <4 x i64> [[TMP11]], <4 x i64> [[WIDE_LOAD1]]
+; FIXED-NEXT:    [[PREDPHI2:%.*]] = select <4 x i1> [[TMP6]], <4 x i64> [[TMP11]], <4 x i64> [[WIDE_LOAD1]]
 ; FIXED-NEXT:    store <4 x i64> [[PREDPHI]], ptr [[TMP4]], align 8
 ; FIXED-NEXT:    store <4 x i64> [[PREDPHI2]], ptr [[TMP5]], align 8
 ; FIXED-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/riscv-vector-reverse.ll b/llvm/test/Transforms/LoopVectorize/RISCV/riscv-vector-reverse.ll
index 349fd13a58d29d..b6a9fed507acd3 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/riscv-vector-reverse.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/riscv-vector-reverse.ll
@@ -136,7 +136,7 @@ define void @vector_reverse_i64(ptr nocapture noundef writeonly %A, ptr nocaptur
 ; CHECK-NEXT:  LV: Found a vectorizable loop (vscale x 4) in <stdin>
 ; CHECK-NEXT:  LEV: Epilogue vectorization is not profitable for this loop
 ; CHECK-NEXT:  Executing best plan with VF=vscale x 4, UF=1
-; CHECK-NEXT:  VPlan 'Final VPlan for VF={vscale x 4},UF>=1' {
+; CHECK-NEXT:  VPlan 'Final VPlan for VF={vscale x 4},UF={1}' {
 ; CHECK-NEXT:  Live-in vp<%0> = VF * UF
 ; CHECK-NEXT:  Live-in vp<%1> = vector-trip-count
 ; CHECK-NEXT:  vp<%2> = original trip-count
@@ -340,7 +340,7 @@ define void @vector_reverse_f32(ptr nocapture noundef writeonly %A, ptr nocaptur
 ; CHECK-NEXT:  LV: Found a vectorizable loop (vscale x 4) in <stdin>
 ; CHECK-NEXT:  LEV: Epilogue vectorization is not profitable for this loop
 ; CHECK-NEXT:  Executing best plan with VF=vscale x 4, UF=1
-; CHECK-NEXT:  VPlan 'Final VPlan for VF={vscale x 4},UF>=1' {
+; CHECK-NEXT:  VPlan 'Final VPlan for VF={vscale x 4},UF={1}' {
 ; CHECK-NEXT:  Live-in vp<%0> = VF * UF
 ; CHECK-NEXT:  Live-in vp<%1> = vector-trip-count
 ; CHECK-NEXT:  vp<%2> = original trip-count
diff --git a/llvm/test/Transforms/LoopVectorize/X86/cost-model.ll b/llvm/test/Transforms/LoopVectorize/X86/cost-model.ll
index a87db82dbea118..e8adccca15ddbc 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/cost-model.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/cost-model.ll
@@ -859,8 +859,8 @@ define void @reduction_store(ptr noalias %src, ptr %dst, i1 %x) #2 {
 ; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i1> poison, i1 [[X:%.*]], i64 0
 ; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i1> [[BROADCAST_SPLATINSERT]], <4 x i1> poison, <4 x i32> zeroinitializer
 ; CHECK-NEXT:    [[TMP0:%.*]] = zext <4 x i1> [[BROADCAST_SPLAT]] to <4 x i64>
-; CHECK-NEXT:    [[TMP7:%.*]] = lshr <4 x i64> [[TMP0]], <i64 12, i64 12, i64 12, i64 12>
-; CHECK-NEXT:    [[TMP9:%.*]] = trunc <4 x i64> [[TMP7]] to <4 x i32>
+; CHECK-NEXT:    [[TMP1:%.*]] = lshr <4 x i64> [[TMP0]], <i64 12, i64 12, i64 12, i64 12>
+; CHECK-NEXT:    [[TMP2:%.*]] = trunc <4 x i64> [[TMP1]] to <4 x i32>
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
@@ -872,8 +872,8 @@ define void @reduction_store(ptr noalias %src, ptr %dst, i1 %x) #2 {
 ; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i32 [[TMP6]]
 ; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i32, ptr [[TMP7]], i32 0
 ; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i32, ptr [[TMP7]], i32 4
-; CHECK-NEXT:    [[TMP11]] = and <4 x i32> [[VEC_PHI]], [[TMP3]]
-; CHECK-NEXT:    [[TMP12]] = and <4 x i32> [[VEC_PHI1]], [[TMP4]]
+; CHECK-NEXT:    [[TMP11]] = and <4 x i32> [[VEC_PHI]], [[TMP2]]
+; CHECK-NEXT:    [[TMP12]] = and <4 x i32> [[VEC_PHI1]], [[TMP2]]
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 8
 ; CHECK-NEXT:    [[TMP13:%.*]] = icmp eq i32 [[INDEX_NEXT]], 24
 ; CHECK-NEXT:    br i1 [[TMP13]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP20:![0-9]+]]
diff --git a/llvm/test/Transforms/LoopVectorize/X86/interleaving.ll b/llvm/test/Transforms/LoopVectorize/X86/interleaving.ll
index 6232c1ae752673..3338cc85772e43 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/interleaving.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/interleaving.ll
@@ -124,8 +124,8 @@ define void @foo(ptr noalias nocapture %a, ptr noalias nocapture readonly %b) {
 ; AVX2-NEXT:    [[STRIDED_VEC5:%.*]] = shufflevector <16 x i32> [[WIDE_VEC2]], <16 x i32> poison, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
 ; AVX2-NEXT:    [[STRIDED_VEC9:%.*]] = shufflevector <16 x i32> [[WIDE_VEC2]], <16 x i32> poison, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
 ; AVX2-NEXT:    [[WIDE_VEC3:%.*]] = load <16 x i32>, ptr [[TMP9]], align 4
-; AVX2-NEXT:    [[STRIDED_VEC10:%.*]] = shufflevector <16 x i32> [[WIDE_VEC3]], <16 x i32> poison, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
 ; AVX2-NEXT:    [[STRIDED_VEC6:%.*]] = shufflevector <16 x i32> [[WIDE_VEC3]], <16 x i32> poison, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
+; AVX2-NEXT:    [[STRIDED_VEC10:%.*]] = shufflevector <16 x i32> [[WIDE_VEC3]], <16 x i32> poison, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
 ; AVX2-NEXT:    [[TMP10:%.*]] = add nsw <8 x i32> [[STRIDED_VEC7]], [[STRIDED_VEC]]
 ; AVX2-NEXT:    [[TMP11:%.*]] = add nsw <8 x i32> [[STRIDED_VEC8]], [[STRIDED_VEC4]]
 ; AVX2-NEXT:    [[TMP12:%.*]] = add nsw <8 x i32> [[STRIDED_VEC9]], [[STRIDED_VEC5]]
diff --git a/llvm/test/Transforms/LoopVectorize/X86/uniform_mem_op.ll b/llvm/test/Transforms/LoopVectorize/X86/uniform_mem_op.ll
index a5300af4d8caa4..f4865c816f2538 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/uniform_mem_op.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/uniform_mem_op.ll
@@ -115,13 +115,7 @@ define i32 @uniform_address(ptr align(4) %addr, i32 %byte_offset) {
 ; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; CHECK:       vector.ph:
 ; CHECK-NEXT:    [[TMP0:%.*]] = udiv i32 [[BYTE_OFFSET:%.*]], 4
-; CHECK-NEXT:    [[TMP1:%.*]] = udiv i32 [[BYTE_OFFSET]], 4
-; CHECK-NEXT:    [[TMP2:%.*]] = udiv i32 [[BYTE_OFFSET]], 4
-; CHECK-NEXT:    [[TMP3:%.*]] = udiv i32 [[BYTE_OFFSET]], 4
 ; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr i32, ptr [[ADDR:%.*]], i32 [[TMP0]]
-; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr i32, ptr [[ADDR]], i32 [[TMP1]]
-; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr i32, ptr [[ADDR]], i32 [[TMP2]]
-; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr i32, ptr [[ADDR]], i32 [[TMP3]]
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
diff --git a/llvm/test/Transforms/LoopVectorize/first-order-recurrence.ll b/llvm/test/Transforms/LoopVectorize/first-order-recurrence.ll
index b54d3cd7dd1850..ed23a0c38e7378 100644
--- a/llvm/test/Transforms/LoopVectorize/first-order-recurrence.ll
+++ b/llvm/test/Transforms/LoopVectorize/first-order-recurrence.ll
@@ -1291,19 +1291,18 @@ define i64 @constant_folded_previous_value() {
 ; UNROLL-NO-VF-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; UNROLL-NO-VF:       vector.ph:
 ; UNROLL-NO-VF-NEXT:    [[TMP0:%.*]] = add i64 0, 1
-; UNROLL-NO-VF-NEXT:    [[TMP1:%.*]] = add i64 0, 1
 ; UNROLL-NO-VF-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; UNROLL-NO-VF:       vector.body:
 ; UNROLL-NO-VF-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; UNROLL-NO-VF-NEXT:    [[VECTOR_RECUR:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[TMP1]], [[VECTOR_BODY]] ]
+; UNROLL-NO-VF-NEXT:    [[VECTOR_RECUR:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[TMP0]], [[VECTOR_BODY]] ]
 ; UNROLL-NO-VF-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
-; UNROLL-NO-VF-NEXT:    [[TMP2:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1000
-; UNROLL-NO-VF-NEXT:    br i1 [[TMP2]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]]
+; UNROLL-NO-VF-NEXT:    [[TMP1:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1000
+; UNROLL-NO-VF-NEXT:    br i1 [[TMP1]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]]
 ; UNROLL-NO-VF:       middle.block:
 ; UNROLL-NO-VF-NEXT:    br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]]
 ; UNROLL-NO-VF:       scalar.ph:
 ; UNROLL-NO-VF-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 1000, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; UNROLL-NO-VF-NEXT:    [[SCALAR_RECUR_INIT:%.*]] = phi i64 [ [[TMP1]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ]
+; UNROLL-NO-VF-NEXT:    [[SCALAR_RECUR_INIT:%.*]] = phi i64 [ [[TMP0]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ]
 ; UNROLL-NO-VF-NEXT:    br label [[SCALAR_BODY:%.*]]
 ; UNROLL-NO-VF:       scalar.body:
 ; UNROLL-NO-VF-NEXT:    [[I:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[I_NEXT:%.*]], [[SCALAR_BODY]] ]
@@ -2546,7 +2545,7 @@ define void @sink_dead_inst(ptr %a) {
 ; UNROLL-NO-IC-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; UNROLL-NO-IC-NEXT:    [[VEC_IND:%.*]] = phi <4 x i16> [ <i16 -27, i16 -26, i16 -25, i16 -24>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; UNROLL-NO-IC-NEXT:    [[VECTOR_RECUR:%.*]] = phi <4 x i16> [ <i16 poison, i16 poison, i16 poison, i16 0>, [[VECTOR_PH]] ], [ [[TMP7:%.*]], [[VECTOR_BODY]] ]
-; UNROLL-NO-IC-NEXT:    [[VECTOR_RECUR2:%.*]] = phi <4 x i32> [ <i32 poison, i32 poison, i32 poison, i32 -27>, [[VECTOR_PH]] ], [ [[TMP5:%.*]], [[VECTOR_BODY]] ]
+; UNROLL-NO-IC-NEXT:    [[VECTOR_RECUR1:%.*]] = phi <4 x i32> [ <i32 poison, i32 poison, i32 poison, i32 -27>, [[VECTOR_PH]] ], [ [[TMP5:%.*]], [[VECTOR_BODY]] ]
 ; UNROLL-NO-IC-NEXT:    [[STEP_ADD:%.*]] = add <4 x i16> [[VEC_IND]], <i16 4, i16 4, i16 4, i16 4>
 ; UNROLL-NO-IC-NEXT:    [[DOTCAST:%.*]] = trunc i32 [[INDEX]] to i16
 ; UNROLL-NO-IC-NEXT:    [[OFFSET_IDX:%.*]] = add i16 -27, [[DOTCAST]]
@@ -2574,17 +2573,17 @@ define void @sink_dead_inst(ptr %a) {
 ; UNROLL-NO-IC-NEXT:    br i1 [[TMP16]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP24:![0-9]+]]
 ; UNROLL-NO-IC:       middle.block:
 ; UNROLL-NO-IC-NEXT:    [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <4 x i16> [[TMP7]], i32 3
-; UNROLL-NO-IC-NEXT:    [[VECTOR_RECUR_EXTRACT3:%.*]] = extractelement <4 x i32> [[TMP5]], i32 3
+; UNROLL-NO-IC-NEXT:    [[VECTOR_RECUR_EXTRACT2:%.*]] = extractelement <4 x i32> [[TMP5]], i32 3
 ; UNROLL-NO-IC-NEXT:    br i1 false, label [[FOR_END:%.*]], label [[SCALAR_PH]]
 ; UNROLL-NO-IC:       scalar.ph:
 ; UNROLL-NO-IC-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i16 [ 13, [[MIDDLE_BLOCK]] ], [ -27, [[ENTRY:%.*]] ]
 ; UNROLL-NO-IC-NEXT:    [[SCALAR_RECUR_INIT:%.*]] = phi i16 [ [[VECTOR_RECUR_EXTRACT]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ]
-; UNROLL-NO-IC-NEXT:    [[SCALAR_RECUR_INIT4:%.*]] = phi i32 [ [[VECTOR_RECUR_EXTRACT3]], [[MIDDLE_BLOCK]] ], [ -27, [[ENTRY]] ]
+; UNROLL-NO-IC-NEXT:    [[SCALAR_RECUR_INIT3:%.*]] = phi i32 [ [[VECTOR_RECUR_EXTRACT2]], [[MIDDLE_BLOCK]] ], [ -27, [[ENTRY]] ]
 ; UNROLL-NO-IC-NEXT:    br label [[FOR_COND:%.*]]
 ; UNROLL-NO-IC:       for.cond:
 ; UNROLL-NO-IC-NEXT:    [[IV:%.*]] = phi i16 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_COND]] ]
 ; UNROLL-NO-IC-NEXT:    [[REC_1:%.*]] = phi i16 [ [[SCALAR_RECUR_INIT]], [[SCALAR_PH]] ], [ [[REC_1_PREV:%.*]], [[FOR_COND]] ]
-; UNROLL-NO-IC-NEXT:    [[REC_2:%.*]] = phi i32 [ [[SCALAR_RECUR_INIT4]], [[SCALAR_PH]] ], [ [[REC_2_PREV:%.*]], [[FOR_COND]] ]
+; UNROLL-NO-IC-NEXT:    [[REC_2:%.*]] = phi i32 [ [[SCALAR_RECUR_INIT3]], [[SCALAR_PH]] ], [ [[REC_2_PREV:%.*]], [[FOR_COND]] ]
 ; UNROLL-NO-IC-NEXT:    [[USE_REC_1:%.*]] = sub i16 [[REC_1]], 10
 ; UNROLL-NO-IC-NEXT:    [[CMP:%.*]] = icmp eq i32 [[REC_2]], 15
 ; UNROLL-NO-IC-NEXT:    [[IV_NEXT]] = add i16 [[IV]], 1
@@ -2728,21 +2727,21 @@ define i32 @sink_into_replication_region(i32 %y) {
 ; UNROLL-NO-IC-NEXT:    [[N_VEC:%.*]] = sub i32 [[N_RND_UP]], [[N_MOD_VF]]
 ; UNROLL-NO-IC-NEXT:    [[IND_END:%.*]] = sub i32 [[Y]], [[N_VEC]]
 ; UNROLL-NO-IC-NEXT:    [[TRIP_COUNT_MINUS_1:%.*]] = sub i32 [[TMP1]], 1
-; UNROLL-NO-IC-NEXT:    [[BROADCAST_SPLATINSERT3:%.*]] = insertelement <4 x i32> poison, i32 [[TRIP_COUNT_MINUS_1]], i64 0
-; UNROLL-NO-IC-NEXT:    [[BROADCAST_SPLAT4:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT3]], <4 x i32> poison, <4 x i32> zeroinitializer
+; UNROLL-NO-IC-NEXT:    [[BROADCAST_SPLATINSERT5:%.*]] = insertelement <4 x i32> poison, i32 [[TRIP_COUNT_MINUS_1]], i64 0
+; UNROLL-NO-IC-NEXT:    [[BROADCAST_SPLAT6:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT5]], <4 x i32> poison, <4 x i32> zeroinitializer
 ; UNROLL-NO-IC-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; UNROLL-NO-IC:       vector.body:
-; UNROLL-NO-IC-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_UDIV_CONTINUE18:%.*]] ]
-; UNROLL-NO-IC-NEXT:    [[VECTOR_RECUR:%.*]] = phi <4 x i32> [ <i32 poison, i32 poison, i32 poison, i32 0>, [[VECTOR_PH]] ], [ [[TMP43:%.*]], [[PRED_UDIV_CONTINUE18]] ]
-; UNROLL-NO-IC-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP46:%.*]], [[PRED_UDIV_CONTINUE18]] ]
-; UNROLL-NO-IC-NEXT:    [[VEC_PHI1:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP47:%.*]], [[PRED_UDIV_CONTINUE18]] ]
+; UNROLL-NO-IC-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_UDIV_CONTINUE20:%.*]] ]
+; UNROLL-NO-IC-NEXT:    [[VECTOR_RECUR:%.*]] = phi <4 x i32> [ <i32 poison, i32 poison, i32 poison, i32 0>, [[VECTOR_PH]] ], [ [[TMP43:%.*]], [[PRED_UDIV_CONTINUE20]] ]
+; UNROLL-NO-IC-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP46:%.*]], [[PRED_UDIV_CONTINUE20]] ]
+; UNROLL-NO-IC-NEXT:    [[VEC_PHI1:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP47:%.*]], [[PRED_UDIV_CONTINUE20]] ]
 ; UNROLL-NO-IC-NEXT:    [[OFFSET_IDX:%.*]] = sub i32 [[Y]], [[INDEX]]
 ; UNROLL-NO-IC-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i32> poison, i32 [[INDEX]], i64 0
 ; UNROLL-NO-IC-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT]], <4 x i32> poison, <4 x i32> zeroinitializer
 ; UNROLL-NO-IC-NEXT:    [[VEC_IV:%.*]] = add <4 x i32> [[BROADCAST_SPLAT]], <i32 0, i32 1, i32 2, i32 3>
-; UNROLL-NO-IC-NEXT:    [[VEC_IV2:%.*]] = add <4 x i32> [[BROADCAST_SPLAT]], <i32 4, i32 5, i32 6, i32 7>
-; UNROLL-NO-IC-NEXT:    [[TMP2:%.*]] = icmp ule <4 x i32> [[VEC_IV]], [[BROADCAST_SPLAT4]]
-; UNROLL-NO-IC-NEXT:    [[TMP3:%.*]] = icmp ule <4 x i32> [[VEC_IV2]], [[BROADCAST_SPLAT4]]
+; UNROLL-NO-IC-NEXT:    [[VEC_IV4:%.*]] = add <4 x i32> [[BROADCAST_SPLAT]], <i32 4, i32 5, i32 6, i32 7>
+; UNROLL-NO-IC-NEXT:    [[TMP2:%.*]] = icmp ule <4 x i32> [[VEC_IV]], [[BROADCAST_SPLAT6]]
+; UNROLL-NO-IC-NEXT:    [[TMP3:%.*]] = icmp ule <4 x i32> [[VEC_IV4]], [[BROADCAST_SPLAT6]]
 ; UNROLL-NO-IC-NEXT:    [[TMP4:%.*]] = extractelement <4 x i1> [[TMP2]], i32 0
 ; UNROLL-NO-IC-NEXT:    br i1 [[TMP4]], label [[PRED_UDIV_IF:%.*]], label [[PRED_UDIV_CONTINUE:%.*]]
 ; UNROLL-NO-IC:       pred.udiv.if:
@@ -2753,68 +2752,68 @@ define i32 @sink_into_replication_region(i32 %y) {
 ; UNROLL-NO-IC:       pred.udiv.continue:
 ; UNROLL-NO-IC-NEXT:    [[TMP8:%.*]] = phi <4 x i32> [ poison, [[VECTOR_BODY]] ], [ [[TMP7]], [[PRED_UDIV_IF]] ]
 ; UNROLL-NO-IC-NEXT:    [[TMP9:%.*]] = extractelement <4 x i1> [[TMP2]], i32 1
-; UNROLL-NO-IC-NEXT:    br i1 [[TMP9]], label [[PRED_UDIV_IF5:%.*]], label [[PRED_UDIV_CONTINUE6:%.*]]
-; UNROLL-NO-IC:       pred.udiv.if5:
+; UNROLL-NO-IC-NEXT:    br i1 [[TMP9]], label [[PRED_UDIV_IF7:%.*]], label [[PRED_UDIV_CONTINUE8:%.*]]
+; UNROLL-NO-IC:       pred.udiv.if7:
 ; UNROLL-NO-IC-NEXT:    [[TMP10:%.*]] = add i32 [[OFFSET_IDX]], -1
 ; UNROLL-NO-IC-NEXT:    [[TMP11:%.*]] = udiv i32 219220132, [[TMP10]]
 ; UNROLL-NO-IC-NEXT:    [[TMP12:%.*]] = insertelement <4 x i32> [[TMP8]], i32 [[TMP11]], i32 1
-; UNROLL-NO-IC-NEXT:    br label [[PRED_UDIV_CONTINUE6]]
-; UNROLL-NO-IC:       pred.udiv.continue6:
-; UNROLL-NO-IC-NEXT:    [[TMP13:%.*]] = phi <4 x i32> [ [[TMP8]], [[PRED_UDIV_CONTINUE]] ], [ [[TMP12]], [[PRED_UDIV_IF5]] ]
+; UNROLL-NO-IC-NEXT:    br label [[PRED_UDIV_CONTINUE8]]
+; UNROLL-NO-IC:       pred.udiv.continue8:
+; UNROLL-NO-IC-NEXT:    [[TMP13:%.*]] = phi <4 x i32> [ [[TMP8]], [[PRED_UDIV_CONTINUE]] ], [ [[TMP12]], [[PRED_UDIV_IF7]] ]
 ; UNROLL-NO-IC-NEXT:    [[TMP14:%.*]] = extractelement <4 x i1> [[TMP2]], i32 2
-; UNROLL-NO-IC-NEXT:    br i1 [[TMP14]], label [[PRED_UDIV_IF7:%.*]], label [[PRED_UDIV_CONTINUE8:%.*]]
-; UNROLL-NO-IC:       pred.udiv.if7:
+; UNROLL-NO-IC-NEXT:    br i1 [[TMP14]], label [[PRED_UDIV_IF9:%.*]], label [[PRED_UDIV_CONTINUE10:%.*]]
+; UNROLL-NO-IC:       pred.udiv.if9:
 ; UNROLL-NO-IC-NEXT:    [[TMP15:%.*]] = add i32 [[OFFSET_IDX]], -2
 ; UNROLL-NO-IC-NEXT:    [[TMP16:%.*]] = udiv i32 219220132, [[TMP15]]
 ; UNROLL-NO-IC-NEXT:    [[TMP17:%.*]] = insertelement <4 x i32> [[TMP13]], i32 [[TMP16]], i32 2
-; UNROLL-NO-IC-NEXT:    br label [[PRED_UDIV_CONTINUE8]]
-; UNROLL-NO-IC:       pred.udiv.continue8:
-; UNROLL-NO-IC-NEXT:    [[TMP18:%.*]] = phi <4 x i32> [ [[TMP13]], [[PRED_UDIV_CONTINUE6]] ], [ [[TMP17]], [[PRED_UDIV_IF7]] ]
+; UNROLL-NO-IC-NEXT:    br label [[PRED_UDIV_CONTINUE10]]
+; UNROLL-NO-IC:       pred.udiv.continue10:
+; UNROLL-NO-IC-NEXT:    [[TMP18:%.*]] = phi <4 x i32> [ [[TMP13]], [[PRED_UDIV_CONTINUE8]] ], [ [[TMP17]], [[PRED_UDIV_IF9]] ]
 ; UNROLL-NO-IC-NEXT:    [[TMP19:%.*]] = extractelement <4 x i1> [[TMP2]], i32 3
-; UNROLL-NO-IC-NEXT:    br i1 [[TMP19]], label [[PRED_UDIV_IF9:%.*]], label [[PRED_UDIV_CONTINUE10:%.*]]
-; UNROLL-NO-IC:       pred.udiv.if9:
+; UNROLL-NO-IC-NEXT:    br i1 [[TMP19]], label [[PRED_UDIV_IF11:%.*]], label [[PRED_UDIV_CONTINUE12:%.*]]
+; UNROLL-NO-IC:       pred.udiv.if11:
 ; UNROLL-NO-IC-NEXT:    [[TMP20:%.*]] = add i32 [[OFFSET_IDX]], -3
 ; UNROLL-NO-IC-NEXT:    [[TMP21:%.*]] = udiv i32 219220132, [[TMP20]]
 ; UNROLL-NO-IC-NEXT:    [[TMP22:%.*]] = insertelement <4 x i32> [[TMP18]], i32 [[TMP21]], i32 3
-; UNROLL-NO-IC-NEXT:    br label [[PRED_UDIV_CONTINUE10]]
-; UNROLL-NO-IC:       pred.udiv.continue10:
-; UNROLL-NO-IC-NEXT:    [[TMP23:%.*]] = phi <4 x i32> [ [[TMP18]], [[PRED_UDIV_CONTINUE8]] ], [ [[TMP22]], [[PRED_UDIV_IF9]] ]
+; UNROLL-NO-IC-NEXT:    br label [[PRED_UDIV_CONTINUE12]]
+; UNROLL-NO-IC:       pred.udiv.continue12:
+; UNROLL-NO-IC-NEXT:    [[TMP23:%.*]] = phi <4 x i32> [ [[TMP18]], [[PRED_UDIV_CONTINUE10]] ], [ [[TMP22]], [[PRED_UDIV_IF11]] ]
 ; UNROLL-NO-IC-NEXT:    [[TMP24:%.*]] = extractelement <4 x i1> [[TMP3]], i32 0
-; UNROLL-NO-IC-NEXT:    br i1 [[TMP24]], label [[PRED_UDIV_IF11:%.*]], label [[PRED_UDIV_CONTINUE12:%.*]]
-; UNROLL-NO-IC:       pred.udiv.if11:
+; UNROLL-NO-IC-NEXT:    br i1 [[TMP24]], label [[PRED_UDIV_IF13:%.*]], label [[PRED_UDIV_CONTINUE14:%.*]]
+; UNROLL-NO-IC:       pred.udiv.if13:
 ; UNROLL-NO-IC-NEXT:    [[TMP25:%.*]] = add i32 [[OFFSET_IDX]], -4
 ; UNROLL-NO-IC-NEXT:    [[TMP26:%.*]] = udiv i32 219220132, [[TMP25]]
 ; UNROLL-NO-IC-NEXT:    [[TMP27:%.*]] = insertelement <4 x i32> poison, i32 [[TMP26]], i32 0
-; UNROLL-NO-IC-NEXT:    br label [[PRED_UDIV_CONTINUE12]]
-; UNROLL-NO-IC:       pred.udiv.continue12:
-; UNROLL-NO-IC-NEXT:    [[TMP28:%.*]] = phi <4 x i32> [ poison, [[PRED_UDIV_CONTINUE10]] ], [ [[TMP27]], [[PRED_UDIV_IF11]] ]
+; UNROLL-NO-IC-NEXT:    br label [[PRED_UDIV_CONTINUE14]]
+; UNROLL-NO-IC:       pred.udiv.continue14:
+; UNROLL-NO-IC-NEXT:    [[TMP28:%.*]] = phi <4 x i32> [ poison, [[PRED_UDIV_CONTINUE12]] ], [ [[TMP27]], [[PRED_UDIV_IF13]] ]
 ; UNROLL-NO-IC-NEXT:    [[TMP29:%.*]] = extractelement <4 x i1> [[TMP3]], i32 1
-; UNROLL-NO-IC-NEXT:    br i1 [[TMP29]], label [[PRED_UDIV_IF13:%.*]], label [[PRED_UDIV_CONTINUE14:%.*]]
-; UNROLL-NO-IC:       pred.udiv.if13:
+; UNROLL-NO-IC-NEXT:    br i1 [[TMP29]], label [[PRED_UDIV_IF15:%.*]], label [[PRED_UDIV_CONTINUE16:%.*]]
+; UNROLL-NO-IC:       pred.udiv.if15:
 ; UNROLL-NO-IC-NEXT:    [[TMP30:%.*]] = add i32 [[OFFSET_IDX]], -5
 ; UNROLL-NO-IC-NEXT:    [[TMP31:%.*]] = udiv i32 219220132, [[TMP30]]
 ; UNROLL-NO-IC-NEXT:    [[TMP32:%.*]] = insertelement <4 x i32> [[TMP28]], i32 [[TMP31]], i32 1
-; UNROLL-NO-IC-NEXT:    br label [[PRED_UDIV_CONTINUE14]]
-; UNROLL-NO-IC:       pred.udiv.continue14:
-; UNROLL-NO-IC-NEXT:    [[TMP33:%.*]] = phi <4 x i32> [ [[TMP28]], [[PRED_UDIV_CONTINUE12]] ], [ [[TMP32]], [[PRED_UDIV_IF13]] ]
+; UNROLL-NO-IC-NEXT:    br label [[PRED_UDIV_CONTINUE16]]
+; UNROLL-NO-IC:       pred.udiv.continue16:
+; UNROLL-NO-IC-NEXT:    [[TMP33:%.*]] = phi <4 x i32> [ [[TMP28]], [[PRED_UDIV_CONTINUE14]] ], [ [[TMP32]], [[PRED_UDIV_IF15]] ]
 ; UNROLL-NO-IC-NEXT:    [[TMP34:%.*]] = extractelement <4 x i1> [[TMP3]], i32 2
-; UNROLL-NO-IC-NEXT:    br i1 [[TMP34]], label [[PRED_UDIV_IF15:%.*]], label [[PRED_UDIV_CONTINUE16:%.*]]
-; UNROLL-NO-IC:       pred.udiv.if15:
+; UNROLL-NO-IC-NEXT:    br i1 [[TMP34]], label [[PRED_UDIV_IF17:%.*]], label [[PRED_UDIV_CONTINUE18:%.*]]
+; UNROLL-NO-IC:       pred.udiv.if17:
 ; UNROLL-NO-IC-NEXT:    [[TMP35:%.*]] = add i32 [[OFFSET_IDX]], -6
 ; UNROLL-NO-IC-NEXT:    [[TMP36:%.*]] = udiv i32 219220132, [[TMP35]]
 ; UNROLL-NO-IC-NEXT:    [[TMP37:%.*]] = insertelement <4 x i32> [[TMP33]], i32 [[TMP36]], i32 2
-; UNROLL-NO-IC-NEXT:    br label [[PRED_UDIV_CONTINUE16]]
-; UNROLL-NO-IC:       pred.udiv.continue16:
-; UNROLL-NO-IC-NEXT:    [[TMP38:%.*]] = phi <4 x i32> [ [[TMP33]], [[PRED_UDIV_CONTINUE14]] ], [ [[TMP37]], [[PRED_UDIV_IF15]] ]
+; UNROLL-NO-IC-NEXT:    br label [[PRED_UDIV_CONTINUE18]]
+; UNROLL-NO-IC:       pred.udiv.continue18:
+; UNROLL-NO-IC-NEXT:    [[TMP38:%.*]] = phi <4 x i32> [ [[TMP33]], [[PRED_UDIV_CONTINUE16]] ], [ [[TMP37]], [[PRED_UDIV_IF17]] ]
 ; UNROLL-NO-IC-NEXT:    [[TMP39:%.*]] = extractelement <4 x i1> [[TMP3]], i32 3
-; UNROLL-NO-IC-NEXT:    br i1 [[TMP39]], label [[PRED_UDIV_IF17:%.*]], label [[PRED_UDIV_CONTINUE18]]
-; UNROLL-NO-IC:       pred.udiv.if17:
+; UNROLL-NO-IC-NEXT:    br i1 [[TMP39]], label [[PRED_UDIV_IF19:%.*]], label [[PRED_UDIV_CONTINUE20]]
+; UNROLL-NO-IC:       pred.udiv.if19:
 ; UNROLL-NO-IC-NEXT:    [[TMP40:%.*]] = add i32 [[OFFSET_IDX]], -7
 ; UNROLL-NO-IC-NEXT:    [[TMP41:%.*]] = udiv i32 219220132, [[TMP40]]
 ; UNROLL-NO-IC-NEXT:    [[TMP42:%.*]] = insertelement <4 x i32> [[TMP38]], i32 [[TMP41]], i32 3
-; UNROLL-NO-IC-NEXT:    br label [[PRED_UDIV_CONTINUE18]]
-; UNROLL-NO-IC:       pred.udiv.continue18:
-; UNROLL-NO-IC-NEXT:    [[TMP43]] = phi <4 x i32> [ [[TMP38]], [[PRED_UDIV_CONTINUE16]] ], [ [[TMP42]], [[PRED_UDIV_IF17]] ]
+; UNROLL-NO-IC-NEXT:    br label [[PRED_UDIV_CONTINUE20]]
+; UNROLL-NO-IC:       pred.udiv.continue20:
+; UNROLL-NO-IC-NEXT:    [[TMP43]] = phi <4 x i32> [ [[TMP38]], [[PRED_UDIV_CONTINUE18]] ], [ [[TMP42]], [[PRED_UDIV_IF19]] ]
 ; UNROLL-NO-IC-NEXT:    [[TMP44:%.*]] = shufflevector <4 x i32> [[VECTOR_RECUR]], <4 x i32> [[TMP23]], <4 x i32> <i32 3, i32 4, i32 5, i32 6>
 ; UNROLL-NO-IC-NEXT:    [[TMP45:%.*]] = shufflevector <4 x i32> [[TMP23]], <4 x i32> [[TMP43]], <4 x i32> <i32 3, i32 4, i32 5, i32 6>
 ; UNROLL-NO-IC-NEXT:    [[TMP46]] = add <4 x i32> [[VEC_PHI]], [[TMP44]]
@@ -2893,7 +2892,7 @@ define i32 @sink_into_replication_region(i32 %y) {
 ; UNROLL-NO-VF-NEXT:    br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !prof [[PROF27:![0-9]+]], !llvm.loop [[LOOP28:![0-9]+]]
 ; UNROLL-NO-VF:       middle.block:
 ; UNROLL-NO-VF-NEXT:    [[BIN_RDX:%.*]] = add i32 [[TMP13]], [[TMP12]]
-; UNROLL-NO-VF-NEXT:    br i1 true, label [[BB1:%.*]], label [[SCALAR_PH]], !prof [[PROF29:![0-9]+]]
+; UNROLL-NO-VF-NEXT:    br i1 true, label [[BB1:%.*]], label [[SCALAR_PH]], !prof [[PROF27]]
 ; UNROLL-NO-VF:       scalar.ph:
 ; UNROLL-NO-VF-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i32 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ [[Y]], [[BB:%.*]] ]
 ; UNROLL-NO-VF-NEXT:    [[SCALAR_RECUR_INIT:%.*]] = phi i32 [ [[TMP9]], [[MIDDLE_BLOCK]] ], [ 0, [[BB]] ]
@@ -2910,7 +2909,7 @@ define i32 @sink_into_replication_region(i32 %y) {
 ; UNROLL-NO-VF-NEXT:    [[VAR7]] = udiv i32 219220132, [[VAR3]]
 ; UNROLL-NO-VF-NEXT:    [[VAR8]] = add nsw i32 [[VAR3]], -1
 ; UNROLL-NO-VF-NEXT:    [[VAR9:%.*]] = icmp slt i32 [[VAR3]], 2
-; UNROLL-NO-VF-NEXT:    br i1 [[VAR9]], label [[BB1]], label [[BB2]], !prof [[PROF30:![0-9]+]], !llvm.loop [[LOOP31:![0-9]+]]
+; UNROLL-NO-VF-NEXT:    br i1 [[VAR9]], label [[BB1]], label [[BB2]], !prof [[PROF29:![0-9]+]], !llvm.loop [[LOOP30:![0-9]+]]
 ;
 ; SINK-AFTER-LABEL: @sink_into_replication_region(
 ; SINK-AFTER-NEXT:  bb:
@@ -3037,11 +3036,11 @@ define i32 @sink_into_replication_region_multiple(ptr %x, i32 %y) {
 ; UNROLL-NO-IC-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT]], <4 x i32> poison, <4 x i32> zeroinitializer
 ; UNROLL-NO-IC-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; UNROLL-NO-IC:       vector.body:
-; UNROLL-NO-IC-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_STORE_CONTINUE31:%.*]] ]
-; UNROLL-NO-IC-NEXT:    [[VEC_IND:%.*]] = phi <4 x i32> [ <i32 0, i32 1, i32 2, i32 3>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[PRED_STORE_CONTINUE31]] ]
-; UNROLL-NO-IC-NEXT:    [[VECTOR_RECUR:%.*]] = phi <4 x i32> [ <i32 poison, i32 poison, i32 poison, i32 0>, [[VECTOR_PH]] ], [ [[TMP43:%.*]], [[PRED_STORE_CONTINUE31]] ]
-; UNROLL-NO-IC-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP46:%.*]], [[PRED_STORE_CONTINUE31]] ]
-; UNROLL-NO-IC-NEXT:    [[VEC_PHI3:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP47:%.*]], [[PRED_STORE_CONTINUE31]] ]
+; UNROLL-NO-IC-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_STORE_CONTINUE30:%.*]] ]
+; UNROLL-NO-IC-NEXT:    [[VEC_IND:%.*]] = phi <4 x i32> [ <i32 0, i32 1, i32 2, i32 3>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[PRED_STORE_CONTINUE30]] ]
+; UNROLL-NO-IC-NEXT:    [[VECTOR_RECUR:%.*]] = phi <4 x i32> [ <i32 poison, i32 poison, i32 poison, i32 0>, [[VECTOR_PH]] ], [ [[TMP43:%.*]], [[PRED_STORE_CONTINUE30]] ]
+; UNROLL-NO-IC-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP46:%.*]], [[PRED_STORE_CONTINUE30]] ]
+; UNROLL-NO-IC-NEXT:    [[VEC_PHI2:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP47:%.*]], [[PRED_STORE_CONTINUE30]] ]
 ; UNROLL-NO-IC-NEXT:    [[STEP_ADD:%.*]] = add <4 x i32> [[VEC_IND]], <i32 4, i32 4, i32 4, i32 4>
 ; UNROLL-NO-IC-NEXT:    [[OFFSET_IDX:%.*]] = sub i32 [[Y]], [[INDEX]]
 ; UNROLL-NO-IC-NEXT:    [[TMP2:%.*]] = add i32 [[OFFSET_IDX]], 0
@@ -3063,65 +3062,65 @@ define i32 @sink_into_replication_region_multiple(ptr %x, i32 %y) {
 ; UNROLL-NO-IC:       pred.udiv.continue:
 ; UNROLL-NO-IC-NEXT:    [[TMP15:%.*]] = phi <4 x i32> [ poison, [[VECTOR_BODY]] ], [ [[TMP14]], [[PRED_UDIV_IF]] ]
 ; UNROLL-NO-IC-NEXT:    [[TMP16:%.*]] = extractelement <4 x i1> [[TMP10]], i32 1
-; UNROLL-NO-IC-NEXT:    br i1 [[TMP16]], label [[PRED_UDIV_IF4:%.*]], label [[PRED_UDIV_CONTINUE5:%.*]]
-; UNROLL-NO-IC:       pred.udiv.if4:
+; UNROLL-NO-IC-NEXT:    br i1 [[TMP16]], label [[PRED_UDIV_IF3:%.*]], label [[PRED_UDIV_CONTINUE4:%.*]]
+; UNROLL-NO-IC:       pred.udiv.if3:
 ; UNROLL-NO-IC-NEXT:    [[TMP17:%.*]] = udiv i32 219220132, [[TMP3]]
 ; UNROLL-NO-IC-NEXT:    [[TMP18:%.*]] = insertelement <4 x i32> [[TMP15]], i32 [[TMP17]], i32 1
-; UNROLL-NO-IC-NEXT:    br label [[PRED_UDIV_CONTINUE5]]
-; UNROLL-NO-IC:       pred.udiv.continue5:
-; UNROLL-NO-IC-NEXT:    [[TMP19:%.*]] = phi <4 x i32> [ [[TMP15]], [[PRED_UDIV_CONTINUE]] ], [ [[TMP18]], [[PRED_UDIV_IF4]] ]
+; UNROLL-NO-IC-NEXT:    br label [[PRED_UDIV_CONTINUE4]]
+; UNROLL-NO-IC:       pred.udiv.continue4:
+; UNROLL-NO-IC-NEXT:    [[TMP19:%.*]] = phi <4 x i32> [ [[TMP15]], [[PRED_UDIV_CONTINUE]] ], [ [[TMP18]], [[PRED_UDIV_IF3]] ]
 ; UNROLL-NO-IC-NEXT:    [[TMP20:%.*]] = extractelement <4 x i1> [[TMP10]], i32 2
-; UNROLL-NO-IC-NEXT:    br i1 [[TMP20]], label [[PRED_UDIV_IF6:%.*]], label [[PRED_UDIV_CONTINUE7:%.*]]
-; UNROLL-NO-IC:       pred.udiv.if6:
+; UNROLL-NO-IC-NEXT:    br i1 [[TMP20]], label [[PRED_UDIV_IF5:%.*]], label [[PRED_UDIV_CONTINUE6:%.*]]
+; UNROLL-NO-IC:       pred.udiv.if5:
 ; UNROLL-NO-IC-NEXT:    [[TMP21:%.*]] = udiv i32 219220132, [[TMP4]]
 ; UNROLL-NO-IC-NEXT:    [[TMP22:%.*]] = insertelement <4 x i32> [[TMP19]], i32 [[TMP21]], i32 2
-; UNROLL-NO-IC-NEXT:    br label [[PRED_UDIV_CONTINUE7]]
-; UNROLL-NO-IC:       pred.udiv.continue7:
-; UNROLL-NO-IC-NEXT:    [[TMP23:%.*]] = phi <4 x i32> [ [[TMP19]], [[PRED_UDIV_CONTINUE5]] ], [ [[TMP22]], [[PRED_UDIV_IF6]] ]
+; UNROLL-NO-IC-NEXT:    br label [[PRED_UDIV_CONTINUE6]]
+; UNROLL-NO-IC:       pred.udiv.continue6:
+; UNROLL-NO-IC-NEXT:    [[TMP23:%.*]] = phi <4 x i32> [ [[TMP19]], [[PRED_UDIV_CONTINUE4]] ], [ [[TMP22]], [[PRED_UDIV_IF5]] ]
 ; UNROLL-NO-IC-NEXT:    [[TMP24:%.*]] = extractelement <4 x i1> [[TMP10]], i32 3
-; UNROLL-NO-IC-NEXT:    br i1 [[TMP24]], label [[PRED_UDIV_IF8:%.*]], label [[PRED_UDIV_CONTINUE9:%.*]]
-; UNROLL-NO-IC:       pred.udiv.if8:
+; UNROLL-NO-IC-NEXT:    br i1 [[TMP24]], label [[PRED_UDIV_IF7:%.*]], label [[PRED_UDIV_CONTINUE8:%.*]]
+; UNROLL-NO-IC:       pred.udiv.if7:
 ; UNROLL-NO-IC-NEXT:    [[TMP25:%.*]] = udiv i32 219220132, [[TMP5]]
 ; UNROLL-NO-IC-NEXT:    [[TMP26:%.*]] = insertelement <4 x i32> [[TMP23]], i32 [[TMP25]], i32 3
-; UNROLL-NO-IC-NEXT:    br label [[PRED_UDIV_CONTINUE9]]
-; UNROLL-NO-IC:       pred.udiv.continue9:
-; UNROLL-NO-IC-NEXT:    [[TMP27:%.*]] = phi <4 x i32> [ [[TMP23]], [[PRED_UDIV_CONTINUE7]] ], [ [[TMP26]], [[PRED_UDIV_IF8]] ]
+; UNROLL-NO-IC-NEXT:    br label [[PRED_UDIV_CONTINUE8]]
+; UNROLL-NO-IC:       pred.udiv.continue8:
+; UNROLL-NO-IC-NEXT:    [[TMP27:%.*]] = phi <4 x i32> [ [[TMP23]], [[PRED_UDIV_CONTINUE6]] ], [ [[TMP26]], [[PRED_UDIV_IF7]] ]
 ; UNROLL-NO-IC-NEXT:    [[TMP28:%.*]] = extractelement <4 x i1> [[TMP11]], i32 0
-; UNROLL-NO-IC-NEXT:    br i1 [[TMP28]], label [[PRED_UDIV_IF10:%.*]], label [[PRED_UDIV_CONTINUE11:%.*]]
-; UNROLL-NO-IC:       pred.udiv.if10:
+; UNROLL-NO-IC-NEXT:    br i1 [[TMP28]], label [[PRED_UDIV_IF9:%.*]], label [[PRED_UDIV_CONTINUE10:%.*]]
+; UNROLL-NO-IC:       pred.udiv.if9:
 ; UNROLL-NO-IC-NEXT:    [[TMP29:%.*]] = udiv i32 219220132, [[TMP6]]
 ; UNROLL-NO-IC-NEXT:    [[TMP30:%.*]] = insertelement <4 x i32> poison, i32 [[TMP29]], i32 0
-; UNROLL-NO-IC-NEXT:    br label [[PRED_UDIV_CONTINUE11]]
-; UNROLL-NO-IC:       pred.udiv.continue11:
-; UNROLL-NO-IC-NEXT:    [[TMP31:%.*]] = phi <4 x i32> [ poison, [[PRED_UDIV_CONTINUE9]] ], [ [[TMP30]], [[PRED_UDIV_IF10]] ]
+; UNROLL-NO-IC-NEXT:    br label [[PRED_UDIV_CONTINUE10]]
+; UNROLL-NO-IC:       pred.udiv.continue10:
+; UNROLL-NO-IC-NEXT:    [[TMP31:%.*]] = phi <4 x i32> [ poison, [[PRED_UDIV_CONTINUE8]] ], [ [[TMP30]], [[PRED_UDIV_IF9]] ]
 ; UNROLL-NO-IC-NEXT:    [[TMP32:%.*]] = extractelement <4 x i1> [[TMP11]], i32 1
-; UNROLL-NO-IC-NEXT:    br i1 [[TMP32]], label [[PRED_UDIV_IF12:%.*]], label [[PRED_UDIV_CONTINUE13:%.*]]
-; UNROLL-NO-IC:       pred.udiv.if12:
+; UNROLL-NO-IC-NEXT:    br i1 [[TMP32]], label [[PRED_UDIV_IF11:%.*]], label [[PRED_UDIV_CONTINUE12:%.*]]
+; UNROLL-NO-IC:       pred.udiv.if11:
 ; UNROLL-NO-IC-NEXT:    [[TMP33:%.*]] = udiv i32 219220132, [[TMP7]]
 ; UNROLL-NO-IC-NEXT:    [[TMP34:%.*]] = insertelement <4 x i32> [[TMP31]], i32 [[TMP33]], i32 1
-; UNROLL-NO-IC-NEXT:    br label [[PRED_UDIV_CONTINUE13]]
-; UNROLL-NO-IC:       pred.udiv.continue13:
-; UNROLL-NO-IC-NEXT:    [[TMP35:%.*]] = phi <4 x i32> [ [[TMP31]], [[PRED_UDIV_CONTINUE11]] ], [ [[TMP34]], [[PRED_UDIV_IF12]] ]
+; UNROLL-NO-IC-NEXT:    br label [[PRED_UDIV_CONTINUE12]]
+; UNROLL-NO-IC:       pred.udiv.continue12:
+; UNROLL-NO-IC-NEXT:    [[TMP35:%.*]] = phi <4 x i32> [ [[TMP31]], [[PRED_UDIV_CONTINUE10]] ], [ [[TMP34]], [[PRED_UDIV_IF11]] ]
 ; UNROLL-NO-IC-NEXT:    [[TMP36:%.*]] = extractelement <4 x i1> [[TMP11]], i32 2
-; UNROLL-NO-IC-NEXT:    br i1 [[TMP36]], label [[PRED_UDIV_IF14:%.*]], label [[PRED_UDIV_CONTINUE15:%.*]]
-; UNROLL-NO-IC:       pred.udiv.if14:
+; UNROLL-NO-IC-NEXT:    br i1 [[TMP36]], label [[PRED_UDIV_IF13:%.*]], label [[PRED_UDIV_CONTINUE14:%.*]]
+; UNROLL-NO-IC:       pred.udiv.if13:
 ; UNROLL-NO-IC-NEXT:    [[TMP37:%.*]] = udiv i32 219220132, [[TMP8]]
 ; UNROLL-NO-IC-NEXT:    [[TMP38:%.*]] = insertelement <4 x i32> [[TMP35]], i32 [[TMP37]], i32 2
-; UNROLL-NO-IC-NEXT:    br label [[PRED_UDIV_CONTINUE15]]
-; UNROLL-NO-IC:       pred.udiv.continue15:
-; UNROLL-NO-IC-NEXT:    [[TMP39:%.*]] = phi <4 x i32> [ [[TMP35]], [[PRED_UDIV_CONTINUE13]] ], [ [[TMP38]], [[PRED_UDIV_IF14]] ]
+; UNROLL-NO-IC-NEXT:    br label [[PRED_UDIV_CONTINUE14]]
+; UNROLL-NO-IC:       pred.udiv.continue14:
+; UNROLL-NO-IC-NEXT:    [[TMP39:%.*]] = phi <4 x i32> [ [[TMP35]], [[PRED_UDIV_CONTINUE12]] ], [ [[TMP38]], [[PRED_UDIV_IF13]] ]
 ; UNROLL-NO-IC-NEXT:    [[TMP40:%.*]] = extractelement <4 x i1> [[TMP11]], i32 3
-; UNROLL-NO-IC-NEXT:    br i1 [[TMP40]], label [[PRED_UDIV_IF16:%.*]], label [[PRED_UDIV_CONTINUE17:%.*]]
-; UNROLL-NO-IC:       pred.udiv.if16:
+; UNROLL-NO-IC-NEXT:    br i1 [[TMP40]], label [[PRED_UDIV_IF15:%.*]], label [[PRED_UDIV_CONTINUE16:%.*]]
+; UNROLL-NO-IC:       pred.udiv.if15:
 ; UNROLL-NO-IC-NEXT:    [[TMP41:%.*]] = udiv i32 219220132, [[TMP9]]
 ; UNROLL-NO-IC-NEXT:    [[TMP42:%.*]] = insertelement <4 x i32> [[TMP39]], i32 [[TMP41]], i32 3
-; UNROLL-NO-IC-NEXT:    br label [[PRED_UDIV_CONTINUE17]]
-; UNROLL-NO-IC:       pred.udiv.continue17:
-; UNROLL-NO-IC-NEXT:    [[TMP43]] = phi <4 x i32> [ [[TMP39]], [[PRED_UDIV_CONTINUE15]] ], [ [[TMP42]], [[PRED_UDIV_IF16]] ]
+; UNROLL-NO-IC-NEXT:    br label [[PRED_UDIV_CONTINUE16]]
+; UNROLL-NO-IC:       pred.udiv.continue16:
+; UNROLL-NO-IC-NEXT:    [[TMP43]] = phi <4 x i32> [ [[TMP39]], [[PRED_UDIV_CONTINUE14]] ], [ [[TMP42]], [[PRED_UDIV_IF15]] ]
 ; UNROLL-NO-IC-NEXT:    [[TMP44:%.*]] = shufflevector <4 x i32> [[VECTOR_RECUR]], <4 x i32> [[TMP27]], <4 x i32> <i32 3, i32 4, i32 5, i32 6>
 ; UNROLL-NO-IC-NEXT:    [[TMP45:%.*]] = shufflevector <4 x i32> [[TMP27]], <4 x i32> [[TMP43]], <4 x i32> <i32 3, i32 4, i32 5, i32 6>
 ; UNROLL-NO-IC-NEXT:    [[TMP46]] = add <4 x i32> [[VEC_PHI]], [[TMP44]]
-; UNROLL-NO-IC-NEXT:    [[TMP47]] = add <4 x i32> [[VEC_PHI3]], [[TMP45]]
+; UNROLL-NO-IC-NEXT:    [[TMP47]] = add <4 x i32> [[VEC_PHI2]], [[TMP45]]
 ; UNROLL-NO-IC-NEXT:    [[TMP48:%.*]] = extractelement <4 x i1> [[TMP10]], i32 0
 ; UNROLL-NO-IC-NEXT:    br i1 [[TMP48]], label [[PRED_STORE_IF:%.*]], label [[PRED_STORE_CONTINUE:%.*]]
 ; UNROLL-NO-IC:       pred.store.if:
@@ -3131,63 +3130,63 @@ define i32 @sink_into_replication_region_multiple(ptr %x, i32 %y) {
 ; UNROLL-NO-IC-NEXT:    br label [[PRED_STORE_CONTINUE]]
 ; UNROLL-NO-IC:       pred.store.continue:
 ; UNROLL-NO-IC-NEXT:    [[TMP51:%.*]] = extractelement <4 x i1> [[TMP10]], i32 1
-; UNROLL-NO-IC-NEXT:    br i1 [[TMP51]], label [[PRED_STORE_IF18:%.*]], label [[PRED_STORE_CONTINUE19:%.*]]
-; UNROLL-NO-IC:       pred.store.if18:
+; UNROLL-NO-IC-NEXT:    br i1 [[TMP51]], label [[PRED_STORE_IF17:%.*]], label [[PRED_STORE_CONTINUE18:%.*]]
+; UNROLL-NO-IC:       pred.store.if17:
 ; UNROLL-NO-IC-NEXT:    [[TMP52:%.*]] = add i32 [[INDEX]], 1
 ; UNROLL-NO-IC-NEXT:    [[TMP53:%.*]] = getelementptr inbounds i32, ptr [[X]], i32 [[TMP52]]
 ; UNROLL-NO-IC-NEXT:    store i32 [[TMP3]], ptr [[TMP53]], align 4
-; UNROLL-NO-IC-NEXT:    br label [[PRED_STORE_CONTINUE19]]
-; UNROLL-NO-IC:       pred.store.continue19:
+; UNROLL-NO-IC-NEXT:    br label [[PRED_STORE_CONTINUE18]]
+; UNROLL-NO-IC:       pred.store.continue18:
 ; UNROLL-NO-IC-NEXT:    [[TMP54:%.*]] = extractelement <4 x i1> [[TMP10]], i32 2
-; UNROLL-NO-IC-NEXT:    br i1 [[TMP54]], label [[PRED_STORE_IF20:%.*]], label [[PRED_STORE_CONTINUE21:%.*]]
-; UNROLL-NO-IC:       pred.store.if20:
+; UNROLL-NO-IC-NEXT:    br i1 [[TMP54]], label [[PRED_STORE_IF19:%.*]], label [[PRED_STORE_CONTINUE20:%.*]]
+; UNROLL-NO-IC:       pred.store.if19:
 ; UNROLL-NO-IC-NEXT:    [[TMP55:%.*]] = add i32 [[INDEX]], 2
 ; UNROLL-NO-IC-NEXT:    [[TMP56:%.*]] = getelementptr inbounds i32, ptr [[X]], i32 [[TMP55]]
 ; UNROLL-NO-IC-NEXT:    store i32 [[TMP4]], ptr [[TMP56]], align 4
-; UNROLL-NO-IC-NEXT:    br label [[PRED_STORE_CONTINUE21]]
-; UNROLL-NO-IC:       pred.store.continue21:
+; UNROLL-NO-IC-NEXT:    br label [[PRED_STORE_CONTINUE20]]
+; UNROLL-NO-IC:       pred.store.continue20:
 ; UNROLL-NO-IC-NEXT:    [[TMP57:%.*]] = extractelement <4 x i1> [[TMP10]], i32 3
-; UNROLL-NO-IC-NEXT:    br i1 [[TMP57]], label [[PRED_STORE_IF22:%.*]], label [[PRED_STORE_CONTINUE23:%.*]]
-; UNROLL-NO-IC:       pred.store.if22:
+; UNROLL-NO-IC-NEXT:    br i1 [[TMP57]], label [[PRED_STORE_IF21:%.*]], label [[PRED_STORE_CONTINUE22:%.*]]
+; UNROLL-NO-IC:       pred.store.if21:
 ; UNROLL-NO-IC-NEXT:    [[TMP58:%.*]] = add i32 [[INDEX]], 3
 ; UNROLL-NO-IC-NEXT:    [[TMP59:%.*]] = getelementptr inbounds i32, ptr [[X]], i32 [[TMP58]]
 ; UNROLL-NO-IC-NEXT:    store i32 [[TMP5]], ptr [[TMP59]], align 4
-; UNROLL-NO-IC-NEXT:    br label [[PRED_STORE_CONTINUE23]]
-; UNROLL-NO-IC:       pred.store.continue23:
+; UNROLL-NO-IC-NEXT:    br label [[PRED_STORE_CONTINUE22]]
+; UNROLL-NO-IC:       pred.store.continue22:
 ; UNROLL-NO-IC-NEXT:    [[TMP60:%.*]] = extractelement <4 x i1> [[TMP11]], i32 0
-; UNROLL-NO-IC-NEXT:    br i1 [[TMP60]], label [[PRED_STORE_IF24:%.*]], label [[PRED_STORE_CONTINUE25:%.*]]
-; UNROLL-NO-IC:       pred.store.if24:
+; UNROLL-NO-IC-NEXT:    br i1 [[TMP60]], label [[PRED_STORE_IF23:%.*]], label [[PRED_STORE_CONTINUE24:%.*]]
+; UNROLL-NO-IC:       pred.store.if23:
 ; UNROLL-NO-IC-NEXT:    [[TMP61:%.*]] = add i32 [[INDEX]], 4
 ; UNROLL-NO-IC-NEXT:    [[TMP62:%.*]] = getelementptr inbounds i32, ptr [[X]], i32 [[TMP61]]
 ; UNROLL-NO-IC-NEXT:    store i32 [[TMP6]], ptr [[TMP62]], align 4
-; UNROLL-NO-IC-NEXT:    br label [[PRED_STORE_CONTINUE25]]
-; UNROLL-NO-IC:       pred.store.continue25:
+; UNROLL-NO-IC-NEXT:    br label [[PRED_STORE_CONTINUE24]]
+; UNROLL-NO-IC:       pred.store.continue24:
 ; UNROLL-NO-IC-NEXT:    [[TMP63:%.*]] = extractelement <4 x i1> [[TMP11]], i32 1
-; UNROLL-NO-IC-NEXT:    br i1 [[TMP63]], label [[PRED_STORE_IF26:%.*]], label [[PRED_STORE_CONTINUE27:%.*]]
-; UNROLL-NO-IC:       pred.store.if26:
+; UNROLL-NO-IC-NEXT:    br i1 [[TMP63]], label [[PRED_STORE_IF25:%.*]], label [[PRED_STORE_CONTINUE26:%.*]]
+; UNROLL-NO-IC:       pred.store.if25:
 ; UNROLL-NO-IC-NEXT:    [[TMP64:%.*]] = add i32 [[INDEX]], 5
 ; UNROLL-NO-IC-NEXT:    [[TMP65:%.*]] = getelementptr inbounds i32, ptr [[X]], i32 [[TMP64]]
 ; UNROLL-NO-IC-NEXT:    store i32 [[TMP7]], ptr [[TMP65]], align 4
-; UNROLL-NO-IC-NEXT:    br label [[PRED_STORE_CONTINUE27]]
-; UNROLL-NO-IC:       pred.store.continue27:
+; UNROLL-NO-IC-NEXT:    br label [[PRED_STORE_CONTINUE26]]
+; UNROLL-NO-IC:       pred.store.continue26:
 ; UNROLL-NO-IC-NEXT:    [[TMP66:%.*]] = extractelement <4 x i1> [[TMP11]], i32 2
-; UNROLL-NO-IC-NEXT:    br i1 [[TMP66]], label [[PRED_STORE_IF28:%.*]], label [[PRED_STORE_CONTINUE29:%.*]]
-; UNROLL-NO-IC:       pred.store.if28:
+; UNROLL-NO-IC-NEXT:    br i1 [[TMP66]], label [[PRED_STORE_IF27:%.*]], label [[PRED_STORE_CONTINUE28:%.*]]
+; UNROLL-NO-IC:       pred.store.if27:
 ; UNROLL-NO-IC-NEXT:    [[TMP67:%.*]] = add i32 [[INDEX]], 6
 ; UNROLL-NO-IC-NEXT:    [[TMP68:%.*]] = getelementptr inbounds i32, ptr [[X]], i32 [[TMP67]]
 ; UNROLL-NO-IC-NEXT:    store i32 [[TMP8]], ptr [[TMP68]], align 4
-; UNROLL-NO-IC-NEXT:    br label [[PRED_STORE_CONTINUE29]]
-; UNROLL-NO-IC:       pred.store.continue29:
+; UNROLL-NO-IC-NEXT:    br label [[PRED_STORE_CONTINUE28]]
+; UNROLL-NO-IC:       pred.store.continue28:
 ; UNROLL-NO-IC-NEXT:    [[TMP69:%.*]] = extractelement <4 x i1> [[TMP11]], i32 3
-; UNROLL-NO-IC-NEXT:    br i1 [[TMP69]], label [[PRED_STORE_IF30:%.*]], label [[PRED_STORE_CONTINUE31]]
-; UNROLL-NO-IC:       pred.store.if30:
+; UNROLL-NO-IC-NEXT:    br i1 [[TMP69]], label [[PRED_STORE_IF29:%.*]], label [[PRED_STORE_CONTINUE30]]
+; UNROLL-NO-IC:       pred.store.if29:
 ; UNROLL-NO-IC-NEXT:    [[TMP70:%.*]] = add i32 [[INDEX]], 7
 ; UNROLL-NO-IC-NEXT:    [[TMP71:%.*]] = getelementptr inbounds i32, ptr [[X]], i32 [[TMP70]]
 ; UNROLL-NO-IC-NEXT:    store i32 [[TMP9]], ptr [[TMP71]], align 4
-; UNROLL-NO-IC-NEXT:    br label [[PRED_STORE_CONTINUE31]]
-; UNROLL-NO-IC:       pred.store.continue31:
+; UNROLL-NO-IC-NEXT:    br label [[PRED_STORE_CONTINUE30]]
+; UNROLL-NO-IC:       pred.store.continue30:
 ; UNROLL-NO-IC-NEXT:    [[TMP72:%.*]] = select <4 x i1> [[TMP10]], <4 x i32> [[TMP46]], <4 x i32> [[VEC_PHI]]
-; UNROLL-NO-IC-NEXT:    [[TMP73:%.*]] = select <4 x i1> [[TMP11]], <4 x i32> [[TMP47]], <4 x i32> [[VEC_PHI3]]
+; UNROLL-NO-IC-NEXT:    [[TMP73:%.*]] = select <4 x i1> [[TMP11]], <4 x i32> [[TMP47]], <4 x i32> [[VEC_PHI2]]
 ; UNROLL-NO-IC-NEXT:    [[INDEX_NEXT]] = add i32 [[INDEX]], 8
 ; UNROLL-NO-IC-NEXT:    [[VEC_IND_NEXT]] = add <4 x i32> [[STEP_ADD]], <i32 4, i32 4, i32 4, i32 4>
 ; UNROLL-NO-IC-NEXT:    [[TMP74:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
@@ -3277,10 +3276,10 @@ define i32 @sink_into_replication_region_multiple(ptr %x, i32 %y) {
 ; UNROLL-NO-VF-NEXT:    [[TMP17:%.*]] = select i1 [[TMP5]], i32 [[TMP11]], i32 [[VEC_PHI2]]
 ; UNROLL-NO-VF-NEXT:    [[INDEX_NEXT]] = add i32 [[INDEX]], 2
 ; UNROLL-NO-VF-NEXT:    [[TMP18:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
-; UNROLL-NO-VF-NEXT:    br i1 [[TMP18]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !prof [[PROF27]], !llvm.loop [[LOOP32:![0-9]+]]
+; UNROLL-NO-VF-NEXT:    br i1 [[TMP18]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !prof [[PROF27]], !llvm.loop [[LOOP31:![0-9]+]]
 ; UNROLL-NO-VF:       middle.block:
 ; UNROLL-NO-VF-NEXT:    [[BIN_RDX:%.*]] = add i32 [[TMP17]], [[TMP16]]
-; UNROLL-NO-VF-NEXT:    br i1 true, label [[BB1:%.*]], label [[SCALAR_PH]], !prof [[PROF29]]
+; UNROLL-NO-VF-NEXT:    br i1 true, label [[BB1:%.*]], label [[SCALAR_PH]], !prof [[PROF27]]
 ; UNROLL-NO-VF:       scalar.ph:
 ; UNROLL-NO-VF-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i32 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ [[Y]], [[BB:%.*]] ]
 ; UNROLL-NO-VF-NEXT:    [[BC_RESUME_VAL1:%.*]] = phi i32 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[BB]] ]
@@ -3302,7 +3301,7 @@ define i32 @sink_into_replication_region_multiple(ptr %x, i32 %y) {
 ; UNROLL-NO-VF-NEXT:    [[VAR8]] = add nsw i32 [[VAR3]], -1
 ; UNROLL-NO-VF-NEXT:    [[IV_NEXT]] = add nsw i32 [[IV]], 1
 ; UNROLL-NO-VF-NEXT:    [[VAR9:%.*]] = icmp slt i32 [[VAR3]], 2
-; UNROLL-NO-VF-NEXT:    br i1 [[VAR9]], label [[BB1]], label [[BB2]], !prof [[PROF30]], !llvm.loop [[LOOP33:![0-9]+]]
+; UNROLL-NO-VF-NEXT:    br i1 [[VAR9]], label [[BB1]], label [[BB2]], !prof [[PROF29]], !llvm.loop [[LOOP32:![0-9]+]]
 ;
 ; SINK-AFTER-LABEL: @sink_into_replication_region_multiple(
 ; SINK-AFTER-NEXT:  bb:
@@ -3531,7 +3530,7 @@ define i32 @sink_after_dead_inst(ptr %A.ptr) {
 ; UNROLL-NO-VF-NEXT:    store i32 0, ptr [[TMP9]], align 4
 ; UNROLL-NO-VF-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 2
 ; UNROLL-NO-VF-NEXT:    [[TMP10:%.*]] = icmp eq i32 [[INDEX_NEXT]], 16
-; UNROLL-NO-VF-NEXT:    br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP34:![0-9]+]]
+; UNROLL-NO-VF-NEXT:    br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP33:![0-9]+]]
 ; UNROLL-NO-VF:       middle.block:
 ; UNROLL-NO-VF-NEXT:    br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]]
 ; UNROLL-NO-VF:       scalar.ph:
@@ -3551,7 +3550,7 @@ define i32 @sink_after_dead_inst(ptr %A.ptr) {
 ; UNROLL-NO-VF-NEXT:    [[EXT:%.*]] = zext i1 [[B3]] to i32
 ; UNROLL-NO-VF-NEXT:    [[A_GEP:%.*]] = getelementptr i32, ptr [[A_PTR]], i16 [[IV]]
 ; UNROLL-NO-VF-NEXT:    store i32 0, ptr [[A_GEP]], align 4
-; UNROLL-NO-VF-NEXT:    br i1 [[VEC_DEAD]], label [[FOR_END]], label [[LOOP]], !llvm.loop [[LOOP35:![0-9]+]]
+; UNROLL-NO-VF-NEXT:    br i1 [[VEC_DEAD]], label [[FOR_END]], label [[LOOP]], !llvm.loop [[LOOP34:![0-9]+]]
 ; UNROLL-NO-VF:       for.end:
 ; UNROLL-NO-VF-NEXT:    [[FOR_LCSSA:%.*]] = phi i32 [ [[FOR]], [[LOOP]] ], [ [[TMP6]], [[MIDDLE_BLOCK]] ]
 ; UNROLL-NO-VF-NEXT:    ret i32 [[FOR_LCSSA]]
@@ -3683,7 +3682,7 @@ define void @unused_recurrence(ptr %a) {
 ; UNROLL-NO-VF-NEXT:    [[TMP5]] = add i16 [[TMP3]], 5
 ; UNROLL-NO-VF-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 2
 ; UNROLL-NO-VF-NEXT:    [[TMP6:%.*]] = icmp eq i32 [[INDEX_NEXT]], 1028
-; UNROLL-NO-VF-NEXT:    br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP36:![0-9]+]]
+; UNROLL-NO-VF-NEXT:    br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP35:![0-9]+]]
 ; UNROLL-NO-VF:       middle.block:
 ; UNROLL-NO-VF-NEXT:    br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]]
 ; UNROLL-NO-VF:       scalar.ph:
@@ -3697,7 +3696,7 @@ define void @unused_recurrence(ptr %a) {
 ; UNROLL-NO-VF-NEXT:    [[IV_NEXT]] = add i16 [[IV]], 1
 ; UNROLL-NO-VF-NEXT:    [[REC_1_PREV]] = add i16 [[IV_NEXT]], 5
 ; UNROLL-NO-VF-NEXT:    [[CMP:%.*]] = icmp eq i16 [[IV]], 1000
-; UNROLL-NO-VF-NEXT:    br i1 [[CMP]], label [[FOR_END]], label [[FOR_COND]], !llvm.loop [[LOOP37:![0-9]+]]
+; UNROLL-NO-VF-NEXT:    br i1 [[CMP]], label [[FOR_END]], label [[FOR_COND]], !llvm.loop [[LOOP36:![0-9]+]]
 ; UNROLL-NO-VF:       for.end:
 ; UNROLL-NO-VF-NEXT:    ret void
 ;
@@ -3791,7 +3790,7 @@ define i32 @recurence_uniform_load(ptr %src, ptr noalias %dst) {
 ; UNROLL-NO-VF-NEXT:    [[VECTOR_RECUR:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[TMP0:%.*]], [[VECTOR_BODY]] ]
 ; UNROLL-NO-VF-NEXT:    [[TMP0]] = load i32, ptr [[SRC:%.*]], align 4
 ; UNROLL-NO-VF-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
-; UNROLL-NO-VF-NEXT:    br i1 true, label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP38:![0-9]+]]
+; UNROLL-NO-VF-NEXT:    br i1 true, label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP37:![0-9]+]]
 ; UNROLL-NO-VF:       middle.block:
 ; UNROLL-NO-VF-NEXT:    br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]]
 ; UNROLL-NO-VF:       scalar.ph:
@@ -3804,7 +3803,7 @@ define i32 @recurence_uniform_load(ptr %src, ptr noalias %dst) {
 ; UNROLL-NO-VF-NEXT:    [[ADD]] = add i64 [[PHI]], 1
 ; UNROLL-NO-VF-NEXT:    [[LOAD]] = load i32, ptr [[SRC]], align 4
 ; UNROLL-NO-VF-NEXT:    [[ICMP:%.*]] = icmp ult i64 [[PHI]], 1
-; UNROLL-NO-VF-NEXT:    br i1 [[ICMP]], label [[LOOP]], label [[EXIT]], !llvm.loop [[LOOP39:![0-9]+]]
+; UNROLL-NO-VF-NEXT:    br i1 [[ICMP]], label [[LOOP]], label [[EXIT]], !llvm.loop [[LOOP38:![0-9]+]]
 ; UNROLL-NO-VF:       exit:
 ; UNROLL-NO-VF-NEXT:    ret i32 0
 ;
diff --git a/llvm/test/Transforms/LoopVectorize/float-induction.ll b/llvm/test/Transforms/LoopVectorize/float-induction.ll
index f9a261e7f8b7b1..9fbab0abbfd3b5 100644
--- a/llvm/test/Transforms/LoopVectorize/float-induction.ll
+++ b/llvm/test/Transforms/LoopVectorize/float-induction.ll
@@ -840,7 +840,7 @@ define void @fp_iv_loop3(float %init, ptr noalias nocapture %A, ptr noalias noca
 ; VEC4_INTERL2-NEXT:    [[IND_END:%.*]] = fadd fast float [[TMP2]], 0x3FB99999A0000000
 ; VEC4_INTERL2-NEXT:    [[DOTCAST2:%.*]] = uitofp nneg i64 [[N_VEC]] to float
 ; VEC4_INTERL2-NEXT:    [[TMP3:%.*]] = fmul fast float [[TMP0]], [[DOTCAST2]]
-; VEC4_INTERL2-NEXT:    [[IND_END3:%.*]] = fadd fast float [[TMP3]], [[INIT:%.*]]
+; VEC4_INTERL2-NEXT:    [[IND_END3:%.*]] = fadd fast float [[INIT:%.*]], [[TMP3]]
 ; VEC4_INTERL2-NEXT:    [[DOTSPLATINSERT2:%.*]] = insertelement <4 x float> poison, float [[TMP0]], i64 0
 ; VEC4_INTERL2-NEXT:    [[BROADCAST:%.*]] = shufflevector <4 x float> [[DOTSPLATINSERT2]], <4 x float> poison, <4 x i32> zeroinitializer
 ; VEC4_INTERL2-NEXT:    [[DOTSPLAT5:%.*]] = fmul fast <4 x float> [[BROADCAST]], <float 4.000000e+00, float 4
diff --git a/llvm/test/Transforms/LoopVectorize/induction.ll b/llvm/test/Transforms/LoopVectorize/induction.ll
index 1c2aa79936e2a8..59ef4b8b162f76 100644
--- a/llvm/test/Transforms/LoopVectorize/induction.ll
+++ b/llvm/test/Transforms/LoopVectorize/induction.ll
@@ -6378,7 +6378,7 @@ define void @test_optimized_cast_induction_feeding_first_order_recurrence(i64 %n
 ; UNROLL:       vector.ph:
 ; UNROLL-NEXT:    [[N_VEC:%.*]] = and i64 [[N]], -4
 ; UNROLL-NEXT:    [[DOTCAST:%.*]] = trunc i64 [[N_VEC]] to i32
-; UNROLL-NEXT:    [[IND_END:%.*]] = mul i32 [[DOTCAST]], [[STEP]]
+; UNROLL-NEXT:    [[IND_END:%.*]] = mul i32 [[STEP]], [[DOTCAST]]
 ; UNROLL-NEXT:    [[DOTSPLATINSERT2:%.*]] = insertelement <2 x i32> poison, i32 [[STEP]], i64 0
 ; UNROLL-NEXT:    [[TMP16:%.*]] = shl <2 x i32> [[DOTSPLATINSERT2]], <i32 1, i32 0>
 ; UNROLL-NEXT:    [[DOTSPLAT3:%.*]] = shufflevector <2 x i32> [[TMP16]], <2 x i32> poison, <2 x i32> zeroinitializer
@@ -6536,7 +6536,7 @@ define void @test_optimized_cast_induction_feeding_first_order_recurrence(i64 %n
 ; INTERLEAVE:       vector.ph:
 ; INTERLEAVE-NEXT:    [[N_VEC:%.*]] = and i64 [[N]], -8
 ; INTERLEAVE-NEXT:    [[DOTCAST:%.*]] = trunc i64 [[N_VEC]] to i32
-; INTERLEAVE-NEXT:    [[IND_END:%.*]] = mul i32 [[DOTCAST]], [[STEP]]
+; INTERLEAVE-NEXT:    [[IND_END:%.*]] = mul i32 [[STEP]], [[DOTCAST]]
 ; INTERLEAVE-NEXT:    [[DOTSPLATINSERT2:%.*]] = insertelement <4 x i32> poison, i32 [[STEP]], i64 0
 ; INTERLEAVE-NEXT:    [[TMP16:%.*]] = shl <4 x i32> [[DOTSPLATINSERT2]], <i32 2, i32 0, i32 0, i32 0>
 ; INTERLEAVE-NEXT:    [[DOTSPLAT3:%.*]] = shufflevector <4 x i32> [[TMP16]], <4 x i32> poison, <4 x i32> zeroinitializer
diff --git a/llvm/test/Transforms/LoopVectorize/reduction-inloop-uf4.ll b/llvm/test/Transforms/LoopVectorize/reduction-inloop-uf4.ll
index c8430d7526b2d3..99a0b708035884 100644
--- a/llvm/test/Transforms/LoopVectorize/reduction-inloop-uf4.ll
+++ b/llvm/test/Transforms/LoopVectorize/reduction-inloop-uf4.ll
@@ -302,11 +302,8 @@ define i32 @cond_rdx_pred(i32 %cond, ptr noalias %a, i64 %N) {
 ; CHECK-NEXT:    [[N_VEC:%.*]] = and i64 [[N_RND_UP]], -16
 ; CHECK-NEXT:    [[TRIP_COUNT_MINUS_1:%.*]] = add i64 [[N]], -1
 ; CHECK-NEXT:    [[BROADCAST_SPLATINSERT7:%.*]] = insertelement <4 x i32> poison, i32 [[COND:%.*]], i64 0
-; CHECK-NEXT:    [[BROADCAST_SPLAT8:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT7]], <4 x i32> poison, <4 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP4:%.*]] = icmp sgt <4 x i32> [[BROADCAST_SPLAT8]], <i32 7, i32 7, i32 7, i32 7>
-; CHECK-NEXT:    [[TMP5:%.*]] = icmp sgt <4 x i32> [[BROADCAST_SPLAT8]], <i32 7, i32 7, i32 7, i32 7>
-; CHECK-NEXT:    [[TMP6:%.*]] = icmp sgt <4 x i32> [[BROADCAST_SPLAT8]], <i32 7, i32 7, i32 7, i32 7>
-; CHECK-NEXT:    [[TMP7:%.*]] = icmp sgt <4 x i32> [[BROADCAST_SPLAT8]], <i32 7, i32 7, i32 7, i32 7>
+; CHECK-NEXT:    [[TMP4:%.*]] = icmp sgt <4 x i32> [[BROADCAST_SPLATINSERT7]], <i32 7, i32 7, i32 7, i32 7>
+; CHECK-NEXT:    [[BROADCAST_SPLAT8:%.*]] = shufflevector <4 x i1> [[TMP4]], <4 x i1> poison, <4 x i32> zeroinitializer
 ; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i64> poison, i64 [[TRIP_COUNT_MINUS_1]], i64 0
 ; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT]], <4 x i64> poison, <4 x i32> zeroinitializer
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
@@ -324,10 +321,10 @@ define i32 @cond_rdx_pred(i32 %cond, ptr noalias %a, i64 %N) {
 ; CHECK-NEXT:    [[TMP1:%.*]] = icmp ule <4 x i64> [[STEP_ADD]], [[BROADCAST_SPLAT]]
 ; CHECK-NEXT:    [[TMP2:%.*]] = icmp ule <4 x i64> [[STEP_ADD1]], [[BROADCAST_SPLAT]]
 ; CHECK-NEXT:    [[TMP3:%.*]] = icmp ule <4 x i64> [[STEP_ADD2]], [[BROADCAST_SPLAT]]
-; CHECK-NEXT:    [[TMP8:%.*]] = select <4 x i1> [[TMP0]], <4 x i1> [[TMP4]], <4 x i1> zeroinitializer
-; CHECK-NEXT:    [[TMP9:%.*]] = select <4 x i1> [[TMP1]], <4 x i1> [[TMP5]], <4 x i1> zeroinitializer
-; CHECK-NEXT:    [[TMP10:%.*]] = select <4 x i1> [[TMP2]], <4 x i1> [[TMP6]], <4 x i1> zeroinitializer
-; CHECK-NEXT:    [[TMP11:%.*]] = select <4 x i1> [[TMP3]], <4 x i1> [[TMP7]], <4 x i1> zeroinitializer
+; CHECK-NEXT:    [[TMP8:%.*]] = select <4 x i1> [[TMP0]], <4 x i1> [[BROADCAST_SPLAT8]], <4 x i1> zeroinitializer
+; CHECK-NEXT:    [[TMP9:%.*]] = select <4 x i1> [[TMP1]], <4 x i1> [[BROADCAST_SPLAT8]], <4 x i1> zeroinitializer
+; CHECK-NEXT:    [[TMP10:%.*]] = select <4 x i1> [[TMP2]], <4 x i1> [[BROADCAST_SPLAT8]], <4 x i1> zeroinitializer
+; CHECK-NEXT:    [[TMP11:%.*]] = select <4 x i1> [[TMP3]], <4 x i1> [[BROADCAST_SPLAT8]], <4 x i1> zeroinitializer
 ; CHECK-NEXT:    [[TMP12:%.*]] = extractelement <4 x i1> [[TMP8]], i64 0
 ; CHECK-NEXT:    br i1 [[TMP12]], label [[PRED_LOAD_IF:%.*]], label [[PRED_LOAD_CONTINUE:%.*]]
 ; CHECK:       pred.load.if:
diff --git a/llvm/test/Transforms/LoopVectorize/vplan-predicate-switch.ll b/llvm/test/Transforms/LoopVectorize/vplan-predicate-switch.ll
index 60827160dcb329..a1e28999a40026 100644
--- a/llvm/test/Transforms/LoopVectorize/vplan-predicate-switch.ll
+++ b/llvm/test/Transforms/LoopVectorize/vplan-predicate-switch.ll
@@ -2,7 +2,7 @@
 ; RUN: opt -p loop-vectorize -force-vector-width=2 -force-vector-interleave=1 -debug -disable-output %s 2>&1 | FileCheck %s
 
 define void @switch4_default_common_dest_with_case(ptr %start, ptr %end) {
-; CHECK:      VPlan 'Final VPlan for VF={2},UF>=1' {
+; CHECK:      VPlan 'Final VPlan for VF={2},UF={1}' {
 ; CHECK-NEXT: Live-in vp<[[VFxUF:%.+]]> = VF * UF
 ; CHECK-NEXT: Live-in vp<[[VTC:%.+]]> = vector-trip-count
 ; CHECK-NEXT: vp<[[TC:%.+]]> = original trip-count

>From 6ce7bf8560d6d846139350eb0125109ca6cdc6d5 Mon Sep 17 00:00:00 2001
From: Florian Hahn <flo at fhahn.com>
Date: Fri, 20 Sep 2024 19:36:24 +0100
Subject: [PATCH 14/15] !fixup address comments, thanks!

---
 llvm/lib/Transforms/Vectorize/CMakeLists.txt  |   1 +
 .../Vectorize/LoopVectorizationPlanner.h      |  17 +-
 .../Transforms/Vectorize/VPlanTransforms.cpp  | 447 ----------------
 llvm/lib/Transforms/Vectorize/VPlanUnroll.cpp | 475 ++++++++++++++++++
 4 files changed, 485 insertions(+), 455 deletions(-)
 create mode 100644 llvm/lib/Transforms/Vectorize/VPlanUnroll.cpp

diff --git a/llvm/lib/Transforms/Vectorize/CMakeLists.txt b/llvm/lib/Transforms/Vectorize/CMakeLists.txt
index f33906b05fedd1..8bd3dbf0695733 100644
--- a/llvm/lib/Transforms/Vectorize/CMakeLists.txt
+++ b/llvm/lib/Transforms/Vectorize/CMakeLists.txt
@@ -16,6 +16,7 @@ add_llvm_component_library(LLVMVectorize
   VPlanRecipes.cpp
   VPlanSLP.cpp
   VPlanTransforms.cpp
+  VPlanUnroll.cpp
   VPlanVerifier.cpp
   VPlanUtils.cpp
 
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h b/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h
index 8300db1dd14b13..034fdf4233de37 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h
@@ -156,6 +156,15 @@ class VPBuilder {
                               DebugLoc DL, const Twine &Name = "") {
     return createInstruction(Opcode, Operands, DL, Name);
   }
+  VPInstruction *createNaryOp(unsigned Opcode,
+                              std::initializer_list<VPValue *> Operands,
+                              std::optional<FastMathFlags> FMFs = {},
+                              DebugLoc DL = {}, const Twine &Name = "") {
+    if (FMFs)
+      return tryInsertInstruction(
+          new VPInstruction(Opcode, Operands, *FMFs, DL, Name));
+    return createInstruction(Opcode, Operands, DL, Name);
+  }
 
   VPInstruction *createOverflowingOp(unsigned Opcode,
                                      std::initializer_list<VPValue *> Operands,
@@ -165,14 +174,6 @@ class VPBuilder {
         new VPInstruction(Opcode, Operands, WrapFlags, DL, Name));
   }
 
-  VPInstruction *createFPOp(unsigned Opcode,
-                            std::initializer_list<VPValue *> Operands,
-                            FastMathFlags FMFs = {}, DebugLoc DL = {},
-                            const Twine &Name = "") {
-    return tryInsertInstruction(
-        new VPInstruction(Opcode, Operands, FMFs, DL, Name));
-  }
-
   VPValue *createNot(VPValue *Operand, DebugLoc DL = {},
                      const Twine &Name = "") {
     return createInstruction(VPInstruction::Not, {Operand}, DL, Name);
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
index dbfdf2a131165f..ba15b07dfd2d7b 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
@@ -1619,450 +1619,3 @@ void VPlanTransforms::createInterleaveGroups(
       }
   }
 }
-
-namespace {
-
-/// Helper to hold state needed for unrolling. It holds the Plan to unroll by
-/// UF. It also holds copies of VPValues across UF-1 unroll parts to facilitate
-/// the unrolling transformation, where the original VPValues are retained for
-/// part zero.
-class UnrollState {
-  /// Plan to unroll.
-  VPlan &Plan;
-  /// Unroll factor to unroll by.
-  const unsigned UF;
-  /// Analysis for types.
-  VPTypeAnalysis TypeInfo;
-
-  /// Unrolling may create recipes that should not be unrolled themselves.
-  /// Those are tracked in ToSkip.
-  SmallPtrSet<VPRecipeBase *, 8> ToSkip;
-
-  // Associate with each VPValue of part 0 its unrolled instances of parts 1,
-  // ..., UF-1.
-  DenseMap<VPValue *, SmallVector<VPValue *>> VPV2Parts;
-
-  void unrollReplicateRegion(VPRegionBlock *VPR);
-  void unrollRecipe(VPRecipeBase &R);
-  void unrollHeaderPHI(VPHeaderPHIRecipe *R, VPBasicBlock::iterator InsertPtForPhi);
-  void unrollWidenInduction(VPWidenIntOrFpInductionRecipe *IV,
-                            VPBasicBlock::iterator InsertPtForPhi);
-
-  VPValue *getConstantVPV(unsigned Part) {
-    Type *CanIVIntTy = Plan.getCanonicalIV()->getScalarType();
-    return Plan.getOrAddLiveIn(ConstantInt::get(CanIVIntTy, Part));
-  }
-
-public:
-  UnrollState(VPlan &Plan, unsigned UF, LLVMContext &Ctx)
-      : Plan(Plan), UF(UF), TypeInfo(Plan.getCanonicalIV()->getScalarType()) {}
-
-  void unrollBlock(VPBlockBase *VPB);
-
-  VPValue *getValueForPart(VPValue *V, unsigned Part) {
-    if (Part == 0 || V->isLiveIn())
-      return V;
-    assert((VPV2Parts.contains(V) && VPV2Parts[V].size() >= Part) &&
-           "accessed value does not exist");
-    return VPV2Parts[V][Part - 1];
-  }
-
-  /// Given a single original recipe \p OrigR (of part zero), and its copy \p
-  /// CopyR for part \p Part, map every VPValue defined by \p OrigR to its
-  /// corresponding VPValue defined by \p CopyR.
-  void addRecipeForPart(VPRecipeBase *OrigR, VPRecipeBase *CopyR,
-                        unsigned Part) {
-    for (const auto &[Idx, VPV] : enumerate(OrigR->definedValues())) {
-      auto Ins = VPV2Parts.insert({VPV, {}});
-      assert(Ins.first->second.size() == Part - 1 && "earlier parts not set");
-      Ins.first->second.push_back(CopyR->getVPValue(Idx));
-    }
-  }
-
-  /// Given a uniform recipe \p R, add it for all parts.
-  void addUniformForAllParts(VPSingleDefRecipe *R) {
-    auto Ins = VPV2Parts.insert({R, {}});
-    assert(Ins.second && "uniform value already added");
-    for (unsigned Part = 0; Part != UF; ++Part)
-      Ins.first->second.push_back(R);
-  }
-
-  bool contains(VPValue *VPV) const { return VPV2Parts.contains(VPV); }
-
-  /// Update \p R's operand at \p OpIdx with its corresponding VPValue for part
-  /// \p P.
-  void remapOperand(VPRecipeBase *R, unsigned OpIdx, unsigned Part) {
-    auto *Op = R->getOperand(OpIdx);
-    R->setOperand(OpIdx, getValueForPart(Op, Part));
-  }
-
-  /// Update \p R's operands with their corresponding VPValues for part \p P.
-  void remapOperands(VPRecipeBase *R, unsigned Part) {
-    for (const auto &[OpIdx, Op] : enumerate(R->operands()))
-      R->setOperand(OpIdx, getValueForPart(Op, Part));
-  }
-};
-} // namespace
-
-void UnrollState::unrollReplicateRegion(VPRegionBlock *VPR) {
-  VPBlockBase *InsertPt = VPR->getSingleSuccessor();
-  for (unsigned Part = 1; Part != UF; ++Part) {
-    auto *Copy = VPR->clone();
-    VPBlockUtils::insertBlockBefore(Copy, InsertPt);
-
-    auto PartI = vp_depth_first_shallow(Copy->getEntry());
-    auto Part0 = vp_depth_first_shallow(VPR->getEntry());
-    for (const auto &[PartIVPBB, Part0VPBB] :
-         zip(VPBlockUtils::blocksOnly<VPBasicBlock>(PartI),
-             VPBlockUtils::blocksOnly<VPBasicBlock>(Part0))) {
-      for (const auto &[PartIR, Part0R] : zip(*PartIVPBB, *Part0VPBB)) {
-        remapOperands(&PartIR, Part);
-        if (auto *ScalarIVSteps = dyn_cast<VPScalarIVStepsRecipe>(&PartIR)) {
-          ScalarIVSteps->addOperand(getConstantVPV(Part));
-        }
-
-        addRecipeForPart(&Part0R, &PartIR, Part);
-      }
-    }
-  }
-}
-
-void UnrollState::unrollWidenInduction(VPWidenIntOrFpInductionRecipe *IV,
-                                       VPBasicBlock::iterator InsertPtForPhi) {
-  VPBasicBlock *PH = cast<VPBasicBlock>(
-      IV->getParent()->getEnclosingLoopRegion()->getSinglePredecessor());
-  Type *IVTy = TypeInfo.inferScalarType(IV);
-  auto &ID = IV->getInductionDescriptor();
-  FastMathFlags FMFs;
-  if (isa_and_present<FPMathOperator>(ID.getInductionBinOp()))
-    FMFs = ID.getInductionBinOp()->getFastMathFlags();
-
-  VPValue *VectorStep = &Plan.getVF();
-  VPBuilder Builder(PH);
-  if (TypeInfo.inferScalarType(VectorStep) != IVTy) {
-    Instruction::CastOps CastOp =
-        IVTy->isFloatingPointTy() ? Instruction::UIToFP : Instruction::Trunc;
-    VectorStep = Builder.createWidenCast(CastOp, VectorStep, IVTy);
-    ToSkip.insert(VectorStep->getDefiningRecipe());
-  }
-
-  VPValue *ScalarStep = IV->getStepValue();
-  auto *ConstStep = ScalarStep->isLiveIn()
-                        ? dyn_cast<ConstantInt>(ScalarStep->getLiveInIRValue())
-                        : nullptr;
-  if (!ConstStep || ConstStep->getZExtValue() != 1) {
-    if (TypeInfo.inferScalarType(ScalarStep) != IVTy) {
-      ScalarStep =
-          Builder.createWidenCast(Instruction::Trunc, ScalarStep, IVTy);
-      ToSkip.insert(ScalarStep->getDefiningRecipe());
-    }
-
-    VPInstruction *Mul;
-    if (IVTy->isFloatingPointTy())
-      Mul = Builder.createFPOp(Instruction::FMul, {VectorStep, ScalarStep},
-                               FMFs, IV->getDebugLoc());
-    else
-      Mul = Builder.createNaryOp(Instruction::Mul, {VectorStep, ScalarStep},
-                                 IV->getDebugLoc());
-    VectorStep = Mul;
-    ToSkip.insert(Mul);
-  }
-
-  // Now create recipes to compute the induction steps for part 1 .. UF. Part 0
-  // remains the header phi. Parts > 0 are computed by adding Step to the
-  // previous part. The header phi recipe will get 2 new operands: the step
-  // value for a single part and the last part, used to compute the backedge
-  // value during VPWidenIntOrFpInductionRecipe::execute. %Part.0 =
-  // VPWidenIntOrFpInductionRecipe %Start, %ScalarStep, %VectorStep, %Part.3
-  // %Part.1 = %Part.0 + %VectorStep
-  // %Part.2 = %Part.1 + %VectorStep
-  // %Part.3 = %Part.2 + %VectorStep
-  //
-  // The newly added recipes are added to ToSkip to avoid interleaving them
-  // again.
-  VPValue *Prev = IV;
-  Builder.setInsertPoint(IV->getParent(), InsertPtForPhi);
-  for (unsigned Part = 1; Part != UF; ++Part) {
-    VPInstruction *Add;
-    std::string Name =
-        Part > 1 ? "step.add." + std::to_string(Part) : "step.add";
-
-    if (IVTy->isFloatingPointTy())
-      Add = Builder.createFPOp(ID.getInductionOpcode(),
-                               {
-                                   Prev,
-                                   VectorStep,
-                               },
-                               FMFs, IV->getDebugLoc(), Name);
-    else
-      Add = Builder.createNaryOp(Instruction::Add,
-                                 {
-                                     Prev,
-                                     VectorStep,
-                                 },
-                                 IV->getDebugLoc(), Name);
-    ToSkip.insert(Add);
-    addRecipeForPart(IV, Add, Part);
-    Prev = Add;
-  }
-  IV->addOperand(VectorStep);
-  IV->addOperand(Prev);
-}
-
-void UnrollState::unrollHeaderPHI(VPHeaderPHIRecipe *R,
-                                  VPBasicBlock::iterator InsertPtForPhi) {
-  // First-order recurrences pass a single vector or scalar through their header
-  // phis, irrespective of interleaving.
-  if (isa<VPFirstOrderRecurrencePHIRecipe>(R))
-    return;
-
-  // Generate step vectors for each unrolled part.
-  if (auto *IV = dyn_cast<VPWidenIntOrFpInductionRecipe>(R)) {
-    unrollWidenInduction(IV, InsertPtForPhi);
-    return;
-  }
-
-  auto *RdxPhi = dyn_cast<VPReductionPHIRecipe>(R);
-  if (RdxPhi && RdxPhi->isOrdered())
-    return;
-
-  auto InsertPt = std::next(R->getIterator());
-  for (unsigned Part = 1; Part != UF; ++Part) {
-    VPRecipeBase *Copy = R->clone();
-    Copy->insertBefore(*R->getParent(), InsertPt);
-    addRecipeForPart(R, Copy, Part);
-    if (isa<VPWidenPointerInductionRecipe>(R)) {
-      Copy->addOperand(R);
-      Copy->addOperand(getConstantVPV(Part));
-    } else if (RdxPhi) {
-      Copy->addOperand(getConstantVPV(Part));
-    } else {
-      assert(isa<VPActiveLaneMaskPHIRecipe>(R) &&
-             "unexpected header phi recipe not needing unrolled part");
-    }
-  }
-}
-
-/// Handle non-header-phi recipes.
-void UnrollState::unrollRecipe(VPRecipeBase &R) {
-  using namespace llvm::VPlanPatternMatch;
-  if (match(&R, m_BranchOnCond(m_VPValue())) ||
-      match(&R, m_BranchOnCount(m_VPValue(), m_VPValue())))
-    return;
-
-  if (auto *VPI = dyn_cast<VPInstruction>(&R)) {
-    VPValue *Op0;
-    VPValue *Op1;
-    if (match(VPI, m_VPInstruction<VPInstruction::ExtractFromEnd>(
-                       m_VPValue(Op0), m_VPValue(Op1)))) {
-      VPI->setOperand(1, getValueForPart(Op1, UF - 1));
-      addUniformForAllParts(VPI);
-      if (Plan.hasScalarVFOnly()) {
-        // Extracting from end with VF = 1 implies retrieving the scalar part UF
-        // - Op1.
-        unsigned Offset =
-            cast<ConstantInt>(Op1->getLiveInIRValue())->getZExtValue();
-        VPI->replaceAllUsesWith(getValueForPart(Op0, UF - Offset));
-      } else {
-        // Otherwise we extract from the last part.
-        remapOperands(VPI, UF - 1);
-      }
-      return;
-    }
-
-    if (vputils::onlyFirstPartUsed(VPI)) {
-      addUniformForAllParts(cast<VPInstruction>(&R));
-      return;
-    }
-    if (match(VPI, m_VPInstruction<VPInstruction::CalculateTripCountMinusVF>(
-                       m_VPValue()))) {
-      addUniformForAllParts(VPI);
-      return;
-    }
-  }
-  if (auto *RepR = dyn_cast<VPReplicateRecipe>(&R)) {
-    if (isa<StoreInst>(RepR->getUnderlyingValue()) &&
-        RepR->getOperand(1)->isDefinedOutsideLoopRegions()) {
-      // Stores to an invariant address only need to store the last part.
-      remapOperands(&R, UF - 1);
-      return;
-    }
-    if (auto *II = dyn_cast<IntrinsicInst>(RepR->getUnderlyingValue())) {
-      if (II->getIntrinsicID() == Intrinsic::experimental_noalias_scope_decl) {
-        addUniformForAllParts(RepR);
-        return;
-      }
-    }
-  }
-
-  // Unroll non-uniform recipes.
-  auto InsertPt = std::next(R.getIterator());
-  VPBasicBlock &VPBB = *R.getParent();
-  for (unsigned Part = 1; Part != UF; ++Part) {
-    VPRecipeBase *Copy = R.clone();
-    Copy->insertBefore(VPBB, InsertPt);
-    addRecipeForPart(&R, Copy, Part);
-
-    VPValue *Op;
-    if (match(&R, m_VPInstruction<VPInstruction::FirstOrderRecurrenceSplice>(
-                      m_VPValue(), m_VPValue(Op)))) {
-      Copy->setOperand(0, getValueForPart(Op, Part - 1));
-      Copy->setOperand(1, getValueForPart(Op, Part));
-      continue;
-    }
-    if (auto *Red = dyn_cast<VPReductionRecipe>(&R)) {
-      auto *Phi = cast<VPReductionPHIRecipe>(R.getOperand(0));
-      if (Phi->isOrdered()) {
-        auto Ins = VPV2Parts.insert({Phi, {}});
-        if (Part == 1) {
-          Ins.first->second.clear();
-          Ins.first->second.push_back(Red);
-        }
-        Ins.first->second.push_back(Copy->getVPSingleValue());
-        Phi->setOperand(1, Copy->getVPSingleValue());
-      }
-    }
-    remapOperands(Copy, Part);
-
-    // Add operand indicating the part to generate code for, to recipes still
-    // requiring it.
-    if (isa<VPScalarIVStepsRecipe, VPWidenCanonicalIVRecipe,
-            VPVectorPointerRecipe>(Copy) ||
-        match(Copy, m_VPInstruction<VPInstruction::CanonicalIVIncrementForPart>(
-                        m_VPValue())))
-      Copy->addOperand(getConstantVPV(Part));
-
-    if (isa<VPVectorPointerRecipe>(R))
-      Copy->setOperand(0, R.getOperand(0));
-  }
-}
-
-using namespace llvm::VPlanPatternMatch;
-void UnrollState::unrollBlock(VPBlockBase *VPB) {
-  auto *VPR = dyn_cast<VPRegionBlock>(VPB);
-  if (VPR) {
-    if (VPR->isReplicator())
-      return unrollReplicateRegion(VPR);
-
-    // Traverse blocks in region in RPO to ensure defs are visited before uses
-    // across blocks.
-    ReversePostOrderTraversal<VPBlockShallowTraversalWrapper<VPBlockBase *>>
-        RPOT(VPR->getEntry());
-    for (VPBlockBase *VPB : RPOT)
-      unrollBlock(VPB);
-    return;
-  }
-
-  // VPB is a VPBasicBlock; unroll it, i.e., unroll its recipes.
-  auto *VPBB = cast<VPBasicBlock>(VPB);
-  auto InsertPtForPhi = VPBB->getFirstNonPhi();
-  for (VPRecipeBase &R : make_early_inc_range(*VPBB)) {
-    if (ToSkip.contains(&R) || isa<VPIRInstruction>(&R))
-      continue;
-
-    // Add all VPValues for all parts to ComputeReductionResult which combines
-    // the parts to compute the final reduction value.
-    VPValue *Op1;
-    if (match(&R, m_VPInstruction<VPInstruction::ComputeReductionResult>(
-                      m_VPValue(), m_VPValue(Op1)))) {
-      addUniformForAllParts(cast<VPInstruction>(&R));
-      for (unsigned Part = 1; Part != UF; ++Part)
-        R.addOperand(getValueForPart(Op1, Part));
-      continue;
-    }
-    VPValue *Op0;
-    if (match(&R, m_VPInstruction<VPInstruction::ExtractFromEnd>(
-                      m_VPValue(Op0), m_VPValue(Op1)))) {
-      addUniformForAllParts(cast<VPSingleDefRecipe>(&R));
-      if (Plan.hasScalarVFOnly()) {
-        // Extracting from end with VF = 1 implies retrieving the scalar part UF
-        // - Op1.
-        unsigned Offset =
-            cast<ConstantInt>(Op1->getLiveInIRValue())->getZExtValue();
-        R.getVPSingleValue()->replaceAllUsesWith(
-            getValueForPart(Op0, UF - Offset));
-        R.eraseFromParent();
-      } else {
-        // Otherwise we extract from the last part.
-        remapOperands(&R, UF - 1);
-      }
-      continue;
-    }
-
-    auto *SingleDef = dyn_cast<VPSingleDefRecipe>(&R);
-    if (SingleDef && vputils::isUniformAcrossVFsAndUFs(SingleDef)) {
-      addUniformForAllParts(SingleDef);
-      continue;
-    }
-
-    if (auto *H = dyn_cast<VPHeaderPHIRecipe>(&R)) {
-      unrollHeaderPHI(H, InsertPtForPhi);
-      continue;
-    }
-
-    unrollRecipe(R);
-  }
-}
-
-void VPlanTransforms::unrollByUF(VPlan &Plan, unsigned UF, LLVMContext &Ctx) {
-  assert(UF > 0 && "Unroll factor must be positive");
-  Plan.setUF(UF);
-  auto Cleanup = make_scope_exit([&Plan]() {
-    auto Iter = vp_depth_first_deep(Plan.getEntry());
-    // Remove recipes that are redundant after unrolling.
-    for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(Iter)) {
-      for (VPRecipeBase &R : make_early_inc_range(*VPBB)) {
-        auto *VPI = dyn_cast<VPInstruction>(&R);
-        if (VPI &&
-            VPI->getOpcode() == VPInstruction::CanonicalIVIncrementForPart &&
-            VPI->getNumOperands() == 1) {
-          VPI->replaceAllUsesWith(VPI->getOperand(0));
-          VPI->eraseFromParent();
-        }
-      }
-    }
-  });
-  if (UF == 1) {
-    return;
-  }
-
-  UnrollState Unroller(Plan, UF, Ctx);
-
-  Unroller.unrollBlock(Plan.getPreheader());
-
-  // Iterate over all blocks in the plan starting from Entry, and unroll
-  // recipes inside them. This includes the vector preheader and middle blocks,
-  // which may set up or post-process per-part values.
-  ReversePostOrderTraversal<VPBlockShallowTraversalWrapper<VPBlockBase *>> RPOT(
-      Plan.getEntry());
-  for (VPBlockBase *VPB : RPOT)
-    Unroller.unrollBlock(VPB);
-
-  unsigned Part = 1;
-  // Remap operands of cloned header phis to update backedge values. The header
-  // phis cloned during unrolling are just after the header phi for part 0.
-  // Reset Part to 1 when reaching the first (part 0) recipe of a block.
-  for (VPRecipeBase &H :
-       Plan.getVectorLoopRegion()->getEntryBasicBlock()->phis()) {
-    // The second operand of Fixed Order Recurrence phi's, feeding the spliced
-    // value across the backedge, needs to remap to the last part of the spliced
-    // value.
-    if (isa<VPFirstOrderRecurrencePHIRecipe>(&H)) {
-      Unroller.remapOperand(&H, 1, UF - 1);
-      continue;
-    }
-    if (Unroller.contains(H.getVPSingleValue()) ||
-        isa<VPWidenPointerInductionRecipe>(&H)) {
-      Part = 1;
-      continue;
-    }
-    Unroller.remapOperands(&H, Part);
-    Part++;
-  }
-
-  // Remap the operand of live-outs to the last part.
-  for (const auto &[_, LO] : Plan.getLiveOuts()) {
-    VPValue *In = Unroller.getValueForPart(LO->getOperand(0), UF - 1);
-    LO->setOperand(0, In);
-  }
-}
diff --git a/llvm/lib/Transforms/Vectorize/VPlanUnroll.cpp b/llvm/lib/Transforms/Vectorize/VPlanUnroll.cpp
new file mode 100644
index 00000000000000..0833111717fb80
--- /dev/null
+++ b/llvm/lib/Transforms/Vectorize/VPlanUnroll.cpp
@@ -0,0 +1,475 @@
+//===-- VPlanTransforms.cpp - Utility VPlan to VPlan transforms -----------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file implements a set of utility VPlan to VPlan transformations.
+///
+//===----------------------------------------------------------------------===//
+
+#include "VPRecipeBuilder.h"
+#include "VPlan.h"
+#include "VPlanAnalysis.h"
+#include "VPlanCFG.h"
+#include "VPlanDominatorTree.h"
+#include "VPlanPatternMatch.h"
+#include "VPlanTransforms.h"
+#include "VPlanUtils.h"
+#include "llvm/ADT/PostOrderIterator.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/ScopeExit.h"
+#include "llvm/ADT/SetVector.h"
+#include "llvm/ADT/TypeSwitch.h"
+#include "llvm/Analysis/IVDescriptors.h"
+#include "llvm/Analysis/VectorUtils.h"
+#include "llvm/IR/Intrinsics.h"
+#include "llvm/IR/PatternMatch.h"
+
+using namespace llvm;
+
+namespace {
+
+/// Helper to hold state needed for unrolling. It holds the Plan to unroll by
+/// UF. It also holds copies of VPValues across UF-1 unroll parts to facilitate
+/// the unrolling transformation, where the original VPValues are retained for
+/// part zero.
+class UnrollState {
+  /// Plan to unroll.
+  VPlan &Plan;
+  /// Unroll factor to unroll by.
+  const unsigned UF;
+  /// Analysis for types.
+  VPTypeAnalysis TypeInfo;
+
+  /// Unrolling may create recipes that should not be unrolled themselves.
+  /// Those are tracked in ToSkip.
+  SmallPtrSet<VPRecipeBase *, 8> ToSkip;
+
+  // Associate with each VPValue of part 0 its unrolled instances of parts 1,
+  // ..., UF-1.
+  DenseMap<VPValue *, SmallVector<VPValue *>> VPV2Parts;
+
+  /// Unroll replicate region \p VPR by cloning the region UF - 1 times.
+  void unrollReplicateRegionByUF(VPRegionBlock *VPR);
+
+  /// Unroll recipe \p R by cloning it UF - 1 times, unless it is uniform across
+  /// all parts.
+  void unrollRecipeByUF(VPRecipeBase &R);
+
+  /// Unroll header phi recipe \p R. How exactly the recipe gets unrolled
+  /// depends on the concrete header phi. Inserts newly created recipes at \p
+  /// InsertPtForPhi.
+  void unrollHeaderPHIByUF(VPHeaderPHIRecipe *R,
+                           VPBasicBlock::iterator InsertPtForPhi);
+
+  /// Unroll a widen induction recipe \p IV. This introduces recipes to compute
+  /// the induction steps for each part.
+  void unrollWidenInductionByUF(VPWidenIntOrFpInductionRecipe *IV,
+                                VPBasicBlock::iterator InsertPtForPhi);
+
+  VPValue *getConstantVPV(unsigned Part) {
+    Type *CanIVIntTy = Plan.getCanonicalIV()->getScalarType();
+    return Plan.getOrAddLiveIn(ConstantInt::get(CanIVIntTy, Part));
+  }
+
+public:
+  UnrollState(VPlan &Plan, unsigned UF, LLVMContext &Ctx)
+      : Plan(Plan), UF(UF), TypeInfo(Plan.getCanonicalIV()->getScalarType()) {}
+
+  void unrollBlock(VPBlockBase *VPB);
+
+  VPValue *getValueForPart(VPValue *V, unsigned Part) {
+    if (Part == 0 || V->isLiveIn())
+      return V;
+    assert((VPV2Parts.contains(V) && VPV2Parts[V].size() >= Part) &&
+           "accessed value does not exist");
+    return VPV2Parts[V][Part - 1];
+  }
+
+  /// Given a single original recipe \p OrigR (of part zero), and its copy \p
+  /// CopyR for part \p Part, map every VPValue defined by \p OrigR to its
+  /// corresponding VPValue defined by \p CopyR.
+  void addRecipeForPart(VPRecipeBase *OrigR, VPRecipeBase *CopyR,
+                        unsigned Part) {
+    for (const auto &[Idx, VPV] : enumerate(OrigR->definedValues())) {
+      auto Ins = VPV2Parts.insert({VPV, {}});
+      assert(Ins.first->second.size() == Part - 1 && "earlier parts not set");
+      Ins.first->second.push_back(CopyR->getVPValue(Idx));
+    }
+  }
+
+  /// Given a uniform recipe \p R, add it for all parts.
+  void addUniformForAllParts(VPSingleDefRecipe *R) {
+    auto Ins = VPV2Parts.insert({R, {}});
+    assert(Ins.second && "uniform value already added");
+    for (unsigned Part = 0; Part != UF; ++Part)
+      Ins.first->second.push_back(R);
+  }
+
+  bool contains(VPValue *VPV) const { return VPV2Parts.contains(VPV); }
+
+  /// Update \p R's operand at \p OpIdx with its corresponding VPValue for part
+  /// \p P.
+  void remapOperand(VPRecipeBase *R, unsigned OpIdx, unsigned Part) {
+    auto *Op = R->getOperand(OpIdx);
+    R->setOperand(OpIdx, getValueForPart(Op, Part));
+  }
+
+  /// Update \p R's operands with their corresponding VPValues for part \p P.
+  void remapOperands(VPRecipeBase *R, unsigned Part) {
+    for (const auto &[OpIdx, Op] : enumerate(R->operands()))
+      R->setOperand(OpIdx, getValueForPart(Op, Part));
+  }
+};
+} // namespace
+
+void UnrollState::unrollReplicateRegionByUF(VPRegionBlock *VPR) {
+  VPBlockBase *InsertPt = VPR->getSingleSuccessor();
+  for (unsigned Part = 1; Part != UF; ++Part) {
+    auto *Copy = VPR->clone();
+    VPBlockUtils::insertBlockBefore(Copy, InsertPt);
+
+    auto PartI = vp_depth_first_shallow(Copy->getEntry());
+    auto Part0 = vp_depth_first_shallow(VPR->getEntry());
+    for (const auto &[PartIVPBB, Part0VPBB] :
+         zip(VPBlockUtils::blocksOnly<VPBasicBlock>(PartI),
+             VPBlockUtils::blocksOnly<VPBasicBlock>(Part0))) {
+      for (const auto &[PartIR, Part0R] : zip(*PartIVPBB, *Part0VPBB)) {
+        remapOperands(&PartIR, Part);
+        if (auto *ScalarIVSteps = dyn_cast<VPScalarIVStepsRecipe>(&PartIR)) {
+          ScalarIVSteps->addOperand(getConstantVPV(Part));
+        }
+
+        addRecipeForPart(&Part0R, &PartIR, Part);
+      }
+    }
+  }
+}
+
+void UnrollState::unrollWidenInductionByUF(
+    VPWidenIntOrFpInductionRecipe *IV, VPBasicBlock::iterator InsertPtForPhi) {
+  VPBasicBlock *PH = cast<VPBasicBlock>(
+      IV->getParent()->getEnclosingLoopRegion()->getSinglePredecessor());
+  Type *IVTy = TypeInfo.inferScalarType(IV);
+  auto &ID = IV->getInductionDescriptor();
+  std::optional<FastMathFlags> FMFs;
+  if (isa_and_present<FPMathOperator>(ID.getInductionBinOp()))
+    FMFs = ID.getInductionBinOp()->getFastMathFlags();
+
+  VPValue *VectorStep = &Plan.getVF();
+  VPBuilder Builder(PH);
+  if (TypeInfo.inferScalarType(VectorStep) != IVTy) {
+    Instruction::CastOps CastOp =
+        IVTy->isFloatingPointTy() ? Instruction::UIToFP : Instruction::Trunc;
+    VectorStep = Builder.createWidenCast(CastOp, VectorStep, IVTy);
+    ToSkip.insert(VectorStep->getDefiningRecipe());
+  }
+
+  VPValue *ScalarStep = IV->getStepValue();
+  auto *ConstStep = ScalarStep->isLiveIn()
+                        ? dyn_cast<ConstantInt>(ScalarStep->getLiveInIRValue())
+                        : nullptr;
+  if (!ConstStep || ConstStep->getZExtValue() != 1) {
+    if (TypeInfo.inferScalarType(ScalarStep) != IVTy) {
+      ScalarStep =
+          Builder.createWidenCast(Instruction::Trunc, ScalarStep, IVTy);
+      ToSkip.insert(ScalarStep->getDefiningRecipe());
+    }
+
+    unsigned MulOpc =
+        IVTy->isFloatingPointTy() ? Instruction::FMul : Instruction::Mul;
+    VPInstruction *Mul = Builder.createNaryOp(MulOpc, {VectorStep, ScalarStep},
+                                              FMFs, IV->getDebugLoc());
+    VectorStep = Mul;
+    ToSkip.insert(Mul);
+  }
+
+  // Now create recipes to compute the induction steps for part 1 .. UF. Part 0
+  // remains the header phi. Parts > 0 are computed by adding Step to the
+  // previous part. The header phi recipe will get 2 new operands: the step
+  // value for a single part and the last part, used to compute the backedge
+  // value during VPWidenIntOrFpInductionRecipe::execute. %Part.0 =
+  // VPWidenIntOrFpInductionRecipe %Start, %ScalarStep, %VectorStep, %Part.3
+  // %Part.1 = %Part.0 + %VectorStep
+  // %Part.2 = %Part.1 + %VectorStep
+  // %Part.3 = %Part.2 + %VectorStep
+  //
+  // The newly added recipes are added to ToSkip to avoid interleaving them
+  // again.
+  VPValue *Prev = IV;
+  Builder.setInsertPoint(IV->getParent(), InsertPtForPhi);
+  unsigned AddOpc =
+      IVTy->isFloatingPointTy() ? ID.getInductionOpcode() : Instruction::Add;
+  for (unsigned Part = 1; Part != UF; ++Part) {
+    std::string Name =
+        Part > 1 ? "step.add." + std::to_string(Part) : "step.add";
+
+    VPInstruction *Add = Builder.createNaryOp(AddOpc,
+                                              {
+                                                  Prev,
+                                                  VectorStep,
+                                              },
+                                              FMFs, IV->getDebugLoc(), Name);
+    ToSkip.insert(Add);
+    addRecipeForPart(IV, Add, Part);
+    Prev = Add;
+  }
+  IV->addOperand(VectorStep);
+  IV->addOperand(Prev);
+}
+
+void UnrollState::unrollHeaderPHIByUF(VPHeaderPHIRecipe *R,
+                                      VPBasicBlock::iterator InsertPtForPhi) {
+  // First-order recurrences pass a single vector or scalar through their header
+  // phis, irrespective of interleaving.
+  if (isa<VPFirstOrderRecurrencePHIRecipe>(R))
+    return;
+
+  // Generate step vectors for each unrolled part.
+  if (auto *IV = dyn_cast<VPWidenIntOrFpInductionRecipe>(R)) {
+    unrollWidenInductionByUF(IV, InsertPtForPhi);
+    return;
+  }
+
+  auto *RdxPhi = dyn_cast<VPReductionPHIRecipe>(R);
+  if (RdxPhi && RdxPhi->isOrdered())
+    return;
+
+  auto InsertPt = std::next(R->getIterator());
+  for (unsigned Part = 1; Part != UF; ++Part) {
+    VPRecipeBase *Copy = R->clone();
+    Copy->insertBefore(*R->getParent(), InsertPt);
+    addRecipeForPart(R, Copy, Part);
+    if (isa<VPWidenPointerInductionRecipe>(R)) {
+      Copy->addOperand(R);
+      Copy->addOperand(getConstantVPV(Part));
+    } else if (RdxPhi) {
+      Copy->addOperand(getConstantVPV(Part));
+    } else {
+      assert(isa<VPActiveLaneMaskPHIRecipe>(R) &&
+             "unexpected header phi recipe not needing unrolled part");
+    }
+  }
+}
+
+/// Handle non-header-phi recipes.
+void UnrollState::unrollRecipeByUF(VPRecipeBase &R) {
+  using namespace llvm::VPlanPatternMatch;
+  if (match(&R, m_BranchOnCond(m_VPValue())) ||
+      match(&R, m_BranchOnCount(m_VPValue(), m_VPValue())))
+    return;
+
+  if (auto *VPI = dyn_cast<VPInstruction>(&R)) {
+    VPValue *Op0, *Op1;
+    if (match(VPI, m_VPInstruction<VPInstruction::ExtractFromEnd>(
+                       m_VPValue(Op0), m_VPValue(Op1)))) {
+      VPI->setOperand(1, getValueForPart(Op1, UF - 1));
+      addUniformForAllParts(VPI);
+      if (Plan.hasScalarVFOnly()) {
+        // Extracting from end with VF = 1 implies retrieving the scalar part UF
+        // - Op1.
+        unsigned Offset =
+            cast<ConstantInt>(Op1->getLiveInIRValue())->getZExtValue();
+        VPI->replaceAllUsesWith(getValueForPart(Op0, UF - Offset));
+      } else {
+        // Otherwise we extract from the last part.
+        remapOperands(VPI, UF - 1);
+      }
+      return;
+    }
+
+    if (vputils::onlyFirstPartUsed(VPI)) {
+      addUniformForAllParts(VPI);
+      return;
+    }
+  }
+  if (auto *RepR = dyn_cast<VPReplicateRecipe>(&R)) {
+    if (isa<StoreInst>(RepR->getUnderlyingValue()) &&
+        RepR->getOperand(1)->isDefinedOutsideLoopRegions()) {
+      // Stores to an invariant address only need to store the last part.
+      remapOperands(&R, UF - 1);
+      return;
+    }
+    if (auto *II = dyn_cast<IntrinsicInst>(RepR->getUnderlyingValue())) {
+      if (II->getIntrinsicID() == Intrinsic::experimental_noalias_scope_decl) {
+        addUniformForAllParts(RepR);
+        return;
+      }
+    }
+  }
+
+  // Unroll non-uniform recipes.
+  auto InsertPt = std::next(R.getIterator());
+  VPBasicBlock &VPBB = *R.getParent();
+  for (unsigned Part = 1; Part != UF; ++Part) {
+    VPRecipeBase *Copy = R.clone();
+    Copy->insertBefore(VPBB, InsertPt);
+    addRecipeForPart(&R, Copy, Part);
+
+    VPValue *Op;
+    if (match(&R, m_VPInstruction<VPInstruction::FirstOrderRecurrenceSplice>(
+                      m_VPValue(), m_VPValue(Op)))) {
+      Copy->setOperand(0, getValueForPart(Op, Part - 1));
+      Copy->setOperand(1, getValueForPart(Op, Part));
+      continue;
+    }
+    if (auto *Red = dyn_cast<VPReductionRecipe>(&R)) {
+      auto *Phi = cast<VPReductionPHIRecipe>(R.getOperand(0));
+      if (Phi->isOrdered()) {
+        auto Ins = VPV2Parts.insert({Phi, {}});
+        if (Part == 1) {
+          Ins.first->second.clear();
+          Ins.first->second.push_back(Red);
+        }
+        Ins.first->second.push_back(Copy->getVPSingleValue());
+        Phi->setOperand(1, Copy->getVPSingleValue());
+      }
+    }
+    remapOperands(Copy, Part);
+
+    // Add operand indicating the part to generate code for, to recipes still
+    // requiring it.
+    if (isa<VPScalarIVStepsRecipe, VPWidenCanonicalIVRecipe,
+            VPVectorPointerRecipe>(Copy) ||
+        match(Copy, m_VPInstruction<VPInstruction::CanonicalIVIncrementForPart>(
+                        m_VPValue())))
+      Copy->addOperand(getConstantVPV(Part));
+
+    if (isa<VPVectorPointerRecipe>(R))
+      Copy->setOperand(0, R.getOperand(0));
+  }
+}
+
+using namespace llvm::VPlanPatternMatch;
+void UnrollState::unrollBlock(VPBlockBase *VPB) {
+  auto *VPR = dyn_cast<VPRegionBlock>(VPB);
+  if (VPR) {
+    if (VPR->isReplicator())
+      return unrollReplicateRegionByUF(VPR);
+
+    // Traverse blocks in region in RPO to ensure defs are visited before uses
+    // across blocks.
+    ReversePostOrderTraversal<VPBlockShallowTraversalWrapper<VPBlockBase *>>
+        RPOT(VPR->getEntry());
+    for (VPBlockBase *VPB : RPOT)
+      unrollBlock(VPB);
+    return;
+  }
+
+  // VPB is a VPBasicBlock; unroll it, i.e., unroll its recipes.
+  auto *VPBB = cast<VPBasicBlock>(VPB);
+  auto InsertPtForPhi = VPBB->getFirstNonPhi();
+  for (VPRecipeBase &R : make_early_inc_range(*VPBB)) {
+    if (ToSkip.contains(&R) || isa<VPIRInstruction>(&R))
+      continue;
+
+    // Add all VPValues for all parts to ComputeReductionResult which combines
+    // the parts to compute the final reduction value.
+    VPValue *Op1;
+    if (match(&R, m_VPInstruction<VPInstruction::ComputeReductionResult>(
+                      m_VPValue(), m_VPValue(Op1)))) {
+      addUniformForAllParts(cast<VPInstruction>(&R));
+      for (unsigned Part = 1; Part != UF; ++Part)
+        R.addOperand(getValueForPart(Op1, Part));
+      continue;
+    }
+    VPValue *Op0;
+    if (match(&R, m_VPInstruction<VPInstruction::ExtractFromEnd>(
+                      m_VPValue(Op0), m_VPValue(Op1)))) {
+      addUniformForAllParts(cast<VPSingleDefRecipe>(&R));
+      if (Plan.hasScalarVFOnly()) {
+        // Extracting from end with VF = 1 implies retrieving the scalar part UF
+        // - Op1.
+        unsigned Offset =
+            cast<ConstantInt>(Op1->getLiveInIRValue())->getZExtValue();
+        R.getVPSingleValue()->replaceAllUsesWith(
+            getValueForPart(Op0, UF - Offset));
+        R.eraseFromParent();
+      } else {
+        // Otherwise we extract from the last part.
+        remapOperands(&R, UF - 1);
+      }
+      continue;
+    }
+
+    auto *SingleDef = dyn_cast<VPSingleDefRecipe>(&R);
+    if (SingleDef && vputils::isUniformAcrossVFsAndUFs(SingleDef)) {
+      addUniformForAllParts(SingleDef);
+      continue;
+    }
+
+    if (auto *H = dyn_cast<VPHeaderPHIRecipe>(&R)) {
+      unrollHeaderPHIByUF(H, InsertPtForPhi);
+      continue;
+    }
+
+    unrollRecipeByUF(R);
+  }
+}
+
+void VPlanTransforms::unrollByUF(VPlan &Plan, unsigned UF, LLVMContext &Ctx) {
+  assert(UF > 0 && "Unroll factor must be positive");
+  Plan.setUF(UF);
+  auto Cleanup = make_scope_exit([&Plan]() {
+    auto Iter = vp_depth_first_deep(Plan.getEntry());
+    // Remove recipes that are redundant after unrolling.
+    for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(Iter)) {
+      for (VPRecipeBase &R : make_early_inc_range(*VPBB)) {
+        auto *VPI = dyn_cast<VPInstruction>(&R);
+        if (VPI &&
+            VPI->getOpcode() == VPInstruction::CanonicalIVIncrementForPart &&
+            VPI->getNumOperands() == 1) {
+          VPI->replaceAllUsesWith(VPI->getOperand(0));
+          VPI->eraseFromParent();
+        }
+      }
+    }
+  });
+  if (UF == 1) {
+    return;
+  }
+
+  UnrollState Unroller(Plan, UF, Ctx);
+
+  Unroller.unrollBlock(Plan.getPreheader());
+
+  // Iterate over all blocks in the plan starting from Entry, and unroll
+  // recipes inside them. This includes the vector preheader and middle blocks,
+  // which may set up or post-process per-part values.
+  ReversePostOrderTraversal<VPBlockShallowTraversalWrapper<VPBlockBase *>> RPOT(
+      Plan.getEntry());
+  for (VPBlockBase *VPB : RPOT)
+    Unroller.unrollBlock(VPB);
+
+  unsigned Part = 1;
+  // Remap operands of cloned header phis to update backedge values. The header
+  // phis cloned during unrolling are just after the header phi for part 0.
+  // Reset Part to 1 when reaching the first (part 0) recipe of a block.
+  for (VPRecipeBase &H :
+       Plan.getVectorLoopRegion()->getEntryBasicBlock()->phis()) {
+    // The second operand of Fixed Order Recurrence phi's, feeding the spliced
+    // value across the backedge, needs to remap to the last part of the spliced
+    // value.
+    if (isa<VPFirstOrderRecurrencePHIRecipe>(&H)) {
+      Unroller.remapOperand(&H, 1, UF - 1);
+      continue;
+    }
+    if (Unroller.contains(H.getVPSingleValue()) ||
+        isa<VPWidenPointerInductionRecipe>(&H)) {
+      Part = 1;
+      continue;
+    }
+    Unroller.remapOperands(&H, Part);
+    Part++;
+  }
+
+  // Remap the operand of live-outs to the last part.
+  for (const auto &[_, LO] : Plan.getLiveOuts()) {
+    VPValue *In = Unroller.getValueForPart(LO->getOperand(0), UF - 1);
+    LO->setOperand(0, In);
+  }
+}

>From 2db534096f9de1657678bc28cc390146a5ec7248 Mon Sep 17 00:00:00 2001
From: Florian Hahn <flo at fhahn.com>
Date: Sat, 21 Sep 2024 13:42:43 +0100
Subject: [PATCH 15/15] !fixup test updates

---
 .../Transforms/Vectorize/LoopVectorize.cpp    |   2 +-
 llvm/lib/Transforms/Vectorize/VPlan.cpp       |   4 +
 .../lib/Transforms/Vectorize/VPlanRecipes.cpp |   7 +-
 .../AArch64/sve-inductions-unusual-types.ll   |  14 +-
 ...ectorize-force-tail-with-evl-interleave.ll | 129 +++++++++++-------
 .../LoopVectorize/first-order-recurrence.ll   |  22 +--
 6 files changed, 104 insertions(+), 74 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index ba78f1bdbaa365..b5c8763b4f66ad 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -7629,7 +7629,7 @@ LoopVectorizationPlanner::executePlan(
   if (MiddleTerm->isConditional() &&
       hasBranchWeightMD(*OrigLoop->getLoopLatch()->getTerminator())) {
     // Assume that `Count % VectorTripCount` is equally distributed.
-    unsigned TripCount = State.UF * State.VF.getKnownMinValue();
+    unsigned TripCount = BestVPlan.getUF() * State.VF.getKnownMinValue();
     assert(TripCount > 0 && "trip count should not be zero");
     const uint32_t Weights[] = {1, TripCount - 1};
     setBranchWeights(*MiddleTerm, Weights, /*IsExpected=*/false);
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.cpp b/llvm/lib/Transforms/Vectorize/VPlan.cpp
index bfbc90e3d5ca30..b0a637012037d6 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlan.cpp
@@ -391,6 +391,7 @@ void VPTransformState::setDebugLocFrom(DebugLoc DL) {
           ->shouldEmitDebugInfoForProfiling() &&
       !EnableFSDiscriminator) {
     // FIXME: For scalable vectors, assume vscale=1.
+    unsigned UF = Plan->getUF();
     auto NewDIL =
         DIL->cloneByMultiplyingDuplicationFactor(UF * VF.getKnownMinValue());
     if (NewDIL)
@@ -1018,6 +1019,9 @@ static void replaceVPBBWithIRVPBB(VPBasicBlock *VPBB, BasicBlock *IRBB) {
 /// Assumes a single pre-header basic-block was created for this. Introduce
 /// additional basic-blocks as needed, and fill them all.
 void VPlan::execute(VPTransformState *State) {
+  // Set UF to 1, as the unrollByUF VPlan transform already explicitly unrolled
+  // the VPlan.
+  // TODO: Remove State::UF and all uses.
   State->UF = 1;
   // Initialize CFG state.
   State->CFG.PrevVPBB = nullptr;
diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
index 7246058adc0f81..255b8b03ae0db9 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
@@ -2928,8 +2928,8 @@ void VPWidenPointerInductionRecipe::execute(VPTransformState &State) {
   BasicBlock *VectorPH = State.CFG.getPreheaderBBFor(this);
   PHINode *NewPointerPhi = nullptr;
   if (CurrentPart == 0) {
-    NewPointerPhi =
-        PHINode::Create(ScStValueType, 2, "pointer.phi", CanonicalIV);
+    NewPointerPhi = PHINode::Create(ScStValueType, 2, "pointer.phi",
+                                    CanonicalIV->getIterator());
     NewPointerPhi->addIncoming(ScalarStartValue, VectorPH);
   } else {
     // The recipe has been unrolled. In that case, fetch the single pointer phi
@@ -3138,8 +3138,7 @@ void VPReductionPHIRecipe::execute(VPTransformState &State) {
     } else {
       IRBuilderBase::InsertPointGuard IPBuilder(Builder);
       Builder.SetInsertPoint(VectorPH->getTerminator());
-      StartV = Iden =
-          Builder.CreateVectorSplat(State.VF, StartV, "minmax.ident");
+      StartV = Iden = State.get(StartVPV, 0);
     }
   } else {
     Iden = llvm::getRecurrenceIdentity(RK, VecTy->getScalarType(),
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-inductions-unusual-types.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-inductions-unusual-types.ll
index 6a4ff7f6003e05..2da2269023e27f 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/sve-inductions-unusual-types.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-inductions-unusual-types.ll
@@ -20,15 +20,14 @@ define void @induction_i7(ptr %dst) #0 {
 ; CHECK-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-NEXT:    [[TMP40:%.*]] = mul i64 [[TMP4]], 2
 ; CHECK-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP40]], 2
+; CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 2 x i64> poison, i64 [[TMP40]], i64 0
+; CHECK-NEXT:    [[DOTSPLAT_:%.*]] = shufflevector <vscale x 2 x i64> [[DOTSPLATINSERT]], <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer
+; CHECK-NEXT:    [[DOTSPLAT:%.*]] = trunc <vscale x 2 x i64> [[DOTSPLAT_]] to <vscale x 2 x i7>
 ; CHECK-NEXT:    [[TMP6:%.*]] = call <vscale x 2 x i8> @llvm.stepvector.nxv2i8()
 ; CHECK-NEXT:    [[TMP7:%.*]] = trunc <vscale x 2 x i8> [[TMP6]] to <vscale x 2 x i7>
 ; CHECK-NEXT:    [[TMP8:%.*]] = add <vscale x 2 x i7> [[TMP7]], zeroinitializer
 ; CHECK-NEXT:    [[TMP9:%.*]] = mul <vscale x 2 x i7> [[TMP8]], shufflevector (<vscale x 2 x i7> insertelement (<vscale x 2 x i7> poison, i7 1, i64 0), <vscale x 2 x i7> poison, <vscale x 2 x i32> zeroinitializer)
 ; CHECK-NEXT:    [[INDUCTION:%.*]] = add <vscale x 2 x i7> zeroinitializer, [[TMP9]]
-; CHECK-NEXT:    [[TMP11:%.*]] = trunc i64 [[TMP40]] to i7
-; CHECK-NEXT:    [[TMP12:%.*]] = mul i7 1, [[TMP11]]
-; CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 2 x i7> poison, i7 [[TMP12]], i64 0
-; CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <vscale x 2 x i7> [[DOTSPLATINSERT]], <vscale x 2 x i7> poison, <vscale x 2 x i32> zeroinitializer
 ; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
 ; CHECK:       [[VECTOR_BODY]]:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %vector.ph ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
@@ -94,15 +93,14 @@ define void @induction_i3_zext(ptr %dst) #0 {
 ; CHECK-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-NEXT:    [[TMP40:%.*]] = mul i64 [[TMP4]], 2
 ; CHECK-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP40]], 2
+; CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 2 x i64> poison, i64 [[TMP40]], i64 0
+; CHECK-NEXT:    [[DOTSPLAT_:%.*]] = shufflevector <vscale x 2 x i64> [[DOTSPLATINSERT]], <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer
+; CHECK-NEXT:    [[DOTSPLAT:%.*]] = trunc <vscale x 2 x i64> [[DOTSPLAT_]] to <vscale x 2 x i3>
 ; CHECK-NEXT:    [[TMP6:%.*]] = call <vscale x 2 x i8> @llvm.stepvector.nxv2i8()
 ; CHECK-NEXT:    [[TMP7:%.*]] = trunc <vscale x 2 x i8> [[TMP6]] to <vscale x 2 x i3>
 ; CHECK-NEXT:    [[TMP8:%.*]] = add <vscale x 2 x i3> [[TMP7]], zeroinitializer
 ; CHECK-NEXT:    [[TMP9:%.*]] = mul <vscale x 2 x i3> [[TMP8]], shufflevector (<vscale x 2 x i3> insertelement (<vscale x 2 x i3> poison, i3 1, i64 0), <vscale x 2 x i3> poison, <vscale x 2 x i32> zeroinitializer)
 ; CHECK-NEXT:    [[INDUCTION:%.*]] = add <vscale x 2 x i3> zeroinitializer, [[TMP9]]
-; CHECK-NEXT:    [[TMP11:%.*]] = trunc i64 [[TMP40]] to i3
-; CHECK-NEXT:    [[TMP12:%.*]] = mul i3 1, [[TMP11]]
-; CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 2 x i3> poison, i3 [[TMP12]], i64 0
-; CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <vscale x 2 x i3> [[DOTSPLATINSERT]], <vscale x 2 x i3> poison, <vscale x 2 x i32> zeroinitializer
 ; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
 ; CHECK:       [[VECTOR_BODY]]:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %vector.ph ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-interleave.ll b/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-interleave.ll
index 57572d7cd1515a..1f68b4a8fdde0e 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-interleave.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-interleave.ll
@@ -13,65 +13,65 @@
 define void @interleave(ptr noalias %a, ptr noalias %b, i64 %N) {
 ; IF-EVL-LABEL: @interleave(
 ; IF-EVL-NEXT:  entry:
-; IF-EVL-NEXT:    [[TMP17:%.*]] = sub i64 -1, [[N:%.*]]
-; IF-EVL-NEXT:    [[TMP31:%.*]] = call i64 @llvm.vscale.i64()
-; IF-EVL-NEXT:    [[TMP2:%.*]] = mul i64 [[TMP31]], 8
-; IF-EVL-NEXT:    [[TMP3:%.*]] = icmp ult i64 [[TMP17]], [[TMP2]]
+; IF-EVL-NEXT:    [[TMP0:%.*]] = sub i64 -1, [[N:%.*]]
+; IF-EVL-NEXT:    [[TMP1:%.*]] = call i64 @llvm.vscale.i64()
+; IF-EVL-NEXT:    [[TMP2:%.*]] = mul i64 [[TMP1]], 8
+; IF-EVL-NEXT:    [[TMP3:%.*]] = icmp ult i64 [[TMP0]], [[TMP2]]
 ; IF-EVL-NEXT:    br i1 [[TMP3]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; IF-EVL:       vector.ph:
 ; IF-EVL-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
 ; IF-EVL-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 8
-; IF-EVL-NEXT:    [[TMP32:%.*]] = sub i64 [[TMP5]], 1
-; IF-EVL-NEXT:    [[N_RND_UP:%.*]] = add i64 [[N]], [[TMP32]]
+; IF-EVL-NEXT:    [[TMP6:%.*]] = sub i64 [[TMP5]], 1
+; IF-EVL-NEXT:    [[N_RND_UP:%.*]] = add i64 [[N]], [[TMP6]]
 ; IF-EVL-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP5]]
 ; IF-EVL-NEXT:    [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
 ; IF-EVL-NEXT:    [[TRIP_COUNT_MINUS_1:%.*]] = sub i64 [[N]], 1
-; IF-EVL-NEXT:    [[TMP9:%.*]] = call i64 @llvm.vscale.i64()
-; IF-EVL-NEXT:    [[TMP10:%.*]] = mul i64 [[TMP9]], 4
-; IF-EVL-NEXT:    [[TMP101:%.*]] = mul i64 [[TMP10]], 2
-; IF-EVL-NEXT:    [[TMP11:%.*]] = call <vscale x 4 x i64> @llvm.experimental.stepvector.nxv4i64()
-; IF-EVL-NEXT:    [[TMP12:%.*]] = add <vscale x 4 x i64> [[TMP11]], zeroinitializer
-; IF-EVL-NEXT:    [[TMP13:%.*]] = mul <vscale x 4 x i64> [[TMP12]], shufflevector (<vscale x 4 x i64> insertelement (<vscale x 4 x i64> poison, i64 1, i64 0), <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer)
-; IF-EVL-NEXT:    [[INDUCTION:%.*]] = add <vscale x 4 x i64> zeroinitializer, [[TMP13]]
-; IF-EVL-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 4 x i64> poison, i64 [[TMP101]], i64 0
-; IF-EVL-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <vscale x 4 x i64> [[DOTSPLATINSERT]], <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer
-; IF-EVL-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 4 x i64> poison, i64 [[TRIP_COUNT_MINUS_1]], i64 0
+; IF-EVL-NEXT:    [[TMP7:%.*]] = call i64 @llvm.vscale.i64()
+; IF-EVL-NEXT:    [[TMP8:%.*]] = mul i64 [[TMP7]], 4
+; IF-EVL-NEXT:    [[TMP9:%.*]] = mul i64 [[TMP8]], 2
+; IF-EVL-NEXT:    [[TMP10:%.*]] = call <vscale x 4 x i64> @llvm.stepvector.nxv4i64()
+; IF-EVL-NEXT:    [[TMP11:%.*]] = add <vscale x 4 x i64> [[TMP10]], zeroinitializer
+; IF-EVL-NEXT:    [[TMP12:%.*]] = mul <vscale x 4 x i64> [[TMP11]], shufflevector (<vscale x 4 x i64> insertelement (<vscale x 4 x i64> poison, i64 1, i64 0), <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer)
+; IF-EVL-NEXT:    [[INDUCTION:%.*]] = add <vscale x 4 x i64> zeroinitializer, [[TMP12]]
+; IF-EVL-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 4 x i64> poison, i64 [[TMP8]], i64 0
 ; IF-EVL-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 4 x i64> [[BROADCAST_SPLATINSERT]], <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer
+; IF-EVL-NEXT:    [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <vscale x 4 x i64> poison, i64 [[TRIP_COUNT_MINUS_1]], i64 0
+; IF-EVL-NEXT:    [[BROADCAST_SPLAT2:%.*]] = shufflevector <vscale x 4 x i64> [[BROADCAST_SPLATINSERT1]], <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer
 ; IF-EVL-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; IF-EVL:       vector.body:
 ; IF-EVL-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; IF-EVL-NEXT:    [[VEC_IND:%.*]] = phi <vscale x 4 x i64> [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
-; IF-EVL-NEXT:    [[STEP_ADD:%.*]] = add <vscale x 4 x i64> [[VEC_IND]], [[DOTSPLAT]]
-; IF-EVL-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
-; IF-EVL-NEXT:    [[TMP18:%.*]] = call i64 @llvm.vscale.i64()
-; IF-EVL-NEXT:    [[TMP19:%.*]] = mul i64 [[TMP18]], 4
-; IF-EVL-NEXT:    [[TMP38:%.*]] = add i64 [[TMP19]], 0
-; IF-EVL-NEXT:    [[TMP39:%.*]] = mul i64 [[TMP38]], 1
-; IF-EVL-NEXT:    [[TMP1:%.*]] = add i64 [[INDEX]], [[TMP39]]
-; IF-EVL-NEXT:    [[TMP23:%.*]] = icmp ule <vscale x 4 x i64> [[VEC_IND]], [[BROADCAST_SPLAT]]
-; IF-EVL-NEXT:    [[TMP24:%.*]] = icmp ule <vscale x 4 x i64> [[STEP_ADD]], [[BROADCAST_SPLAT]]
-; IF-EVL-NEXT:    [[TMP25:%.*]] = getelementptr inbounds [2 x i32], ptr [[B:%.*]], <vscale x 4 x i64> [[VEC_IND]], i32 0
-; IF-EVL-NEXT:    [[TMP26:%.*]] = getelementptr inbounds [2 x i32], ptr [[B]], <vscale x 4 x i64> [[STEP_ADD]], i32 0
-; IF-EVL-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <vscale x 4 x i32> @llvm.masked.gather.nxv4i32.nxv4p0(<vscale x 4 x ptr> [[TMP25]], i32 4, <vscale x 4 x i1> [[TMP23]], <vscale x 4 x i32> poison)
-; IF-EVL-NEXT:    [[WIDE_MASKED_GATHER2:%.*]] = call <vscale x 4 x i32> @llvm.masked.gather.nxv4i32.nxv4p0(<vscale x 4 x ptr> [[TMP26]], i32 4, <vscale x 4 x i1> [[TMP24]], <vscale x 4 x i32> poison)
-; IF-EVL-NEXT:    [[TMP27:%.*]] = getelementptr inbounds [2 x i32], ptr [[B]], <vscale x 4 x i64> [[VEC_IND]], i32 1
-; IF-EVL-NEXT:    [[TMP28:%.*]] = getelementptr inbounds [2 x i32], ptr [[B]], <vscale x 4 x i64> [[STEP_ADD]], i32 1
-; IF-EVL-NEXT:    [[WIDE_MASKED_GATHER3:%.*]] = call <vscale x 4 x i32> @llvm.masked.gather.nxv4i32.nxv4p0(<vscale x 4 x ptr> [[TMP27]], i32 4, <vscale x 4 x i1> [[TMP23]], <vscale x 4 x i32> poison)
-; IF-EVL-NEXT:    [[WIDE_MASKED_GATHER4:%.*]] = call <vscale x 4 x i32> @llvm.masked.gather.nxv4i32.nxv4p0(<vscale x 4 x ptr> [[TMP28]], i32 4, <vscale x 4 x i1> [[TMP24]], <vscale x 4 x i32> poison)
-; IF-EVL-NEXT:    [[TMP29:%.*]] = add nsw <vscale x 4 x i32> [[WIDE_MASKED_GATHER3]], [[WIDE_MASKED_GATHER]]
-; IF-EVL-NEXT:    [[TMP30:%.*]] = add nsw <vscale x 4 x i32> [[WIDE_MASKED_GATHER4]], [[WIDE_MASKED_GATHER2]]
-; IF-EVL-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[TMP0]]
-; IF-EVL-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP1]]
-; IF-EVL-NEXT:    [[TMP33:%.*]] = getelementptr inbounds i32, ptr [[TMP8]], i32 0
-; IF-EVL-NEXT:    [[TMP34:%.*]] = call i64 @llvm.vscale.i64()
-; IF-EVL-NEXT:    [[TMP35:%.*]] = mul i64 [[TMP34]], 4
-; IF-EVL-NEXT:    [[TMP36:%.*]] = getelementptr inbounds i32, ptr [[TMP8]], i64 [[TMP35]]
-; IF-EVL-NEXT:    call void @llvm.masked.store.nxv4i32.p0(<vscale x 4 x i32> [[TMP29]], ptr [[TMP33]], i32 4, <vscale x 4 x i1> [[TMP23]])
-; IF-EVL-NEXT:    call void @llvm.masked.store.nxv4i32.p0(<vscale x 4 x i32> [[TMP30]], ptr [[TMP36]], i32 4, <vscale x 4 x i1> [[TMP24]])
-; IF-EVL-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP101]]
-; IF-EVL-NEXT:    [[VEC_IND_NEXT]] = add <vscale x 4 x i64> [[STEP_ADD]], [[DOTSPLAT]]
-; IF-EVL-NEXT:    [[TMP20:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; IF-EVL-NEXT:    br i1 [[TMP20]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; IF-EVL-NEXT:    [[STEP_ADD:%.*]] = add <vscale x 4 x i64> [[VEC_IND]], [[BROADCAST_SPLAT]]
+; IF-EVL-NEXT:    [[TMP13:%.*]] = add i64 [[INDEX]], 0
+; IF-EVL-NEXT:    [[TMP14:%.*]] = call i64 @llvm.vscale.i64()
+; IF-EVL-NEXT:    [[TMP15:%.*]] = mul i64 [[TMP14]], 4
+; IF-EVL-NEXT:    [[TMP16:%.*]] = add i64 [[TMP15]], 0
+; IF-EVL-NEXT:    [[TMP17:%.*]] = mul i64 [[TMP16]], 1
+; IF-EVL-NEXT:    [[TMP18:%.*]] = add i64 [[INDEX]], [[TMP17]]
+; IF-EVL-NEXT:    [[TMP19:%.*]] = icmp ule <vscale x 4 x i64> [[VEC_IND]], [[BROADCAST_SPLAT2]]
+; IF-EVL-NEXT:    [[TMP20:%.*]] = icmp ule <vscale x 4 x i64> [[STEP_ADD]], [[BROADCAST_SPLAT2]]
+; IF-EVL-NEXT:    [[TMP21:%.*]] = getelementptr inbounds [2 x i32], ptr [[B:%.*]], <vscale x 4 x i64> [[VEC_IND]], i32 0
+; IF-EVL-NEXT:    [[TMP22:%.*]] = getelementptr inbounds [2 x i32], ptr [[B]], <vscale x 4 x i64> [[STEP_ADD]], i32 0
+; IF-EVL-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <vscale x 4 x i32> @llvm.masked.gather.nxv4i32.nxv4p0(<vscale x 4 x ptr> [[TMP21]], i32 4, <vscale x 4 x i1> [[TMP19]], <vscale x 4 x i32> poison)
+; IF-EVL-NEXT:    [[WIDE_MASKED_GATHER3:%.*]] = call <vscale x 4 x i32> @llvm.masked.gather.nxv4i32.nxv4p0(<vscale x 4 x ptr> [[TMP22]], i32 4, <vscale x 4 x i1> [[TMP20]], <vscale x 4 x i32> poison)
+; IF-EVL-NEXT:    [[TMP23:%.*]] = getelementptr inbounds [2 x i32], ptr [[B]], <vscale x 4 x i64> [[VEC_IND]], i32 1
+; IF-EVL-NEXT:    [[TMP24:%.*]] = getelementptr inbounds [2 x i32], ptr [[B]], <vscale x 4 x i64> [[STEP_ADD]], i32 1
+; IF-EVL-NEXT:    [[WIDE_MASKED_GATHER4:%.*]] = call <vscale x 4 x i32> @llvm.masked.gather.nxv4i32.nxv4p0(<vscale x 4 x ptr> [[TMP23]], i32 4, <vscale x 4 x i1> [[TMP19]], <vscale x 4 x i32> poison)
+; IF-EVL-NEXT:    [[WIDE_MASKED_GATHER5:%.*]] = call <vscale x 4 x i32> @llvm.masked.gather.nxv4i32.nxv4p0(<vscale x 4 x ptr> [[TMP24]], i32 4, <vscale x 4 x i1> [[TMP20]], <vscale x 4 x i32> poison)
+; IF-EVL-NEXT:    [[TMP25:%.*]] = add nsw <vscale x 4 x i32> [[WIDE_MASKED_GATHER4]], [[WIDE_MASKED_GATHER]]
+; IF-EVL-NEXT:    [[TMP26:%.*]] = add nsw <vscale x 4 x i32> [[WIDE_MASKED_GATHER5]], [[WIDE_MASKED_GATHER3]]
+; IF-EVL-NEXT:    [[TMP27:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[TMP13]]
+; IF-EVL-NEXT:    [[TMP28:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP18]]
+; IF-EVL-NEXT:    [[TMP29:%.*]] = getelementptr inbounds i32, ptr [[TMP27]], i32 0
+; IF-EVL-NEXT:    [[TMP30:%.*]] = call i64 @llvm.vscale.i64()
+; IF-EVL-NEXT:    [[TMP31:%.*]] = mul i64 [[TMP30]], 4
+; IF-EVL-NEXT:    [[TMP32:%.*]] = getelementptr inbounds i32, ptr [[TMP27]], i64 [[TMP31]]
+; IF-EVL-NEXT:    call void @llvm.masked.store.nxv4i32.p0(<vscale x 4 x i32> [[TMP25]], ptr [[TMP29]], i32 4, <vscale x 4 x i1> [[TMP19]])
+; IF-EVL-NEXT:    call void @llvm.masked.store.nxv4i32.p0(<vscale x 4 x i32> [[TMP26]], ptr [[TMP32]], i32 4, <vscale x 4 x i1> [[TMP20]])
+; IF-EVL-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP9]]
+; IF-EVL-NEXT:    [[VEC_IND_NEXT]] = add <vscale x 4 x i64> [[STEP_ADD]], [[BROADCAST_SPLAT]]
+; IF-EVL-NEXT:    [[TMP33:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; IF-EVL-NEXT:    br i1 [[TMP33]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; IF-EVL:       middle.block:
 ; IF-EVL-NEXT:    br i1 true, label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]]
 ; IF-EVL:       scalar.ph:
@@ -80,10 +80,10 @@ define void @interleave(ptr noalias %a, ptr noalias %b, i64 %N) {
 ; IF-EVL:       for.body:
 ; IF-EVL-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
 ; IF-EVL-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x i32], ptr [[B]], i64 [[IV]], i32 0
-; IF-EVL-NEXT:    [[TMP21:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
+; IF-EVL-NEXT:    [[TMP34:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
 ; IF-EVL-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x i32], ptr [[B]], i64 [[IV]], i32 1
-; IF-EVL-NEXT:    [[TMP22:%.*]] = load i32, ptr [[ARRAYIDX2]], align 4
-; IF-EVL-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP22]], [[TMP21]]
+; IF-EVL-NEXT:    [[TMP35:%.*]] = load i32, ptr [[ARRAYIDX2]], align 4
+; IF-EVL-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP35]], [[TMP34]]
 ; IF-EVL-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[IV]]
 ; IF-EVL-NEXT:    store i32 [[ADD]], ptr [[ARRAYIDX4]], align 4
 ; IF-EVL-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
@@ -108,6 +108,35 @@ define void @interleave(ptr noalias %a, ptr noalias %b, i64 %N) {
 ; NO-VP-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; NO-VP:       vector.body:
 ; NO-VP-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; NO-VP-NEXT:    [[TMP6:%.*]] = add i64 [[INDEX]], 0
+; NO-VP-NEXT:    [[TMP7:%.*]] = call i64 @llvm.vscale.i64()
+; NO-VP-NEXT:    [[TMP8:%.*]] = mul i64 [[TMP7]], 4
+; NO-VP-NEXT:    [[TMP9:%.*]] = add i64 [[TMP8]], 0
+; NO-VP-NEXT:    [[TMP10:%.*]] = mul i64 [[TMP9]], 1
+; NO-VP-NEXT:    [[TMP11:%.*]] = add i64 [[INDEX]], [[TMP10]]
+; NO-VP-NEXT:    [[TMP12:%.*]] = getelementptr inbounds [2 x i32], ptr [[B:%.*]], i64 [[TMP6]], i32 0
+; NO-VP-NEXT:    [[TMP13:%.*]] = getelementptr inbounds [2 x i32], ptr [[B]], i64 [[TMP11]], i32 0
+; NO-VP-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i32, ptr [[TMP12]], i32 0
+; NO-VP-NEXT:    [[WIDE_VEC:%.*]] = load <vscale x 8 x i32>, ptr [[TMP14]], align 4
+; NO-VP-NEXT:    [[STRIDED_VEC:%.*]] = call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.vector.deinterleave2.nxv8i32(<vscale x 8 x i32> [[WIDE_VEC]])
+; NO-VP-NEXT:    [[TMP15:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[STRIDED_VEC]], 0
+; NO-VP-NEXT:    [[TMP16:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[STRIDED_VEC]], 1
+; NO-VP-NEXT:    [[TMP17:%.*]] = getelementptr inbounds i32, ptr [[TMP13]], i32 0
+; NO-VP-NEXT:    [[WIDE_VEC1:%.*]] = load <vscale x 8 x i32>, ptr [[TMP17]], align 4
+; NO-VP-NEXT:    [[STRIDED_VEC2:%.*]] = call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.vector.deinterleave2.nxv8i32(<vscale x 8 x i32> [[WIDE_VEC1]])
+; NO-VP-NEXT:    [[TMP18:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[STRIDED_VEC2]], 0
+; NO-VP-NEXT:    [[TMP19:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[STRIDED_VEC2]], 1
+; NO-VP-NEXT:    [[TMP20:%.*]] = add nsw <vscale x 4 x i32> [[TMP16]], [[TMP15]]
+; NO-VP-NEXT:    [[TMP21:%.*]] = add nsw <vscale x 4 x i32> [[TMP19]], [[TMP18]]
+; NO-VP-NEXT:    [[TMP22:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[TMP6]]
+; NO-VP-NEXT:    [[TMP23:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP11]]
+; NO-VP-NEXT:    [[TMP24:%.*]] = getelementptr inbounds i32, ptr [[TMP22]], i32 0
+; NO-VP-NEXT:    [[TMP25:%.*]] = call i64 @llvm.vscale.i64()
+; NO-VP-NEXT:    [[TMP26:%.*]] = mul i64 [[TMP25]], 4
+; NO-VP-NEXT:    [[TMP27:%.*]] = getelementptr inbounds i32, ptr [[TMP22]], i64 [[TMP26]]
+; NO-VP-NEXT:    store <vscale x 4 x i32> [[TMP20]], ptr [[TMP24]], align 4
+; NO-VP-NEXT:    store <vscale x 4 x i32> [[TMP21]], ptr [[TMP27]], align 4
+; NO-VP-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
 ; NO-VP-NEXT:    [[TMP28:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; NO-VP-NEXT:    br i1 [[TMP28]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; NO-VP:       middle.block:
diff --git a/llvm/test/Transforms/LoopVectorize/first-order-recurrence.ll b/llvm/test/Transforms/LoopVectorize/first-order-recurrence.ll
index ed23a0c38e7378..79979790716fea 100644
--- a/llvm/test/Transforms/LoopVectorize/first-order-recurrence.ll
+++ b/llvm/test/Transforms/LoopVectorize/first-order-recurrence.ll
@@ -2892,7 +2892,7 @@ define i32 @sink_into_replication_region(i32 %y) {
 ; UNROLL-NO-VF-NEXT:    br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !prof [[PROF27:![0-9]+]], !llvm.loop [[LOOP28:![0-9]+]]
 ; UNROLL-NO-VF:       middle.block:
 ; UNROLL-NO-VF-NEXT:    [[BIN_RDX:%.*]] = add i32 [[TMP13]], [[TMP12]]
-; UNROLL-NO-VF-NEXT:    br i1 true, label [[BB1:%.*]], label [[SCALAR_PH]], !prof [[PROF27]]
+; UNROLL-NO-VF-NEXT:    br i1 true, label [[BB1:%.*]], label [[SCALAR_PH]], !prof [[PROF29:![0-9]+]]
 ; UNROLL-NO-VF:       scalar.ph:
 ; UNROLL-NO-VF-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i32 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ [[Y]], [[BB:%.*]] ]
 ; UNROLL-NO-VF-NEXT:    [[SCALAR_RECUR_INIT:%.*]] = phi i32 [ [[TMP9]], [[MIDDLE_BLOCK]] ], [ 0, [[BB]] ]
@@ -2909,7 +2909,7 @@ define i32 @sink_into_replication_region(i32 %y) {
 ; UNROLL-NO-VF-NEXT:    [[VAR7]] = udiv i32 219220132, [[VAR3]]
 ; UNROLL-NO-VF-NEXT:    [[VAR8]] = add nsw i32 [[VAR3]], -1
 ; UNROLL-NO-VF-NEXT:    [[VAR9:%.*]] = icmp slt i32 [[VAR3]], 2
-; UNROLL-NO-VF-NEXT:    br i1 [[VAR9]], label [[BB1]], label [[BB2]], !prof [[PROF29:![0-9]+]], !llvm.loop [[LOOP30:![0-9]+]]
+; UNROLL-NO-VF-NEXT:    br i1 [[VAR9]], label [[BB1]], label [[BB2]], !prof [[PROF30:![0-9]+]], !llvm.loop [[LOOP31:![0-9]+]]
 ;
 ; SINK-AFTER-LABEL: @sink_into_replication_region(
 ; SINK-AFTER-NEXT:  bb:
@@ -3276,10 +3276,10 @@ define i32 @sink_into_replication_region_multiple(ptr %x, i32 %y) {
 ; UNROLL-NO-VF-NEXT:    [[TMP17:%.*]] = select i1 [[TMP5]], i32 [[TMP11]], i32 [[VEC_PHI2]]
 ; UNROLL-NO-VF-NEXT:    [[INDEX_NEXT]] = add i32 [[INDEX]], 2
 ; UNROLL-NO-VF-NEXT:    [[TMP18:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
-; UNROLL-NO-VF-NEXT:    br i1 [[TMP18]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !prof [[PROF27]], !llvm.loop [[LOOP31:![0-9]+]]
+; UNROLL-NO-VF-NEXT:    br i1 [[TMP18]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !prof [[PROF27]], !llvm.loop [[LOOP32:![0-9]+]]
 ; UNROLL-NO-VF:       middle.block:
 ; UNROLL-NO-VF-NEXT:    [[BIN_RDX:%.*]] = add i32 [[TMP17]], [[TMP16]]
-; UNROLL-NO-VF-NEXT:    br i1 true, label [[BB1:%.*]], label [[SCALAR_PH]], !prof [[PROF27]]
+; UNROLL-NO-VF-NEXT:    br i1 true, label [[BB1:%.*]], label [[SCALAR_PH]], !prof [[PROF29]]
 ; UNROLL-NO-VF:       scalar.ph:
 ; UNROLL-NO-VF-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i32 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ [[Y]], [[BB:%.*]] ]
 ; UNROLL-NO-VF-NEXT:    [[BC_RESUME_VAL1:%.*]] = phi i32 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[BB]] ]
@@ -3301,7 +3301,7 @@ define i32 @sink_into_replication_region_multiple(ptr %x, i32 %y) {
 ; UNROLL-NO-VF-NEXT:    [[VAR8]] = add nsw i32 [[VAR3]], -1
 ; UNROLL-NO-VF-NEXT:    [[IV_NEXT]] = add nsw i32 [[IV]], 1
 ; UNROLL-NO-VF-NEXT:    [[VAR9:%.*]] = icmp slt i32 [[VAR3]], 2
-; UNROLL-NO-VF-NEXT:    br i1 [[VAR9]], label [[BB1]], label [[BB2]], !prof [[PROF29]], !llvm.loop [[LOOP32:![0-9]+]]
+; UNROLL-NO-VF-NEXT:    br i1 [[VAR9]], label [[BB1]], label [[BB2]], !prof [[PROF30]], !llvm.loop [[LOOP33:![0-9]+]]
 ;
 ; SINK-AFTER-LABEL: @sink_into_replication_region_multiple(
 ; SINK-AFTER-NEXT:  bb:
@@ -3530,7 +3530,7 @@ define i32 @sink_after_dead_inst(ptr %A.ptr) {
 ; UNROLL-NO-VF-NEXT:    store i32 0, ptr [[TMP9]], align 4
 ; UNROLL-NO-VF-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 2
 ; UNROLL-NO-VF-NEXT:    [[TMP10:%.*]] = icmp eq i32 [[INDEX_NEXT]], 16
-; UNROLL-NO-VF-NEXT:    br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP33:![0-9]+]]
+; UNROLL-NO-VF-NEXT:    br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP34:![0-9]+]]
 ; UNROLL-NO-VF:       middle.block:
 ; UNROLL-NO-VF-NEXT:    br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]]
 ; UNROLL-NO-VF:       scalar.ph:
@@ -3550,7 +3550,7 @@ define i32 @sink_after_dead_inst(ptr %A.ptr) {
 ; UNROLL-NO-VF-NEXT:    [[EXT:%.*]] = zext i1 [[B3]] to i32
 ; UNROLL-NO-VF-NEXT:    [[A_GEP:%.*]] = getelementptr i32, ptr [[A_PTR]], i16 [[IV]]
 ; UNROLL-NO-VF-NEXT:    store i32 0, ptr [[A_GEP]], align 4
-; UNROLL-NO-VF-NEXT:    br i1 [[VEC_DEAD]], label [[FOR_END]], label [[LOOP]], !llvm.loop [[LOOP34:![0-9]+]]
+; UNROLL-NO-VF-NEXT:    br i1 [[VEC_DEAD]], label [[FOR_END]], label [[LOOP]], !llvm.loop [[LOOP35:![0-9]+]]
 ; UNROLL-NO-VF:       for.end:
 ; UNROLL-NO-VF-NEXT:    [[FOR_LCSSA:%.*]] = phi i32 [ [[FOR]], [[LOOP]] ], [ [[TMP6]], [[MIDDLE_BLOCK]] ]
 ; UNROLL-NO-VF-NEXT:    ret i32 [[FOR_LCSSA]]
@@ -3682,7 +3682,7 @@ define void @unused_recurrence(ptr %a) {
 ; UNROLL-NO-VF-NEXT:    [[TMP5]] = add i16 [[TMP3]], 5
 ; UNROLL-NO-VF-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 2
 ; UNROLL-NO-VF-NEXT:    [[TMP6:%.*]] = icmp eq i32 [[INDEX_NEXT]], 1028
-; UNROLL-NO-VF-NEXT:    br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP35:![0-9]+]]
+; UNROLL-NO-VF-NEXT:    br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP36:![0-9]+]]
 ; UNROLL-NO-VF:       middle.block:
 ; UNROLL-NO-VF-NEXT:    br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]]
 ; UNROLL-NO-VF:       scalar.ph:
@@ -3696,7 +3696,7 @@ define void @unused_recurrence(ptr %a) {
 ; UNROLL-NO-VF-NEXT:    [[IV_NEXT]] = add i16 [[IV]], 1
 ; UNROLL-NO-VF-NEXT:    [[REC_1_PREV]] = add i16 [[IV_NEXT]], 5
 ; UNROLL-NO-VF-NEXT:    [[CMP:%.*]] = icmp eq i16 [[IV]], 1000
-; UNROLL-NO-VF-NEXT:    br i1 [[CMP]], label [[FOR_END]], label [[FOR_COND]], !llvm.loop [[LOOP36:![0-9]+]]
+; UNROLL-NO-VF-NEXT:    br i1 [[CMP]], label [[FOR_END]], label [[FOR_COND]], !llvm.loop [[LOOP37:![0-9]+]]
 ; UNROLL-NO-VF:       for.end:
 ; UNROLL-NO-VF-NEXT:    ret void
 ;
@@ -3790,7 +3790,7 @@ define i32 @recurence_uniform_load(ptr %src, ptr noalias %dst) {
 ; UNROLL-NO-VF-NEXT:    [[VECTOR_RECUR:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[TMP0:%.*]], [[VECTOR_BODY]] ]
 ; UNROLL-NO-VF-NEXT:    [[TMP0]] = load i32, ptr [[SRC:%.*]], align 4
 ; UNROLL-NO-VF-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
-; UNROLL-NO-VF-NEXT:    br i1 true, label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP37:![0-9]+]]
+; UNROLL-NO-VF-NEXT:    br i1 true, label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP38:![0-9]+]]
 ; UNROLL-NO-VF:       middle.block:
 ; UNROLL-NO-VF-NEXT:    br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]]
 ; UNROLL-NO-VF:       scalar.ph:
@@ -3803,7 +3803,7 @@ define i32 @recurence_uniform_load(ptr %src, ptr noalias %dst) {
 ; UNROLL-NO-VF-NEXT:    [[ADD]] = add i64 [[PHI]], 1
 ; UNROLL-NO-VF-NEXT:    [[LOAD]] = load i32, ptr [[SRC]], align 4
 ; UNROLL-NO-VF-NEXT:    [[ICMP:%.*]] = icmp ult i64 [[PHI]], 1
-; UNROLL-NO-VF-NEXT:    br i1 [[ICMP]], label [[LOOP]], label [[EXIT]], !llvm.loop [[LOOP38:![0-9]+]]
+; UNROLL-NO-VF-NEXT:    br i1 [[ICMP]], label [[LOOP]], label [[EXIT]], !llvm.loop [[LOOP39:![0-9]+]]
 ; UNROLL-NO-VF:       exit:
 ; UNROLL-NO-VF-NEXT:    ret i32 0
 ;