[llvm] [VPlan] Remove loop region in optimizeForVFAndUF. (PR #108378)

Florian Hahn via llvm-commits llvm-commits at lists.llvm.org
Sat Jan 4 06:01:32 PST 2025


https://github.com/fhahn updated https://github.com/llvm/llvm-project/pull/108378

>From d3614bcbf77ac92b1f1dd37711392f828bd695f3 Mon Sep 17 00:00:00 2001
From: Florian Hahn <flo at fhahn.com>
Date: Wed, 25 Sep 2024 12:20:07 +0100
Subject: [PATCH 01/24] [VPlan] Use ResumePhi to create reduction resume phis.

Use VPInstruction::ResumePhi to create phi nodes for reduction resume
values.

This allows simplifying createAndCollectMergePhiForReduction to only
collect reduction resume phis when vectorizing epilogue loops and adding
extra incoming edges from the main vector loop.
---
 .../Transforms/Vectorize/LoopVectorize.cpp    | 82 +++++++++----------
 .../RISCV/vplan-vp-intrinsics-reduction.ll    |  9 ++
 ...-order-recurrence-sink-replicate-region.ll |  2 +
 .../LoopVectorize/vplan-printing.ll           |  9 ++
 4 files changed, 61 insertions(+), 41 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index db650b23e271e2..74104304301a86 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -7467,23 +7467,31 @@ static void addRuntimeUnrollDisableMetaData(Loop *L) {
 }
 
 // Check if \p RedResult is a ComputeReductionResult instruction, and if it is
-// create a merge phi node for it.
-static void createAndCollectMergePhiForReduction(
-    VPInstruction *RedResult,
-    VPTransformState &State, Loop *OrigLoop, BasicBlock *LoopMiddleBlock,
-    bool VectorizingEpilogue) {
+// create a merge phi node for it and add incoming values from the main vector
+// loop.
+static void updateAndCollectMergePhiForReductionForEpilogueVectorization(
+    VPInstruction *RedResult, VPTransformState &State, Loop *OrigLoop,
+    BasicBlock *LoopMiddleBlock, bool VectorizingEpilogue) {
   if (!RedResult ||
       RedResult->getOpcode() != VPInstruction::ComputeReductionResult)
     return;
 
+  using namespace VPlanPatternMatch;
+  VPValue *ResumePhiVPV =
+      cast<VPInstruction>(*find_if(RedResult->users(), [](VPUser *U) {
+        return match(U, m_VPInstruction<VPInstruction::ResumePhi>(m_VPValue(),
+                                                                  m_VPValue()));
+      }));
+  auto *BCBlockPhi = cast<PHINode>(State.get(ResumePhiVPV, true));
   auto *PhiR = cast<VPReductionPHIRecipe>(RedResult->getOperand(0));
   const RecurrenceDescriptor &RdxDesc = PhiR->getRecurrenceDescriptor();
+  if (!VectorizingEpilogue)
+    return;
 
-  Value *FinalValue = State.get(RedResult, VPLane(VPLane::getFirstLane()));
   auto *ResumePhi =
       dyn_cast<PHINode>(PhiR->getStartValue()->getUnderlyingValue());
-  if (VectorizingEpilogue && RecurrenceDescriptor::isAnyOfRecurrenceKind(
-                                 RdxDesc.getRecurrenceKind())) {
+  if (RecurrenceDescriptor::isAnyOfRecurrenceKind(
+          RdxDesc.getRecurrenceKind())) {
     auto *Cmp = cast<ICmpInst>(PhiR->getStartValue()->getUnderlyingValue());
     assert(Cmp->getPredicate() == CmpInst::ICMP_NE);
     assert(Cmp->getOperand(1) == RdxDesc.getRecurrenceStartValue());
@@ -7493,40 +7501,15 @@ static void createAndCollectMergePhiForReduction(
          "when vectorizing the epilogue loop, we need a resume phi from main "
          "vector loop");
 
-  // TODO: bc.merge.rdx should not be created here, instead it should be
-  // modeled in VPlan.
   BasicBlock *LoopScalarPreHeader = OrigLoop->getLoopPreheader();
-  // Create a phi node that merges control-flow from the backedge-taken check
-  // block and the middle block.
-  auto *BCBlockPhi =
-      PHINode::Create(FinalValue->getType(), 2, "bc.merge.rdx",
-                      LoopScalarPreHeader->getTerminator()->getIterator());
-
   // If we are fixing reductions in the epilogue loop then we should already
   // have created a bc.merge.rdx Phi after the main vector body. Ensure that
   // we carry over the incoming values correctly.
   for (auto *Incoming : predecessors(LoopScalarPreHeader)) {
-    if (Incoming == LoopMiddleBlock)
-      BCBlockPhi->addIncoming(FinalValue, Incoming);
-    else if (ResumePhi && is_contained(ResumePhi->blocks(), Incoming))
-      BCBlockPhi->addIncoming(ResumePhi->getIncomingValueForBlock(Incoming),
-                              Incoming);
-    else
-      BCBlockPhi->addIncoming(RdxDesc.getRecurrenceStartValue(), Incoming);
+    if (ResumePhi && is_contained(ResumePhi->blocks(), Incoming))
+      BCBlockPhi->setIncomingValueForBlock(
+          Incoming, ResumePhi->getIncomingValueForBlock(Incoming));
   }
-
-  auto *OrigPhi = cast<PHINode>(PhiR->getUnderlyingValue());
-  // TODO: This fixup should instead be modeled in VPlan.
-  // Fix the scalar loop reduction variable with the incoming reduction sum
-  // from the vector body and from the backedge value.
-  int IncomingEdgeBlockIdx =
-      OrigPhi->getBasicBlockIndex(OrigLoop->getLoopLatch());
-  assert(IncomingEdgeBlockIdx >= 0 && "Invalid block index");
-  // Pick the other block.
-  int SelfEdgeBlockIdx = (IncomingEdgeBlockIdx ? 0 : 1);
-  OrigPhi->setIncomingValue(SelfEdgeBlockIdx, BCBlockPhi);
-  Instruction *LoopExitInst = RdxDesc.getLoopExitInstr();
-  OrigPhi->setIncomingValue(IncomingEdgeBlockIdx, LoopExitInst);
 }
 
 DenseMap<const SCEV *, Value *> LoopVectorizationPlanner::executePlan(
@@ -7617,11 +7600,12 @@ DenseMap<const SCEV *, Value *> LoopVectorizationPlanner::executePlan(
   // 2.5 Collect reduction resume values.
   auto *ExitVPBB =
       cast<VPBasicBlock>(BestVPlan.getVectorLoopRegion()->getSingleSuccessor());
-  for (VPRecipeBase &R : *ExitVPBB) {
-    createAndCollectMergePhiForReduction(
-        dyn_cast<VPInstruction>(&R), State, OrigLoop,
-        State.CFG.VPBB2IRBB[ExitVPBB], ExpandedSCEVs);
-  }
+  if (IsEpilogueVectorization)
+    for (VPRecipeBase &R : *ExitVPBB) {
+      updateAndCollectMergePhiForReductionForEpilogueVectorization(
+          dyn_cast<VPInstruction>(&R), State, OrigLoop,
+          State.CFG.VPBB2IRBB[ExitVPBB], ExpandedSCEVs);
+    }
 
   // 2.6. Maintain Loop Hints
   // Keep all loop hints from the original loop on the vector loop (we'll
@@ -9411,6 +9395,22 @@ void LoopVectorizationPlanner::adjustRecipesForReductions(
         });
     FinalReductionResult->insertBefore(*MiddleVPBB, IP);
 
+    VPBasicBlock *ScalarPHVPBB = nullptr;
+    if (MiddleVPBB->getNumSuccessors() == 2) {
+      // Order is strict: first is the exit block, second is the scalar
+      // preheader.
+      ScalarPHVPBB = cast<VPBasicBlock>(MiddleVPBB->getSuccessors()[1]);
+    } else {
+      ScalarPHVPBB = cast<VPBasicBlock>(MiddleVPBB->getSingleSuccessor());
+    }
+
+    VPBuilder ScalarPHBuilder(ScalarPHVPBB);
+    auto *ResumePhiRecipe = ScalarPHBuilder.createNaryOp(
+        VPInstruction::ResumePhi, {FinalReductionResult, PhiR->getStartValue()},
+        {}, "bc.merge.rdx");
+    auto *RedPhi = cast<PHINode>(PhiR->getUnderlyingInstr());
+    Plan->addLiveOut(RedPhi, ResumePhiRecipe);
+
     // Adjust AnyOf reductions; replace the reduction phi for the selected value
     // with a boolean reduction phi node to check if the condition is true in
     // any iteration. The final value is selected by the final
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/vplan-vp-intrinsics-reduction.ll b/llvm/test/Transforms/LoopVectorize/RISCV/vplan-vp-intrinsics-reduction.ll
index 90c209cf3f5186..6a435709aeb2b2 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/vplan-vp-intrinsics-reduction.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/vplan-vp-intrinsics-reduction.ll
@@ -65,7 +65,10 @@ define i32 @reduction(ptr %a, i64 %n, i32 %start) {
 ; IF-EVL-INLOOP-NEXT: No successors
 ; IF-EVL-INLOOP-EMPTY:
 ; IF-EVL-INLOOP-NEXT: scalar.ph:
+; IF-EVL-INLOOP-NEXT:   EMIT vp<[[RED_RESUME:%.+]]> = resume-phi vp<[[RDX]]>, ir<%start>
 ; IF-EVL-INLOOP-NEXT: No successors
+; IF-EVL-INLOOP-EMPTY:
+; IF-EVL-INLOOP-NEXT: Live-out i32 %rdx = vp<[[RED_RESUME]]>
 ; IF-EVL-INLOOP-NEXT: }
 ;
 
@@ -104,7 +107,10 @@ define i32 @reduction(ptr %a, i64 %n, i32 %start) {
 ; NO-VP-OUTLOOP-NEXT: No successors
 ; NO-VP-OUTLOOP-EMPTY:
 ; NO-VP-OUTLOOP-NEXT: scalar.ph:
+; NO-VP-OUTLOOP-NEXT:   EMIT vp<[[RED_RESUME:%.+]]> = resume-phi vp<[[RDX]]>, ir<%start>
 ; NO-VP-OUTLOOP-NEXT: No successors
+; NO-VP-OUTLOOP-EMPTY:
+; NO-VP-OUTLOOP-NEXT: Live-out i32 %rdx = vp<[[RED_RESUME]]>
 ; NO-VP-OUTLOOP-NEXT: }
 ;
 
@@ -143,7 +149,10 @@ define i32 @reduction(ptr %a, i64 %n, i32 %start) {
 ; NO-VP-INLOOP-NEXT: No successors
 ; NO-VP-INLOOP-EMPTY:
 ; NO-VP-INLOOP-NEXT: scalar.ph:
+; NO-VP-INLOOP-NEXT:   EMIT vp<[[RED_RESUME:%.+]]> = resume-phi vp<[[RDX]]>, ir<%start>
 ; NO-VP-INLOOP-NEXT: No successors
+; NO-VP-INLOOP-EMPTY:
+; NO-VP-INLOOP-NEXT: Live-out i32 %rdx = vp<[[RED_RESUME]]>
 ; NO-VP-INLOOP-NEXT: }
 ;
 entry:
diff --git a/llvm/test/Transforms/LoopVectorize/first-order-recurrence-sink-replicate-region.ll b/llvm/test/Transforms/LoopVectorize/first-order-recurrence-sink-replicate-region.ll
index 8e56614a2e3d5c..b05980bef1b38f 100644
--- a/llvm/test/Transforms/LoopVectorize/first-order-recurrence-sink-replicate-region.ll
+++ b/llvm/test/Transforms/LoopVectorize/first-order-recurrence-sink-replicate-region.ll
@@ -232,9 +232,11 @@ define i32 @sink_replicate_region_3_reduction(i32 %x, i8 %y, ptr %ptr) optsize {
 ; CHECK-EMPTY:
 ; CHECK-NEXT: scalar.ph
 ; CHECK-NEXT:   EMIT vp<[[RESUME_1_P:%.*]]> = resume-phi vp<[[RESUME_1]]>, ir<0>
+; CHECK-NEXT:   EMIT vp<[[RESUME_RED:%.+]]> = resume-phi vp<[[RED_RES]]>, ir<1234>
 ; CHECK-NEXT: No successors
 ; CHECK-EMPTY:
 ; CHECK-NEXT: Live-out i32 %recur = vp<[[RESUME_1_P]]>
+; CHECK-NEXT: Live-out i32 %and.red = vp<[[RESUME_RED]]>
 ; CHECK-NEXT: }
 ;
 entry:
diff --git a/llvm/test/Transforms/LoopVectorize/vplan-printing.ll b/llvm/test/Transforms/LoopVectorize/vplan-printing.ll
index 0dde507d08be74..2247295295663e 100644
--- a/llvm/test/Transforms/LoopVectorize/vplan-printing.ll
+++ b/llvm/test/Transforms/LoopVectorize/vplan-printing.ll
@@ -165,7 +165,10 @@ define float @print_reduction(i64 %n, ptr noalias %y) {
 ; CHECK-NEXT: No successors
 ; CHECK-EMPTY:
 ; CHECK-NEXT: scalar.ph
+; CHECK-NEXT:   EMIT vp<[[RED_RESUME:%.+]]> = resume-phi vp<[[RED_RES]]>, ir<0.000000e+00>
 ; CHECK-NEXT: No successors
+; CHECK-EMPTY:
+; CHECK-NEXT: Live-out float %red = vp<[[RED_RESUME]]>
 ; CHECK-NEXT: }
 ;
 entry:
@@ -221,7 +224,10 @@ define void @print_reduction_with_invariant_store(i64 %n, ptr noalias %y, ptr no
 ; CHECK-NEXT: No successors
 ; CHECK-EMPTY:
 ; CHECK-NEXT: scalar.ph
+; CHECK-NEXT:   EMIT vp<[[RED_RESUME:%.+]]> = resume-phi vp<[[RED_RES]]>, ir<0.000000e+00>
 ; CHECK-NEXT: No successors
+; CHECK-EMPTY:
+; CHECK-NEXT: Live-out float %red = vp<[[RED_RESUME]]>
 ; CHECK-NEXT: }
 ;
 entry:
@@ -447,7 +453,10 @@ define float @print_fmuladd_strict(ptr %a, ptr %b, i64 %n) {
 ; CHECK-NEXT: No successors
 ; CHECK-EMPTY:
 ; CHECK-NEXT: scalar.ph
+; CHECK-NEXT:   EMIT vp<[[RED_RESUME:%.+]]> = resume-phi vp<[[RED_RES]]>, ir<0.000000e+00>
 ; CHECK-NEXT: No successors
+; CHECK-EMPTY:
+; CHECK-NEXT: Live-out float %sum.07 = vp<[[RED_RESUME]]>
 ; CHECK-NEXT:}
 
 entry:

>From 5f8fabe01a7c6ee8e9ca37e9566e52467983bff6 Mon Sep 17 00:00:00 2001
From: Florian Hahn <flo at fhahn.com>
Date: Wed, 25 Sep 2024 12:18:39 +0100
Subject: [PATCH 02/24] [VPlan] Remove loop region in optimizeForVFAndUF.

---
 .../Transforms/Vectorize/LoopVectorize.cpp    |  75 ++++---
 llvm/lib/Transforms/Vectorize/VPlan.cpp       | 143 +++++++------
 llvm/lib/Transforms/Vectorize/VPlan.h         |   5 +-
 .../lib/Transforms/Vectorize/VPlanRecipes.cpp |   9 +-
 .../Transforms/Vectorize/VPlanTransforms.cpp  |  44 +++-
 .../LoopVectorize/AArch64/call-costs.ll       |  17 +-
 .../LoopVectorize/RISCV/low-trip-count.ll     | 192 ++++++++----------
 .../LoopVectorize/RISCV/short-trip-count.ll   |  44 ++--
 .../truncate-to-minimal-bitwidth-cost.ll      |  27 +--
 .../LoopVectorize/SystemZ/pr47665.ll          |  77 +++----
 ...demanding-all-lanes-and-first-lane-only.ll |  56 +++--
 .../LoopVectorize/X86/constant-fold.ll        |  18 +-
 .../Transforms/LoopVectorize/X86/pr34438.ll   |  26 +--
 .../LoopVectorize/first-order-recurrence.ll   |  33 +--
 .../vector-loop-backedge-elimination.ll       |   4 +-
 .../version-stride-with-integer-casts.ll      |  15 +-
 16 files changed, 359 insertions(+), 426 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 74104304301a86..a573147c41241f 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -2927,6 +2927,9 @@ void InnerLoopVectorizer::fixVectorizedLoop(VPTransformState &State,
   for (const auto &KV : Plan.getLiveOuts())
     KV.second->fixPhi(Plan, State);
 
+  if (!isa<VPRegionBlock>(State.Plan->getEntry()->getSingleSuccessor()))
+    return;
+
   for (Instruction *PI : PredicatedInstructions)
     sinkScalarOperands(&*PI);
 
@@ -7537,7 +7540,8 @@ DenseMap<const SCEV *, Value *> LoopVectorizationPlanner::executePlan(
   LLVM_DEBUG(BestVPlan.dump());
 
   // Perform the actual loop transformation.
-  VPTransformState State(BestVF, BestUF, LI, DT, ILV.Builder, &ILV, &BestVPlan);
+  VPTransformState State(BestVF, BestUF, LI, DT, ILV.Builder, &ILV, &BestVPlan,
+                         Legal->getWidestInductionType());
 
   // 0. Generate SCEV-dependent code into the preheader, including TripCount,
   // before making any changes to the CFG.
@@ -7598,14 +7602,15 @@ DenseMap<const SCEV *, Value *> LoopVectorizationPlanner::executePlan(
   BestVPlan.execute(&State);
 
   // 2.5 Collect reduction resume values.
-  auto *ExitVPBB =
-      cast<VPBasicBlock>(BestVPlan.getVectorLoopRegion()->getSingleSuccessor());
-  if (IsEpilogueVectorization)
+  if (IsEpilogueVectorization) {
+    auto *ExitVPBB = cast<VPBasicBlock>(
+        BestVPlan.getVectorLoopRegion()->getSingleSuccessor());
     for (VPRecipeBase &R : *ExitVPBB) {
       updateAndCollectMergePhiForReductionForEpilogueVectorization(
           dyn_cast<VPInstruction>(&R), State, OrigLoop,
           State.CFG.VPBB2IRBB[ExitVPBB], ExpandedSCEVs);
     }
+  }
 
   // 2.6. Maintain Loop Hints
   // Keep all loop hints from the original loop on the vector loop (we'll
@@ -7616,24 +7621,26 @@ DenseMap<const SCEV *, Value *> LoopVectorizationPlanner::executePlan(
       makeFollowupLoopID(OrigLoopID, {LLVMLoopVectorizeFollowupAll,
                                       LLVMLoopVectorizeFollowupVectorized});
 
-  VPBasicBlock *HeaderVPBB =
-      BestVPlan.getVectorLoopRegion()->getEntryBasicBlock();
-  Loop *L = LI->getLoopFor(State.CFG.VPBB2IRBB[HeaderVPBB]);
-  if (VectorizedLoopID)
-    L->setLoopID(*VectorizedLoopID);
-  else {
-    // Keep all loop hints from the original loop on the vector loop (we'll
-    // replace the vectorizer-specific hints below).
-    if (MDNode *LID = OrigLoop->getLoopID())
-      L->setLoopID(LID);
-
-    LoopVectorizeHints Hints(L, true, *ORE);
-    Hints.setAlreadyVectorized();
+  if (auto *R =
+          dyn_cast<VPRegionBlock>(BestVPlan.getEntry()->getSingleSuccessor())) {
+    VPBasicBlock *HeaderVPBB = R->getEntryBasicBlock();
+    Loop *L = LI->getLoopFor(State.CFG.VPBB2IRBB[HeaderVPBB]);
+    if (VectorizedLoopID)
+      L->setLoopID(*VectorizedLoopID);
+    else {
+      // Keep all loop hints from the original loop on the vector loop (we'll
+      // replace the vectorizer-specific hints below).
+      if (MDNode *LID = OrigLoop->getLoopID())
+        L->setLoopID(LID);
+
+      LoopVectorizeHints Hints(L, true, *ORE);
+      Hints.setAlreadyVectorized();
+    }
+    TargetTransformInfo::UnrollingPreferences UP;
+    TTI.getUnrollingPreferences(L, *PSE.getSE(), UP, ORE);
+    if (!UP.UnrollVectorizedLoop || CanonicalIVStartValue)
+      addRuntimeUnrollDisableMetaData(L);
   }
-  TargetTransformInfo::UnrollingPreferences UP;
-  TTI.getUnrollingPreferences(L, *PSE.getSE(), UP, ORE);
-  if (!UP.UnrollVectorizedLoop || CanonicalIVStartValue)
-    addRuntimeUnrollDisableMetaData(L);
 
   // 3. Fix the vectorized code: take care of header phi's, live-outs,
   //    predication, updating analyses.
@@ -7642,15 +7649,20 @@ DenseMap<const SCEV *, Value *> LoopVectorizationPlanner::executePlan(
   ILV.printDebugTracesAtEnd();
 
   // 4. Adjust branch weight of the branch in the middle block.
-  auto *MiddleTerm =
-      cast<BranchInst>(State.CFG.VPBB2IRBB[ExitVPBB]->getTerminator());
-  if (MiddleTerm->isConditional() &&
-      hasBranchWeightMD(*OrigLoop->getLoopLatch()->getTerminator())) {
-    // Assume that `Count % VectorTripCount` is equally distributed.
-    unsigned TripCount = BestVPlan.getUF() * State.VF.getKnownMinValue();
-    assert(TripCount > 0 && "trip count should not be zero");
-    const uint32_t Weights[] = {1, TripCount - 1};
-    setBranchWeights(*MiddleTerm, Weights, /*IsExpected=*/false);
+  if (auto *R =
+          dyn_cast<VPRegionBlock>(BestVPlan.getEntry()->getSingleSuccessor())) {
+    auto *ExitVPBB = cast<VPBasicBlock>(R->getSingleSuccessor());
+
+    auto *MiddleTerm =
+        cast<BranchInst>(State.CFG.VPBB2IRBB[ExitVPBB]->getTerminator());
+    if (MiddleTerm->isConditional() &&
+        hasBranchWeightMD(*OrigLoop->getLoopLatch()->getTerminator())) {
+      // Assume that `Count % VectorTripCount` is equally distributed.
+      unsigned TripCount = BestVPlan.getUF() * State.VF.getKnownMinValue();
+      assert(TripCount > 0 && "trip count should not be zero");
+      const uint32_t Weights[] = {1, TripCount - 1};
+      setBranchWeights(*MiddleTerm, Weights, /*IsExpected=*/false);
+    }
   }
 
   return State.ExpandedSCEVs;
@@ -9464,7 +9476,8 @@ void VPDerivedIVRecipe::execute(VPTransformState &State) {
       State.Builder, CanonicalIV, getStartValue()->getLiveInIRValue(), Step,
       Kind, cast_if_present<BinaryOperator>(FPBinOp));
   DerivedIV->setName("offset.idx");
-  assert(DerivedIV != CanonicalIV && "IV didn't need transforming?");
+  assert((isa<Constant>(CanonicalIV) || DerivedIV != CanonicalIV) &&
+         "IV didn't need transforming?");
 
   State.set(this, DerivedIV, VPLane(0));
 }
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.cpp b/llvm/lib/Transforms/Vectorize/VPlan.cpp
index 5e3a6388094940..27f74e87bdf42f 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlan.cpp
@@ -224,9 +224,10 @@ VPBasicBlock::iterator VPBasicBlock::getFirstNonPhi() {
 
 VPTransformState::VPTransformState(ElementCount VF, unsigned UF, LoopInfo *LI,
                                    DominatorTree *DT, IRBuilderBase &Builder,
-                                   InnerLoopVectorizer *ILV, VPlan *Plan)
+                                   InnerLoopVectorizer *ILV, VPlan *Plan,
+                                   Type *CanonicalIVTy)
     : VF(VF), CFG(DT), LI(LI), Builder(Builder), ILV(ILV), Plan(Plan),
-      LVer(nullptr), TypeAnalysis(Plan->getCanonicalIV()->getScalarType()) {}
+      LVer(nullptr), TypeAnalysis(CanonicalIVTy) {}
 
 Value *VPTransformState::get(VPValue *Def, const VPLane &Lane) {
   if (Def->isLiveIn())
@@ -275,8 +276,8 @@ Value *VPTransformState::get(VPValue *Def, bool NeedsScalar) {
     // Place the code for broadcasting invariant variables in the new preheader.
     IRBuilder<>::InsertPointGuard Guard(Builder);
     if (SafeToHoist) {
-      BasicBlock *LoopVectorPreHeader = CFG.VPBB2IRBB[cast<VPBasicBlock>(
-          Plan->getVectorLoopRegion()->getSinglePredecessor())];
+      BasicBlock *LoopVectorPreHeader =
+          CFG.VPBB2IRBB[cast<VPBasicBlock>(Plan->getEntry())];
       if (LoopVectorPreHeader)
         Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator());
     }
@@ -417,6 +418,12 @@ VPBasicBlock::createEmptyBasicBlock(VPTransformState::CFGState &CFG) {
                                          PrevBB->getParent(), CFG.ExitBB);
   LLVM_DEBUG(dbgs() << "LV: created " << NewBB->getName() << '\n');
 
+  connectToPredecessors(NewBB, CFG);
+  return NewBB;
+}
+
+void VPBasicBlock::connectToPredecessors(BasicBlock *NewBB,
+                                         VPTransformState::CFGState &CFG) {
   // Hook up the new basic block to its predecessors.
   for (VPBlockBase *PredVPBlock : getHierarchicalPredecessors()) {
     VPBasicBlock *PredVPBB = PredVPBlock->getExitingBasicBlock();
@@ -447,38 +454,14 @@ VPBasicBlock::createEmptyBasicBlock(VPTransformState::CFGState &CFG) {
     }
     CFG.DTU.applyUpdates({{DominatorTree::Insert, PredBB, NewBB}});
   }
-  return NewBB;
 }
-
 void VPIRBasicBlock::execute(VPTransformState *State) {
   assert(getHierarchicalSuccessors().size() <= 2 &&
          "VPIRBasicBlock can have at most two successors at the moment!");
   State->Builder.SetInsertPoint(getIRBasicBlock()->getTerminator());
   executeRecipes(State, getIRBasicBlock());
-  if (getSingleSuccessor()) {
-    assert(isa<UnreachableInst>(getIRBasicBlock()->getTerminator()));
-    auto *Br = State->Builder.CreateBr(getIRBasicBlock());
-    Br->setOperand(0, nullptr);
-    getIRBasicBlock()->getTerminator()->eraseFromParent();
-  }
-
-  for (VPBlockBase *PredVPBlock : getHierarchicalPredecessors()) {
-    VPBasicBlock *PredVPBB = PredVPBlock->getExitingBasicBlock();
-    BasicBlock *PredBB = State->CFG.VPBB2IRBB[PredVPBB];
-    assert(PredBB && "Predecessor basic-block not found building successor.");
-    LLVM_DEBUG(dbgs() << "LV: draw edge from" << PredBB->getName() << '\n');
 
-    auto *PredBBTerminator = PredBB->getTerminator();
-    auto *TermBr = cast<BranchInst>(PredBBTerminator);
-    // Set each forward successor here when it is created, excluding
-    // backedges. A backward successor is set when the branch is created.
-    const auto &PredVPSuccessors = PredVPBB->getHierarchicalSuccessors();
-    unsigned idx = PredVPSuccessors.front() == this ? 0 : 1;
-    assert(!TermBr->getSuccessor(idx) &&
-           "Trying to reset an existing successor block.");
-    TermBr->setSuccessor(idx, IRBB);
-    State->CFG.DTU.applyUpdates({{DominatorTree::Insert, PredBB, IRBB}});
-  }
+  connectToPredecessors(getIRBasicBlock(), State->CFG);
 }
 
 void VPBasicBlock::execute(VPTransformState *State) {
@@ -962,7 +945,6 @@ void VPlan::prepareToExecute(Value *TripCountV, Value *VectorTripCountV,
 
   IRBuilder<> Builder(State.CFG.PrevBB->getTerminator());
   // FIXME: Model VF * UF computation completely in VPlan.
-  assert(VFxUF.getNumUsers() && "VFxUF expected to always have users");
   unsigned UF = getUF();
   if (VF.getNumUsers()) {
     Value *RuntimeVF = getRuntimeVF(Builder, TCTy, State.VF);
@@ -1034,8 +1016,13 @@ void VPlan::execute(VPTransformState *State) {
   // skeleton creation, so we can only create the VPIRBasicBlocks now during
   // VPlan execution rather than earlier during VPlan construction.
   BasicBlock *MiddleBB = State->CFG.ExitBB;
-  VPBasicBlock *MiddleVPBB =
-      cast<VPBasicBlock>(getVectorLoopRegion()->getSingleSuccessor());
+  VPBlockBase *Leaf = nullptr;
+  for (VPBlockBase *VPB : vp_depth_first_shallow(getEntry()))
+    if (VPB->getNumSuccessors() == 0) {
+      Leaf = VPB;
+      break;
+    }
+  VPBasicBlock *MiddleVPBB = cast<VPBasicBlock>(Leaf->getSinglePredecessor());
   // Find the VPBB for the scalar preheader, relying on the current structure
   // when creating the middle block and its successrs: if there's a single
   // predecessor, it must be the scalar preheader. Otherwise, the second
@@ -1063,53 +1050,59 @@ void VPlan::execute(VPTransformState *State) {
   for (VPBlockBase *Block : vp_depth_first_shallow(Entry))
     Block->execute(State);
 
-  VPBasicBlock *LatchVPBB = getVectorLoopRegion()->getExitingBasicBlock();
-  BasicBlock *VectorLatchBB = State->CFG.VPBB2IRBB[LatchVPBB];
-
-  // Fix the latch value of canonical, reduction and first-order recurrences
-  // phis in the vector loop.
-  VPBasicBlock *Header = getVectorLoopRegion()->getEntryBasicBlock();
-  for (VPRecipeBase &R : Header->phis()) {
-    // Skip phi-like recipes that generate their backedege values themselves.
-    if (isa<VPWidenPHIRecipe>(&R))
-      continue;
-
-    if (isa<VPWidenPointerInductionRecipe>(&R) ||
-        isa<VPWidenIntOrFpInductionRecipe>(&R)) {
-      PHINode *Phi = nullptr;
-      if (isa<VPWidenIntOrFpInductionRecipe>(&R)) {
-        Phi = cast<PHINode>(State->get(R.getVPSingleValue()));
-      } else {
-        auto *WidenPhi = cast<VPWidenPointerInductionRecipe>(&R);
-        assert(!WidenPhi->onlyScalarsGenerated(State->VF.isScalable()) &&
-               "recipe generating only scalars should have been replaced");
-        auto *GEP = cast<GetElementPtrInst>(State->get(WidenPhi));
-        Phi = cast<PHINode>(GEP->getPointerOperand());
-      }
-
-      Phi->setIncomingBlock(1, VectorLatchBB);
+  if (auto *LoopRegion =
+          dyn_cast<VPRegionBlock>(getEntry()->getSingleSuccessor())) {
+    VPBasicBlock *LatchVPBB = LoopRegion->getExitingBasicBlock();
+    BasicBlock *VectorLatchBB = State->CFG.VPBB2IRBB[LatchVPBB];
+
+    // Fix the latch value of canonical, reduction and first-order recurrences
+    // phis in the vector loop.
+    VPBasicBlock *Header = LoopRegion->getEntryBasicBlock();
+    for (VPRecipeBase &R : Header->phis()) {
+      // Skip phi-like recipes that generate their backedege values themselves.
+      if (isa<VPWidenPHIRecipe>(&R))
+        continue;
 
-      // Move the last step to the end of the latch block. This ensures
-      // consistent placement of all induction updates.
-      Instruction *Inc = cast<Instruction>(Phi->getIncomingValue(1));
-      Inc->moveBefore(VectorLatchBB->getTerminator()->getPrevNode());
+      if (isa<VPWidenPointerInductionRecipe>(&R) ||
+          isa<VPWidenIntOrFpInductionRecipe>(&R)) {
+        PHINode *Phi = nullptr;
+        if (isa<VPWidenIntOrFpInductionRecipe>(&R)) {
+          Phi = cast<PHINode>(State->get(R.getVPSingleValue()));
+        } else {
+          auto *WidenPhi = cast<VPWidenPointerInductionRecipe>(&R);
+          assert(!WidenPhi->onlyScalarsGenerated(State->VF.isScalable()) &&
+                 "recipe generating only scalars should have been replaced");
+          auto *GEP = cast<GetElementPtrInst>(State->get(WidenPhi));
+          Phi = cast<PHINode>(GEP->getPointerOperand());
+        }
+
+        Phi->setIncomingBlock(1, VectorLatchBB);
+
+        // Move the last step to the end of the latch block. This ensures
+        // consistent placement of all induction updates.
+        Instruction *Inc = cast<Instruction>(Phi->getIncomingValue(1));
+        Inc->moveBefore(VectorLatchBB->getTerminator()->getPrevNode());
+
+        // Use the steps for the last part as backedge value for the induction.
+        if (auto *IV = dyn_cast<VPWidenIntOrFpInductionRecipe>(&R))
+          Inc->setOperand(0, State->get(IV->getLastUnrolledPartOperand()));
+        continue;
+      }
 
-      // Use the steps for the last part as backedge value for the induction.
-      if (auto *IV = dyn_cast<VPWidenIntOrFpInductionRecipe>(&R))
-        Inc->setOperand(0, State->get(IV->getLastUnrolledPartOperand()));
-      continue;
+      // For  canonical IV, first-order recurrences and in-order reduction phis,
+      // only a single part is generated, which provides the last part from the
+      // previous iteration. For non-ordered reductions all UF parts are
+      // generated.
+      auto *PhiR = cast<VPHeaderPHIRecipe>(&R);
+      bool NeedsScalar =
+          isa<VPCanonicalIVPHIRecipe, VPEVLBasedIVPHIRecipe>(PhiR) ||
+          (isa<VPReductionPHIRecipe>(PhiR) &&
+           cast<VPReductionPHIRecipe>(PhiR)->isInLoop());
+      Value *Phi = State->get(PhiR, NeedsScalar);
+      Value *Val = State->get(PhiR->getBackedgeValue(), NeedsScalar);
+      cast<PHINode>(Phi)->addIncoming(Val, VectorLatchBB);
     }
-
-    auto *PhiR = cast<VPHeaderPHIRecipe>(&R);
-    bool NeedsScalar =
-        isa<VPCanonicalIVPHIRecipe, VPEVLBasedIVPHIRecipe>(PhiR) ||
-        (isa<VPReductionPHIRecipe>(PhiR) &&
-         cast<VPReductionPHIRecipe>(PhiR)->isInLoop());
-    Value *Phi = State->get(PhiR, NeedsScalar);
-    Value *Val = State->get(PhiR->getBackedgeValue(), NeedsScalar);
-    cast<PHINode>(Phi)->addIncoming(Val, VectorLatchBB);
   }
-
   State->CFG.DTU.flush();
   assert(State->CFG.DTU.getDomTree().verify(
              DominatorTree::VerificationLevel::Fast) &&
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h
index 68a62638b9d588..986092ca9b11f5 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.h
+++ b/llvm/lib/Transforms/Vectorize/VPlan.h
@@ -236,7 +236,7 @@ class VPLane {
 struct VPTransformState {
   VPTransformState(ElementCount VF, unsigned UF, LoopInfo *LI,
                    DominatorTree *DT, IRBuilderBase &Builder,
-                   InnerLoopVectorizer *ILV, VPlan *Plan);
+                   InnerLoopVectorizer *ILV, VPlan *Plan, Type *CanonicalIVTy);
 
   /// The chosen Vectorization Factor of the loop being vectorized.
   ElementCount VF;
@@ -3378,6 +3378,8 @@ class VPBasicBlock : public VPBlockBase {
 protected:
   /// Execute the recipes in the IR basic block \p BB.
   void executeRecipes(VPTransformState *State, BasicBlock *BB);
+  void connectToPredecessors(BasicBlock *NewBB,
+                             VPTransformState::CFGState &CFG);
 
 private:
   /// Create an IR BasicBlock to hold the output instructions generated by this
@@ -3499,6 +3501,7 @@ class VPRegionBlock : public VPBlockBase {
     assert(!isReplicator() && "should only get pre-header of loop regions");
     return getSinglePredecessor()->getExitingBasicBlock();
   }
+  void clearEntry() { Entry = nullptr; }
 
   /// An indicator whether this region is to generate multiple replicated
   /// instances of output IR corresponding to its VPBlockBases.
diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
index ba94cd29587664..efc924e5b58022 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
@@ -210,8 +210,9 @@ bool VPRecipeBase::mayHaveSideEffects() const {
 
 void VPLiveOut::fixPhi(VPlan &Plan, VPTransformState &State) {
   VPValue *ExitValue = getOperand(0);
-  VPBasicBlock *MiddleVPBB =
-      cast<VPBasicBlock>(Plan.getVectorLoopRegion()->getSingleSuccessor());
+  auto *Region = dyn_cast<VPRegionBlock>(Plan.getEntry()->getSingleSuccessor());
+  VPBasicBlock *MiddleVPBB = dyn_cast_or_null<VPBasicBlock>(
+      Region ? Region->getSingleSuccessor() : nullptr);
   VPRecipeBase *ExitingRecipe = ExitValue->getDefiningRecipe();
   auto *ExitingVPBB = ExitingRecipe ? ExitingRecipe->getParent() : nullptr;
   // Values leaving the vector loop reach live out phi's in the exiting block
@@ -2208,7 +2209,9 @@ void VPBranchOnMaskRecipe::execute(VPTransformState &State) {
   // Replace the temporary unreachable terminator with a new conditional branch,
   // whose two destinations will be set later when they are created.
   auto *CurrentTerminator = State.CFG.PrevBB->getTerminator();
-  assert(isa<UnreachableInst>(CurrentTerminator) &&
+  assert((isa<UnreachableInst>(CurrentTerminator) ||
+          (isa<BranchInst>(CurrentTerminator) &&
+           !CurrentTerminator->getOperand(0))) &&
          "Expected to replace unreachable terminator with conditional branch.");
   auto *CondBr = BranchInst::Create(State.CFG.PrevBB, nullptr, ConditionBit);
   CondBr->setSuccessor(0, nullptr);
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
index 379bfc0a4394bf..f2db068e046ca5 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
@@ -696,16 +696,46 @@ void VPlanTransforms::optimizeForVFAndUF(VPlan &Plan, ElementCount BestVF,
       !SE.isKnownPredicate(CmpInst::ICMP_ULE, TripCount, C))
     return;
 
-  LLVMContext &Ctx = SE.getContext();
-  auto *BOC =
-      new VPInstruction(VPInstruction::BranchOnCond,
-                        {Plan.getOrAddLiveIn(ConstantInt::getTrue(Ctx))});
-
   SmallVector<VPValue *> PossiblyDead(Term->operands());
   Term->eraseFromParent();
+  VPBasicBlock *Header =
+      cast<VPBasicBlock>(Plan.getVectorLoopRegion()->getEntry());
+  if (all_of(Header->phis(), [](VPRecipeBase &R) {
+        return !isa<VPWidenIntOrFpInductionRecipe, VPReductionPHIRecipe>(&R);
+      })) {
+    for (VPRecipeBase &R : make_early_inc_range(Header->phis())) {
+      auto *P = cast<VPHeaderPHIRecipe>(&R);
+      P->replaceAllUsesWith(P->getStartValue());
+      P->eraseFromParent();
+    }
+
+    VPBlockBase *Preheader = Plan.getVectorLoopRegion()->getSinglePredecessor();
+    auto HeaderSuccs = to_vector(Header->getSuccessors());
+    VPBasicBlock *Exiting =
+        cast<VPBasicBlock>(Plan.getVectorLoopRegion()->getExiting());
+
+    auto *LoopRegion = Plan.getVectorLoopRegion();
+    VPBlockBase *Middle = LoopRegion->getSingleSuccessor();
+    VPBlockUtils::disconnectBlocks(Preheader, LoopRegion);
+    VPBlockUtils::disconnectBlocks(LoopRegion, Middle);
+
+    Header->setParent(nullptr);
+    Exiting->setParent(nullptr);
+    VPBlockUtils::connectBlocks(Preheader, Header);
+
+    VPBlockUtils::connectBlocks(Exiting, Middle);
+    LoopRegion->clearEntry();
+    delete LoopRegion;
+  } else {
+    LLVMContext &Ctx = SE.getContext();
+    auto *BOC =
+        new VPInstruction(VPInstruction::BranchOnCond,
+                          {Plan.getOrAddLiveIn(ConstantInt::getTrue(Ctx))});
+
+    ExitingVPBB->appendRecipe(BOC);
+  }
   for (VPValue *Op : PossiblyDead)
     recursivelyDeleteDeadRecipes(Op);
-  ExitingVPBB->appendRecipe(BOC);
   Plan.setVF(BestVF);
   Plan.setUF(BestUF);
   // TODO: Further simplifications are possible
@@ -714,7 +744,7 @@ void VPlanTransforms::optimizeForVFAndUF(VPlan &Plan, ElementCount BestVF,
 }
 
 /// Sink users of \p FOR after the recipe defining the previous value \p
-/// Previous of the recurrence. \returns true if all users of \p FOR could be
+// Previous of the recurrence. \returns true if all users of \p FOR could be
 /// re-arranged as needed or false if it is not possible.
 static bool
 sinkRecurrenceUsersAfterPrevious(VPFirstOrderRecurrencePHIRecipe *FOR,
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/call-costs.ll b/llvm/test/Transforms/LoopVectorize/AArch64/call-costs.ll
index dcb8ba73661667..47886b28439ddb 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/call-costs.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/call-costs.ll
@@ -78,17 +78,13 @@ define void @powi_call(ptr %P) {
 ; CHECK-NEXT:  [[ENTRY:.*]]:
 ; CHECK-NEXT:    br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
 ; CHECK:       [[VECTOR_PH]]:
-; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
-; CHECK:       [[VECTOR_BODY]]:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
-; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds double, ptr [[P]], i64 [[TMP0]]
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds double, ptr [[P]], i64 0
 ; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds double, ptr [[TMP1]], i32 0
 ; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x double>, ptr [[TMP2]], align 8
 ; CHECK-NEXT:    [[TMP3:%.*]] = call <2 x double> @llvm.powi.v2f64.i32(<2 x double> [[WIDE_LOAD]], i32 3)
-; CHECK-NEXT:    store <2 x double> [[TMP3]], ptr [[TMP2]], align 8
-; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
-; CHECK-NEXT:    br i1 true, label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
+; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds double, ptr [[TMP1]], i32 0
+; CHECK-NEXT:    store <2 x double> [[TMP3]], ptr [[TMP4]], align 8
+; CHECK-NEXT:    br label %[[MIDDLE_BLOCK:.*]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
 ; CHECK-NEXT:    br i1 true, label %[[EXIT:.*]], label %[[SCALAR_PH]]
 ; CHECK:       [[SCALAR_PH]]:
@@ -102,7 +98,7 @@ define void @powi_call(ptr %P) {
 ; CHECK-NEXT:    store double [[POWI]], ptr [[GEP]], align 8
 ; CHECK-NEXT:    [[IV_NEXT]] = add i64 [[IV]], 1
 ; CHECK-NEXT:    [[EC:%.*]] = icmp eq i64 [[IV]], 1
-; CHECK-NEXT:    br i1 [[EC]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP5:![0-9]+]]
+; CHECK-NEXT:    br i1 [[EC]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP4:![0-9]+]]
 ; CHECK:       [[EXIT]]:
 ; CHECK-NEXT:    ret void
 ;
@@ -233,6 +229,5 @@ declare i64 @llvm.fshl.i64(i64, i64, i64)
 ; CHECK: [[META1]] = !{!"llvm.loop.isvectorized", i32 1}
 ; CHECK: [[META2]] = !{!"llvm.loop.unroll.runtime.disable"}
 ; CHECK: [[LOOP3]] = distinct !{[[LOOP3]], [[META2]], [[META1]]}
-; CHECK: [[LOOP4]] = distinct !{[[LOOP4]], [[META1]], [[META2]]}
-; CHECK: [[LOOP5]] = distinct !{[[LOOP5]], [[META2]], [[META1]]}
+; CHECK: [[LOOP4]] = distinct !{[[LOOP4]], [[META2]], [[META1]]}
 ;.
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/low-trip-count.ll b/llvm/test/Transforms/LoopVectorize/RISCV/low-trip-count.ll
index ec50b0cac03829..516c5d0f082746 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/low-trip-count.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/low-trip-count.ll
@@ -49,28 +49,24 @@ define void @trip3_i8(ptr noalias nocapture noundef %dst, ptr noalias nocapture
 ; CHECK:       vector.ph:
 ; CHECK-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 2
-; CHECK-NEXT:    [[TMP4:%.*]] = sub i64 [[TMP1]], 1
-; CHECK-NEXT:    [[N_RND_UP:%.*]] = add i64 3, [[TMP4]]
+; CHECK-NEXT:    [[TMP2:%.*]] = sub i64 [[TMP1]], 1
+; CHECK-NEXT:    [[N_RND_UP:%.*]] = add i64 3, [[TMP2]]
 ; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP1]]
 ; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
-; CHECK-NEXT:    [[TMP5:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP6:%.*]] = mul i64 [[TMP5]], 2
-; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
-; CHECK:       vector.body:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP7:%.*]] = add i64 [[INDEX]], 0
-; CHECK-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = call <vscale x 2 x i1> @llvm.get.active.lane.mask.nxv2i1.i64(i64 [[TMP7]], i64 3)
-; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i8, ptr [[SRC:%.*]], i64 [[TMP7]]
+; CHECK-NEXT:    [[TMP3:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP4:%.*]] = mul i64 [[TMP3]], 2
+; CHECK-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = call <vscale x 2 x i1> @llvm.get.active.lane.mask.nxv2i1.i64(i64 0, i64 3)
+; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i8, ptr [[SRC:%.*]], i64 0
+; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i8, ptr [[TMP5]], i32 0
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 2 x i8> @llvm.masked.load.nxv2i8.p0(ptr [[TMP6]], i32 1, <vscale x 2 x i1> [[ACTIVE_LANE_MASK]], <vscale x 2 x i8> poison)
+; CHECK-NEXT:    [[TMP7:%.*]] = shl <vscale x 2 x i8> [[WIDE_MASKED_LOAD]], shufflevector (<vscale x 2 x i8> insertelement (<vscale x 2 x i8> poison, i8 1, i64 0), <vscale x 2 x i8> poison, <vscale x 2 x i32> zeroinitializer)
+; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i8, ptr [[DST:%.*]], i64 0
 ; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i8, ptr [[TMP8]], i32 0
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 2 x i8> @llvm.masked.load.nxv2i8.p0(ptr [[TMP9]], i32 1, <vscale x 2 x i1> [[ACTIVE_LANE_MASK]], <vscale x 2 x i8> poison)
-; CHECK-NEXT:    [[TMP10:%.*]] = shl <vscale x 2 x i8> [[WIDE_MASKED_LOAD]], shufflevector (<vscale x 2 x i8> insertelement (<vscale x 2 x i8> poison, i8 1, i64 0), <vscale x 2 x i8> poison, <vscale x 2 x i32> zeroinitializer)
-; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i8, ptr [[DST:%.*]], i64 [[TMP7]]
-; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i8, ptr [[TMP11]], i32 0
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD1:%.*]] = call <vscale x 2 x i8> @llvm.masked.load.nxv2i8.p0(ptr [[TMP12]], i32 1, <vscale x 2 x i1> [[ACTIVE_LANE_MASK]], <vscale x 2 x i8> poison)
-; CHECK-NEXT:    [[TMP13:%.*]] = add <vscale x 2 x i8> [[TMP10]], [[WIDE_MASKED_LOAD1]]
-; CHECK-NEXT:    call void @llvm.masked.store.nxv2i8.p0(<vscale x 2 x i8> [[TMP13]], ptr [[TMP12]], i32 1, <vscale x 2 x i1> [[ACTIVE_LANE_MASK]])
-; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP6]]
-; CHECK-NEXT:    br i1 true, label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD1:%.*]] = call <vscale x 2 x i8> @llvm.masked.load.nxv2i8.p0(ptr [[TMP9]], i32 1, <vscale x 2 x i1> [[ACTIVE_LANE_MASK]], <vscale x 2 x i8> poison)
+; CHECK-NEXT:    [[TMP10:%.*]] = add <vscale x 2 x i8> [[TMP7]], [[WIDE_MASKED_LOAD1]]
+; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i8, ptr [[TMP8]], i32 0
+; CHECK-NEXT:    call void @llvm.masked.store.nxv2i8.p0(<vscale x 2 x i8> [[TMP10]], ptr [[TMP11]], i32 1, <vscale x 2 x i1> [[ACTIVE_LANE_MASK]])
+; CHECK-NEXT:    br label [[MIDDLE_BLOCK:%.*]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
@@ -79,15 +75,15 @@ define void @trip3_i8(ptr noalias nocapture noundef %dst, ptr noalias nocapture
 ; CHECK:       for.body:
 ; CHECK-NEXT:    [[I_08:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INC:%.*]], [[FOR_BODY]] ]
 ; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i64 [[I_08]]
-; CHECK-NEXT:    [[TMP14:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
-; CHECK-NEXT:    [[MUL:%.*]] = shl i8 [[TMP14]], 1
+; CHECK-NEXT:    [[TMP12:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
+; CHECK-NEXT:    [[MUL:%.*]] = shl i8 [[TMP12]], 1
 ; CHECK-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds i8, ptr [[DST]], i64 [[I_08]]
-; CHECK-NEXT:    [[TMP15:%.*]] = load i8, ptr [[ARRAYIDX1]], align 1
-; CHECK-NEXT:    [[ADD:%.*]] = add i8 [[MUL]], [[TMP15]]
+; CHECK-NEXT:    [[TMP13:%.*]] = load i8, ptr [[ARRAYIDX1]], align 1
+; CHECK-NEXT:    [[ADD:%.*]] = add i8 [[MUL]], [[TMP13]]
 ; CHECK-NEXT:    store i8 [[ADD]], ptr [[ARRAYIDX1]], align 1
 ; CHECK-NEXT:    [[INC]] = add nuw nsw i64 [[I_08]], 1
 ; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INC]], 3
-; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
+; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; CHECK:       for.end:
 ; CHECK-NEXT:    ret void
 ;
@@ -118,28 +114,24 @@ define void @trip5_i8(ptr noalias nocapture noundef %dst, ptr noalias nocapture
 ; CHECK:       vector.ph:
 ; CHECK-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 4
-; CHECK-NEXT:    [[TMP4:%.*]] = sub i64 [[TMP1]], 1
-; CHECK-NEXT:    [[N_RND_UP:%.*]] = add i64 5, [[TMP4]]
+; CHECK-NEXT:    [[TMP2:%.*]] = sub i64 [[TMP1]], 1
+; CHECK-NEXT:    [[N_RND_UP:%.*]] = add i64 5, [[TMP2]]
 ; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP1]]
 ; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
-; CHECK-NEXT:    [[TMP5:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP6:%.*]] = mul i64 [[TMP5]], 4
-; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
-; CHECK:       vector.body:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP7:%.*]] = add i64 [[INDEX]], 0
-; CHECK-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[TMP7]], i64 5)
-; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i8, ptr [[SRC:%.*]], i64 [[TMP7]]
+; CHECK-NEXT:    [[TMP3:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP4:%.*]] = mul i64 [[TMP3]], 4
+; CHECK-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 0, i64 5)
+; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i8, ptr [[SRC:%.*]], i64 0
+; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i8, ptr [[TMP5]], i32 0
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 4 x i8> @llvm.masked.load.nxv4i8.p0(ptr [[TMP6]], i32 1, <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x i8> poison)
+; CHECK-NEXT:    [[TMP7:%.*]] = shl <vscale x 4 x i8> [[WIDE_MASKED_LOAD]], shufflevector (<vscale x 4 x i8> insertelement (<vscale x 4 x i8> poison, i8 1, i64 0), <vscale x 4 x i8> poison, <vscale x 4 x i32> zeroinitializer)
+; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i8, ptr [[DST:%.*]], i64 0
 ; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i8, ptr [[TMP8]], i32 0
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 4 x i8> @llvm.masked.load.nxv4i8.p0(ptr [[TMP9]], i32 1, <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x i8> poison)
-; CHECK-NEXT:    [[TMP10:%.*]] = shl <vscale x 4 x i8> [[WIDE_MASKED_LOAD]], shufflevector (<vscale x 4 x i8> insertelement (<vscale x 4 x i8> poison, i8 1, i64 0), <vscale x 4 x i8> poison, <vscale x 4 x i32> zeroinitializer)
-; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i8, ptr [[DST:%.*]], i64 [[TMP7]]
-; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i8, ptr [[TMP11]], i32 0
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD1:%.*]] = call <vscale x 4 x i8> @llvm.masked.load.nxv4i8.p0(ptr [[TMP12]], i32 1, <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x i8> poison)
-; CHECK-NEXT:    [[TMP13:%.*]] = add <vscale x 4 x i8> [[TMP10]], [[WIDE_MASKED_LOAD1]]
-; CHECK-NEXT:    call void @llvm.masked.store.nxv4i8.p0(<vscale x 4 x i8> [[TMP13]], ptr [[TMP12]], i32 1, <vscale x 4 x i1> [[ACTIVE_LANE_MASK]])
-; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP6]]
-; CHECK-NEXT:    br i1 true, label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD1:%.*]] = call <vscale x 4 x i8> @llvm.masked.load.nxv4i8.p0(ptr [[TMP9]], i32 1, <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x i8> poison)
+; CHECK-NEXT:    [[TMP10:%.*]] = add <vscale x 4 x i8> [[TMP7]], [[WIDE_MASKED_LOAD1]]
+; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i8, ptr [[TMP8]], i32 0
+; CHECK-NEXT:    call void @llvm.masked.store.nxv4i8.p0(<vscale x 4 x i8> [[TMP10]], ptr [[TMP11]], i32 1, <vscale x 4 x i1> [[ACTIVE_LANE_MASK]])
+; CHECK-NEXT:    br label [[MIDDLE_BLOCK:%.*]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
@@ -148,15 +140,15 @@ define void @trip5_i8(ptr noalias nocapture noundef %dst, ptr noalias nocapture
 ; CHECK:       for.body:
 ; CHECK-NEXT:    [[I_08:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INC:%.*]], [[FOR_BODY]] ]
 ; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i64 [[I_08]]
-; CHECK-NEXT:    [[TMP14:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
-; CHECK-NEXT:    [[MUL:%.*]] = shl i8 [[TMP14]], 1
+; CHECK-NEXT:    [[TMP12:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
+; CHECK-NEXT:    [[MUL:%.*]] = shl i8 [[TMP12]], 1
 ; CHECK-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds i8, ptr [[DST]], i64 [[I_08]]
-; CHECK-NEXT:    [[TMP15:%.*]] = load i8, ptr [[ARRAYIDX1]], align 1
-; CHECK-NEXT:    [[ADD:%.*]] = add i8 [[MUL]], [[TMP15]]
+; CHECK-NEXT:    [[TMP13:%.*]] = load i8, ptr [[ARRAYIDX1]], align 1
+; CHECK-NEXT:    [[ADD:%.*]] = add i8 [[MUL]], [[TMP13]]
 ; CHECK-NEXT:    store i8 [[ADD]], ptr [[ARRAYIDX1]], align 1
 ; CHECK-NEXT:    [[INC]] = add nuw nsw i64 [[I_08]], 1
 ; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INC]], 5
-; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
+; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
 ; CHECK:       for.end:
 ; CHECK-NEXT:    ret void
 ;
@@ -187,28 +179,24 @@ define void @trip8_i8(ptr noalias nocapture noundef %dst, ptr noalias nocapture
 ; CHECK:       vector.ph:
 ; CHECK-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 4
-; CHECK-NEXT:    [[TMP4:%.*]] = sub i64 [[TMP1]], 1
-; CHECK-NEXT:    [[N_RND_UP:%.*]] = add i64 8, [[TMP4]]
+; CHECK-NEXT:    [[TMP2:%.*]] = sub i64 [[TMP1]], 1
+; CHECK-NEXT:    [[N_RND_UP:%.*]] = add i64 8, [[TMP2]]
 ; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP1]]
 ; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
-; CHECK-NEXT:    [[TMP5:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP6:%.*]] = mul i64 [[TMP5]], 4
-; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
-; CHECK:       vector.body:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP7:%.*]] = add i64 [[INDEX]], 0
-; CHECK-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[TMP7]], i64 8)
-; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i8, ptr [[SRC:%.*]], i64 [[TMP7]]
+; CHECK-NEXT:    [[TMP3:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP4:%.*]] = mul i64 [[TMP3]], 4
+; CHECK-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 0, i64 8)
+; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i8, ptr [[SRC:%.*]], i64 0
+; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i8, ptr [[TMP5]], i32 0
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 4 x i8> @llvm.masked.load.nxv4i8.p0(ptr [[TMP6]], i32 1, <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x i8> poison)
+; CHECK-NEXT:    [[TMP7:%.*]] = shl <vscale x 4 x i8> [[WIDE_MASKED_LOAD]], shufflevector (<vscale x 4 x i8> insertelement (<vscale x 4 x i8> poison, i8 1, i64 0), <vscale x 4 x i8> poison, <vscale x 4 x i32> zeroinitializer)
+; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i8, ptr [[DST:%.*]], i64 0
 ; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i8, ptr [[TMP8]], i32 0
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 4 x i8> @llvm.masked.load.nxv4i8.p0(ptr [[TMP9]], i32 1, <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x i8> poison)
-; CHECK-NEXT:    [[TMP10:%.*]] = shl <vscale x 4 x i8> [[WIDE_MASKED_LOAD]], shufflevector (<vscale x 4 x i8> insertelement (<vscale x 4 x i8> poison, i8 1, i64 0), <vscale x 4 x i8> poison, <vscale x 4 x i32> zeroinitializer)
-; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i8, ptr [[DST:%.*]], i64 [[TMP7]]
-; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i8, ptr [[TMP11]], i32 0
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD1:%.*]] = call <vscale x 4 x i8> @llvm.masked.load.nxv4i8.p0(ptr [[TMP12]], i32 1, <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x i8> poison)
-; CHECK-NEXT:    [[TMP13:%.*]] = add <vscale x 4 x i8> [[TMP10]], [[WIDE_MASKED_LOAD1]]
-; CHECK-NEXT:    call void @llvm.masked.store.nxv4i8.p0(<vscale x 4 x i8> [[TMP13]], ptr [[TMP12]], i32 1, <vscale x 4 x i1> [[ACTIVE_LANE_MASK]])
-; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP6]]
-; CHECK-NEXT:    br i1 true, label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD1:%.*]] = call <vscale x 4 x i8> @llvm.masked.load.nxv4i8.p0(ptr [[TMP9]], i32 1, <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x i8> poison)
+; CHECK-NEXT:    [[TMP10:%.*]] = add <vscale x 4 x i8> [[TMP7]], [[WIDE_MASKED_LOAD1]]
+; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i8, ptr [[TMP8]], i32 0
+; CHECK-NEXT:    call void @llvm.masked.store.nxv4i8.p0(<vscale x 4 x i8> [[TMP10]], ptr [[TMP11]], i32 1, <vscale x 4 x i1> [[ACTIVE_LANE_MASK]])
+; CHECK-NEXT:    br label [[MIDDLE_BLOCK:%.*]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
@@ -217,15 +205,15 @@ define void @trip8_i8(ptr noalias nocapture noundef %dst, ptr noalias nocapture
 ; CHECK:       for.body:
 ; CHECK-NEXT:    [[I_08:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INC:%.*]], [[FOR_BODY]] ]
 ; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i64 [[I_08]]
-; CHECK-NEXT:    [[TMP14:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
-; CHECK-NEXT:    [[MUL:%.*]] = shl i8 [[TMP14]], 1
+; CHECK-NEXT:    [[TMP12:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
+; CHECK-NEXT:    [[MUL:%.*]] = shl i8 [[TMP12]], 1
 ; CHECK-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds i8, ptr [[DST]], i64 [[I_08]]
-; CHECK-NEXT:    [[TMP15:%.*]] = load i8, ptr [[ARRAYIDX1]], align 1
-; CHECK-NEXT:    [[ADD:%.*]] = add i8 [[MUL]], [[TMP15]]
+; CHECK-NEXT:    [[TMP13:%.*]] = load i8, ptr [[ARRAYIDX1]], align 1
+; CHECK-NEXT:    [[ADD:%.*]] = add i8 [[MUL]], [[TMP13]]
 ; CHECK-NEXT:    store i8 [[ADD]], ptr [[ARRAYIDX1]], align 1
 ; CHECK-NEXT:    [[INC]] = add nuw nsw i64 [[I_08]], 1
 ; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INC]], 8
-; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]]
+; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
 ; CHECK:       for.end:
 ; CHECK-NEXT:    ret void
 ;
@@ -254,21 +242,17 @@ define void @trip16_i8(ptr noalias nocapture noundef %dst, ptr noalias nocapture
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; CHECK:       vector.ph:
-; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
-; CHECK:       vector.body:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
-; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i8, ptr [[SRC:%.*]], i64 [[TMP0]]
-; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i32 0
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP2]], align 1
-; CHECK-NEXT:    [[TMP3:%.*]] = shl <16 x i8> [[WIDE_LOAD]], <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
-; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i8, ptr [[DST:%.*]], i64 [[TMP0]]
-; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i8, ptr [[TMP4]], i32 0
-; CHECK-NEXT:    [[WIDE_LOAD1:%.*]] = load <16 x i8>, ptr [[TMP5]], align 1
-; CHECK-NEXT:    [[TMP6:%.*]] = add <16 x i8> [[TMP3]], [[WIDE_LOAD1]]
-; CHECK-NEXT:    store <16 x i8> [[TMP6]], ptr [[TMP5]], align 1
-; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
-; CHECK-NEXT:    br i1 true, label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
+; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds i8, ptr [[SRC:%.*]], i64 0
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i8, ptr [[TMP0]], i32 0
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP1]], align 1
+; CHECK-NEXT:    [[TMP2:%.*]] = shl <16 x i8> [[WIDE_LOAD]], <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
+; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i8, ptr [[DST:%.*]], i64 0
+; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i8, ptr [[TMP3]], i32 0
+; CHECK-NEXT:    [[WIDE_LOAD1:%.*]] = load <16 x i8>, ptr [[TMP4]], align 1
+; CHECK-NEXT:    [[TMP5:%.*]] = add <16 x i8> [[TMP2]], [[WIDE_LOAD1]]
+; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i8, ptr [[TMP3]], i32 0
+; CHECK-NEXT:    store <16 x i8> [[TMP5]], ptr [[TMP6]], align 1
+; CHECK-NEXT:    br label [[MIDDLE_BLOCK:%.*]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
@@ -285,7 +269,7 @@ define void @trip16_i8(ptr noalias nocapture noundef %dst, ptr noalias nocapture
 ; CHECK-NEXT:    store i8 [[ADD]], ptr [[ARRAYIDX1]], align 1
 ; CHECK-NEXT:    [[INC]] = add nuw nsw i64 [[I_08]], 1
 ; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INC]], 16
-; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]]
+; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
 ; CHECK:       for.end:
 ; CHECK-NEXT:    ret void
 ;
@@ -315,21 +299,17 @@ define void @trip32_i8(ptr noalias nocapture noundef %dst, ptr noalias nocapture
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; CHECK:       vector.ph:
-; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
-; CHECK:       vector.body:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
-; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i8, ptr [[SRC:%.*]], i64 [[TMP0]]
-; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i32 0
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <32 x i8>, ptr [[TMP2]], align 1
-; CHECK-NEXT:    [[TMP3:%.*]] = shl <32 x i8> [[WIDE_LOAD]], <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
-; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i8, ptr [[DST:%.*]], i64 [[TMP0]]
-; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i8, ptr [[TMP4]], i32 0
-; CHECK-NEXT:    [[WIDE_LOAD1:%.*]] = load <32 x i8>, ptr [[TMP5]], align 1
-; CHECK-NEXT:    [[TMP6:%.*]] = add <32 x i8> [[TMP3]], [[WIDE_LOAD1]]
-; CHECK-NEXT:    store <32 x i8> [[TMP6]], ptr [[TMP5]], align 1
-; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 32
-; CHECK-NEXT:    br i1 true, label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
+; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds i8, ptr [[SRC:%.*]], i64 0
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i8, ptr [[TMP0]], i32 0
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <32 x i8>, ptr [[TMP1]], align 1
+; CHECK-NEXT:    [[TMP2:%.*]] = shl <32 x i8> [[WIDE_LOAD]], <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
+; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i8, ptr [[DST:%.*]], i64 0
+; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i8, ptr [[TMP3]], i32 0
+; CHECK-NEXT:    [[WIDE_LOAD1:%.*]] = load <32 x i8>, ptr [[TMP4]], align 1
+; CHECK-NEXT:    [[TMP5:%.*]] = add <32 x i8> [[TMP2]], [[WIDE_LOAD1]]
+; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i8, ptr [[TMP3]], i32 0
+; CHECK-NEXT:    store <32 x i8> [[TMP5]], ptr [[TMP6]], align 1
+; CHECK-NEXT:    br label [[MIDDLE_BLOCK:%.*]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
@@ -346,7 +326,7 @@ define void @trip32_i8(ptr noalias nocapture noundef %dst, ptr noalias nocapture
 ; CHECK-NEXT:    store i8 [[ADD]], ptr [[ARRAYIDX1]], align 1
 ; CHECK-NEXT:    [[INC]] = add nuw nsw i64 [[I_08]], 1
 ; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INC]], 32
-; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]]
+; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
 ; CHECK:       for.end:
 ; CHECK-NEXT:    ret void
 ;
@@ -390,7 +370,7 @@ define void @trip24_i8(ptr noalias nocapture noundef %dst, ptr noalias nocapture
 ; CHECK-NEXT:    store <8 x i8> [[TMP6]], ptr [[TMP5]], align 1
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
 ; CHECK-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], 24
-; CHECK-NEXT:    br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]]
+; CHECK-NEXT:    br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
@@ -407,7 +387,7 @@ define void @trip24_i8(ptr noalias nocapture noundef %dst, ptr noalias nocapture
 ; CHECK-NEXT:    store i8 [[ADD]], ptr [[ARRAYIDX1]], align 1
 ; CHECK-NEXT:    [[INC]] = add nuw nsw i64 [[I_08]], 1
 ; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INC]], 24
-; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP13:![0-9]+]]
+; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
 ; CHECK:       for.end:
 ; CHECK-NEXT:    ret void
 ;
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/short-trip-count.ll b/llvm/test/Transforms/LoopVectorize/RISCV/short-trip-count.ll
index bb716d78ca4119..71244694ae2996 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/short-trip-count.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/short-trip-count.ll
@@ -12,18 +12,14 @@ define void @small_trip_count_min_vlen_128(ptr nocapture %a) nounwind vscale_ran
 ; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i32 [[N_RND_UP]], [[TMP0]]
 ; CHECK-NEXT:    [[N_VEC:%.*]] = sub i32 [[N_RND_UP]], [[N_MOD_VF]]
 ; CHECK-NEXT:    [[TMP2:%.*]] = call i32 @llvm.vscale.i32()
-; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
-; CHECK:       vector.body:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP3:%.*]] = add i32 [[INDEX]], 0
-; CHECK-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = call <vscale x 1 x i1> @llvm.get.active.lane.mask.nxv1i1.i32(i32 [[TMP3]], i32 4)
-; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i32 [[TMP3]]
-; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i32, ptr [[TMP4]], i32 0
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 1 x i32> @llvm.masked.load.nxv1i32.p0(ptr [[TMP5]], i32 4, <vscale x 1 x i1> [[ACTIVE_LANE_MASK]], <vscale x 1 x i32> poison)
-; CHECK-NEXT:    [[TMP6:%.*]] = add nsw <vscale x 1 x i32> [[WIDE_MASKED_LOAD]], shufflevector (<vscale x 1 x i32> insertelement (<vscale x 1 x i32> poison, i32 1, i64 0), <vscale x 1 x i32> poison, <vscale x 1 x i32> zeroinitializer)
-; CHECK-NEXT:    call void @llvm.masked.store.nxv1i32.p0(<vscale x 1 x i32> [[TMP6]], ptr [[TMP5]], i32 4, <vscale x 1 x i1> [[ACTIVE_LANE_MASK]])
-; CHECK-NEXT:    [[INDEX_NEXT]] = add i32 [[INDEX]], [[TMP2]]
-; CHECK-NEXT:    br i1 true, label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = call <vscale x 1 x i1> @llvm.get.active.lane.mask.nxv1i1.i32(i32 0, i32 4)
+; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i32 0
+; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i32, ptr [[TMP3]], i32 0
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 1 x i32> @llvm.masked.load.nxv1i32.p0(ptr [[TMP4]], i32 4, <vscale x 1 x i1> [[ACTIVE_LANE_MASK]], <vscale x 1 x i32> poison)
+; CHECK-NEXT:    [[TMP5:%.*]] = add nsw <vscale x 1 x i32> [[WIDE_MASKED_LOAD]], shufflevector (<vscale x 1 x i32> insertelement (<vscale x 1 x i32> poison, i32 1, i64 0), <vscale x 1 x i32> poison, <vscale x 1 x i32> zeroinitializer)
+; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[TMP3]], i32 0
+; CHECK-NEXT:    call void @llvm.masked.store.nxv1i32.p0(<vscale x 1 x i32> [[TMP5]], ptr [[TMP6]], i32 4, <vscale x 1 x i1> [[ACTIVE_LANE_MASK]])
+; CHECK-NEXT:    br label [[MIDDLE_BLOCK:%.*]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
@@ -37,7 +33,7 @@ define void @small_trip_count_min_vlen_128(ptr nocapture %a) nounwind vscale_ran
 ; CHECK-NEXT:    store i32 [[ADD]], ptr [[GEP]], align 4
 ; CHECK-NEXT:    [[IV_NEXT]] = add i32 [[IV]], 1
 ; CHECK-NEXT:    [[COND:%.*]] = icmp eq i32 [[IV]], 3
-; CHECK-NEXT:    br i1 [[COND]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP3:![0-9]+]]
+; CHECK-NEXT:    br i1 [[COND]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP0:![0-9]+]]
 ; CHECK:       exit:
 ; CHECK-NEXT:    ret void
 ;
@@ -71,18 +67,14 @@ define void @small_trip_count_min_vlen_32(ptr nocapture %a) nounwind vscale_rang
 ; CHECK-NEXT:    [[N_VEC:%.*]] = sub i32 [[N_RND_UP]], [[N_MOD_VF]]
 ; CHECK-NEXT:    [[TMP3:%.*]] = call i32 @llvm.vscale.i32()
 ; CHECK-NEXT:    [[TMP4:%.*]] = mul i32 [[TMP3]], 4
-; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
-; CHECK:       vector.body:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP5:%.*]] = add i32 [[INDEX]], 0
-; CHECK-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i32(i32 [[TMP5]], i32 4)
-; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i32 [[TMP5]]
-; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[TMP6]], i32 0
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 4 x i32> @llvm.masked.load.nxv4i32.p0(ptr [[TMP7]], i32 4, <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x i32> poison)
-; CHECK-NEXT:    [[TMP8:%.*]] = add nsw <vscale x 4 x i32> [[WIDE_MASKED_LOAD]], shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 1, i64 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer)
-; CHECK-NEXT:    call void @llvm.masked.store.nxv4i32.p0(<vscale x 4 x i32> [[TMP8]], ptr [[TMP7]], i32 4, <vscale x 4 x i1> [[ACTIVE_LANE_MASK]])
-; CHECK-NEXT:    [[INDEX_NEXT]] = add i32 [[INDEX]], [[TMP4]]
-; CHECK-NEXT:    br i1 true, label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
+; CHECK-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i32(i32 0, i32 4)
+; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i32 0
+; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[TMP5]], i32 0
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 4 x i32> @llvm.masked.load.nxv4i32.p0(ptr [[TMP6]], i32 4, <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x i32> poison)
+; CHECK-NEXT:    [[TMP7:%.*]] = add nsw <vscale x 4 x i32> [[WIDE_MASKED_LOAD]], shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 1, i64 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer)
+; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i32, ptr [[TMP5]], i32 0
+; CHECK-NEXT:    call void @llvm.masked.store.nxv4i32.p0(<vscale x 4 x i32> [[TMP7]], ptr [[TMP8]], i32 4, <vscale x 4 x i1> [[ACTIVE_LANE_MASK]])
+; CHECK-NEXT:    br label [[MIDDLE_BLOCK:%.*]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
@@ -96,7 +88,7 @@ define void @small_trip_count_min_vlen_32(ptr nocapture %a) nounwind vscale_rang
 ; CHECK-NEXT:    store i32 [[ADD]], ptr [[GEP]], align 4
 ; CHECK-NEXT:    [[IV_NEXT]] = add i32 [[IV]], 1
 ; CHECK-NEXT:    [[COND:%.*]] = icmp eq i32 [[IV]], 3
-; CHECK-NEXT:    br i1 [[COND]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP5:![0-9]+]]
+; CHECK-NEXT:    br i1 [[COND]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP3:![0-9]+]]
 ; CHECK:       exit:
 ; CHECK-NEXT:    ret void
 ;
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/truncate-to-minimal-bitwidth-cost.ll b/llvm/test/Transforms/LoopVectorize/RISCV/truncate-to-minimal-bitwidth-cost.ll
index fa13cd8f19ae56..6eface402f6df6 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/truncate-to-minimal-bitwidth-cost.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/truncate-to-minimal-bitwidth-cost.ll
@@ -163,20 +163,12 @@ define void @truncate_to_i1_used_by_branch(i8 %x, ptr %dst) #0 {
 ; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <2 x i8> [[BROADCAST_SPLATINSERT]], <2 x i8> poison, <2 x i32> zeroinitializer
 ; CHECK-NEXT:    [[TMP0:%.*]] = trunc <2 x i8> [[BROADCAST_SPLAT]] to <2 x i1>
 ; CHECK-NEXT:    [[TMP2:%.*]] = or <2 x i1> <i1 true, i1 true>, [[TMP0]]
+; CHECK-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = call <2 x i1> @llvm.get.active.lane.mask.v2i1.i32(i32 0, i32 2)
+; CHECK-NEXT:    [[TMP3:%.*]] = select <2 x i1> [[ACTIVE_LANE_MASK]], <2 x i1> [[TMP2]], <2 x i1> zeroinitializer
 ; CHECK-NEXT:    [[BROADCAST_SPLATINSERT3:%.*]] = insertelement <2 x ptr> poison, ptr [[DST]], i64 0
 ; CHECK-NEXT:    [[BROADCAST_SPLAT4:%.*]] = shufflevector <2 x ptr> [[BROADCAST_SPLATINSERT3]], <2 x ptr> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
-; CHECK:       [[VECTOR_BODY]]:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <2 x i32> poison, i32 [[INDEX]], i64 0
-; CHECK-NEXT:    [[BROADCAST_SPLAT2:%.*]] = shufflevector <2 x i32> [[BROADCAST_SPLATINSERT1]], <2 x i32> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT:    [[VEC_IV:%.*]] = add <2 x i32> [[BROADCAST_SPLAT2]], <i32 0, i32 1>
-; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <2 x i32> [[VEC_IV]], i32 0
-; CHECK-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = call <2 x i1> @llvm.get.active.lane.mask.v2i1.i32(i32 [[TMP1]], i32 2)
-; CHECK-NEXT:    [[TMP3:%.*]] = select <2 x i1> [[ACTIVE_LANE_MASK]], <2 x i1> [[TMP2]], <2 x i1> zeroinitializer
 ; CHECK-NEXT:    call void @llvm.masked.scatter.v2i8.v2p0(<2 x i8> zeroinitializer, <2 x ptr> [[BROADCAST_SPLAT4]], i32 1, <2 x i1> [[TMP3]])
-; CHECK-NEXT:    [[INDEX_NEXT]] = add i32 [[INDEX]], 2
-; CHECK-NEXT:    br i1 true, label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
+; CHECK-NEXT:    br label %[[MIDDLE_BLOCK:.*]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
 ; CHECK-NEXT:    br i1 true, label %[[EXIT:.*]], label %[[SCALAR_PH]]
 ; CHECK:       [[SCALAR_PH]]:
@@ -194,7 +186,7 @@ define void @truncate_to_i1_used_by_branch(i8 %x, ptr %dst) #0 {
 ; CHECK-NEXT:    [[ADD]] = add i8 [[F_039]], 1
 ; CHECK-NEXT:    [[CONV:%.*]] = sext i8 [[F_039]] to i32
 ; CHECK-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CONV]], 1
-; CHECK-NEXT:    br i1 [[CMP]], label %[[LOOP_HEADER]], label %[[EXIT]], !llvm.loop [[LOOP7:![0-9]+]]
+; CHECK-NEXT:    br i1 [[CMP]], label %[[LOOP_HEADER]], label %[[EXIT]], !llvm.loop [[LOOP6:![0-9]+]]
 ; CHECK:       [[EXIT]]:
 ; CHECK-NEXT:    ret void
 ;
@@ -293,7 +285,7 @@ define void @icmp_only_first_op_truncated(ptr noalias %dst, i32 %x, i64 %N, i64
 ; CHECK-NEXT:    call void @llvm.masked.scatter.nxv2f64.nxv2p0(<vscale x 2 x double> [[WIDE_MASKED_GATHER]], <vscale x 2 x ptr> [[BROADCAST_SPLAT6]], i32 8, <vscale x 2 x i1> [[TMP8]])
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP6]]
 ; CHECK-NEXT:    [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP11]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
+; CHECK-NEXT:    br i1 [[TMP11]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[TMP0]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]]
@@ -314,7 +306,7 @@ define void @icmp_only_first_op_truncated(ptr noalias %dst, i32 %x, i64 %N, i64
 ; CHECK:       [[LOOP_LATCH]]:
 ; CHECK-NEXT:    [[IV_NEXT]] = add i64 [[IV]], 1
 ; CHECK-NEXT:    [[EC:%.*]] = icmp eq i64 [[IV]], [[V]]
-; CHECK-NEXT:    br i1 [[EC]], label %[[EXIT]], label %[[LOOP_HEADER]], !llvm.loop [[LOOP9:![0-9]+]]
+; CHECK-NEXT:    br i1 [[EC]], label %[[EXIT]], label %[[LOOP_HEADER]], !llvm.loop [[LOOP8:![0-9]+]]
 ; CHECK:       [[EXIT]]:
 ; CHECK-NEXT:    ret void
 ;
@@ -354,8 +346,7 @@ attributes #1 = { "target-features"="+64bit,+v" }
 ; CHECK: [[LOOP3]] = distinct !{[[LOOP3]], [[META2]], [[META1]]}
 ; CHECK: [[LOOP4]] = distinct !{[[LOOP4]], [[META1]], [[META2]]}
 ; CHECK: [[LOOP5]] = distinct !{[[LOOP5]], [[META2]], [[META1]]}
-; CHECK: [[LOOP6]] = distinct !{[[LOOP6]], [[META1]], [[META2]]}
-; CHECK: [[LOOP7]] = distinct !{[[LOOP7]], [[META2]], [[META1]]}
-; CHECK: [[LOOP8]] = distinct !{[[LOOP8]], [[META1]], [[META2]]}
-; CHECK: [[LOOP9]] = distinct !{[[LOOP9]], [[META2]], [[META1]]}
+; CHECK: [[LOOP6]] = distinct !{[[LOOP6]], [[META2]], [[META1]]}
+; CHECK: [[LOOP7]] = distinct !{[[LOOP7]], [[META1]], [[META2]]}
+; CHECK: [[LOOP8]] = distinct !{[[LOOP8]], [[META2]], [[META1]]}
 ;.
diff --git a/llvm/test/Transforms/LoopVectorize/SystemZ/pr47665.ll b/llvm/test/Transforms/LoopVectorize/SystemZ/pr47665.ll
index 5e8f287217478b..11344c7ff00bbe 100644
--- a/llvm/test/Transforms/LoopVectorize/SystemZ/pr47665.ll
+++ b/llvm/test/Transforms/LoopVectorize/SystemZ/pr47665.ll
@@ -17,127 +17,105 @@ define void @test(ptr %p, i40 %a) {
 ; CHECK-NEXT:    [[TMP6:%.*]] = icmp ult <16 x i1> zeroinitializer, [[TMP5]]
 ; CHECK-NEXT:    [[TMP7:%.*]] = or <16 x i1> [[TMP6]], <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>
 ; CHECK-NEXT:    [[TMP8:%.*]] = icmp sgt <16 x i1> [[TMP7]], zeroinitializer
-; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
-; CHECK:       vector.body:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_STORE_CONTINUE32:%.*]] ]
-; CHECK-NEXT:    [[BROADCAST_SPLATINSERT2:%.*]] = insertelement <16 x i32> poison, i32 [[INDEX]], i64 0
-; CHECK-NEXT:    [[BROADCAST_SPLAT3:%.*]] = shufflevector <16 x i32> [[BROADCAST_SPLATINSERT2]], <16 x i32> poison, <16 x i32> zeroinitializer
-; CHECK-NEXT:    [[VEC_IV:%.*]] = add <16 x i32> [[BROADCAST_SPLAT3]], <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; CHECK-NEXT:    [[TMP0:%.*]] = icmp ule <16 x i32> [[VEC_IV]], <i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9>
-; CHECK-NEXT:    [[TMP9:%.*]] = extractelement <16 x i1> [[TMP0]], i32 0
-; CHECK-NEXT:    br i1 [[TMP9]], label [[PRED_STORE_IF:%.*]], label [[PRED_STORE_CONTINUE:%.*]]
+; CHECK-NEXT:    br i1 true, label [[PRED_STORE_IF:%.*]], label [[PRED_STORE_CONTINUE:%.*]]
 ; CHECK:       pred.store.if:
 ; CHECK-NEXT:    [[TMP10:%.*]] = extractelement <16 x i1> [[TMP8]], i32 0
 ; CHECK-NEXT:    store i1 [[TMP10]], ptr [[P]], align 1
 ; CHECK-NEXT:    br label [[PRED_STORE_CONTINUE]]
 ; CHECK:       pred.store.continue:
-; CHECK-NEXT:    [[TMP11:%.*]] = extractelement <16 x i1> [[TMP0]], i32 1
-; CHECK-NEXT:    br i1 [[TMP11]], label [[PRED_STORE_IF3:%.*]], label [[PRED_STORE_CONTINUE4:%.*]]
+; CHECK-NEXT:    br i1 true, label [[PRED_STORE_IF1:%.*]], label [[PRED_STORE_CONTINUE2:%.*]]
+; CHECK:       pred.store.if1:
+; CHECK-NEXT:    [[TMP9:%.*]] = extractelement <16 x i1> [[TMP8]], i32 0
+; CHECK-NEXT:    store i1 [[TMP9]], ptr [[P]], align 1
+; CHECK-NEXT:    br label [[PRED_STORE_CONTINUE2]]
+; CHECK:       pred.store.continue2:
+; CHECK-NEXT:    br i1 true, label [[PRED_STORE_IF3:%.*]], label [[PRED_STORE_CONTINUE4:%.*]]
 ; CHECK:       pred.store.if3:
 ; CHECK-NEXT:    [[TMP12:%.*]] = extractelement <16 x i1> [[TMP8]], i32 0
 ; CHECK-NEXT:    store i1 [[TMP12]], ptr [[P]], align 1
 ; CHECK-NEXT:    br label [[PRED_STORE_CONTINUE4]]
 ; CHECK:       pred.store.continue4:
-; CHECK-NEXT:    [[TMP13:%.*]] = extractelement <16 x i1> [[TMP0]], i32 2
-; CHECK-NEXT:    br i1 [[TMP13]], label [[PRED_STORE_IF5:%.*]], label [[PRED_STORE_CONTINUE6:%.*]]
+; CHECK-NEXT:    br i1 true, label [[PRED_STORE_IF5:%.*]], label [[PRED_STORE_CONTINUE6:%.*]]
 ; CHECK:       pred.store.if5:
 ; CHECK-NEXT:    [[TMP14:%.*]] = extractelement <16 x i1> [[TMP8]], i32 0
 ; CHECK-NEXT:    store i1 [[TMP14]], ptr [[P]], align 1
 ; CHECK-NEXT:    br label [[PRED_STORE_CONTINUE6]]
 ; CHECK:       pred.store.continue6:
-; CHECK-NEXT:    [[TMP15:%.*]] = extractelement <16 x i1> [[TMP0]], i32 3
-; CHECK-NEXT:    br i1 [[TMP15]], label [[PRED_STORE_IF7:%.*]], label [[PRED_STORE_CONTINUE8:%.*]]
+; CHECK-NEXT:    br i1 true, label [[PRED_STORE_IF7:%.*]], label [[PRED_STORE_CONTINUE8:%.*]]
 ; CHECK:       pred.store.if7:
 ; CHECK-NEXT:    [[TMP16:%.*]] = extractelement <16 x i1> [[TMP8]], i32 0
 ; CHECK-NEXT:    store i1 [[TMP16]], ptr [[P]], align 1
 ; CHECK-NEXT:    br label [[PRED_STORE_CONTINUE8]]
 ; CHECK:       pred.store.continue8:
-; CHECK-NEXT:    [[TMP17:%.*]] = extractelement <16 x i1> [[TMP0]], i32 4
-; CHECK-NEXT:    br i1 [[TMP17]], label [[PRED_STORE_IF9:%.*]], label [[PRED_STORE_CONTINUE10:%.*]]
+; CHECK-NEXT:    br i1 true, label [[PRED_STORE_IF9:%.*]], label [[PRED_STORE_CONTINUE10:%.*]]
 ; CHECK:       pred.store.if9:
 ; CHECK-NEXT:    [[TMP18:%.*]] = extractelement <16 x i1> [[TMP8]], i32 0
 ; CHECK-NEXT:    store i1 [[TMP18]], ptr [[P]], align 1
 ; CHECK-NEXT:    br label [[PRED_STORE_CONTINUE10]]
 ; CHECK:       pred.store.continue10:
-; CHECK-NEXT:    [[TMP19:%.*]] = extractelement <16 x i1> [[TMP0]], i32 5
-; CHECK-NEXT:    br i1 [[TMP19]], label [[PRED_STORE_IF11:%.*]], label [[PRED_STORE_CONTINUE12:%.*]]
+; CHECK-NEXT:    br i1 true, label [[PRED_STORE_IF11:%.*]], label [[PRED_STORE_CONTINUE12:%.*]]
 ; CHECK:       pred.store.if11:
 ; CHECK-NEXT:    [[TMP20:%.*]] = extractelement <16 x i1> [[TMP8]], i32 0
 ; CHECK-NEXT:    store i1 [[TMP20]], ptr [[P]], align 1
 ; CHECK-NEXT:    br label [[PRED_STORE_CONTINUE12]]
 ; CHECK:       pred.store.continue12:
-; CHECK-NEXT:    [[TMP21:%.*]] = extractelement <16 x i1> [[TMP0]], i32 6
-; CHECK-NEXT:    br i1 [[TMP21]], label [[PRED_STORE_IF13:%.*]], label [[PRED_STORE_CONTINUE14:%.*]]
+; CHECK-NEXT:    br i1 true, label [[PRED_STORE_IF13:%.*]], label [[PRED_STORE_CONTINUE14:%.*]]
 ; CHECK:       pred.store.if13:
 ; CHECK-NEXT:    [[TMP22:%.*]] = extractelement <16 x i1> [[TMP8]], i32 0
 ; CHECK-NEXT:    store i1 [[TMP22]], ptr [[P]], align 1
 ; CHECK-NEXT:    br label [[PRED_STORE_CONTINUE14]]
 ; CHECK:       pred.store.continue14:
-; CHECK-NEXT:    [[TMP23:%.*]] = extractelement <16 x i1> [[TMP0]], i32 7
-; CHECK-NEXT:    br i1 [[TMP23]], label [[PRED_STORE_IF15:%.*]], label [[PRED_STORE_CONTINUE16:%.*]]
+; CHECK-NEXT:    br i1 true, label [[PRED_STORE_IF15:%.*]], label [[PRED_STORE_CONTINUE16:%.*]]
 ; CHECK:       pred.store.if15:
 ; CHECK-NEXT:    [[TMP24:%.*]] = extractelement <16 x i1> [[TMP8]], i32 0
 ; CHECK-NEXT:    store i1 [[TMP24]], ptr [[P]], align 1
 ; CHECK-NEXT:    br label [[PRED_STORE_CONTINUE16]]
 ; CHECK:       pred.store.continue16:
-; CHECK-NEXT:    [[TMP25:%.*]] = extractelement <16 x i1> [[TMP0]], i32 8
-; CHECK-NEXT:    br i1 [[TMP25]], label [[PRED_STORE_IF17:%.*]], label [[PRED_STORE_CONTINUE18:%.*]]
+; CHECK-NEXT:    br i1 true, label [[PRED_STORE_IF17:%.*]], label [[PRED_STORE_CONTINUE18:%.*]]
 ; CHECK:       pred.store.if17:
 ; CHECK-NEXT:    [[TMP26:%.*]] = extractelement <16 x i1> [[TMP8]], i32 0
 ; CHECK-NEXT:    store i1 [[TMP26]], ptr [[P]], align 1
 ; CHECK-NEXT:    br label [[PRED_STORE_CONTINUE18]]
 ; CHECK:       pred.store.continue18:
-; CHECK-NEXT:    [[TMP27:%.*]] = extractelement <16 x i1> [[TMP0]], i32 9
-; CHECK-NEXT:    br i1 [[TMP27]], label [[PRED_STORE_IF19:%.*]], label [[PRED_STORE_CONTINUE20:%.*]]
+; CHECK-NEXT:    br i1 false, label [[PRED_STORE_IF19:%.*]], label [[PRED_STORE_CONTINUE20:%.*]]
 ; CHECK:       pred.store.if19:
 ; CHECK-NEXT:    [[TMP28:%.*]] = extractelement <16 x i1> [[TMP8]], i32 0
 ; CHECK-NEXT:    store i1 [[TMP28]], ptr [[P]], align 1
 ; CHECK-NEXT:    br label [[PRED_STORE_CONTINUE20]]
 ; CHECK:       pred.store.continue20:
-; CHECK-NEXT:    [[TMP29:%.*]] = extractelement <16 x i1> [[TMP0]], i32 10
-; CHECK-NEXT:    br i1 [[TMP29]], label [[PRED_STORE_IF21:%.*]], label [[PRED_STORE_CONTINUE22:%.*]]
+; CHECK-NEXT:    br i1 false, label [[PRED_STORE_IF21:%.*]], label [[PRED_STORE_CONTINUE22:%.*]]
 ; CHECK:       pred.store.if21:
 ; CHECK-NEXT:    [[TMP30:%.*]] = extractelement <16 x i1> [[TMP8]], i32 0
 ; CHECK-NEXT:    store i1 [[TMP30]], ptr [[P]], align 1
 ; CHECK-NEXT:    br label [[PRED_STORE_CONTINUE22]]
 ; CHECK:       pred.store.continue22:
-; CHECK-NEXT:    [[TMP31:%.*]] = extractelement <16 x i1> [[TMP0]], i32 11
-; CHECK-NEXT:    br i1 [[TMP31]], label [[PRED_STORE_IF23:%.*]], label [[PRED_STORE_CONTINUE24:%.*]]
+; CHECK-NEXT:    br i1 false, label [[PRED_STORE_IF23:%.*]], label [[PRED_STORE_CONTINUE24:%.*]]
 ; CHECK:       pred.store.if23:
 ; CHECK-NEXT:    [[TMP32:%.*]] = extractelement <16 x i1> [[TMP8]], i32 0
 ; CHECK-NEXT:    store i1 [[TMP32]], ptr [[P]], align 1
 ; CHECK-NEXT:    br label [[PRED_STORE_CONTINUE24]]
 ; CHECK:       pred.store.continue24:
-; CHECK-NEXT:    [[TMP33:%.*]] = extractelement <16 x i1> [[TMP0]], i32 12
-; CHECK-NEXT:    br i1 [[TMP33]], label [[PRED_STORE_IF25:%.*]], label [[PRED_STORE_CONTINUE26:%.*]]
+; CHECK-NEXT:    br i1 false, label [[PRED_STORE_IF25:%.*]], label [[PRED_STORE_CONTINUE26:%.*]]
 ; CHECK:       pred.store.if25:
 ; CHECK-NEXT:    [[TMP34:%.*]] = extractelement <16 x i1> [[TMP8]], i32 0
 ; CHECK-NEXT:    store i1 [[TMP34]], ptr [[P]], align 1
 ; CHECK-NEXT:    br label [[PRED_STORE_CONTINUE26]]
 ; CHECK:       pred.store.continue26:
-; CHECK-NEXT:    [[TMP35:%.*]] = extractelement <16 x i1> [[TMP0]], i32 13
-; CHECK-NEXT:    br i1 [[TMP35]], label [[PRED_STORE_IF27:%.*]], label [[PRED_STORE_CONTINUE28:%.*]]
+; CHECK-NEXT:    br i1 false, label [[PRED_STORE_IF27:%.*]], label [[PRED_STORE_CONTINUE28:%.*]]
 ; CHECK:       pred.store.if27:
 ; CHECK-NEXT:    [[TMP36:%.*]] = extractelement <16 x i1> [[TMP8]], i32 0
 ; CHECK-NEXT:    store i1 [[TMP36]], ptr [[P]], align 1
 ; CHECK-NEXT:    br label [[PRED_STORE_CONTINUE28]]
 ; CHECK:       pred.store.continue28:
-; CHECK-NEXT:    [[TMP37:%.*]] = extractelement <16 x i1> [[TMP0]], i32 14
-; CHECK-NEXT:    br i1 [[TMP37]], label [[PRED_STORE_IF29:%.*]], label [[PRED_STORE_CONTINUE30:%.*]]
+; CHECK-NEXT:    br i1 false, label [[PRED_STORE_IF29:%.*]], label [[PRED_STORE_CONTINUE30:%.*]]
 ; CHECK:       pred.store.if29:
 ; CHECK-NEXT:    [[TMP38:%.*]] = extractelement <16 x i1> [[TMP8]], i32 0
 ; CHECK-NEXT:    store i1 [[TMP38]], ptr [[P]], align 1
 ; CHECK-NEXT:    br label [[PRED_STORE_CONTINUE30]]
 ; CHECK:       pred.store.continue30:
-; CHECK-NEXT:    [[TMP39:%.*]] = extractelement <16 x i1> [[TMP0]], i32 15
-; CHECK-NEXT:    br i1 [[TMP39]], label [[PRED_STORE_IF31:%.*]], label [[PRED_STORE_CONTINUE32]]
-; CHECK:       pred.store.if31:
-; CHECK-NEXT:    [[TMP40:%.*]] = extractelement <16 x i1> [[TMP8]], i32 0
-; CHECK-NEXT:    store i1 [[TMP40]], ptr [[P]], align 1
-; CHECK-NEXT:    br label [[PRED_STORE_CONTINUE32]]
-; CHECK:       pred.store.continue32:
-; CHECK-NEXT:    [[INDEX_NEXT]] = add i32 [[INDEX]], 16
-; CHECK-NEXT:    br i1 true, label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK-NEXT:    br label [[PRED_STORE_CONTINUE32:%.*]]
+; CHECK:       for.body.0:
+; CHECK-NEXT:    br label [[MIDDLE_BLOCK:%.*]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
@@ -156,7 +134,7 @@ define void @test(ptr %p, i40 %a) {
 ; CHECK-NEXT:    store i1 [[ICMP_SGT]], ptr [[P]], align 1
 ; CHECK-NEXT:    [[IV_NEXT]] = add i32 [[IV]], 1
 ; CHECK-NEXT:    [[COND:%.*]] = icmp ult i32 [[IV_NEXT]], 10
-; CHECK-NEXT:    br i1 [[COND]], label [[FOR_BODY]], label [[EXIT]], !llvm.loop [[LOOP3:![0-9]+]]
+; CHECK-NEXT:    br i1 [[COND]], label [[FOR_BODY]], label [[EXIT]], !llvm.loop [[LOOP0:![0-9]+]]
 ; CHECK:       exit:
 ; CHECK-NEXT:    ret void
 ;
@@ -183,7 +161,6 @@ exit:                                             ; preds = %for.body
 }
 ;.
 ; CHECK: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]}
-; CHECK: [[META1]] = !{!"llvm.loop.isvectorized", i32 1}
-; CHECK: [[META2]] = !{!"llvm.loop.unroll.runtime.disable"}
-; CHECK: [[LOOP3]] = distinct !{[[LOOP3]], [[META2]], [[META1]]}
+; CHECK: [[META1]] = !{!"llvm.loop.unroll.runtime.disable"}
+; CHECK: [[META2]] = !{!"llvm.loop.isvectorized", i32 1}
 ;.
diff --git a/llvm/test/Transforms/LoopVectorize/SystemZ/scalar-steps-with-users-demanding-all-lanes-and-first-lane-only.ll b/llvm/test/Transforms/LoopVectorize/SystemZ/scalar-steps-with-users-demanding-all-lanes-and-first-lane-only.ll
index fcf1ba072a62cb..fa39fc262ef5a5 100644
--- a/llvm/test/Transforms/LoopVectorize/SystemZ/scalar-steps-with-users-demanding-all-lanes-and-first-lane-only.ll
+++ b/llvm/test/Transforms/LoopVectorize/SystemZ/scalar-steps-with-users-demanding-all-lanes-and-first-lane-only.ll
@@ -14,32 +14,22 @@ define void @test_scalar_iv_steps_used_by_replicate_and_first_lane_only_vpinst(p
 ; CHECK-NEXT:  [[ENTRY:.*]]:
 ; CHECK-NEXT:    br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
 ; CHECK:       [[VECTOR_PH]]:
-; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
-; CHECK:       [[VECTOR_BODY]]:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[PRED_STORE_CONTINUE6:.*]] ]
-; CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
-; CHECK-NEXT:    [[TMP1:%.*]] = add i64 [[INDEX]], 1
-; CHECK-NEXT:    [[TMP2:%.*]] = add i64 [[INDEX]], 2
-; CHECK-NEXT:    [[TMP3:%.*]] = add i64 [[INDEX]], 3
-; CHECK-NEXT:    [[TMP4:%.*]] = mul nsw i64 [[TMP0]], 4
-; CHECK-NEXT:    [[TMP5:%.*]] = mul nsw i64 [[TMP1]], 4
-; CHECK-NEXT:    [[TMP6:%.*]] = mul nsw i64 [[TMP2]], 4
-; CHECK-NEXT:    [[TMP7:%.*]] = mul nsw i64 [[TMP3]], 4
+; CHECK-NEXT:    [[TMP4:%.*]] = mul nsw i64 0, 4
+; CHECK-NEXT:    [[TMP1:%.*]] = mul nsw i64 0, 4
+; CHECK-NEXT:    [[TMP2:%.*]] = mul nsw i64 0, 4
+; CHECK-NEXT:    [[TMP3:%.*]] = mul nsw i64 0, 4
 ; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i8, ptr [[SRC_1]], i64 [[TMP4]]
-; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i8, ptr [[SRC_1]], i64 [[TMP5]]
-; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i8, ptr [[SRC_1]], i64 [[TMP6]]
-; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i8, ptr [[SRC_1]], i64 [[TMP7]]
+; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i8, ptr [[SRC_1]], i64 [[TMP4]]
+; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i8, ptr [[SRC_1]], i64 [[TMP4]]
+; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i8, ptr [[SRC_1]], i64 [[TMP4]]
 ; CHECK-NEXT:    [[TMP12:%.*]] = load i8, ptr [[TMP8]], align 1
-; CHECK-NEXT:    [[TMP13:%.*]] = load i8, ptr [[TMP9]], align 1
-; CHECK-NEXT:    [[TMP14:%.*]] = load i8, ptr [[TMP10]], align 1
-; CHECK-NEXT:    [[TMP15:%.*]] = load i8, ptr [[TMP11]], align 1
-; CHECK-NEXT:    [[TMP16:%.*]] = insertelement <4 x i8> poison, i8 [[TMP12]], i32 0
-; CHECK-NEXT:    [[TMP17:%.*]] = insertelement <4 x i8> [[TMP16]], i8 [[TMP13]], i32 1
-; CHECK-NEXT:    [[TMP18:%.*]] = insertelement <4 x i8> [[TMP17]], i8 [[TMP14]], i32 2
-; CHECK-NEXT:    [[TMP19:%.*]] = insertelement <4 x i8> [[TMP18]], i8 [[TMP15]], i32 3
+; CHECK-NEXT:    [[TMP9:%.*]] = load i8, ptr [[TMP8]], align 1
+; CHECK-NEXT:    [[TMP10:%.*]] = load i8, ptr [[TMP8]], align 1
+; CHECK-NEXT:    [[TMP11:%.*]] = load i8, ptr [[TMP8]], align 1
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i8> poison, i8 [[TMP12]], i64 0
+; CHECK-NEXT:    [[TMP19:%.*]] = shufflevector <4 x i8> [[BROADCAST_SPLATINSERT]], <4 x i8> poison, <4 x i32> zeroinitializer
 ; CHECK-NEXT:    [[TMP20:%.*]] = icmp eq <4 x i8> [[TMP19]], zeroinitializer
-; CHECK-NEXT:    [[TMP21:%.*]] = add i64 [[TMP0]], 4
-; CHECK-NEXT:    [[TMP22:%.*]] = getelementptr [8 x i32], ptr @src, i64 0, i64 [[TMP21]]
+; CHECK-NEXT:    [[TMP22:%.*]] = getelementptr [8 x i32], ptr @src, i64 0, i64 4
 ; CHECK-NEXT:    [[TMP23:%.*]] = getelementptr i32, ptr [[TMP22]], i32 0
 ; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP23]], align 4
 ; CHECK-NEXT:    [[TMP24:%.*]] = extractelement <4 x i1> [[TMP20]], i32 0
@@ -52,26 +42,27 @@ define void @test_scalar_iv_steps_used_by_replicate_and_first_lane_only_vpinst(p
 ; CHECK-NEXT:    [[TMP26:%.*]] = extractelement <4 x i1> [[TMP20]], i32 1
 ; CHECK-NEXT:    br i1 [[TMP26]], label %[[PRED_STORE_IF1:.*]], label %[[PRED_STORE_CONTINUE2:.*]]
 ; CHECK:       [[PRED_STORE_IF1]]:
-; CHECK-NEXT:    [[TMP27:%.*]] = extractelement <4 x i32> [[WIDE_LOAD]], i32 1
+; CHECK-NEXT:    [[TMP27:%.*]] = extractelement <4 x i32> [[WIDE_LOAD]], i32 0
 ; CHECK-NEXT:    store i32 [[TMP27]], ptr [[DST]], align 4
 ; CHECK-NEXT:    br label %[[PRED_STORE_CONTINUE2]]
 ; CHECK:       [[PRED_STORE_CONTINUE2]]:
 ; CHECK-NEXT:    [[TMP28:%.*]] = extractelement <4 x i1> [[TMP20]], i32 2
 ; CHECK-NEXT:    br i1 [[TMP28]], label %[[PRED_STORE_IF3:.*]], label %[[PRED_STORE_CONTINUE4:.*]]
 ; CHECK:       [[PRED_STORE_IF3]]:
-; CHECK-NEXT:    [[TMP29:%.*]] = extractelement <4 x i32> [[WIDE_LOAD]], i32 2
+; CHECK-NEXT:    [[TMP29:%.*]] = extractelement <4 x i32> [[WIDE_LOAD]], i32 0
 ; CHECK-NEXT:    store i32 [[TMP29]], ptr [[DST]], align 4
 ; CHECK-NEXT:    br label %[[PRED_STORE_CONTINUE4]]
 ; CHECK:       [[PRED_STORE_CONTINUE4]]:
 ; CHECK-NEXT:    [[TMP30:%.*]] = extractelement <4 x i1> [[TMP20]], i32 3
-; CHECK-NEXT:    br i1 [[TMP30]], label %[[PRED_STORE_IF5:.*]], label %[[PRED_STORE_CONTINUE6]]
+; CHECK-NEXT:    br i1 [[TMP30]], label %[[PRED_STORE_IF5:.*]], label %[[PRED_STORE_CONTINUE6:.*]]
 ; CHECK:       [[PRED_STORE_IF5]]:
-; CHECK-NEXT:    [[TMP31:%.*]] = extractelement <4 x i32> [[WIDE_LOAD]], i32 3
+; CHECK-NEXT:    [[TMP31:%.*]] = extractelement <4 x i32> [[WIDE_LOAD]], i32 0
 ; CHECK-NEXT:    store i32 [[TMP31]], ptr [[DST]], align 4
 ; CHECK-NEXT:    br label %[[PRED_STORE_CONTINUE6]]
 ; CHECK:       [[PRED_STORE_CONTINUE6]]:
-; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; CHECK-NEXT:    br i1 true, label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK-NEXT:    br label %[[THEN_0:.*]]
+; CHECK:       [[THEN_0]]:
+; CHECK-NEXT:    br label %[[MIDDLE_BLOCK:.*]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
 ; CHECK-NEXT:    br i1 true, label %[[EXIT:.*]], label %[[SCALAR_PH]]
 ; CHECK:       [[SCALAR_PH]]:
@@ -93,7 +84,7 @@ define void @test_scalar_iv_steps_used_by_replicate_and_first_lane_only_vpinst(p
 ; CHECK:       [[LOOP_LATCH]]:
 ; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
 ; CHECK-NEXT:    [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], 4
-; CHECK-NEXT:    br i1 [[EC]], label %[[EXIT]], label %[[LOOP_HEADER]], !llvm.loop [[LOOP3:![0-9]+]]
+; CHECK-NEXT:    br i1 [[EC]], label %[[EXIT]], label %[[LOOP_HEADER]], !llvm.loop [[LOOP0:![0-9]+]]
 ; CHECK:       [[EXIT]]:
 ; CHECK-NEXT:    ret void
 ;
@@ -125,7 +116,6 @@ exit:
 }
 ;.
 ; CHECK: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]}
-; CHECK: [[META1]] = !{!"llvm.loop.isvectorized", i32 1}
-; CHECK: [[META2]] = !{!"llvm.loop.unroll.runtime.disable"}
-; CHECK: [[LOOP3]] = distinct !{[[LOOP3]], [[META2]], [[META1]]}
+; CHECK: [[META1]] = !{!"llvm.loop.unroll.runtime.disable"}
+; CHECK: [[META2]] = !{!"llvm.loop.isvectorized", i32 1}
 ;.
diff --git a/llvm/test/Transforms/LoopVectorize/X86/constant-fold.ll b/llvm/test/Transforms/LoopVectorize/X86/constant-fold.ll
index 61cae9c1b3f5dc..1e6e38632701ff 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/constant-fold.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/constant-fold.ll
@@ -15,17 +15,11 @@ define void @f1() {
 ; CHECK-NEXT:  bb1:
 ; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; CHECK:       vector.ph:
-; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
-; CHECK:       vector.body:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[OFFSET_IDX:%.*]] = trunc i32 [[INDEX]] to i16
-; CHECK-NEXT:    [[TMP0:%.*]] = add i16 [[OFFSET_IDX]], 0
-; CHECK-NEXT:    [[TMP1:%.*]] = sext i16 [[TMP0]] to i64
-; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr [2 x ptr], ptr @b, i16 0, i64 [[TMP1]]
-; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr ptr, ptr [[TMP2]], i32 0
-; CHECK-NEXT:    store <2 x ptr> <ptr @a, ptr @a>, ptr [[TMP3]], align 8
-; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 2
-; CHECK-NEXT:    br i1 true, label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK-NEXT:    [[TMP0:%.*]] = sext i16 0 to i64
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr [2 x ptr], ptr @b, i16 0, i64 [[TMP0]]
+; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr ptr, ptr [[TMP1]], i32 0
+; CHECK-NEXT:    store <2 x ptr> <ptr @a, ptr @a>, ptr [[TMP2]], align 8
+; CHECK-NEXT:    br label [[MIDDLE_BLOCK:%.*]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    br i1 true, label [[BB3:%.*]], label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
@@ -40,7 +34,7 @@ define void @f1() {
 ; CHECK-NEXT:    store ptr [[_TMP2]], ptr [[_TMP7]], align 8
 ; CHECK-NEXT:    [[_TMP9]] = add nsw i16 [[C_1_0]], 1
 ; CHECK-NEXT:    [[_TMP11:%.*]] = icmp slt i16 [[_TMP9]], 2
-; CHECK-NEXT:    br i1 [[_TMP11]], label [[BB2]], label [[BB3]], !llvm.loop [[LOOP2:![0-9]+]]
+; CHECK-NEXT:    br i1 [[_TMP11]], label [[BB2]], label [[BB3]], !llvm.loop [[LOOP0:![0-9]+]]
 ; CHECK:       bb3:
 ; CHECK-NEXT:    ret void
 ;
diff --git a/llvm/test/Transforms/LoopVectorize/X86/pr34438.ll b/llvm/test/Transforms/LoopVectorize/X86/pr34438.ll
index cc60359af2f8c5..47662d155b5453 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/pr34438.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/pr34438.ll
@@ -14,20 +14,16 @@ define void @small_tc(ptr noalias nocapture %A, ptr noalias nocapture readonly %
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; CHECK:       vector.ph:
-; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
-; CHECK:       vector.body:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
-; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds float, ptr [[B:%.*]], i64 [[TMP0]]
-; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i32 0
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <8 x float>, ptr [[TMP2]], align 4, !llvm.access.group [[ACC_GRP0:![0-9]+]]
-; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds float, ptr [[A:%.*]], i64 [[TMP0]]
-; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds float, ptr [[TMP3]], i32 0
-; CHECK-NEXT:    [[WIDE_LOAD1:%.*]] = load <8 x float>, ptr [[TMP4]], align 4, !llvm.access.group [[ACC_GRP0]]
-; CHECK-NEXT:    [[TMP5:%.*]] = fadd fast <8 x float> [[WIDE_LOAD]], [[WIDE_LOAD1]]
-; CHECK-NEXT:    store <8 x float> [[TMP5]], ptr [[TMP4]], align 4, !llvm.access.group [[ACC_GRP0]]
-; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
-; CHECK-NEXT:    br i1 true, label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP1:![0-9]+]]
+; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds float, ptr [[B:%.*]], i64 0
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds float, ptr [[TMP0]], i32 0
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <8 x float>, ptr [[TMP1]], align 4, !llvm.access.group [[ACC_GRP0:![0-9]+]]
+; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds float, ptr [[A:%.*]], i64 0
+; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds float, ptr [[TMP2]], i32 0
+; CHECK-NEXT:    [[WIDE_LOAD1:%.*]] = load <8 x float>, ptr [[TMP3]], align 4, !llvm.access.group [[ACC_GRP0]]
+; CHECK-NEXT:    [[TMP4:%.*]] = fadd fast <8 x float> [[WIDE_LOAD]], [[WIDE_LOAD1]]
+; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds float, ptr [[TMP2]], i32 0
+; CHECK-NEXT:    store <8 x float> [[TMP4]], ptr [[TMP5]], align 4, !llvm.access.group [[ACC_GRP0]]
+; CHECK-NEXT:    br label [[MIDDLE_BLOCK:%.*]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
@@ -43,7 +39,7 @@ define void @small_tc(ptr noalias nocapture %A, ptr noalias nocapture readonly %
 ; CHECK-NEXT:    store float [[ADD]], ptr [[ARRAYIDX2]], align 4, !llvm.access.group [[ACC_GRP0]]
 ; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
 ; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 8
-; CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
+; CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP1:![0-9]+]]
 ; CHECK:       for.end:
 ; CHECK-NEXT:    ret void
 ;
diff --git a/llvm/test/Transforms/LoopVectorize/first-order-recurrence.ll b/llvm/test/Transforms/LoopVectorize/first-order-recurrence.ll
index c4e3e0b8c5a369..cd91eb9757c0a9 100644
--- a/llvm/test/Transforms/LoopVectorize/first-order-recurrence.ll
+++ b/llvm/test/Transforms/LoopVectorize/first-order-recurrence.ll
@@ -3705,15 +3705,8 @@ define i32 @recurence_uniform_load(ptr %src, ptr noalias %dst) {
 ; UNROLL-NO-IC-NEXT:  entry:
 ; UNROLL-NO-IC-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; UNROLL-NO-IC:       vector.ph:
-; UNROLL-NO-IC-NEXT:    br label [[VECTOR_BODY:%.*]]
-; UNROLL-NO-IC:       vector.body:
-; UNROLL-NO-IC-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; UNROLL-NO-IC-NEXT:    [[VECTOR_RECUR:%.*]] = phi <4 x i32> [ <i32 poison, i32 poison, i32 poison, i32 0>, [[VECTOR_PH]] ], [ [[BROADCAST_SPLAT:%.*]], [[VECTOR_BODY]] ]
 ; UNROLL-NO-IC-NEXT:    [[TMP0:%.*]] = load i32, ptr [[SRC:%.*]], align 4
-; UNROLL-NO-IC-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i32> poison, i32 [[TMP0]], i64 0
-; UNROLL-NO-IC-NEXT:    [[BROADCAST_SPLAT]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT]], <4 x i32> poison, <4 x i32> zeroinitializer
-; UNROLL-NO-IC-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 8
-; UNROLL-NO-IC-NEXT:    br i1 true, label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP38:![0-9]+]]
+; UNROLL-NO-IC-NEXT:    br label [[MIDDLE_BLOCK:%.*]]
 ; UNROLL-NO-IC:       middle.block:
 ; UNROLL-NO-IC-NEXT:    br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]]
 ; UNROLL-NO-IC:       scalar.ph:
@@ -3726,7 +3719,7 @@ define i32 @recurence_uniform_load(ptr %src, ptr noalias %dst) {
 ; UNROLL-NO-IC-NEXT:    [[ADD]] = add i64 [[PHI]], 1
 ; UNROLL-NO-IC-NEXT:    [[LOAD]] = load i32, ptr [[SRC]], align 4
 ; UNROLL-NO-IC-NEXT:    [[ICMP:%.*]] = icmp ult i64 [[PHI]], 1
-; UNROLL-NO-IC-NEXT:    br i1 [[ICMP]], label [[LOOP]], label [[EXIT]], !llvm.loop [[LOOP39:![0-9]+]]
+; UNROLL-NO-IC-NEXT:    br i1 [[ICMP]], label [[LOOP]], label [[EXIT]], !llvm.loop [[LOOP38:![0-9]+]]
 ; UNROLL-NO-IC:       exit:
 ; UNROLL-NO-IC-NEXT:    ret i32 0
 ;
@@ -3734,13 +3727,8 @@ define i32 @recurence_uniform_load(ptr %src, ptr noalias %dst) {
 ; UNROLL-NO-VF-NEXT:  entry:
 ; UNROLL-NO-VF-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; UNROLL-NO-VF:       vector.ph:
-; UNROLL-NO-VF-NEXT:    br label [[VECTOR_BODY:%.*]]
-; UNROLL-NO-VF:       vector.body:
-; UNROLL-NO-VF-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; UNROLL-NO-VF-NEXT:    [[VECTOR_RECUR:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[TMP0:%.*]], [[VECTOR_BODY]] ]
-; UNROLL-NO-VF-NEXT:    [[TMP0]] = load i32, ptr [[SRC:%.*]], align 4
-; UNROLL-NO-VF-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
-; UNROLL-NO-VF-NEXT:    br i1 true, label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP38:![0-9]+]]
+; UNROLL-NO-VF-NEXT:    [[TMP0:%.*]] = load i32, ptr [[SRC:%.*]], align 4
+; UNROLL-NO-VF-NEXT:    br label [[MIDDLE_BLOCK:%.*]]
 ; UNROLL-NO-VF:       middle.block:
 ; UNROLL-NO-VF-NEXT:    br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]]
 ; UNROLL-NO-VF:       scalar.ph:
@@ -3753,7 +3741,7 @@ define i32 @recurence_uniform_load(ptr %src, ptr noalias %dst) {
 ; UNROLL-NO-VF-NEXT:    [[ADD]] = add i64 [[PHI]], 1
 ; UNROLL-NO-VF-NEXT:    [[LOAD]] = load i32, ptr [[SRC]], align 4
 ; UNROLL-NO-VF-NEXT:    [[ICMP:%.*]] = icmp ult i64 [[PHI]], 1
-; UNROLL-NO-VF-NEXT:    br i1 [[ICMP]], label [[LOOP]], label [[EXIT]], !llvm.loop [[LOOP39:![0-9]+]]
+; UNROLL-NO-VF-NEXT:    br i1 [[ICMP]], label [[LOOP]], label [[EXIT]], !llvm.loop [[LOOP38:![0-9]+]]
 ; UNROLL-NO-VF:       exit:
 ; UNROLL-NO-VF-NEXT:    ret i32 0
 ;
@@ -3761,15 +3749,8 @@ define i32 @recurence_uniform_load(ptr %src, ptr noalias %dst) {
 ; SINK-AFTER-NEXT:  entry:
 ; SINK-AFTER-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; SINK-AFTER:       vector.ph:
-; SINK-AFTER-NEXT:    br label [[VECTOR_BODY:%.*]]
-; SINK-AFTER:       vector.body:
-; SINK-AFTER-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; SINK-AFTER-NEXT:    [[VECTOR_RECUR:%.*]] = phi <4 x i32> [ <i32 poison, i32 poison, i32 poison, i32 0>, [[VECTOR_PH]] ], [ [[BROADCAST_SPLAT:%.*]], [[VECTOR_BODY]] ]
 ; SINK-AFTER-NEXT:    [[TMP0:%.*]] = load i32, ptr [[SRC:%.*]], align 4
-; SINK-AFTER-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i32> poison, i32 [[TMP0]], i64 0
-; SINK-AFTER-NEXT:    [[BROADCAST_SPLAT]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT]], <4 x i32> poison, <4 x i32> zeroinitializer
-; SINK-AFTER-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 4
-; SINK-AFTER-NEXT:    br i1 true, label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP38:![0-9]+]]
+; SINK-AFTER-NEXT:    br label [[MIDDLE_BLOCK:%.*]]
 ; SINK-AFTER:       middle.block:
 ; SINK-AFTER-NEXT:    br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]]
 ; SINK-AFTER:       scalar.ph:
@@ -3782,7 +3763,7 @@ define i32 @recurence_uniform_load(ptr %src, ptr noalias %dst) {
 ; SINK-AFTER-NEXT:    [[ADD]] = add i64 [[PHI]], 1
 ; SINK-AFTER-NEXT:    [[LOAD]] = load i32, ptr [[SRC]], align 4
 ; SINK-AFTER-NEXT:    [[ICMP:%.*]] = icmp ult i64 [[PHI]], 1
-; SINK-AFTER-NEXT:    br i1 [[ICMP]], label [[LOOP]], label [[EXIT]], !llvm.loop [[LOOP39:![0-9]+]]
+; SINK-AFTER-NEXT:    br i1 [[ICMP]], label [[LOOP]], label [[EXIT]], !llvm.loop [[LOOP38:![0-9]+]]
 ; SINK-AFTER:       exit:
 ; SINK-AFTER-NEXT:    ret i32 0
 ;
diff --git a/llvm/test/Transforms/LoopVectorize/vector-loop-backedge-elimination.ll b/llvm/test/Transforms/LoopVectorize/vector-loop-backedge-elimination.ll
index fd75177c0d1062..c144cf0ca5c52f 100644
--- a/llvm/test/Transforms/LoopVectorize/vector-loop-backedge-elimination.ll
+++ b/llvm/test/Transforms/LoopVectorize/vector-loop-backedge-elimination.ll
@@ -11,9 +11,9 @@ define void @test_tc_less_than_16(ptr %A, i64 %N) {
 ; VF8UF1:       [[CMP:%.+]] = icmp eq i64 %index.next, %n.vec
 ; VF8UF1-NEXT:  br i1 [[CMP]], label %middle.block, label %vector.body
 ;
-; VF8UF2:       br i1 true, label %middle.block, label %vector.body
+; VF8UF2:       br label %middle.block
 ;
-; VF16UF1:      br i1 true, label %middle.block, label %vector.body
+; VF16UF1:      br label %middle.block
 ;
 entry:
   %and = and i64 %N, 15
diff --git a/llvm/test/Transforms/LoopVectorize/version-stride-with-integer-casts.ll b/llvm/test/Transforms/LoopVectorize/version-stride-with-integer-casts.ll
index b3ec3e8f0f3c63..010a7f3b73a215 100644
--- a/llvm/test/Transforms/LoopVectorize/version-stride-with-integer-casts.ll
+++ b/llvm/test/Transforms/LoopVectorize/version-stride-with-integer-casts.ll
@@ -498,18 +498,14 @@ define void @sext_of_i1_stride(i1 %g, ptr %dst) mustprogress {
 ; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[TMP2]], 4
 ; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[TMP2]], [[N_MOD_VF]]
 ; CHECK-NEXT:    [[IND_END:%.*]] = mul i64 [[N_VEC]], [[G_64]]
-; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
-; CHECK:       vector.body:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[OFFSET_IDX:%.*]] = mul i64 [[INDEX]], [[G_64]]
 ; CHECK-NEXT:    [[TMP3:%.*]] = mul i64 0, [[G_64]]
-; CHECK-NEXT:    [[TMP4:%.*]] = add i64 [[OFFSET_IDX]], [[TMP3]]
+; CHECK-NEXT:    [[TMP8:%.*]] = mul i64 0, [[G_64]]
+; CHECK-NEXT:    [[TMP4:%.*]] = add i64 [[TMP3]], [[TMP8]]
 ; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i16, ptr [[DST]], i64 [[TMP4]]
 ; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i16, ptr [[TMP5]], i32 0
 ; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i16, ptr [[TMP6]], i32 -3
 ; CHECK-NEXT:    store <4 x i16> <i16 -1, i16 -1, i16 -1, i16 -1>, ptr [[TMP7]], align 2
-; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; CHECK-NEXT:    br i1 true, label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]]
+; CHECK-NEXT:    br label [[MIDDLE_BLOCK:%.*]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[TMP2]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]
@@ -522,7 +518,7 @@ define void @sext_of_i1_stride(i1 %g, ptr %dst) mustprogress {
 ; CHECK-NEXT:    store i16 [[G_16]], ptr [[GEP]], align 2
 ; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], [[G_64]]
 ; CHECK-NEXT:    [[CMP:%.*]] = icmp ult i64 [[IV_NEXT]], 16
-; CHECK-NEXT:    br i1 [[CMP]], label [[LOOP]], label [[EXIT]], !llvm.loop [[LOOP15:![0-9]+]]
+; CHECK-NEXT:    br i1 [[CMP]], label [[LOOP]], label [[EXIT]], !llvm.loop [[LOOP14:![0-9]+]]
 ; CHECK:       exit:
 ; CHECK-NEXT:    ret void
 ;
@@ -559,6 +555,5 @@ exit:
 ; CHECK: [[LOOP11]] = distinct !{[[LOOP11]], [[META1]]}
 ; CHECK: [[LOOP12]] = distinct !{[[LOOP12]], [[META1]], [[META2]]}
 ; CHECK: [[LOOP13]] = distinct !{[[LOOP13]], [[META1]]}
-; CHECK: [[LOOP14]] = distinct !{[[LOOP14]], [[META1]], [[META2]]}
-; CHECK: [[LOOP15]] = distinct !{[[LOOP15]], [[META1]]}
+; CHECK: [[LOOP14]] = distinct !{[[LOOP14]], [[META1]]}
 ;.

>From f0421c685f34b6b11234320a3eb41b7e8c618214 Mon Sep 17 00:00:00 2001
From: Florian Hahn <flo at fhahn.com>
Date: Mon, 4 Nov 2024 13:19:14 +0000
Subject: [PATCH 03/24] !fixup after merge

---
 llvm/lib/Transforms/Vectorize/VPlan.cpp       | 30 ++++------------
 llvm/lib/Transforms/Vectorize/VPlan.h         |  4 +--
 .../RISCV/vplan-vp-intrinsics-reduction.ll    |  1 +
 .../vplan-printing-before-execute.ll          | 35 ++++++++-----------
 4 files changed, 23 insertions(+), 47 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/VPlan.cpp b/llvm/lib/Transforms/Vectorize/VPlan.cpp
index a1ba3952620cae..8ef5d0253218a4 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlan.cpp
@@ -448,8 +448,10 @@ void VPBasicBlock::connectToPredecessors(BasicBlock *NewBB,
       // Set each forward successor here when it is created, excluding
       // backedges. A backward successor is set when the branch is created.
       unsigned idx = PredVPSuccessors.front() == this ? 0 : 1;
-      assert(!TermBr->getSuccessor(idx) &&
-             "Trying to reset an existing successor block.");
+      assert(
+          (!TermBr->getSuccessor(idx) ||
+           (isa<VPIRBasicBlock>(this) && TermBr->getSuccessor(idx) == NewBB)) &&
+          "Trying to reset an existing successor block.");
       TermBr->setSuccessor(idx, NewBB);
     }
     CFG.DTU.applyUpdates({{DominatorTree::Insert, PredBB, NewBB}});
@@ -472,23 +474,7 @@ void VPIRBasicBlock::execute(VPTransformState *State) {
         "other blocks must be terminated by a branch");
   }
 
-  for (VPBlockBase *PredVPBlock : getHierarchicalPredecessors()) {
-    VPBasicBlock *PredVPBB = PredVPBlock->getExitingBasicBlock();
-    BasicBlock *PredBB = State->CFG.VPBB2IRBB[PredVPBB];
-    assert(PredBB && "Predecessor basic-block not found building successor.");
-    LLVM_DEBUG(dbgs() << "LV: draw edge from" << PredBB->getName() << '\n');
-
-    auto *PredBBTerminator = PredBB->getTerminator();
-    auto *TermBr = cast<BranchInst>(PredBBTerminator);
-    // Set each forward successor here when it is created, excluding
-    // backedges. A backward successor is set when the branch is created.
-    const auto &PredVPSuccessors = PredVPBB->getHierarchicalSuccessors();
-    unsigned idx = PredVPSuccessors.front() == this ? 0 : 1;
-    assert((!TermBr->getSuccessor(idx) || TermBr->getSuccessor(idx) == IRBB) &&
-           "Trying to reset an existing successor block.");
-    TermBr->setSuccessor(idx, IRBB);
-    State->CFG.DTU.applyUpdates({{DominatorTree::Insert, PredBB, IRBB}});
-  }
+  connectToPredecessors(IRBB, State->CFG);
 }
 
 void VPBasicBlock::execute(VPTransformState *State) {
@@ -1039,11 +1025,9 @@ void VPlan::execute(VPTransformState *State) {
   // VPlan execution rather than earlier during VPlan construction.
   BasicBlock *MiddleBB = State->CFG.ExitBB;
   BasicBlock *ScalarPh = MiddleBB->getSingleSuccessor();
+  VPBasicBlock *MiddleVPBB = getMiddleBlock();
   replaceVPBBWithIRVPBB(getScalarPreheader(), ScalarPh);
-  if (getVectorLoopRegion()) {
-    VPBasicBlock *MiddleVPBB = getMiddleBlock();
-    replaceVPBBWithIRVPBB(MiddleVPBB, MiddleBB);
-  }
+  replaceVPBBWithIRVPBB(MiddleVPBB, MiddleBB);
 
   // Disconnect the middle block from its single successor (the scalar loop
   // header) in both the CFG and DT. The branch will be recreated during VPlan
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h
index bfd6b4f375946e..6f7f73a920a4b5 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.h
+++ b/llvm/lib/Transforms/Vectorize/VPlan.h
@@ -3874,9 +3874,7 @@ class VPlan {
   }
 
   /// Returns the preheader of the vector loop region.
-  VPBasicBlock *getVectorPreheader() {
-    return cast<VPBasicBlock>(getVectorLoopRegion()->getSinglePredecessor());
-  }
+  VPBasicBlock *getVectorPreheader() { return cast<VPBasicBlock>(getEntry()); }
 
   /// Returns the canonical induction recipe of the vector loop.
   VPCanonicalIVPHIRecipe *getCanonicalIV() {
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/vplan-vp-intrinsics-reduction.ll b/llvm/test/Transforms/LoopVectorize/RISCV/vplan-vp-intrinsics-reduction.ll
index ca745d8b98ceec..77a9d105c85f3f 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/vplan-vp-intrinsics-reduction.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/vplan-vp-intrinsics-reduction.ll
@@ -72,6 +72,7 @@ define i32 @reduction(ptr %a, i64 %n, i32 %start) {
 ; IF-EVL-INLOOP-NEXT:   IR   %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
 ; IF-EVL-INLOOP-NEXT:   IR   %rdx = phi i32 [ %start, %entry ], [ %add, %for.body ]
 ; IF-EVL-INLOOP:        IR   %exitcond.not = icmp eq i64 %iv.next, %n
+; IF-EVL-INLOOP-NEXT: No successors
 ; IF-EVL-INLOOP-NEXT: }
 ;
 
diff --git a/llvm/test/Transforms/LoopVectorize/vplan-printing-before-execute.ll b/llvm/test/Transforms/LoopVectorize/vplan-printing-before-execute.ll
index c9612ced3eee01..3d1947a38acc71 100644
--- a/llvm/test/Transforms/LoopVectorize/vplan-printing-before-execute.ll
+++ b/llvm/test/Transforms/LoopVectorize/vplan-printing-before-execute.ll
@@ -57,7 +57,6 @@ define void @test_tc_less_than_16(ptr %A, i64 %N) {
 ;
 ; CHECK: Executing best plan with VF=8, UF=2
 ; CHECK-NEXT: VPlan 'Final VPlan for VF={8},UF={2}' {
-; CHECK-NEXT: Live-in vp<[[VFxUF:%.+]]> = VF * UF
 ; CHECK-NEXT: Live-in vp<[[VTC:%.+]]> = vector-trip-count
 ; CHECK-NEXT: vp<[[TC:%.+]]> = original trip-count
 ; CHECK-EMPTY:
@@ -67,27 +66,21 @@ define void @test_tc_less_than_16(ptr %A, i64 %N) {
 ; CHECK-NEXT: No successors
 ; CHECK-EMPTY:
 ; CHECK-NEXT: vector.ph:
-; CHECK-NEXT: Successor(s): vector loop
+; CHECK-NEXT: Successor(s): vector.body
 ; CHECK-EMPTY:
-; CHECK-NEXT: <x1> vector loop: {
-; CHECK-NEXT:   vector.body:
-; CHECK-NEXT:     EMIT vp<[[CAN_IV:%.+]]> = CANONICAL-INDUCTION ir<0>, vp<[[CAN_IV_NEXT:%.+]]>
-; CHECK-NEXT:     vp<[[STEPS1:%.+]]> = SCALAR-STEPS vp<[[CAN_IV]]>, ir<1>
-; CHECK-NEXT:     EMIT vp<[[PADD1:%.+]]> = ptradd ir<%A>, vp<[[STEPS1]]>
-; CHECK-NEXT:     vp<[[VPTR1:%.]]> = vector-pointer vp<[[PADD1]]>
-; CHECK-NEXT:     vp<[[VPTR2:%.]]> = vector-pointer vp<[[PADD1]]>, ir<1>
-; CHECK-NEXT:     WIDEN ir<%l> = load vp<[[VPTR1]]>
-; CHECK-NEXT:     WIDEN ir<%l>.1 = load vp<[[VPTR2]]>
-; CHECK-NEXT:     WIDEN ir<%add> = add nsw ir<%l>, ir<10>
-; CHECK-NEXT:     WIDEN ir<%add>.1 = add nsw ir<%l>.1, ir<10>
-; CHECK-NEXT:     vp<[[VPTR3:%.+]]> = vector-pointer vp<[[PADD1]]>
-; CHECK-NEXT:     vp<[[VPTR4:%.+]]> = vector-pointer vp<[[PADD1]]>, ir<1>
-; CHECK-NEXT:     WIDEN store vp<[[VPTR3]]>, ir<%add>
-; CHECK-NEXT:     WIDEN store vp<[[VPTR4]]>, ir<%add>.1
-; CHECK-NEXT:     EMIT vp<[[CAN_IV_NEXT]]> = add nuw vp<[[CAN_IV:%.+]]>, vp<[[VFxUF]]>
-; CHECK-NEXT:     EMIT branch-on-cond ir<true>
-; CHECK-NEXT:   No successors
-; CHECK-NEXT: }
+; CHECK-NEXT: vector.body:
+; CHECK-NEXT:   vp<[[STEPS1:%.+]]> = SCALAR-STEPS ir<0>, ir<1>
+; CHECK-NEXT:   EMIT vp<[[PADD1:%.+]]> = ptradd ir<%A>, vp<[[STEPS1]]>
+; CHECK-NEXT:   vp<[[VPTR1:%.]]> = vector-pointer vp<[[PADD1]]>
+; CHECK-NEXT:   vp<[[VPTR2:%.]]> = vector-pointer vp<[[PADD1]]>, ir<1>
+; CHECK-NEXT:   WIDEN ir<%l> = load vp<[[VPTR1]]>
+; CHECK-NEXT:   WIDEN ir<%l>.1 = load vp<[[VPTR2]]>
+; CHECK-NEXT:   WIDEN ir<%add> = add nsw ir<%l>, ir<10>
+; CHECK-NEXT:   WIDEN ir<%add>.1 = add nsw ir<%l>.1, ir<10>
+; CHECK-NEXT:   vp<[[VPTR3:%.+]]> = vector-pointer vp<[[PADD1]]>
+; CHECK-NEXT:   vp<[[VPTR4:%.+]]> = vector-pointer vp<[[PADD1]]>, ir<1>
+; CHECK-NEXT:   WIDEN store vp<[[VPTR3]]>, ir<%add>
+; CHECK-NEXT:   WIDEN store vp<[[VPTR4]]>, ir<%add>.1
 ; CHECK-NEXT: Successor(s): middle.block
 ; CHECK-EMPTY:
 ; CHECK-NEXT: middle.block:

>From a4843b5e91d6da59a9e942db320a2f92ebe99438 Mon Sep 17 00:00:00 2001
From: Florian Hahn <flo at fhahn.com>
Date: Mon, 4 Nov 2024 19:17:13 +0000
Subject: [PATCH 04/24] !fixup address latest comments, thanks!

---
 llvm/lib/Transforms/Vectorize/LoopVectorize.cpp   | 4 ++--
 llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp | 7 +++----
 2 files changed, 5 insertions(+), 6 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 934564fb75b04c..554fd57a719d89 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -7729,9 +7729,9 @@ DenseMap<const SCEV *, Value *> LoopVectorizationPlanner::executePlan(
           dyn_cast<VPRegionBlock>(BestVPlan.getEntry()->getSingleSuccessor())) {
     VPBasicBlock *HeaderVPBB = R->getEntryBasicBlock();
     Loop *L = LI->getLoopFor(State.CFG.VPBB2IRBB[HeaderVPBB]);
-    if (VectorizedLoopID)
+    if (VectorizedLoopID) {
       L->setLoopID(*VectorizedLoopID);
-    else {
+    } else {
       // Keep all loop hints from the original loop on the vector loop (we'll
       // replace the vectorizer-specific hints below).
       if (MDNode *LID = OrigLoop->getLoopID())
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
index 2869330078ae42..53ec6a4829e178 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
@@ -694,8 +694,7 @@ void VPlanTransforms::optimizeForVFAndUF(VPlan &Plan, ElementCount BestVF,
 
   SmallVector<VPValue *> PossiblyDead(Term->operands());
   Term->eraseFromParent();
-  VPBasicBlock *Header =
-      cast<VPBasicBlock>(Plan.getVectorLoopRegion()->getEntry());
+  auto *Header = cast<VPBasicBlock>(Plan.getVectorLoopRegion()->getEntry());
   if (all_of(Header->phis(), [](VPRecipeBase &R) {
         return !isa<VPWidenIntOrFpInductionRecipe, VPReductionPHIRecipe>(&R);
       })) {
@@ -706,7 +705,7 @@ void VPlanTransforms::optimizeForVFAndUF(VPlan &Plan, ElementCount BestVF,
     }
 
     VPBlockBase *Preheader = Plan.getVectorLoopRegion()->getSinglePredecessor();
-    VPBasicBlock *Exiting =
+    auto *Exiting =
         cast<VPBasicBlock>(Plan.getVectorLoopRegion()->getExiting());
 
     auto *LoopRegion = Plan.getVectorLoopRegion();
@@ -741,7 +740,7 @@ void VPlanTransforms::optimizeForVFAndUF(VPlan &Plan, ElementCount BestVF,
 }
 
 /// Sink users of \p FOR after the recipe defining the previous value \p
-// Previous of the recurrence. \returns true if all users of \p FOR could be
+/// Previous of the recurrence. \returns true if all users of \p FOR could be
 /// re-arranged as needed or false if it is not possible.
 static bool
 sinkRecurrenceUsersAfterPrevious(VPFirstOrderRecurrencePHIRecipe *FOR,

>From 706b68134f86fe8381d8244544dedefbecaea65a Mon Sep 17 00:00:00 2001
From: Florian Hahn <flo at fhahn.com>
Date: Sun, 24 Nov 2024 08:05:03 +0000
Subject: [PATCH 05/24] !fixup address latest comments, thanks!

---
 llvm/lib/Transforms/Vectorize/VPlan.cpp       |  8 +--
 .../Transforms/Vectorize/VPlanTransforms.cpp  | 18 +++--
 .../LoopVectorize/RISCV/low-trip-count.ll     | 72 ++++++++-----------
 .../LoopVectorize/RISCV/short-trip-count.ll   | 40 +++++------
 .../debugloc-optimize-vfuf-term.ll            | 43 +++++------
 5 files changed, 77 insertions(+), 104 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/VPlan.cpp b/llvm/lib/Transforms/Vectorize/VPlan.cpp
index 8b228520cea51a..70c2f5f320af97 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlan.cpp
@@ -1054,8 +1054,8 @@ void VPlan::execute(VPTransformState *State) {
       if (isa<VPWidenPHIRecipe>(&R))
         continue;
 
-      if (isa<VPWidenPointerInductionRecipe>(&R) ||
-          isa<VPWidenIntOrFpInductionRecipe>(&R)) {
+      if (isa<VPWidenPointerInductionRecipe, VPWidenIntOrFpInductionRecipe>(
+              &R)) {
         PHINode *Phi = nullptr;
         if (isa<VPWidenIntOrFpInductionRecipe>(&R)) {
           Phi = cast<PHINode>(State->get(R.getVPSingleValue()));
@@ -1085,10 +1085,10 @@ void VPlan::execute(VPTransformState *State) {
       // previous iteration. For non-ordered reductions all UF parts are
       // generated.
       auto *PhiR = cast<VPHeaderPHIRecipe>(&R);
+      auto *RedPhiR = dyn_cast<VPReductionPHIRecipe>(PhiR);
       bool NeedsScalar =
           isa<VPCanonicalIVPHIRecipe, VPEVLBasedIVPHIRecipe>(PhiR) ||
-          (isa<VPReductionPHIRecipe>(PhiR) &&
-           cast<VPReductionPHIRecipe>(PhiR)->isInLoop());
+          (RedPhiR && RedPhiR->isInLoop());
       Value *Phi = State->get(PhiR, NeedsScalar);
       Value *Val = State->get(PhiR->getBackedgeValue(), NeedsScalar);
       cast<PHINode>(Phi)->addIncoming(Val, VectorLatchBB);
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
index 02f6fc8e907c0b..5a64f6318a6543 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
@@ -693,9 +693,14 @@ void VPlanTransforms::optimizeForVFAndUF(VPlan &Plan, ElementCount BestVF,
   SmallVector<VPValue *> PossiblyDead(Term->operands());
   Term->eraseFromParent();
   auto *Header = cast<VPBasicBlock>(Plan.getVectorLoopRegion()->getEntry());
-  if (all_of(Header->phis(), [](VPRecipeBase &R) {
-        return !isa<VPWidenIntOrFpInductionRecipe, VPReductionPHIRecipe>(&R);
-      })) {
+  if (any_of(Header->phis(),
+             IsaPred<VPWidenIntOrFpInductionRecipe, VPReductionPHIRecipe>)) {
+    LLVMContext &Ctx = SE.getContext();
+    auto *BOC = new VPInstruction(
+        VPInstruction::BranchOnCond,
+        {Plan.getOrAddLiveIn(ConstantInt::getTrue(Ctx))}, Term->getDebugLoc());
+    ExitingVPBB->appendRecipe(BOC);
+  } else {
     for (VPRecipeBase &R : make_early_inc_range(Header->phis())) {
       auto *P = cast<VPHeaderPHIRecipe>(&R);
       P->replaceAllUsesWith(P->getStartValue());
@@ -720,13 +725,6 @@ void VPlanTransforms::optimizeForVFAndUF(VPlan &Plan, ElementCount BestVF,
     // be deleted when the region is deleted.
     LoopRegion->clearEntry();
     delete LoopRegion;
-  } else {
-    LLVMContext &Ctx = SE.getContext();
-    auto *BOC =
-        new VPInstruction(VPInstruction::BranchOnCond,
-                          {Plan.getOrAddLiveIn(ConstantInt::getTrue(Ctx))}, Term->getDebugLoc());
-
-    ExitingVPBB->appendRecipe(BOC);
   }
   for (VPValue *Op : PossiblyDead)
     recursivelyDeleteDeadRecipes(Op);
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/low-trip-count.ll b/llvm/test/Transforms/LoopVectorize/RISCV/low-trip-count.ll
index f8937e47cd0dff..a6da9ec9d0452f 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/low-trip-count.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/low-trip-count.ll
@@ -56,21 +56,17 @@ define void @trip3_i8(ptr noalias nocapture noundef %dst, ptr noalias nocapture
 ; CHECK-NEXT:    [[TMP3:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-NEXT:    [[TMP4:%.*]] = mul i64 [[TMP3]], 2
 ; CHECK-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = call <vscale x 2 x i1> @llvm.get.active.lane.mask.nxv2i1.i64(i64 0, i64 3)
-; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i8, ptr [[SRC:%.*]], i64 0
-; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i8, ptr [[TMP5]], i32 0
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 2 x i8> @llvm.masked.load.nxv2i8.p0(ptr [[TMP6]], i32 1, <vscale x 2 x i1> [[ACTIVE_LANE_MASK]], <vscale x 2 x i8> poison)
-; CHECK-NEXT:    [[TMP7:%.*]] = shl <vscale x 2 x i8> [[WIDE_MASKED_LOAD]], shufflevector (<vscale x 2 x i8> insertelement (<vscale x 2 x i8> poison, i8 1, i64 0), <vscale x 2 x i8> poison, <vscale x 2 x i32> zeroinitializer)
 ; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i8, ptr [[DST:%.*]], i64 0
 ; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i8, ptr [[TMP8]], i32 0
 ; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 2 x i8> @llvm.masked.load.nxv2i8.p0(ptr [[TMP9]], i32 1, <vscale x 2 x i1> [[ACTIVE_LANE_MASK]], <vscale x 2 x i8> poison)
 ; CHECK-NEXT:    [[TMP10:%.*]] = shl <vscale x 2 x i8> [[WIDE_MASKED_LOAD]], splat (i8 1)
-; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i8, ptr [[DST:%.*]], i64 [[TMP7]]
+; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i8, ptr [[DST1:%.*]], i64 0
 ; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i8, ptr [[TMP11]], i32 0
 ; CHECK-NEXT:    [[WIDE_MASKED_LOAD1:%.*]] = call <vscale x 2 x i8> @llvm.masked.load.nxv2i8.p0(ptr [[TMP12]], i32 1, <vscale x 2 x i1> [[ACTIVE_LANE_MASK]], <vscale x 2 x i8> poison)
 ; CHECK-NEXT:    [[TMP13:%.*]] = add <vscale x 2 x i8> [[TMP10]], [[WIDE_MASKED_LOAD1]]
-; CHECK-NEXT:    call void @llvm.masked.store.nxv2i8.p0(<vscale x 2 x i8> [[TMP13]], ptr [[TMP12]], i32 1, <vscale x 2 x i1> [[ACTIVE_LANE_MASK]])
-; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP6]]
-; CHECK-NEXT:    br i1 true, label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i8, ptr [[TMP11]], i32 0
+; CHECK-NEXT:    call void @llvm.masked.store.nxv2i8.p0(<vscale x 2 x i8> [[TMP13]], ptr [[TMP14]], i32 1, <vscale x 2 x i1> [[ACTIVE_LANE_MASK]])
+; CHECK-NEXT:    br label [[MIDDLE_BLOCK:%.*]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
@@ -78,12 +74,12 @@ define void @trip3_i8(ptr noalias nocapture noundef %dst, ptr noalias nocapture
 ; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
 ; CHECK:       for.body:
 ; CHECK-NEXT:    [[I_08:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INC:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i64 [[I_08]]
-; CHECK-NEXT:    [[TMP12:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
-; CHECK-NEXT:    [[MUL:%.*]] = shl i8 [[TMP12]], 1
-; CHECK-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds i8, ptr [[DST]], i64 [[I_08]]
-; CHECK-NEXT:    [[TMP13:%.*]] = load i8, ptr [[ARRAYIDX1]], align 1
-; CHECK-NEXT:    [[ADD:%.*]] = add i8 [[MUL]], [[TMP13]]
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[DST]], i64 [[I_08]]
+; CHECK-NEXT:    [[TMP15:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
+; CHECK-NEXT:    [[MUL:%.*]] = shl i8 [[TMP15]], 1
+; CHECK-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds i8, ptr [[DST1]], i64 [[I_08]]
+; CHECK-NEXT:    [[TMP16:%.*]] = load i8, ptr [[ARRAYIDX1]], align 1
+; CHECK-NEXT:    [[ADD:%.*]] = add i8 [[MUL]], [[TMP16]]
 ; CHECK-NEXT:    store i8 [[ADD]], ptr [[ARRAYIDX1]], align 1
 ; CHECK-NEXT:    [[INC]] = add nuw nsw i64 [[I_08]], 1
 ; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INC]], 3
@@ -125,21 +121,17 @@ define void @trip5_i8(ptr noalias nocapture noundef %dst, ptr noalias nocapture
 ; CHECK-NEXT:    [[TMP3:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-NEXT:    [[TMP4:%.*]] = mul i64 [[TMP3]], 4
 ; CHECK-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 0, i64 5)
-; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i8, ptr [[SRC:%.*]], i64 0
-; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i8, ptr [[TMP5]], i32 0
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 4 x i8> @llvm.masked.load.nxv4i8.p0(ptr [[TMP6]], i32 1, <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x i8> poison)
-; CHECK-NEXT:    [[TMP7:%.*]] = shl <vscale x 4 x i8> [[WIDE_MASKED_LOAD]], shufflevector (<vscale x 4 x i8> insertelement (<vscale x 4 x i8> poison, i8 1, i64 0), <vscale x 4 x i8> poison, <vscale x 4 x i32> zeroinitializer)
 ; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i8, ptr [[DST:%.*]], i64 0
 ; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i8, ptr [[TMP8]], i32 0
 ; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 4 x i8> @llvm.masked.load.nxv4i8.p0(ptr [[TMP9]], i32 1, <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x i8> poison)
 ; CHECK-NEXT:    [[TMP10:%.*]] = shl <vscale x 4 x i8> [[WIDE_MASKED_LOAD]], splat (i8 1)
-; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i8, ptr [[DST:%.*]], i64 [[TMP7]]
+; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i8, ptr [[DST1:%.*]], i64 0
 ; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i8, ptr [[TMP11]], i32 0
 ; CHECK-NEXT:    [[WIDE_MASKED_LOAD1:%.*]] = call <vscale x 4 x i8> @llvm.masked.load.nxv4i8.p0(ptr [[TMP12]], i32 1, <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x i8> poison)
 ; CHECK-NEXT:    [[TMP13:%.*]] = add <vscale x 4 x i8> [[TMP10]], [[WIDE_MASKED_LOAD1]]
-; CHECK-NEXT:    call void @llvm.masked.store.nxv4i8.p0(<vscale x 4 x i8> [[TMP13]], ptr [[TMP12]], i32 1, <vscale x 4 x i1> [[ACTIVE_LANE_MASK]])
-; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP6]]
-; CHECK-NEXT:    br i1 true, label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
+; CHECK-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i8, ptr [[TMP11]], i32 0
+; CHECK-NEXT:    call void @llvm.masked.store.nxv4i8.p0(<vscale x 4 x i8> [[TMP13]], ptr [[TMP14]], i32 1, <vscale x 4 x i1> [[ACTIVE_LANE_MASK]])
+; CHECK-NEXT:    br label [[MIDDLE_BLOCK:%.*]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
@@ -147,12 +139,12 @@ define void @trip5_i8(ptr noalias nocapture noundef %dst, ptr noalias nocapture
 ; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
 ; CHECK:       for.body:
 ; CHECK-NEXT:    [[I_08:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INC:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i64 [[I_08]]
-; CHECK-NEXT:    [[TMP12:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
-; CHECK-NEXT:    [[MUL:%.*]] = shl i8 [[TMP12]], 1
-; CHECK-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds i8, ptr [[DST]], i64 [[I_08]]
-; CHECK-NEXT:    [[TMP13:%.*]] = load i8, ptr [[ARRAYIDX1]], align 1
-; CHECK-NEXT:    [[ADD:%.*]] = add i8 [[MUL]], [[TMP13]]
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[DST]], i64 [[I_08]]
+; CHECK-NEXT:    [[TMP15:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
+; CHECK-NEXT:    [[MUL:%.*]] = shl i8 [[TMP15]], 1
+; CHECK-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds i8, ptr [[DST1]], i64 [[I_08]]
+; CHECK-NEXT:    [[TMP16:%.*]] = load i8, ptr [[ARRAYIDX1]], align 1
+; CHECK-NEXT:    [[ADD:%.*]] = add i8 [[MUL]], [[TMP16]]
 ; CHECK-NEXT:    store i8 [[ADD]], ptr [[ARRAYIDX1]], align 1
 ; CHECK-NEXT:    [[INC]] = add nuw nsw i64 [[I_08]], 1
 ; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INC]], 5
@@ -194,21 +186,17 @@ define void @trip8_i8(ptr noalias nocapture noundef %dst, ptr noalias nocapture
 ; CHECK-NEXT:    [[TMP3:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-NEXT:    [[TMP4:%.*]] = mul i64 [[TMP3]], 4
 ; CHECK-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 0, i64 8)
-; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i8, ptr [[SRC:%.*]], i64 0
-; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i8, ptr [[TMP5]], i32 0
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 4 x i8> @llvm.masked.load.nxv4i8.p0(ptr [[TMP6]], i32 1, <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x i8> poison)
-; CHECK-NEXT:    [[TMP7:%.*]] = shl <vscale x 4 x i8> [[WIDE_MASKED_LOAD]], shufflevector (<vscale x 4 x i8> insertelement (<vscale x 4 x i8> poison, i8 1, i64 0), <vscale x 4 x i8> poison, <vscale x 4 x i32> zeroinitializer)
 ; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i8, ptr [[DST:%.*]], i64 0
 ; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i8, ptr [[TMP8]], i32 0
 ; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 4 x i8> @llvm.masked.load.nxv4i8.p0(ptr [[TMP9]], i32 1, <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x i8> poison)
 ; CHECK-NEXT:    [[TMP10:%.*]] = shl <vscale x 4 x i8> [[WIDE_MASKED_LOAD]], splat (i8 1)
-; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i8, ptr [[DST:%.*]], i64 [[TMP7]]
+; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i8, ptr [[DST1:%.*]], i64 0
 ; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i8, ptr [[TMP11]], i32 0
 ; CHECK-NEXT:    [[WIDE_MASKED_LOAD1:%.*]] = call <vscale x 4 x i8> @llvm.masked.load.nxv4i8.p0(ptr [[TMP12]], i32 1, <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x i8> poison)
 ; CHECK-NEXT:    [[TMP13:%.*]] = add <vscale x 4 x i8> [[TMP10]], [[WIDE_MASKED_LOAD1]]
-; CHECK-NEXT:    call void @llvm.masked.store.nxv4i8.p0(<vscale x 4 x i8> [[TMP13]], ptr [[TMP12]], i32 1, <vscale x 4 x i1> [[ACTIVE_LANE_MASK]])
-; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP6]]
-; CHECK-NEXT:    br i1 true, label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
+; CHECK-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i8, ptr [[TMP11]], i32 0
+; CHECK-NEXT:    call void @llvm.masked.store.nxv4i8.p0(<vscale x 4 x i8> [[TMP13]], ptr [[TMP14]], i32 1, <vscale x 4 x i1> [[ACTIVE_LANE_MASK]])
+; CHECK-NEXT:    br label [[MIDDLE_BLOCK:%.*]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
@@ -216,12 +204,12 @@ define void @trip8_i8(ptr noalias nocapture noundef %dst, ptr noalias nocapture
 ; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
 ; CHECK:       for.body:
 ; CHECK-NEXT:    [[I_08:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INC:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i64 [[I_08]]
-; CHECK-NEXT:    [[TMP12:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
-; CHECK-NEXT:    [[MUL:%.*]] = shl i8 [[TMP12]], 1
-; CHECK-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds i8, ptr [[DST]], i64 [[I_08]]
-; CHECK-NEXT:    [[TMP13:%.*]] = load i8, ptr [[ARRAYIDX1]], align 1
-; CHECK-NEXT:    [[ADD:%.*]] = add i8 [[MUL]], [[TMP13]]
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[DST]], i64 [[I_08]]
+; CHECK-NEXT:    [[TMP15:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
+; CHECK-NEXT:    [[MUL:%.*]] = shl i8 [[TMP15]], 1
+; CHECK-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds i8, ptr [[DST1]], i64 [[I_08]]
+; CHECK-NEXT:    [[TMP16:%.*]] = load i8, ptr [[ARRAYIDX1]], align 1
+; CHECK-NEXT:    [[ADD:%.*]] = add i8 [[MUL]], [[TMP16]]
 ; CHECK-NEXT:    store i8 [[ADD]], ptr [[ARRAYIDX1]], align 1
 ; CHECK-NEXT:    [[INC]] = add nuw nsw i64 [[I_08]], 1
 ; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INC]], 8
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/short-trip-count.ll b/llvm/test/Transforms/LoopVectorize/RISCV/short-trip-count.ll
index 692569bb2c8603..8212d28f8aedd5 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/short-trip-count.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/short-trip-count.ll
@@ -12,18 +12,14 @@ define void @small_trip_count_min_vlen_128(ptr nocapture %a) nounwind vscale_ran
 ; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i32 [[N_RND_UP]], [[TMP0]]
 ; CHECK-NEXT:    [[N_VEC:%.*]] = sub i32 [[N_RND_UP]], [[N_MOD_VF]]
 ; CHECK-NEXT:    [[TMP2:%.*]] = call i32 @llvm.vscale.i32()
-; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
-; CHECK:       vector.body:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP3:%.*]] = add i32 [[INDEX]], 0
-; CHECK-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = call <vscale x 1 x i1> @llvm.get.active.lane.mask.nxv1i1.i32(i32 [[TMP3]], i32 4)
-; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i32 [[TMP3]]
-; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i32, ptr [[TMP4]], i32 0
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 1 x i32> @llvm.masked.load.nxv1i32.p0(ptr [[TMP5]], i32 4, <vscale x 1 x i1> [[ACTIVE_LANE_MASK]], <vscale x 1 x i32> poison)
+; CHECK-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = call <vscale x 1 x i1> @llvm.get.active.lane.mask.nxv1i1.i32(i32 0, i32 4)
+; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i32, ptr [[TMP4:%.*]], i32 0
+; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[TMP5]], i32 0
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 1 x i32> @llvm.masked.load.nxv1i32.p0(ptr [[TMP7]], i32 4, <vscale x 1 x i1> [[ACTIVE_LANE_MASK]], <vscale x 1 x i32> poison)
 ; CHECK-NEXT:    [[TMP6:%.*]] = add nsw <vscale x 1 x i32> [[WIDE_MASKED_LOAD]], splat (i32 1)
-; CHECK-NEXT:    call void @llvm.masked.store.nxv1i32.p0(<vscale x 1 x i32> [[TMP6]], ptr [[TMP5]], i32 4, <vscale x 1 x i1> [[ACTIVE_LANE_MASK]])
-; CHECK-NEXT:    [[INDEX_NEXT]] = add i32 [[INDEX]], [[TMP2]]
-; CHECK-NEXT:    br i1 true, label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i32, ptr [[TMP5]], i32 0
+; CHECK-NEXT:    call void @llvm.masked.store.nxv1i32.p0(<vscale x 1 x i32> [[TMP6]], ptr [[TMP8]], i32 4, <vscale x 1 x i1> [[ACTIVE_LANE_MASK]])
+; CHECK-NEXT:    br label [[MIDDLE_BLOCK:%.*]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
@@ -31,7 +27,7 @@ define void @small_trip_count_min_vlen_128(ptr nocapture %a) nounwind vscale_ran
 ; CHECK-NEXT:    br label [[LOOP:%.*]]
 ; CHECK:       loop:
 ; CHECK-NEXT:    [[IV:%.*]] = phi i32 [ [[IV_NEXT:%.*]], [[LOOP]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
-; CHECK-NEXT:    [[GEP:%.*]] = getelementptr inbounds i32, ptr [[A]], i32 [[IV]]
+; CHECK-NEXT:    [[GEP:%.*]] = getelementptr inbounds i32, ptr [[TMP4]], i32 [[IV]]
 ; CHECK-NEXT:    [[V:%.*]] = load i32, ptr [[GEP]], align 4
 ; CHECK-NEXT:    [[ADD:%.*]] = add nsw i32 [[V]], 1
 ; CHECK-NEXT:    store i32 [[ADD]], ptr [[GEP]], align 4
@@ -71,18 +67,14 @@ define void @small_trip_count_min_vlen_32(ptr nocapture %a) nounwind vscale_rang
 ; CHECK-NEXT:    [[N_VEC:%.*]] = sub i32 [[N_RND_UP]], [[N_MOD_VF]]
 ; CHECK-NEXT:    [[TMP3:%.*]] = call i32 @llvm.vscale.i32()
 ; CHECK-NEXT:    [[TMP4:%.*]] = mul i32 [[TMP3]], 4
-; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
-; CHECK:       vector.body:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP5:%.*]] = add i32 [[INDEX]], 0
-; CHECK-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i32(i32 [[TMP5]], i32 4)
-; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i32 [[TMP5]]
-; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[TMP6]], i32 0
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 4 x i32> @llvm.masked.load.nxv4i32.p0(ptr [[TMP7]], i32 4, <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x i32> poison)
+; CHECK-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i32(i32 0, i32 4)
+; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[TMP6:%.*]], i32 0
+; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i32, ptr [[TMP7]], i32 0
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 4 x i32> @llvm.masked.load.nxv4i32.p0(ptr [[TMP9]], i32 4, <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x i32> poison)
 ; CHECK-NEXT:    [[TMP8:%.*]] = add nsw <vscale x 4 x i32> [[WIDE_MASKED_LOAD]], splat (i32 1)
-; CHECK-NEXT:    call void @llvm.masked.store.nxv4i32.p0(<vscale x 4 x i32> [[TMP8]], ptr [[TMP7]], i32 4, <vscale x 4 x i1> [[ACTIVE_LANE_MASK]])
-; CHECK-NEXT:    [[INDEX_NEXT]] = add i32 [[INDEX]], [[TMP4]]
-; CHECK-NEXT:    br i1 true, label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
+; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i32, ptr [[TMP7]], i32 0
+; CHECK-NEXT:    call void @llvm.masked.store.nxv4i32.p0(<vscale x 4 x i32> [[TMP8]], ptr [[TMP10]], i32 4, <vscale x 4 x i1> [[ACTIVE_LANE_MASK]])
+; CHECK-NEXT:    br label [[MIDDLE_BLOCK:%.*]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
@@ -90,7 +82,7 @@ define void @small_trip_count_min_vlen_32(ptr nocapture %a) nounwind vscale_rang
 ; CHECK-NEXT:    br label [[LOOP:%.*]]
 ; CHECK:       loop:
 ; CHECK-NEXT:    [[IV:%.*]] = phi i32 [ [[IV_NEXT:%.*]], [[LOOP]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
-; CHECK-NEXT:    [[GEP:%.*]] = getelementptr inbounds i32, ptr [[A]], i32 [[IV]]
+; CHECK-NEXT:    [[GEP:%.*]] = getelementptr inbounds i32, ptr [[TMP6]], i32 [[IV]]
 ; CHECK-NEXT:    [[V:%.*]] = load i32, ptr [[GEP]], align 4
 ; CHECK-NEXT:    [[ADD:%.*]] = add nsw i32 [[V]], 1
 ; CHECK-NEXT:    store i32 [[ADD]], ptr [[GEP]], align 4
diff --git a/llvm/test/Transforms/LoopVectorize/debugloc-optimize-vfuf-term.ll b/llvm/test/Transforms/LoopVectorize/debugloc-optimize-vfuf-term.ll
index 04ce9562c04b57..22d1c61992b3ff 100644
--- a/llvm/test/Transforms/LoopVectorize/debugloc-optimize-vfuf-term.ll
+++ b/llvm/test/Transforms/LoopVectorize/debugloc-optimize-vfuf-term.ll
@@ -9,24 +9,20 @@ define i32 @foo(ptr %p) {
 ; CHECK-NEXT:  [[ENTRY:.*]]:
 ; CHECK-NEXT:    br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
 ; CHECK:       [[VECTOR_PH]]:
-; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
-; CHECK:       [[VECTOR_BODY]]:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ], !dbg [[DBG3:![0-9]+]]
-; CHECK-NEXT:    store i8 0, ptr [[P]], align 1, !dbg [[DBG7:![0-9]+]]
-; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2, !dbg [[DBG3]]
-; CHECK-NEXT:    br i1 true, label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !dbg [[DBG3]], !llvm.loop [[LOOP8:![0-9]+]]
+; CHECK-NEXT:    store i8 0, ptr [[P]], align 1, !dbg [[DBG3:![0-9]+]]
+; CHECK-NEXT:    br label %[[MIDDLE_BLOCK:.*]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
-; CHECK-NEXT:    br i1 true, label %[[EXIT:.*]], label %[[SCALAR_PH]], !dbg [[DBG11:![0-9]+]]
+; CHECK-NEXT:    br i1 true, label %[[EXIT:.*]], label %[[SCALAR_PH]], !dbg [[DBG7:![0-9]+]]
 ; CHECK:       [[SCALAR_PH]]:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 2, %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ], !dbg [[DBG3]]
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 2, %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ], !dbg [[DBG8:![0-9]+]]
 ; CHECK-NEXT:    br label %[[LOOP:.*]]
 ; CHECK:       [[LOOP]]:
-; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[IV_NEXT:%.*]], %[[LOOP]] ], [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], !dbg [[DBG3]]
-; CHECK-NEXT:    [[CONV:%.*]] = trunc i64 0 to i8, !dbg [[DBG12:![0-9]+]]
-; CHECK-NEXT:    store i8 [[CONV]], ptr [[P]], align 1, !dbg [[DBG7]]
-; CHECK-NEXT:    [[IV_NEXT]] = add i64 [[IV]], 1, !dbg [[DBG13:![0-9]+]]
-; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[IV]], 1, !dbg [[DBG14:![0-9]+]]
-; CHECK-NEXT:    br i1 [[EXITCOND]], label %[[EXIT]], label %[[LOOP]], !dbg [[DBG11]], !llvm.loop [[LOOP15:![0-9]+]]
+; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[IV_NEXT:%.*]], %[[LOOP]] ], [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], !dbg [[DBG8]]
+; CHECK-NEXT:    [[CONV:%.*]] = trunc i64 0 to i8, !dbg [[DBG9:![0-9]+]]
+; CHECK-NEXT:    store i8 [[CONV]], ptr [[P]], align 1, !dbg [[DBG3]]
+; CHECK-NEXT:    [[IV_NEXT]] = add i64 [[IV]], 1, !dbg [[DBG10:![0-9]+]]
+; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[IV]], 1, !dbg [[DBG11:![0-9]+]]
+; CHECK-NEXT:    br i1 [[EXITCOND]], label %[[EXIT]], label %[[LOOP]], !dbg [[DBG7]], !llvm.loop [[LOOP12:![0-9]+]]
 ; CHECK:       [[EXIT]]:
 ; CHECK-NEXT:    ret i32 0
 ;
@@ -63,17 +59,16 @@ exit:                              ; preds = %loop
 ;.
 ; CHECK: [[META0:![0-9]+]] = distinct !DICompileUnit(language: DW_LANG_C_plus_plus_14, file: [[META1:![0-9]+]], producer: "{{.*}}clang version {{.*}}", isOptimized: false, runtimeVersion: 0, emissionKind: NoDebug)
 ; CHECK: [[META1]] = !DIFile(filename: "test.cpp", directory: {{.*}})
-; CHECK: [[DBG3]] = !DILocation(line: 4, scope: [[META4:![0-9]+]])
+; CHECK: [[DBG3]] = !DILocation(line: 6, scope: [[META4:![0-9]+]])
 ; CHECK: [[META4]] = distinct !DISubprogram(name: "foo", scope: [[META1]], file: [[META1]], line: 11, type: [[META5:![0-9]+]], spFlags: DISPFlagDefinition, unit: [[META0]], retainedNodes: [[META6:![0-9]+]])
 ; CHECK: [[META5]] = distinct !DISubroutineType(types: [[META6]])
 ; CHECK: [[META6]] = !{}
-; CHECK: [[DBG7]] = !DILocation(line: 6, scope: [[META4]])
-; CHECK: [[LOOP8]] = distinct !{[[LOOP8]], [[META9:![0-9]+]], [[META10:![0-9]+]]}
-; CHECK: [[META9]] = !{!"llvm.loop.isvectorized", i32 1}
-; CHECK: [[META10]] = !{!"llvm.loop.unroll.runtime.disable"}
-; CHECK: [[DBG11]] = !DILocation(line: 9, scope: [[META4]])
-; CHECK: [[DBG12]] = !DILocation(line: 5, scope: [[META4]])
-; CHECK: [[DBG13]] = !DILocation(line: 7, scope: [[META4]])
-; CHECK: [[DBG14]] = !DILocation(line: 8, scope: [[META4]])
-; CHECK: [[LOOP15]] = distinct !{[[LOOP15]], [[META10]], [[META9]]}
+; CHECK: [[DBG7]] = !DILocation(line: 9, scope: [[META4]])
+; CHECK: [[DBG8]] = !DILocation(line: 4, scope: [[META4]])
+; CHECK: [[DBG9]] = !DILocation(line: 5, scope: [[META4]])
+; CHECK: [[DBG10]] = !DILocation(line: 7, scope: [[META4]])
+; CHECK: [[DBG11]] = !DILocation(line: 8, scope: [[META4]])
+; CHECK: [[LOOP12]] = distinct !{[[LOOP12]], [[META13:![0-9]+]], [[META14:![0-9]+]]}
+; CHECK: [[META13]] = !{!"llvm.loop.unroll.runtime.disable"}
+; CHECK: [[META14]] = !{!"llvm.loop.isvectorized", i32 1}
 ;.

>From 71436fc905be3131651bca1a069370261a396c72 Mon Sep 17 00:00:00 2001
From: Florian Hahn <flo at fhahn.com>
Date: Sun, 24 Nov 2024 11:09:08 +0000
Subject: [PATCH 06/24] !fixup fix formatting

---
 llvm/lib/Transforms/Vectorize/VPlan.cpp | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/llvm/lib/Transforms/Vectorize/VPlan.cpp b/llvm/lib/Transforms/Vectorize/VPlan.cpp
index 70c2f5f320af97..e3e1cc85636e02 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlan.cpp
@@ -222,7 +222,8 @@ VPBasicBlock::iterator VPBasicBlock::getFirstNonPhi() {
 VPTransformState::VPTransformState(const TargetTransformInfo *TTI,
                                    ElementCount VF, unsigned UF, LoopInfo *LI,
                                    DominatorTree *DT, IRBuilderBase &Builder,
-                                   InnerLoopVectorizer *ILV, VPlan *Plan, Type *CanonicalIVTy)
+                                   InnerLoopVectorizer *ILV, VPlan *Plan,
+                                   Type *CanonicalIVTy)
     : TTI(TTI), VF(VF), CFG(DT), LI(LI), Builder(Builder), ILV(ILV), Plan(Plan),
       LVer(nullptr), TypeAnalysis(CanonicalIVTy) {}
 

>From cc43362acee766a668c86c60142641e753522c70 Mon Sep 17 00:00:00 2001
From: Florian Hahn <flo at fhahn.com>
Date: Sat, 7 Dec 2024 15:33:15 +0000
Subject: [PATCH 07/24] !fixup update after merge.

---
 llvm/lib/Transforms/Vectorize/VPlan.cpp       | 24 ++++---------------
 llvm/lib/Transforms/Vectorize/VPlan.h         |  7 ++++--
 .../Transforms/Vectorize/VPlanTransforms.cpp  |  2 --
 .../LoopVectorize/AArch64/call-costs.ll       |  2 ++
 .../LoopVectorize/RISCV/low-trip-count.ll     | 23 ++++++++++++------
 .../LoopVectorize/RISCV/short-trip-count.ll   | 12 ++++++----
 .../truncate-to-minimal-bitwidth-cost.ll      |  2 ++
 .../LoopVectorize/SystemZ/pr47665.ll          |  4 ++--
 ...demanding-all-lanes-and-first-lane-only.ll |  8 +++----
 .../LoopVectorize/X86/constant-fold.ll        |  2 ++
 .../Transforms/LoopVectorize/X86/pr34438.ll   |  2 ++
 .../debugloc-optimize-vfuf-term.ll            | 20 +++++++++-------
 .../LoopVectorize/first-order-recurrence.ll   |  6 +++++
 .../version-stride-with-integer-casts.ll      |  2 ++
 14 files changed, 66 insertions(+), 50 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/VPlan.cpp b/llvm/lib/Transforms/Vectorize/VPlan.cpp
index 48733ff9124f08..2d5c05309fb248 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlan.cpp
@@ -486,7 +486,7 @@ void VPBasicBlock::execute(VPTransformState *State) {
   };
 
   // 1. Create an IR basic block.
-  if (this == getPlan()->getVectorPreheader() ||
+  if (this == getPlan()->getEntry() ||
       (Replica && this == getParent()->getEntry()) ||
       IsReplicateRegion(getSingleHierarchicalPredecessor())) {
     // Reuse the previous basic block if the current VPBB is either
@@ -1078,30 +1078,14 @@ void VPlan::execute(VPTransformState *State) {
         continue;
       }
 
-      // For  canonical IV, first-order recurrences and in-order reduction phis,
-      // only a single part is generated, which provides the last part from the
-      // previous iteration. For non-ordered reductions all UF parts are
-      // generated.
       auto *PhiR = cast<VPHeaderPHIRecipe>(&R);
-      auto *RedPhiR = dyn_cast<VPReductionPHIRecipe>(PhiR);
-      bool NeedsScalar =
-          isa<VPCanonicalIVPHIRecipe, VPEVLBasedIVPHIRecipe>(PhiR) ||
-          (RedPhiR && RedPhiR->isInLoop());
+      bool NeedsScalar = isa<VPScalarPHIRecipe>(PhiR) ||
+                         (isa<VPReductionPHIRecipe>(PhiR) &&
+                          cast<VPReductionPHIRecipe>(PhiR)->isInLoop());
       Value *Phi = State->get(PhiR, NeedsScalar);
       Value *Val = State->get(PhiR->getBackedgeValue(), NeedsScalar);
       cast<PHINode>(Phi)->addIncoming(Val, VectorLatchBB);
     }
-/*<<<<<<< HEAD*/
-/*=======*/
-
-    /*auto *PhiR = cast<VPHeaderPHIRecipe>(&R);*/
-    /*bool NeedsScalar = isa<VPScalarPHIRecipe>(PhiR) ||*/
-                       /*(isa<VPReductionPHIRecipe>(PhiR) &&*/
-                        /*cast<VPReductionPHIRecipe>(PhiR)->isInLoop());*/
-    /*Value *Phi = State->get(PhiR, NeedsScalar);*/
-    /*Value *Val = State->get(PhiR->getBackedgeValue(), NeedsScalar);*/
-    /*cast<PHINode>(Phi)->addIncoming(Val, VectorLatchBB);*/
-/*>>>>>>> origin/main*/
   }
   State->CFG.DTU.flush();
   assert(State->CFG.DTU.getDomTree().verify(
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h
index 1ffd198a129ef4..b57054d8227a85 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.h
+++ b/llvm/lib/Transforms/Vectorize/VPlan.h
@@ -3872,7 +3872,10 @@ class VPlan {
 
   /// Returns the preheader of the vector loop region.
   VPBasicBlock *getVectorPreheader() {
-    return dyn_cast<VPBasicBlock>(getVectorLoopRegion()->getSinglePredecessor());
+    auto *LoopRegion = getVectorLoopRegion();
+    if (!LoopRegion)
+      return nullptr;
+    return dyn_cast<VPBasicBlock>(LoopRegion->getSinglePredecessor());
   }
 
   /// Returns the VPRegionBlock of the vector loop.
@@ -3880,7 +3883,7 @@ class VPlan {
     return dyn_cast<VPRegionBlock>(getEntry()->getSingleSuccessor());
   }
   const VPRegionBlock *getVectorLoopRegion() const {
-    return cast<VPRegionBlock>(getEntry()->getSingleSuccessor());
+    return dyn_cast<VPRegionBlock>(getEntry()->getSingleSuccessor());
   }
 
   /// Returns the 'middle' block of the plan, that is the block that selects
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
index fa3b9605de266a..b1e2f26d1db65e 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
@@ -1849,8 +1849,6 @@ void VPlanTransforms::createInterleaveGroups(
 }
 
 void VPlanTransforms::prepareToExecute(VPlan &Plan) {
-  ReversePostOrderTraversal<VPBlockDeepTraversalWrapper<VPBlockBase *>> RPOT(
-      Plan.getVectorLoopRegion());
   for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(
            vp_depth_first_deep(Plan.getEntry()))) {
     for (VPRecipeBase &R : make_early_inc_range(VPBB->phis())) {
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/call-costs.ll b/llvm/test/Transforms/LoopVectorize/AArch64/call-costs.ll
index a01f1792812234..e63155b024c43a 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/call-costs.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/call-costs.ll
@@ -78,6 +78,8 @@ define void @powi_call(ptr %P) {
 ; CHECK-NEXT:  [[ENTRY:.*]]:
 ; CHECK-NEXT:    br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
 ; CHECK:       [[VECTOR_PH]]:
+; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK:       [[VECTOR_BODY]]:
 ; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds double, ptr [[P]], i64 0
 ; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds double, ptr [[TMP1]], i32 0
 ; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x double>, ptr [[TMP2]], align 8
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/low-trip-count.ll b/llvm/test/Transforms/LoopVectorize/RISCV/low-trip-count.ll
index 334aec6217681f..10ac870c112aef 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/low-trip-count.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/low-trip-count.ll
@@ -55,6 +55,8 @@ define void @trip3_i8(ptr noalias nocapture noundef %dst, ptr noalias nocapture
 ; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
 ; CHECK-NEXT:    [[TMP3:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-NEXT:    [[TMP4:%.*]] = mul i64 [[TMP3]], 2
+; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK:       vector.body:
 ; CHECK-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = call <vscale x 2 x i1> @llvm.get.active.lane.mask.nxv2i1.i64(i64 0, i64 3)
 ; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i8, ptr [[DST:%.*]], i64 0
 ; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i8, ptr [[TMP8]], i32 0
@@ -64,9 +66,9 @@ define void @trip3_i8(ptr noalias nocapture noundef %dst, ptr noalias nocapture
 ; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i8, ptr [[TMP11]], i32 0
 ; CHECK-NEXT:    [[WIDE_MASKED_LOAD1:%.*]] = call <vscale x 2 x i8> @llvm.masked.load.nxv2i8.p0(ptr [[TMP12]], i32 1, <vscale x 2 x i1> [[ACTIVE_LANE_MASK]], <vscale x 2 x i8> poison)
 ; CHECK-NEXT:    [[TMP13:%.*]] = add <vscale x 2 x i8> [[TMP10]], [[WIDE_MASKED_LOAD1]]
+; CHECK-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i8, ptr [[TMP11]], i32 0
 ; CHECK-NEXT:    call void @llvm.masked.store.nxv2i8.p0(<vscale x 2 x i8> [[TMP13]], ptr [[TMP14]], i32 1, <vscale x 2 x i1> [[ACTIVE_LANE_MASK]])
-; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP6]]
-; CHECK-NEXT:    br i1 true, label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK-NEXT:    br label [[MIDDLE_BLOCK:%.*]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
@@ -120,6 +122,8 @@ define void @trip5_i8(ptr noalias nocapture noundef %dst, ptr noalias nocapture
 ; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
 ; CHECK-NEXT:    [[TMP3:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-NEXT:    [[TMP4:%.*]] = mul i64 [[TMP3]], 4
+; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK:       vector.body:
 ; CHECK-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 0, i64 5)
 ; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i8, ptr [[DST:%.*]], i64 0
 ; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i8, ptr [[TMP8]], i32 0
@@ -129,9 +133,8 @@ define void @trip5_i8(ptr noalias nocapture noundef %dst, ptr noalias nocapture
 ; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i8, ptr [[TMP11]], i32 0
 ; CHECK-NEXT:    [[WIDE_MASKED_LOAD1:%.*]] = call <vscale x 4 x i8> @llvm.masked.load.nxv4i8.p0(ptr [[TMP12]], i32 1, <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x i8> poison)
 ; CHECK-NEXT:    [[TMP13:%.*]] = add <vscale x 4 x i8> [[TMP10]], [[WIDE_MASKED_LOAD1]]
-; CHECK-NEXT:    call void @llvm.masked.store.nxv4i8.p0(<vscale x 4 x i8> [[TMP13]], ptr [[TMP12]], i32 1, <vscale x 4 x i1> [[ACTIVE_LANE_MASK]])
-; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP6]]
-; CHECK-NEXT:    br i1 true, label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
+; CHECK-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i8, ptr [[TMP11]], i32 0
+; CHECK-NEXT:    call void @llvm.masked.store.nxv4i8.p0(<vscale x 4 x i8> [[TMP13]], ptr [[TMP14]], i32 1, <vscale x 4 x i1> [[ACTIVE_LANE_MASK]])
 ; CHECK-NEXT:    br label [[MIDDLE_BLOCK:%.*]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]]
@@ -186,6 +189,8 @@ define void @trip8_i8(ptr noalias nocapture noundef %dst, ptr noalias nocapture
 ; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
 ; CHECK-NEXT:    [[TMP3:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-NEXT:    [[TMP4:%.*]] = mul i64 [[TMP3]], 4
+; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK:       vector.body:
 ; CHECK-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 0, i64 8)
 ; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i8, ptr [[DST:%.*]], i64 0
 ; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i8, ptr [[TMP8]], i32 0
@@ -195,8 +200,8 @@ define void @trip8_i8(ptr noalias nocapture noundef %dst, ptr noalias nocapture
 ; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i8, ptr [[TMP11]], i32 0
 ; CHECK-NEXT:    [[WIDE_MASKED_LOAD1:%.*]] = call <vscale x 4 x i8> @llvm.masked.load.nxv4i8.p0(ptr [[TMP12]], i32 1, <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x i8> poison)
 ; CHECK-NEXT:    [[TMP13:%.*]] = add <vscale x 4 x i8> [[TMP10]], [[WIDE_MASKED_LOAD1]]
-; CHECK-NEXT:    call void @llvm.masked.store.nxv4i8.p0(<vscale x 4 x i8> [[TMP13]], ptr [[TMP12]], i32 1, <vscale x 4 x i1> [[ACTIVE_LANE_MASK]])
-; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP6]]
+; CHECK-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i8, ptr [[TMP11]], i32 0
+; CHECK-NEXT:    call void @llvm.masked.store.nxv4i8.p0(<vscale x 4 x i8> [[TMP13]], ptr [[TMP14]], i32 1, <vscale x 4 x i1> [[ACTIVE_LANE_MASK]])
 ; CHECK-NEXT:    br label [[MIDDLE_BLOCK:%.*]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]]
@@ -243,6 +248,8 @@ define void @trip16_i8(ptr noalias nocapture noundef %dst, ptr noalias nocapture
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; CHECK:       vector.ph:
+; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK:       vector.body:
 ; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds i8, ptr [[SRC:%.*]], i64 0
 ; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i8, ptr [[TMP0]], i32 0
 ; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP1]], align 1
@@ -300,6 +307,8 @@ define void @trip32_i8(ptr noalias nocapture noundef %dst, ptr noalias nocapture
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; CHECK:       vector.ph:
+; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK:       vector.body:
 ; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds i8, ptr [[SRC:%.*]], i64 0
 ; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i8, ptr [[TMP0]], i32 0
 ; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <32 x i8>, ptr [[TMP1]], align 1
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/short-trip-count.ll b/llvm/test/Transforms/LoopVectorize/RISCV/short-trip-count.ll
index 616b7c9af5b570..3386a7d3972aa2 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/short-trip-count.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/short-trip-count.ll
@@ -12,13 +12,15 @@ define void @small_trip_count_min_vlen_128(ptr nocapture %a) nounwind vscale_ran
 ; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i32 [[N_RND_UP]], [[TMP0]]
 ; CHECK-NEXT:    [[N_VEC:%.*]] = sub i32 [[N_RND_UP]], [[N_MOD_VF]]
 ; CHECK-NEXT:    [[TMP2:%.*]] = call i32 @llvm.vscale.i32()
+; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK:       vector.body:
 ; CHECK-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = call <vscale x 1 x i1> @llvm.get.active.lane.mask.nxv1i1.i32(i32 0, i32 4)
 ; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i32, ptr [[TMP4:%.*]], i32 0
 ; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[TMP5]], i32 0
 ; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 1 x i32> @llvm.masked.load.nxv1i32.p0(ptr [[TMP7]], i32 4, <vscale x 1 x i1> [[ACTIVE_LANE_MASK]], <vscale x 1 x i32> poison)
 ; CHECK-NEXT:    [[TMP6:%.*]] = add nsw <vscale x 1 x i32> [[WIDE_MASKED_LOAD]], splat (i32 1)
-; CHECK-NEXT:    call void @llvm.masked.store.nxv1i32.p0(<vscale x 1 x i32> [[TMP6]], ptr [[TMP5]], i32 4, <vscale x 1 x i1> [[ACTIVE_LANE_MASK]])
-; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], [[TMP2]]
+; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i32, ptr [[TMP5]], i32 0
+; CHECK-NEXT:    call void @llvm.masked.store.nxv1i32.p0(<vscale x 1 x i32> [[TMP6]], ptr [[TMP8]], i32 4, <vscale x 1 x i1> [[ACTIVE_LANE_MASK]])
 ; CHECK-NEXT:    br label [[MIDDLE_BLOCK:%.*]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]]
@@ -67,13 +69,15 @@ define void @small_trip_count_min_vlen_32(ptr nocapture %a) nounwind vscale_rang
 ; CHECK-NEXT:    [[N_VEC:%.*]] = sub i32 [[N_RND_UP]], [[N_MOD_VF]]
 ; CHECK-NEXT:    [[TMP3:%.*]] = call i32 @llvm.vscale.i32()
 ; CHECK-NEXT:    [[TMP4:%.*]] = mul i32 [[TMP3]], 4
+; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK:       vector.body:
 ; CHECK-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i32(i32 0, i32 4)
 ; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[TMP6:%.*]], i32 0
 ; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i32, ptr [[TMP7]], i32 0
 ; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 4 x i32> @llvm.masked.load.nxv4i32.p0(ptr [[TMP9]], i32 4, <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x i32> poison)
 ; CHECK-NEXT:    [[TMP8:%.*]] = add nsw <vscale x 4 x i32> [[WIDE_MASKED_LOAD]], splat (i32 1)
-; CHECK-NEXT:    call void @llvm.masked.store.nxv4i32.p0(<vscale x 4 x i32> [[TMP8]], ptr [[TMP7]], i32 4, <vscale x 4 x i1> [[ACTIVE_LANE_MASK]])
-; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], [[TMP4]]
+; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i32, ptr [[TMP7]], i32 0
+; CHECK-NEXT:    call void @llvm.masked.store.nxv4i32.p0(<vscale x 4 x i32> [[TMP8]], ptr [[TMP10]], i32 4, <vscale x 4 x i1> [[ACTIVE_LANE_MASK]])
 ; CHECK-NEXT:    br label [[MIDDLE_BLOCK:%.*]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]]
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/truncate-to-minimal-bitwidth-cost.ll b/llvm/test/Transforms/LoopVectorize/RISCV/truncate-to-minimal-bitwidth-cost.ll
index 9556c3653f37d2..3dd5e70e8d5b14 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/truncate-to-minimal-bitwidth-cost.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/truncate-to-minimal-bitwidth-cost.ll
@@ -163,6 +163,8 @@ define void @truncate_to_i1_used_by_branch(i8 %x, ptr %dst) #0 {
 ; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <2 x i8> [[BROADCAST_SPLATINSERT]], <2 x i8> poison, <2 x i32> zeroinitializer
 ; CHECK-NEXT:    [[TMP0:%.*]] = trunc <2 x i8> [[BROADCAST_SPLAT]] to <2 x i1>
 ; CHECK-NEXT:    [[TMP2:%.*]] = or <2 x i1> splat (i1 true), [[TMP0]]
+; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK:       [[VECTOR_BODY]]:
 ; CHECK-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = call <2 x i1> @llvm.get.active.lane.mask.v2i1.i32(i32 0, i32 2)
 ; CHECK-NEXT:    [[TMP3:%.*]] = select <2 x i1> [[ACTIVE_LANE_MASK]], <2 x i1> [[TMP2]], <2 x i1> zeroinitializer
 ; CHECK-NEXT:    [[BROADCAST_SPLATINSERT3:%.*]] = insertelement <2 x ptr> poison, ptr [[DST]], i64 0
diff --git a/llvm/test/Transforms/LoopVectorize/SystemZ/pr47665.ll b/llvm/test/Transforms/LoopVectorize/SystemZ/pr47665.ll
index f7de8a6fd2be59..21063e32e81b69 100644
--- a/llvm/test/Transforms/LoopVectorize/SystemZ/pr47665.ll
+++ b/llvm/test/Transforms/LoopVectorize/SystemZ/pr47665.ll
@@ -17,6 +17,8 @@ define void @test(ptr %p, i40 %a) {
 ; CHECK-NEXT:    [[TMP6:%.*]] = icmp ult <16 x i1> zeroinitializer, [[TMP5]]
 ; CHECK-NEXT:    [[TMP7:%.*]] = or <16 x i1> [[TMP6]], splat (i1 true)
 ; CHECK-NEXT:    [[TMP8:%.*]] = icmp sgt <16 x i1> [[TMP7]], zeroinitializer
+; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK:       vector.body:
 ; CHECK-NEXT:    br i1 true, label [[PRED_STORE_IF:%.*]], label [[PRED_STORE_CONTINUE:%.*]]
 ; CHECK:       pred.store.if:
 ; CHECK-NEXT:    [[TMP10:%.*]] = extractelement <16 x i1> [[TMP8]], i32 0
@@ -113,8 +115,6 @@ define void @test(ptr %p, i40 %a) {
 ; CHECK-NEXT:    store i1 [[TMP38]], ptr [[P]], align 1
 ; CHECK-NEXT:    br label [[PRED_STORE_CONTINUE30]]
 ; CHECK:       pred.store.continue30:
-; CHECK-NEXT:    br label [[PRED_STORE_CONTINUE32:%.*]]
-; CHECK:       for.body.0:
 ; CHECK-NEXT:    br label [[MIDDLE_BLOCK:%.*]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]]
diff --git a/llvm/test/Transforms/LoopVectorize/SystemZ/scalar-steps-with-users-demanding-all-lanes-and-first-lane-only.ll b/llvm/test/Transforms/LoopVectorize/SystemZ/scalar-steps-with-users-demanding-all-lanes-and-first-lane-only.ll
index fa39fc262ef5a5..c912e18e74ecf8 100644
--- a/llvm/test/Transforms/LoopVectorize/SystemZ/scalar-steps-with-users-demanding-all-lanes-and-first-lane-only.ll
+++ b/llvm/test/Transforms/LoopVectorize/SystemZ/scalar-steps-with-users-demanding-all-lanes-and-first-lane-only.ll
@@ -14,6 +14,8 @@ define void @test_scalar_iv_steps_used_by_replicate_and_first_lane_only_vpinst(p
 ; CHECK-NEXT:  [[ENTRY:.*]]:
 ; CHECK-NEXT:    br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
 ; CHECK:       [[VECTOR_PH]]:
+; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK:       [[VECTOR_BODY]]:
 ; CHECK-NEXT:    [[TMP4:%.*]] = mul nsw i64 0, 4
 ; CHECK-NEXT:    [[TMP1:%.*]] = mul nsw i64 0, 4
 ; CHECK-NEXT:    [[TMP2:%.*]] = mul nsw i64 0, 4
@@ -23,11 +25,11 @@ define void @test_scalar_iv_steps_used_by_replicate_and_first_lane_only_vpinst(p
 ; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i8, ptr [[SRC_1]], i64 [[TMP4]]
 ; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i8, ptr [[SRC_1]], i64 [[TMP4]]
 ; CHECK-NEXT:    [[TMP12:%.*]] = load i8, ptr [[TMP8]], align 1
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i8> poison, i8 [[TMP12]], i64 0
+; CHECK-NEXT:    [[TMP19:%.*]] = shufflevector <4 x i8> [[BROADCAST_SPLATINSERT]], <4 x i8> poison, <4 x i32> zeroinitializer
 ; CHECK-NEXT:    [[TMP9:%.*]] = load i8, ptr [[TMP8]], align 1
 ; CHECK-NEXT:    [[TMP10:%.*]] = load i8, ptr [[TMP8]], align 1
 ; CHECK-NEXT:    [[TMP11:%.*]] = load i8, ptr [[TMP8]], align 1
-; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i8> poison, i8 [[TMP12]], i64 0
-; CHECK-NEXT:    [[TMP19:%.*]] = shufflevector <4 x i8> [[BROADCAST_SPLATINSERT]], <4 x i8> poison, <4 x i32> zeroinitializer
 ; CHECK-NEXT:    [[TMP20:%.*]] = icmp eq <4 x i8> [[TMP19]], zeroinitializer
 ; CHECK-NEXT:    [[TMP22:%.*]] = getelementptr [8 x i32], ptr @src, i64 0, i64 4
 ; CHECK-NEXT:    [[TMP23:%.*]] = getelementptr i32, ptr [[TMP22]], i32 0
@@ -60,8 +62,6 @@ define void @test_scalar_iv_steps_used_by_replicate_and_first_lane_only_vpinst(p
 ; CHECK-NEXT:    store i32 [[TMP31]], ptr [[DST]], align 4
 ; CHECK-NEXT:    br label %[[PRED_STORE_CONTINUE6]]
 ; CHECK:       [[PRED_STORE_CONTINUE6]]:
-; CHECK-NEXT:    br label %[[THEN_0:.*]]
-; CHECK:       [[THEN_0]]:
 ; CHECK-NEXT:    br label %[[MIDDLE_BLOCK:.*]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
 ; CHECK-NEXT:    br i1 true, label %[[EXIT:.*]], label %[[SCALAR_PH]]
diff --git a/llvm/test/Transforms/LoopVectorize/X86/constant-fold.ll b/llvm/test/Transforms/LoopVectorize/X86/constant-fold.ll
index 1e6e38632701ff..83e2f84814add5 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/constant-fold.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/constant-fold.ll
@@ -15,6 +15,8 @@ define void @f1() {
 ; CHECK-NEXT:  bb1:
 ; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; CHECK:       vector.ph:
+; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK:       vector.body:
 ; CHECK-NEXT:    [[TMP0:%.*]] = sext i16 0 to i64
 ; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr [2 x ptr], ptr @b, i16 0, i64 [[TMP0]]
 ; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr ptr, ptr [[TMP1]], i32 0
diff --git a/llvm/test/Transforms/LoopVectorize/X86/pr34438.ll b/llvm/test/Transforms/LoopVectorize/X86/pr34438.ll
index 47662d155b5453..7816c4918761f7 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/pr34438.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/pr34438.ll
@@ -14,6 +14,8 @@ define void @small_tc(ptr noalias nocapture %A, ptr noalias nocapture readonly %
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; CHECK:       vector.ph:
+; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK:       vector.body:
 ; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds float, ptr [[B:%.*]], i64 0
 ; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds float, ptr [[TMP0]], i32 0
 ; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <8 x float>, ptr [[TMP1]], align 4, !llvm.access.group [[ACC_GRP0:![0-9]+]]
diff --git a/llvm/test/Transforms/LoopVectorize/debugloc-optimize-vfuf-term.ll b/llvm/test/Transforms/LoopVectorize/debugloc-optimize-vfuf-term.ll
index 22d1c61992b3ff..0f34f6243f155a 100644
--- a/llvm/test/Transforms/LoopVectorize/debugloc-optimize-vfuf-term.ll
+++ b/llvm/test/Transforms/LoopVectorize/debugloc-optimize-vfuf-term.ll
@@ -9,20 +9,22 @@ define i32 @foo(ptr %p) {
 ; CHECK-NEXT:  [[ENTRY:.*]]:
 ; CHECK-NEXT:    br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
 ; CHECK:       [[VECTOR_PH]]:
+; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK:       [[VECTOR_BODY]]:
 ; CHECK-NEXT:    store i8 0, ptr [[P]], align 1, !dbg [[DBG3:![0-9]+]]
-; CHECK-NEXT:    br label %[[MIDDLE_BLOCK:.*]]
+; CHECK-NEXT:    br label %[[MIDDLE_BLOCK:.*]], !dbg [[DBG7:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
-; CHECK-NEXT:    br i1 true, label %[[EXIT:.*]], label %[[SCALAR_PH]], !dbg [[DBG7:![0-9]+]]
+; CHECK-NEXT:    br i1 true, label %[[EXIT:.*]], label %[[SCALAR_PH]], !dbg [[DBG8:![0-9]+]]
 ; CHECK:       [[SCALAR_PH]]:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 2, %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ], !dbg [[DBG8:![0-9]+]]
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 2, %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ], !dbg [[DBG9:![0-9]+]]
 ; CHECK-NEXT:    br label %[[LOOP:.*]]
 ; CHECK:       [[LOOP]]:
-; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[IV_NEXT:%.*]], %[[LOOP]] ], [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], !dbg [[DBG8]]
-; CHECK-NEXT:    [[CONV:%.*]] = trunc i64 0 to i8, !dbg [[DBG9:![0-9]+]]
+; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[IV_NEXT:%.*]], %[[LOOP]] ], [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], !dbg [[DBG9]]
+; CHECK-NEXT:    [[CONV:%.*]] = trunc i64 0 to i8, !dbg [[DBG7]]
 ; CHECK-NEXT:    store i8 [[CONV]], ptr [[P]], align 1, !dbg [[DBG3]]
 ; CHECK-NEXT:    [[IV_NEXT]] = add i64 [[IV]], 1, !dbg [[DBG10:![0-9]+]]
 ; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[IV]], 1, !dbg [[DBG11:![0-9]+]]
-; CHECK-NEXT:    br i1 [[EXITCOND]], label %[[EXIT]], label %[[LOOP]], !dbg [[DBG7]], !llvm.loop [[LOOP12:![0-9]+]]
+; CHECK-NEXT:    br i1 [[EXITCOND]], label %[[EXIT]], label %[[LOOP]], !dbg [[DBG8]], !llvm.loop [[LOOP12:![0-9]+]]
 ; CHECK:       [[EXIT]]:
 ; CHECK-NEXT:    ret i32 0
 ;
@@ -63,9 +65,9 @@ exit:                              ; preds = %loop
 ; CHECK: [[META4]] = distinct !DISubprogram(name: "foo", scope: [[META1]], file: [[META1]], line: 11, type: [[META5:![0-9]+]], spFlags: DISPFlagDefinition, unit: [[META0]], retainedNodes: [[META6:![0-9]+]])
 ; CHECK: [[META5]] = distinct !DISubroutineType(types: [[META6]])
 ; CHECK: [[META6]] = !{}
-; CHECK: [[DBG7]] = !DILocation(line: 9, scope: [[META4]])
-; CHECK: [[DBG8]] = !DILocation(line: 4, scope: [[META4]])
-; CHECK: [[DBG9]] = !DILocation(line: 5, scope: [[META4]])
+; CHECK: [[DBG7]] = !DILocation(line: 5, scope: [[META4]])
+; CHECK: [[DBG8]] = !DILocation(line: 9, scope: [[META4]])
+; CHECK: [[DBG9]] = !DILocation(line: 4, scope: [[META4]])
 ; CHECK: [[DBG10]] = !DILocation(line: 7, scope: [[META4]])
 ; CHECK: [[DBG11]] = !DILocation(line: 8, scope: [[META4]])
 ; CHECK: [[LOOP12]] = distinct !{[[LOOP12]], [[META13:![0-9]+]], [[META14:![0-9]+]]}
diff --git a/llvm/test/Transforms/LoopVectorize/first-order-recurrence.ll b/llvm/test/Transforms/LoopVectorize/first-order-recurrence.ll
index 4d8f2b8e21dd7f..2da5daef61562c 100644
--- a/llvm/test/Transforms/LoopVectorize/first-order-recurrence.ll
+++ b/llvm/test/Transforms/LoopVectorize/first-order-recurrence.ll
@@ -3705,6 +3705,8 @@ define i32 @recurence_uniform_load(ptr %src, ptr noalias %dst) {
 ; UNROLL-NO-IC-NEXT:  entry:
 ; UNROLL-NO-IC-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; UNROLL-NO-IC:       vector.ph:
+; UNROLL-NO-IC-NEXT:    br label [[VECTOR_BODY:%.*]]
+; UNROLL-NO-IC:       vector.body:
 ; UNROLL-NO-IC-NEXT:    [[TMP0:%.*]] = load i32, ptr [[SRC:%.*]], align 4
 ; UNROLL-NO-IC-NEXT:    br label [[MIDDLE_BLOCK:%.*]]
 ; UNROLL-NO-IC:       middle.block:
@@ -3727,6 +3729,8 @@ define i32 @recurence_uniform_load(ptr %src, ptr noalias %dst) {
 ; UNROLL-NO-VF-NEXT:  entry:
 ; UNROLL-NO-VF-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; UNROLL-NO-VF:       vector.ph:
+; UNROLL-NO-VF-NEXT:    br label [[VECTOR_BODY:%.*]]
+; UNROLL-NO-VF:       vector.body:
 ; UNROLL-NO-VF-NEXT:    [[TMP0:%.*]] = load i32, ptr [[SRC:%.*]], align 4
 ; UNROLL-NO-VF-NEXT:    br label [[MIDDLE_BLOCK:%.*]]
 ; UNROLL-NO-VF:       middle.block:
@@ -3749,6 +3753,8 @@ define i32 @recurence_uniform_load(ptr %src, ptr noalias %dst) {
 ; SINK-AFTER-NEXT:  entry:
 ; SINK-AFTER-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; SINK-AFTER:       vector.ph:
+; SINK-AFTER-NEXT:    br label [[VECTOR_BODY:%.*]]
+; SINK-AFTER:       vector.body:
 ; SINK-AFTER-NEXT:    [[TMP0:%.*]] = load i32, ptr [[SRC:%.*]], align 4
 ; SINK-AFTER-NEXT:    br label [[MIDDLE_BLOCK:%.*]]
 ; SINK-AFTER:       middle.block:
diff --git a/llvm/test/Transforms/LoopVectorize/version-stride-with-integer-casts.ll b/llvm/test/Transforms/LoopVectorize/version-stride-with-integer-casts.ll
index 69c541ba6bbbb9..1137456779eec4 100644
--- a/llvm/test/Transforms/LoopVectorize/version-stride-with-integer-casts.ll
+++ b/llvm/test/Transforms/LoopVectorize/version-stride-with-integer-casts.ll
@@ -497,6 +497,8 @@ define void @sext_of_i1_stride(i1 %g, ptr %dst) mustprogress {
 ; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[TMP2]], 4
 ; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[TMP2]], [[N_MOD_VF]]
 ; CHECK-NEXT:    [[IND_END:%.*]] = mul i64 [[N_VEC]], [[G_64]]
+; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK:       vector.body:
 ; CHECK-NEXT:    [[TMP3:%.*]] = mul i64 0, [[G_64]]
 ; CHECK-NEXT:    [[TMP8:%.*]] = mul i64 0, [[G_64]]
 ; CHECK-NEXT:    [[TMP4:%.*]] = add i64 [[TMP3]], [[TMP8]]

>From e758945a0b4f36c8f5d3e6c2039ed295587f85b4 Mon Sep 17 00:00:00 2001
From: Florian Hahn <flo at fhahn.com>
Date: Sat, 7 Dec 2024 15:39:18 +0000
Subject: [PATCH 08/24] !fixup remove unrelated changes.

---
 llvm/lib/Transforms/Vectorize/LoopVectorize.cpp | 2 +-
 llvm/lib/Transforms/Vectorize/VPlan.cpp         | 2 +-
 llvm/lib/Transforms/Vectorize/VPlan.h           | 2 --
 3 files changed, 2 insertions(+), 4 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index d5beb47a29b265..67660313a45763 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -2999,7 +2999,7 @@ void InnerLoopVectorizer::fixVectorizedLoop(VPTransformState &State) {
                    getOrCreateVectorTripCount(nullptr), LoopMiddleBlock, State);
   }
 
-  if (!isa<VPRegionBlock>(State.Plan->getEntry()->getSingleSuccessor()))
+  if (!State.Plan->getVectorLoopRegion())
     return;
 
   for (Instruction *PI : PredicatedInstructions)
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.cpp b/llvm/lib/Transforms/Vectorize/VPlan.cpp
index 2d5c05309fb248..479604e616e69c 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlan.cpp
@@ -1009,8 +1009,8 @@ void VPlan::execute(VPTransformState *State) {
   // skeleton creation, so we can only create the VPIRBasicBlocks now during
   // VPlan execution rather than earlier during VPlan construction.
   BasicBlock *MiddleBB = State->CFG.ExitBB;
-  BasicBlock *ScalarPh = MiddleBB->getSingleSuccessor();
   VPBasicBlock *MiddleVPBB = getMiddleBlock();
+  BasicBlock *ScalarPh = MiddleBB->getSingleSuccessor();
   replaceVPBBWithIRVPBB(getScalarPreheader(), ScalarPh);
   replaceVPBBWithIRVPBB(MiddleVPBB, MiddleBB);
 
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h
index b57054d8227a85..150e398da3aed6 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.h
+++ b/llvm/lib/Transforms/Vectorize/VPlan.h
@@ -3592,8 +3592,6 @@ class VPBasicBlock : public VPBlockBase {
 protected:
   /// Execute the recipes in the IR basic block \p BB.
   void executeRecipes(VPTransformState *State, BasicBlock *BB);
-  void connectToPredecessors(BasicBlock *NewBB,
-                             VPTransformState::CFGState &CFG);
 
   /// Connect the VPBBs predecessors' in the VPlan CFG to the IR basic block
   /// generated for this VPBB.

>From 1d4b2e64f42e572fc73e1d20753ab68b04c58b14 Mon Sep 17 00:00:00 2001
From: Florian Hahn <flo at fhahn.com>
Date: Tue, 17 Dec 2024 21:26:13 +0000
Subject: [PATCH 09/24] !fixup partial fixup after merging main.

---
 llvm/lib/Transforms/Vectorize/LoopVectorize.cpp |  8 ++++----
 llvm/lib/Transforms/Vectorize/VPlan.cpp         | 17 +++++++++++++++--
 2 files changed, 19 insertions(+), 6 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index bb0cab961fa0bb..53d3d593412a73 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -2471,6 +2471,8 @@ InnerLoopVectorizer::getOrCreateVectorTripCount(BasicBlock *InsertBlock) {
 /// scalar preheader.
 static void introduceCheckBlockInVPlan(VPlan &Plan, BasicBlock *CheckIRBB) {
   VPBlockBase *ScalarPH = Plan.getScalarPreheader();
+  // FIXME: Cannot get the vector preheader at the moment if the vector loop
+  // region has been removed.
   VPBlockBase *VectorPH = Plan.getVectorPreheader();
   VPBlockBase *PreVectorPH = VectorPH->getSinglePredecessor();
   if (PreVectorPH->getNumSuccessors() != 1) {
@@ -7862,8 +7864,7 @@ DenseMap<const SCEV *, Value *> LoopVectorizationPlanner::executePlan(
       makeFollowupLoopID(OrigLoopID, {LLVMLoopVectorizeFollowupAll,
                                       LLVMLoopVectorizeFollowupVectorized});
 
-  if (auto *R =
-          dyn_cast<VPRegionBlock>(BestVPlan.getEntry()->getSingleSuccessor())) {
+  if (auto *R = BestVPlan.getVectorLoopRegion()) {
     VPBasicBlock *HeaderVPBB = R->getEntryBasicBlock();
     Loop *L = LI->getLoopFor(State.CFG.VPBB2IRBB[HeaderVPBB]);
     if (VectorizedLoopID) {
@@ -7890,8 +7891,7 @@ DenseMap<const SCEV *, Value *> LoopVectorizationPlanner::executePlan(
   ILV.printDebugTracesAtEnd();
 
   // 4. Adjust branch weight of the branch in the middle block.
-  if (auto *R =
-          dyn_cast<VPRegionBlock>(BestVPlan.getEntry()->getSingleSuccessor())) {
+  if (auto *R = BestVPlan.getVectorLoopRegion()) {
     auto *ExitVPBB = cast<VPBasicBlock>(R->getSingleSuccessor());
 
     auto *MiddleTerm =
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.cpp b/llvm/lib/Transforms/Vectorize/VPlan.cpp
index 52f77a4598cc52..11a71820c22a26 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlan.cpp
@@ -1022,8 +1022,7 @@ void VPlan::execute(VPTransformState *State) {
   for (VPBlockBase *Block : RPOT)
     Block->execute(State);
 
-  if (auto *LoopRegion =
-          dyn_cast<VPRegionBlock>(getEntry()->getSingleSuccessor())) {
+  if (auto *LoopRegion = getVectorLoopRegion()) {
     VPBasicBlock *LatchVPBB = LoopRegion->getExitingBasicBlock();
     BasicBlock *VectorLatchBB = State->CFG.VPBB2IRBB[LatchVPBB];
 
@@ -1047,6 +1046,19 @@ void VPlan::execute(VPTransformState *State) {
         Phi = cast<PHINode>(GEP->getPointerOperand());
       }
 
+      Phi->setIncomingBlock(1, VectorLatchBB);
+
+      // Move the last step to the end of the latch block. This ensures
+      // consistent placement of all induction updates.
+      Instruction *Inc = cast<Instruction>(Phi->getIncomingValue(1));
+      Inc->moveBefore(VectorLatchBB->getTerminator()->getPrevNode());
+
+      // Use the steps for the last part as backedge value for the induction.
+      if (auto *IV = dyn_cast<VPWidenIntOrFpInductionRecipe>(&R))
+        Inc->setOperand(0, State->get(IV->getLastUnrolledPartOperand()));
+      continue;
+    }
+
       auto *PhiR = cast<VPHeaderPHIRecipe>(&R);
       bool NeedsScalar = isa<VPScalarPHIRecipe>(PhiR) ||
                          (isa<VPReductionPHIRecipe>(PhiR) &&
@@ -1056,6 +1068,7 @@ void VPlan::execute(VPTransformState *State) {
       cast<PHINode>(Phi)->addIncoming(Val, VectorLatchBB);
     }
   }
+
   State->CFG.DTU.flush();
 }
 

>From 0c76e9d4954ed542279486320330fc51b9253461 Mon Sep 17 00:00:00 2001
From: Florian Hahn <flo at fhahn.com>
Date: Mon, 23 Dec 2024 21:21:03 +0000
Subject: [PATCH 10/24] !fixup update on top of current main, iterate to find
 vector.ph

---
 .../Transforms/Vectorize/LoopVectorize.cpp    | 12 ++++----
 llvm/lib/Transforms/Vectorize/VPlan.cpp       | 29 ++++++++++++------
 llvm/lib/Transforms/Vectorize/VPlan.h         |  8 ++---
 .../Transforms/Vectorize/VPlanTransforms.cpp  |  5 ++++
 .../truncate-to-minimal-bitwidth-cost.ll      |  4 +--
 .../LoopVectorize/SystemZ/pr47665.ll          | 30 +++++++++----------
 ...demanding-all-lanes-and-first-lane-only.ll | 30 ++++++++++---------
 .../vplan-printing-before-execute.ll          |  4 +--
 8 files changed, 68 insertions(+), 54 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 11fbb49d4dd6cb..e57eb58eaa040b 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -2406,12 +2406,12 @@ void InnerLoopVectorizer::scalarizeInstruction(const Instruction *Instr,
   // End if-block.
   VPRegionBlock *Parent = RepRecipe->getParent()->getParent();
   bool IfPredicateInstr = Parent ? Parent->isReplicator() : false;
-  assert((Parent || all_of(RepRecipe->operands(),
-                           [](VPValue *Op) {
-                             return Op->isDefinedOutsideLoopRegions();
-                           })) &&
-         "Expected a recipe is either within a region or all of its operands "
-         "are defined outside the vectorized region.");
+  assert(
+      (Parent || !RepRecipe->getParent()->getPlan()->getVectorLoopRegion() ||
+       all_of(RepRecipe->operands(),
+              [](VPValue *Op) { return Op->isDefinedOutsideLoopRegions(); })) &&
+      "Expected a recipe is either within a region or all of its operands "
+      "are defined outside the vectorized region.");
   if (IfPredicateInstr)
     PredicatedInstructions.push_back(Cloned);
 }
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.cpp b/llvm/lib/Transforms/Vectorize/VPlan.cpp
index 197d7289288388..3664a5aee7b6f8 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlan.cpp
@@ -1081,19 +1081,28 @@ InstructionCost VPlan::cost(ElementCount VF, VPCostContext &Ctx) {
   return getVectorLoopRegion()->cost(VF, Ctx);
 }
 
+VPBasicBlock *VPlan::getVectorPreheader() {
+  VPBlockBase *Current = getEntry()->getSuccessors().back();
+  while (Current->getNumSuccessors() == 2)
+    Current = Current->getSuccessors().back();
+  return cast<VPBasicBlock>(Current);
+}
+
+VPBasicBlock *VPlan::getVectorPreheader() const {
+  VPBlockBase *Current = getEntry()->getSuccessors().back();
+  while (Current->getNumSuccessors() == 2)
+    Current = Current->getSuccessors().back();
+  return cast<VPBasicBlock>(Current);
+}
+
 VPRegionBlock *VPlan::getVectorLoopRegion() {
   // TODO: Cache if possible.
-  for (VPBlockBase *B : vp_depth_first_shallow(getEntry()))
-    if (auto *R = dyn_cast<VPRegionBlock>(B))
-      return R;
-  return nullptr;
+  return dyn_cast<VPRegionBlock>(getVectorPreheader()->getSingleSuccessor());
+  ;
 }
 
 const VPRegionBlock *VPlan::getVectorLoopRegion() const {
-  for (const VPBlockBase *B : vp_depth_first_shallow(getEntry()))
-    if (auto *R = dyn_cast<VPRegionBlock>(B))
-      return R;
-  return nullptr;
+  return dyn_cast<VPRegionBlock>(getVectorPreheader()->getSingleSuccessor());
 }
 
 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
@@ -1415,8 +1424,10 @@ void VPlanIngredient::print(raw_ostream &O) const {
 #endif
 
 bool VPValue::isDefinedOutsideLoopRegions() const {
+
   return !hasDefiningRecipe() ||
-         !getDefiningRecipe()->getParent()->getEnclosingLoopRegion();
+         (!getDefiningRecipe()->getParent()->getEnclosingLoopRegion() &&
+          getDefiningRecipe()->getParent()->getPlan()->getVectorLoopRegion());
 }
 
 void VPValue::replaceAllUsesWith(VPValue *New) {
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h
index d2297101ab305a..87a3884404fed8 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.h
+++ b/llvm/lib/Transforms/Vectorize/VPlan.h
@@ -3869,12 +3869,8 @@ class VPlan {
   const VPBasicBlock *getEntry() const { return Entry; }
 
   /// Returns the preheader of the vector loop region.
-  VPBasicBlock *getVectorPreheader() {
-    auto *LoopRegion = getVectorLoopRegion();
-    if (!LoopRegion)
-      return nullptr;
-    return dyn_cast<VPBasicBlock>(LoopRegion->getSinglePredecessor());
-  }
+  VPBasicBlock *getVectorPreheader();
+  VPBasicBlock *getVectorPreheader() const;
 
   /// Returns the VPRegionBlock of the vector loop.
   VPRegionBlock *getVectorLoopRegion();
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
index f227f83fdeb6cb..a7097dd1934478 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
@@ -718,6 +718,11 @@ void VPlanTransforms::optimizeForVFAndUF(VPlan &Plan, ElementCount BestVF,
 
     Header->setParent(nullptr);
     Exiting->setParent(nullptr);
+
+    for (VPBlockBase *B : vp_depth_first_shallow(LoopRegion->getEntry())) {
+      if (isa<VPRegionBlock>(B))
+        B->setParent(nullptr);
+    }
     VPBlockUtils::connectBlocks(Preheader, Header);
 
     VPBlockUtils::connectBlocks(Exiting, Middle);
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/truncate-to-minimal-bitwidth-cost.ll b/llvm/test/Transforms/LoopVectorize/RISCV/truncate-to-minimal-bitwidth-cost.ll
index 3dd5e70e8d5b14..7a6676ff6ea1b3 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/truncate-to-minimal-bitwidth-cost.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/truncate-to-minimal-bitwidth-cost.ll
@@ -163,12 +163,12 @@ define void @truncate_to_i1_used_by_branch(i8 %x, ptr %dst) #0 {
 ; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <2 x i8> [[BROADCAST_SPLATINSERT]], <2 x i8> poison, <2 x i32> zeroinitializer
 ; CHECK-NEXT:    [[TMP0:%.*]] = trunc <2 x i8> [[BROADCAST_SPLAT]] to <2 x i1>
 ; CHECK-NEXT:    [[TMP2:%.*]] = or <2 x i1> splat (i1 true), [[TMP0]]
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT3:%.*]] = insertelement <2 x ptr> poison, ptr [[DST]], i64 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT4:%.*]] = shufflevector <2 x ptr> [[BROADCAST_SPLATINSERT3]], <2 x ptr> poison, <2 x i32> zeroinitializer
 ; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
 ; CHECK:       [[VECTOR_BODY]]:
 ; CHECK-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = call <2 x i1> @llvm.get.active.lane.mask.v2i1.i32(i32 0, i32 2)
 ; CHECK-NEXT:    [[TMP3:%.*]] = select <2 x i1> [[ACTIVE_LANE_MASK]], <2 x i1> [[TMP2]], <2 x i1> zeroinitializer
-; CHECK-NEXT:    [[BROADCAST_SPLATINSERT3:%.*]] = insertelement <2 x ptr> poison, ptr [[DST]], i64 0
-; CHECK-NEXT:    [[BROADCAST_SPLAT4:%.*]] = shufflevector <2 x ptr> [[BROADCAST_SPLATINSERT3]], <2 x ptr> poison, <2 x i32> zeroinitializer
 ; CHECK-NEXT:    call void @llvm.masked.scatter.v2i8.v2p0(<2 x i8> zeroinitializer, <2 x ptr> [[BROADCAST_SPLAT4]], i32 1, <2 x i1> [[TMP3]])
 ; CHECK-NEXT:    br label %[[MIDDLE_BLOCK:.*]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
diff --git a/llvm/test/Transforms/LoopVectorize/SystemZ/pr47665.ll b/llvm/test/Transforms/LoopVectorize/SystemZ/pr47665.ll
index 21063e32e81b69..2de0f7e4d40169 100644
--- a/llvm/test/Transforms/LoopVectorize/SystemZ/pr47665.ll
+++ b/llvm/test/Transforms/LoopVectorize/SystemZ/pr47665.ll
@@ -27,91 +27,91 @@ define void @test(ptr %p, i40 %a) {
 ; CHECK:       pred.store.continue:
 ; CHECK-NEXT:    br i1 true, label [[PRED_STORE_IF1:%.*]], label [[PRED_STORE_CONTINUE2:%.*]]
 ; CHECK:       pred.store.if1:
-; CHECK-NEXT:    [[TMP9:%.*]] = extractelement <16 x i1> [[TMP8]], i32 0
+; CHECK-NEXT:    [[TMP9:%.*]] = extractelement <16 x i1> [[TMP8]], i32 1
 ; CHECK-NEXT:    store i1 [[TMP9]], ptr [[P]], align 1
 ; CHECK-NEXT:    br label [[PRED_STORE_CONTINUE2]]
 ; CHECK:       pred.store.continue2:
 ; CHECK-NEXT:    br i1 true, label [[PRED_STORE_IF3:%.*]], label [[PRED_STORE_CONTINUE4:%.*]]
 ; CHECK:       pred.store.if3:
-; CHECK-NEXT:    [[TMP12:%.*]] = extractelement <16 x i1> [[TMP8]], i32 0
+; CHECK-NEXT:    [[TMP12:%.*]] = extractelement <16 x i1> [[TMP8]], i32 2
 ; CHECK-NEXT:    store i1 [[TMP12]], ptr [[P]], align 1
 ; CHECK-NEXT:    br label [[PRED_STORE_CONTINUE4]]
 ; CHECK:       pred.store.continue4:
 ; CHECK-NEXT:    br i1 true, label [[PRED_STORE_IF5:%.*]], label [[PRED_STORE_CONTINUE6:%.*]]
 ; CHECK:       pred.store.if5:
-; CHECK-NEXT:    [[TMP14:%.*]] = extractelement <16 x i1> [[TMP8]], i32 0
+; CHECK-NEXT:    [[TMP14:%.*]] = extractelement <16 x i1> [[TMP8]], i32 3
 ; CHECK-NEXT:    store i1 [[TMP14]], ptr [[P]], align 1
 ; CHECK-NEXT:    br label [[PRED_STORE_CONTINUE6]]
 ; CHECK:       pred.store.continue6:
 ; CHECK-NEXT:    br i1 true, label [[PRED_STORE_IF7:%.*]], label [[PRED_STORE_CONTINUE8:%.*]]
 ; CHECK:       pred.store.if7:
-; CHECK-NEXT:    [[TMP16:%.*]] = extractelement <16 x i1> [[TMP8]], i32 0
+; CHECK-NEXT:    [[TMP16:%.*]] = extractelement <16 x i1> [[TMP8]], i32 4
 ; CHECK-NEXT:    store i1 [[TMP16]], ptr [[P]], align 1
 ; CHECK-NEXT:    br label [[PRED_STORE_CONTINUE8]]
 ; CHECK:       pred.store.continue8:
 ; CHECK-NEXT:    br i1 true, label [[PRED_STORE_IF9:%.*]], label [[PRED_STORE_CONTINUE10:%.*]]
 ; CHECK:       pred.store.if9:
-; CHECK-NEXT:    [[TMP18:%.*]] = extractelement <16 x i1> [[TMP8]], i32 0
+; CHECK-NEXT:    [[TMP18:%.*]] = extractelement <16 x i1> [[TMP8]], i32 5
 ; CHECK-NEXT:    store i1 [[TMP18]], ptr [[P]], align 1
 ; CHECK-NEXT:    br label [[PRED_STORE_CONTINUE10]]
 ; CHECK:       pred.store.continue10:
 ; CHECK-NEXT:    br i1 true, label [[PRED_STORE_IF11:%.*]], label [[PRED_STORE_CONTINUE12:%.*]]
 ; CHECK:       pred.store.if11:
-; CHECK-NEXT:    [[TMP20:%.*]] = extractelement <16 x i1> [[TMP8]], i32 0
+; CHECK-NEXT:    [[TMP20:%.*]] = extractelement <16 x i1> [[TMP8]], i32 6
 ; CHECK-NEXT:    store i1 [[TMP20]], ptr [[P]], align 1
 ; CHECK-NEXT:    br label [[PRED_STORE_CONTINUE12]]
 ; CHECK:       pred.store.continue12:
 ; CHECK-NEXT:    br i1 true, label [[PRED_STORE_IF13:%.*]], label [[PRED_STORE_CONTINUE14:%.*]]
 ; CHECK:       pred.store.if13:
-; CHECK-NEXT:    [[TMP22:%.*]] = extractelement <16 x i1> [[TMP8]], i32 0
+; CHECK-NEXT:    [[TMP22:%.*]] = extractelement <16 x i1> [[TMP8]], i32 7
 ; CHECK-NEXT:    store i1 [[TMP22]], ptr [[P]], align 1
 ; CHECK-NEXT:    br label [[PRED_STORE_CONTINUE14]]
 ; CHECK:       pred.store.continue14:
 ; CHECK-NEXT:    br i1 true, label [[PRED_STORE_IF15:%.*]], label [[PRED_STORE_CONTINUE16:%.*]]
 ; CHECK:       pred.store.if15:
-; CHECK-NEXT:    [[TMP24:%.*]] = extractelement <16 x i1> [[TMP8]], i32 0
+; CHECK-NEXT:    [[TMP24:%.*]] = extractelement <16 x i1> [[TMP8]], i32 8
 ; CHECK-NEXT:    store i1 [[TMP24]], ptr [[P]], align 1
 ; CHECK-NEXT:    br label [[PRED_STORE_CONTINUE16]]
 ; CHECK:       pred.store.continue16:
 ; CHECK-NEXT:    br i1 true, label [[PRED_STORE_IF17:%.*]], label [[PRED_STORE_CONTINUE18:%.*]]
 ; CHECK:       pred.store.if17:
-; CHECK-NEXT:    [[TMP26:%.*]] = extractelement <16 x i1> [[TMP8]], i32 0
+; CHECK-NEXT:    [[TMP26:%.*]] = extractelement <16 x i1> [[TMP8]], i32 9
 ; CHECK-NEXT:    store i1 [[TMP26]], ptr [[P]], align 1
 ; CHECK-NEXT:    br label [[PRED_STORE_CONTINUE18]]
 ; CHECK:       pred.store.continue18:
 ; CHECK-NEXT:    br i1 false, label [[PRED_STORE_IF19:%.*]], label [[PRED_STORE_CONTINUE20:%.*]]
 ; CHECK:       pred.store.if19:
-; CHECK-NEXT:    [[TMP28:%.*]] = extractelement <16 x i1> [[TMP8]], i32 0
+; CHECK-NEXT:    [[TMP28:%.*]] = extractelement <16 x i1> [[TMP8]], i32 10
 ; CHECK-NEXT:    store i1 [[TMP28]], ptr [[P]], align 1
 ; CHECK-NEXT:    br label [[PRED_STORE_CONTINUE20]]
 ; CHECK:       pred.store.continue20:
 ; CHECK-NEXT:    br i1 false, label [[PRED_STORE_IF21:%.*]], label [[PRED_STORE_CONTINUE22:%.*]]
 ; CHECK:       pred.store.if21:
-; CHECK-NEXT:    [[TMP30:%.*]] = extractelement <16 x i1> [[TMP8]], i32 0
+; CHECK-NEXT:    [[TMP30:%.*]] = extractelement <16 x i1> [[TMP8]], i32 11
 ; CHECK-NEXT:    store i1 [[TMP30]], ptr [[P]], align 1
 ; CHECK-NEXT:    br label [[PRED_STORE_CONTINUE22]]
 ; CHECK:       pred.store.continue22:
 ; CHECK-NEXT:    br i1 false, label [[PRED_STORE_IF23:%.*]], label [[PRED_STORE_CONTINUE24:%.*]]
 ; CHECK:       pred.store.if23:
-; CHECK-NEXT:    [[TMP32:%.*]] = extractelement <16 x i1> [[TMP8]], i32 0
+; CHECK-NEXT:    [[TMP32:%.*]] = extractelement <16 x i1> [[TMP8]], i32 12
 ; CHECK-NEXT:    store i1 [[TMP32]], ptr [[P]], align 1
 ; CHECK-NEXT:    br label [[PRED_STORE_CONTINUE24]]
 ; CHECK:       pred.store.continue24:
 ; CHECK-NEXT:    br i1 false, label [[PRED_STORE_IF25:%.*]], label [[PRED_STORE_CONTINUE26:%.*]]
 ; CHECK:       pred.store.if25:
-; CHECK-NEXT:    [[TMP34:%.*]] = extractelement <16 x i1> [[TMP8]], i32 0
+; CHECK-NEXT:    [[TMP34:%.*]] = extractelement <16 x i1> [[TMP8]], i32 13
 ; CHECK-NEXT:    store i1 [[TMP34]], ptr [[P]], align 1
 ; CHECK-NEXT:    br label [[PRED_STORE_CONTINUE26]]
 ; CHECK:       pred.store.continue26:
 ; CHECK-NEXT:    br i1 false, label [[PRED_STORE_IF27:%.*]], label [[PRED_STORE_CONTINUE28:%.*]]
 ; CHECK:       pred.store.if27:
-; CHECK-NEXT:    [[TMP36:%.*]] = extractelement <16 x i1> [[TMP8]], i32 0
+; CHECK-NEXT:    [[TMP36:%.*]] = extractelement <16 x i1> [[TMP8]], i32 14
 ; CHECK-NEXT:    store i1 [[TMP36]], ptr [[P]], align 1
 ; CHECK-NEXT:    br label [[PRED_STORE_CONTINUE28]]
 ; CHECK:       pred.store.continue28:
 ; CHECK-NEXT:    br i1 false, label [[PRED_STORE_IF29:%.*]], label [[PRED_STORE_CONTINUE30:%.*]]
 ; CHECK:       pred.store.if29:
-; CHECK-NEXT:    [[TMP38:%.*]] = extractelement <16 x i1> [[TMP8]], i32 0
+; CHECK-NEXT:    [[TMP38:%.*]] = extractelement <16 x i1> [[TMP8]], i32 15
 ; CHECK-NEXT:    store i1 [[TMP38]], ptr [[P]], align 1
 ; CHECK-NEXT:    br label [[PRED_STORE_CONTINUE30]]
 ; CHECK:       pred.store.continue30:
diff --git a/llvm/test/Transforms/LoopVectorize/SystemZ/scalar-steps-with-users-demanding-all-lanes-and-first-lane-only.ll b/llvm/test/Transforms/LoopVectorize/SystemZ/scalar-steps-with-users-demanding-all-lanes-and-first-lane-only.ll
index c912e18e74ecf8..61bcbaa1fe4d2e 100644
--- a/llvm/test/Transforms/LoopVectorize/SystemZ/scalar-steps-with-users-demanding-all-lanes-and-first-lane-only.ll
+++ b/llvm/test/Transforms/LoopVectorize/SystemZ/scalar-steps-with-users-demanding-all-lanes-and-first-lane-only.ll
@@ -16,20 +16,22 @@ define void @test_scalar_iv_steps_used_by_replicate_and_first_lane_only_vpinst(p
 ; CHECK:       [[VECTOR_PH]]:
 ; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
 ; CHECK:       [[VECTOR_BODY]]:
-; CHECK-NEXT:    [[TMP4:%.*]] = mul nsw i64 0, 4
-; CHECK-NEXT:    [[TMP1:%.*]] = mul nsw i64 0, 4
-; CHECK-NEXT:    [[TMP2:%.*]] = mul nsw i64 0, 4
 ; CHECK-NEXT:    [[TMP3:%.*]] = mul nsw i64 0, 4
-; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i8, ptr [[SRC_1]], i64 [[TMP4]]
+; CHECK-NEXT:    [[TMP4:%.*]] = mul nsw i64 1, 4
+; CHECK-NEXT:    [[TMP2:%.*]] = mul nsw i64 2, 4
+; CHECK-NEXT:    [[TMP15:%.*]] = mul nsw i64 3, 4
+; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i8, ptr [[SRC_1]], i64 [[TMP3]]
 ; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i8, ptr [[SRC_1]], i64 [[TMP4]]
-; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i8, ptr [[SRC_1]], i64 [[TMP4]]
-; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i8, ptr [[SRC_1]], i64 [[TMP4]]
+; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i8, ptr [[SRC_1]], i64 [[TMP2]]
+; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i8, ptr [[SRC_1]], i64 [[TMP15]]
 ; CHECK-NEXT:    [[TMP12:%.*]] = load i8, ptr [[TMP8]], align 1
-; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i8> poison, i8 [[TMP12]], i64 0
-; CHECK-NEXT:    [[TMP19:%.*]] = shufflevector <4 x i8> [[BROADCAST_SPLATINSERT]], <4 x i8> poison, <4 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP9:%.*]] = load i8, ptr [[TMP8]], align 1
-; CHECK-NEXT:    [[TMP10:%.*]] = load i8, ptr [[TMP8]], align 1
-; CHECK-NEXT:    [[TMP11:%.*]] = load i8, ptr [[TMP8]], align 1
+; CHECK-NEXT:    [[TMP9:%.*]] = load i8, ptr [[TMP5]], align 1
+; CHECK-NEXT:    [[TMP10:%.*]] = load i8, ptr [[TMP6]], align 1
+; CHECK-NEXT:    [[TMP11:%.*]] = load i8, ptr [[TMP7]], align 1
+; CHECK-NEXT:    [[TMP16:%.*]] = insertelement <4 x i8> poison, i8 [[TMP12]], i32 0
+; CHECK-NEXT:    [[TMP13:%.*]] = insertelement <4 x i8> [[TMP16]], i8 [[TMP9]], i32 1
+; CHECK-NEXT:    [[TMP14:%.*]] = insertelement <4 x i8> [[TMP13]], i8 [[TMP10]], i32 2
+; CHECK-NEXT:    [[TMP19:%.*]] = insertelement <4 x i8> [[TMP14]], i8 [[TMP11]], i32 3
 ; CHECK-NEXT:    [[TMP20:%.*]] = icmp eq <4 x i8> [[TMP19]], zeroinitializer
 ; CHECK-NEXT:    [[TMP22:%.*]] = getelementptr [8 x i32], ptr @src, i64 0, i64 4
 ; CHECK-NEXT:    [[TMP23:%.*]] = getelementptr i32, ptr [[TMP22]], i32 0
@@ -44,21 +46,21 @@ define void @test_scalar_iv_steps_used_by_replicate_and_first_lane_only_vpinst(p
 ; CHECK-NEXT:    [[TMP26:%.*]] = extractelement <4 x i1> [[TMP20]], i32 1
 ; CHECK-NEXT:    br i1 [[TMP26]], label %[[PRED_STORE_IF1:.*]], label %[[PRED_STORE_CONTINUE2:.*]]
 ; CHECK:       [[PRED_STORE_IF1]]:
-; CHECK-NEXT:    [[TMP27:%.*]] = extractelement <4 x i32> [[WIDE_LOAD]], i32 0
+; CHECK-NEXT:    [[TMP27:%.*]] = extractelement <4 x i32> [[WIDE_LOAD]], i32 1
 ; CHECK-NEXT:    store i32 [[TMP27]], ptr [[DST]], align 4
 ; CHECK-NEXT:    br label %[[PRED_STORE_CONTINUE2]]
 ; CHECK:       [[PRED_STORE_CONTINUE2]]:
 ; CHECK-NEXT:    [[TMP28:%.*]] = extractelement <4 x i1> [[TMP20]], i32 2
 ; CHECK-NEXT:    br i1 [[TMP28]], label %[[PRED_STORE_IF3:.*]], label %[[PRED_STORE_CONTINUE4:.*]]
 ; CHECK:       [[PRED_STORE_IF3]]:
-; CHECK-NEXT:    [[TMP29:%.*]] = extractelement <4 x i32> [[WIDE_LOAD]], i32 0
+; CHECK-NEXT:    [[TMP29:%.*]] = extractelement <4 x i32> [[WIDE_LOAD]], i32 2
 ; CHECK-NEXT:    store i32 [[TMP29]], ptr [[DST]], align 4
 ; CHECK-NEXT:    br label %[[PRED_STORE_CONTINUE4]]
 ; CHECK:       [[PRED_STORE_CONTINUE4]]:
 ; CHECK-NEXT:    [[TMP30:%.*]] = extractelement <4 x i1> [[TMP20]], i32 3
 ; CHECK-NEXT:    br i1 [[TMP30]], label %[[PRED_STORE_IF5:.*]], label %[[PRED_STORE_CONTINUE6:.*]]
 ; CHECK:       [[PRED_STORE_IF5]]:
-; CHECK-NEXT:    [[TMP31:%.*]] = extractelement <4 x i32> [[WIDE_LOAD]], i32 0
+; CHECK-NEXT:    [[TMP31:%.*]] = extractelement <4 x i32> [[WIDE_LOAD]], i32 3
 ; CHECK-NEXT:    store i32 [[TMP31]], ptr [[DST]], align 4
 ; CHECK-NEXT:    br label %[[PRED_STORE_CONTINUE6]]
 ; CHECK:       [[PRED_STORE_CONTINUE6]]:
diff --git a/llvm/test/Transforms/LoopVectorize/vplan-printing-before-execute.ll b/llvm/test/Transforms/LoopVectorize/vplan-printing-before-execute.ll
index c1ac6f85b0b647..ba728c6be8ab6e 100644
--- a/llvm/test/Transforms/LoopVectorize/vplan-printing-before-execute.ll
+++ b/llvm/test/Transforms/LoopVectorize/vplan-printing-before-execute.ll
@@ -70,7 +70,7 @@ define void @test_tc_less_than_16(ptr %A, i64 %N) {
 ; CHECK-NEXT:  IR   %n.vec = sub i64 %and, %n.mod.vf
 ; CHECK-NEXT:  IR   %ind.end = sub i64 %and, %n.vec
 ; CHECK-NEXT:  IR   %ind.end1 = getelementptr i8, ptr %A, i64 %n.vec
-; CHECK-NEXT: Successor(s): vector loop
+; CHECK-NEXT: Successor(s): vector.body
 ; CHECK-EMPTY:
 ; CHECK-NEXT: vector.body:
 ; CHECK-NEXT:   vp<[[STEPS1:%.+]]> = SCALAR-STEPS ir<0>, ir<1>
@@ -85,7 +85,7 @@ define void @test_tc_less_than_16(ptr %A, i64 %N) {
 ; CHECK-NEXT:   vp<[[VPTR4:%.+]]> = vector-pointer vp<[[PADD1]]>, ir<1>
 ; CHECK-NEXT:   WIDEN store vp<[[VPTR3]]>, ir<%add>
 ; CHECK-NEXT:   WIDEN store vp<[[VPTR4]]>, ir<%add>.1
-; CHECK-NEXT: Successor(s): middle.block
+; CHECK-NEXT: Successor(s): ir-bb<middle.block>
 ; CHECK-EMPTY:
 ; CHECK-NEXT: ir-bb<middle.block>:
 ; CHECK-NEXT:   EMIT vp<[[C:%.+]]> = icmp eq vp<[[TC]]>, ir<[[VTC]]>

>From dd45cad20284589bbe26db0c64bf8d1ad3210e91 Mon Sep 17 00:00:00 2001
From: Florian Hahn <flo at fhahn.com>
Date: Wed, 18 Dec 2024 14:33:49 +0000
Subject: [PATCH 11/24] [VPlan] Manage created blocks directly in VPlan. (NFC)

This patch changes the way blocks are managed by VPlan. Previously all
blocks reachable from entry would be cleaned up when a VPlan is
destroyed. With this patch, each VPlan keeps track of blocks created for
it in a list and this list is then used to delete all blocks in the list
when the VPlan is destroyed. To do so, block creation is funneled
through helpers in directly in VPlan.

The main advantage of doing so is it simplifies CFG transformations, as
those do not have to take care of deleting any blocks, just adjusting
the CFG. This helps to simplify
https://github.com/llvm/llvm-project/pull/108378 and
https://github.com/llvm/llvm-project/pull/106748.

This also simplifies handling of 'immutable' blocks a VPlan holds
references to, which at the moment only include the scalar
header block.

Note that the original constructors taking VPBlockBase are retained at
the moment for unit tests.
---
 .../Transforms/Vectorize/LoopVectorize.cpp    |  7 +-
 llvm/lib/Transforms/Vectorize/VPlan.cpp       | 84 ++++++++++++-------
 llvm/lib/Transforms/Vectorize/VPlan.h         | 54 ++++++------
 .../Transforms/Vectorize/VPlanHCFGBuilder.cpp |  4 +-
 .../Transforms/Vectorize/VPlanTransforms.cpp  | 19 +++--
 5 files changed, 97 insertions(+), 71 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 355ff40ce770e7..4f2bf097d0cde5 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -2477,7 +2477,7 @@ static void introduceCheckBlockInVPlan(VPlan &Plan, BasicBlock *CheckIRBB) {
     assert(PreVectorPH->getNumSuccessors() == 2 && "Expected 2 successors");
     assert(PreVectorPH->getSuccessors()[0] == ScalarPH &&
            "Unexpected successor");
-    VPIRBasicBlock *CheckVPIRBB = VPIRBasicBlock::fromBasicBlock(CheckIRBB);
+    VPIRBasicBlock *CheckVPIRBB = Plan.createVPIRBasicBlock(CheckIRBB);
     VPBlockUtils::insertOnEdge(PreVectorPH, VectorPH, CheckVPIRBB);
     PreVectorPH = CheckVPIRBB;
   }
@@ -8189,11 +8189,10 @@ EpilogueVectorizerEpilogueLoop::emitMinimumVectorEpilogueIterCountCheck(
 
   // A new entry block has been created for the epilogue VPlan. Hook it in, as
   // otherwise we would try to modify the entry to the main vector loop.
-  VPIRBasicBlock *NewEntry = VPIRBasicBlock::fromBasicBlock(Insert);
+  VPIRBasicBlock *NewEntry = Plan.createVPIRBasicBlock(Insert);
   VPBasicBlock *OldEntry = Plan.getEntry();
   VPBlockUtils::reassociateBlocks(OldEntry, NewEntry);
   Plan.setEntry(NewEntry);
-  delete OldEntry;
 
   introduceCheckBlockInVPlan(Plan, Insert);
   return Insert;
@@ -9463,7 +9462,7 @@ LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(VFRange &Range) {
         VPBB->appendRecipe(Recipe);
     }
 
-    VPBlockUtils::insertBlockAfter(new VPBasicBlock(), VPBB);
+    VPBlockUtils::insertBlockAfter(Plan->createVPBasicBlock(""), VPBB);
     VPBB = cast<VPBasicBlock>(VPBB->getSingleSuccessor());
   }
 
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.cpp b/llvm/lib/Transforms/Vectorize/VPlan.cpp
index 9a082921d4f7f2..e03847cea131f0 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlan.cpp
@@ -205,11 +205,6 @@ VPBlockBase *VPBlockBase::getEnclosingBlockWithPredecessors() {
   return Parent->getEnclosingBlockWithPredecessors();
 }
 
-void VPBlockBase::deleteCFG(VPBlockBase *Entry) {
-  for (VPBlockBase *Block : to_vector(vp_depth_first_shallow(Entry)))
-    delete Block;
-}
-
 VPBasicBlock::iterator VPBasicBlock::getFirstNonPhi() {
   iterator It = begin();
   while (It != end() && It->isPhi())
@@ -474,6 +469,16 @@ void VPIRBasicBlock::execute(VPTransformState *State) {
   connectToPredecessors(State->CFG);
 }
 
+VPIRBasicBlock *VPIRBasicBlock::clone() {
+  auto *NewBlock = getPlan()->createVPIRBasicBlock(IRBB);
+  for (VPRecipeBase &R : make_early_inc_range(*NewBlock))
+    R.eraseFromParent();
+
+  for (VPRecipeBase &R : Recipes)
+    NewBlock->appendRecipe(R.clone());
+  return NewBlock;
+}
+
 void VPBasicBlock::execute(VPTransformState *State) {
   bool Replica = bool(State->Lane);
   BasicBlock *NewBB = State->CFG.PrevBB; // Reuse it if possible.
@@ -523,6 +528,13 @@ void VPBasicBlock::dropAllReferences(VPValue *NewValue) {
   }
 }
 
+VPBasicBlock *VPBasicBlock::clone() {
+  auto *NewBlock = getPlan()->createVPBasicBlock(getName());
+  for (VPRecipeBase &R : *this)
+    NewBlock->appendRecipe(R.clone());
+  return NewBlock;
+}
+
 void VPBasicBlock::executeRecipes(VPTransformState *State, BasicBlock *BB) {
   LLVM_DEBUG(dbgs() << "LV: vectorizing VPBB:" << getName()
                     << " in BB:" << BB->getName() << '\n');
@@ -541,7 +553,7 @@ VPBasicBlock *VPBasicBlock::splitAt(iterator SplitAt) {
 
   SmallVector<VPBlockBase *, 2> Succs(successors());
   // Create new empty block after the block to split.
-  auto *SplitBlock = new VPBasicBlock(getName() + ".split");
+  auto *SplitBlock = getPlan()->createVPBasicBlock(getName() + ".split");
   VPBlockUtils::insertBlockAfter(SplitBlock, this);
 
   // Finally, move the recipes starting at SplitAt to new block.
@@ -701,8 +713,8 @@ static std::pair<VPBlockBase *, VPBlockBase *> cloneFrom(VPBlockBase *Entry) {
 
 VPRegionBlock *VPRegionBlock::clone() {
   const auto &[NewEntry, NewExiting] = cloneFrom(getEntry());
-  auto *NewRegion =
-      new VPRegionBlock(NewEntry, NewExiting, getName(), isReplicator());
+  auto *NewRegion = getPlan()->createVPRegionBlock(NewEntry, NewExiting,
+                                                   getName(), isReplicator());
   for (VPBlockBase *Block : vp_depth_first_shallow(NewEntry))
     Block->setParent(NewRegion);
   return NewRegion;
@@ -822,17 +834,20 @@ void VPRegionBlock::print(raw_ostream &O, const Twine &Indent,
 #endif
 
 VPlan::VPlan(Loop *L) {
-  setEntry(VPIRBasicBlock::fromBasicBlock(L->getLoopPreheader()));
-  ScalarHeader = VPIRBasicBlock::fromBasicBlock(L->getHeader());
+  setEntry(createVPIRBasicBlock(L->getLoopPreheader()));
+  ScalarHeader = createVPIRBasicBlock(L->getHeader());
 }
 
 VPlan::~VPlan() {
   if (Entry) {
     VPValue DummyValue;
-    for (VPBlockBase *Block : vp_depth_first_shallow(Entry))
-      Block->dropAllReferences(&DummyValue);
 
-    VPBlockBase::deleteCFG(Entry);
+    for (auto *VPB : reverse(CreatedBlocks))
+      VPB->dropAllReferences(&DummyValue);
+
+    for (auto *VPB : reverse(CreatedBlocks)) {
+      delete VPB;
+    }
   }
   for (VPValue *VPV : VPLiveInsToFree)
     delete VPV;
@@ -840,14 +855,6 @@ VPlan::~VPlan() {
     delete BackedgeTakenCount;
 }
 
-VPIRBasicBlock *VPIRBasicBlock::fromBasicBlock(BasicBlock *IRBB) {
-  auto *VPIRBB = new VPIRBasicBlock(IRBB);
-  for (Instruction &I :
-       make_range(IRBB->begin(), IRBB->getTerminator()->getIterator()))
-    VPIRBB->appendRecipe(new VPIRInstruction(I));
-  return VPIRBB;
-}
-
 VPlanPtr VPlan::createInitialVPlan(Type *InductionTy,
                                    PredicatedScalarEvolution &PSE,
                                    bool RequiresScalarEpilogueCheck,
@@ -861,7 +868,7 @@ VPlanPtr VPlan::createInitialVPlan(Type *InductionTy,
   // an epilogue vector loop, the original entry block here will be replaced by
   // a new VPIRBasicBlock wrapping the entry to the epilogue vector loop after
   // generating code for the main vector loop.
-  VPBasicBlock *VecPreheader = new VPBasicBlock("vector.ph");
+  VPBasicBlock *VecPreheader = Plan->createVPBasicBlock("vector.ph");
   VPBlockUtils::connectBlocks(Plan->getEntry(), VecPreheader);
 
   // Create SCEV and VPValue for the trip count.
@@ -878,17 +885,17 @@ VPlanPtr VPlan::createInitialVPlan(Type *InductionTy,
 
   // Create VPRegionBlock, with empty header and latch blocks, to be filled
   // during processing later.
-  VPBasicBlock *HeaderVPBB = new VPBasicBlock("vector.body");
-  VPBasicBlock *LatchVPBB = new VPBasicBlock("vector.latch");
+  VPBasicBlock *HeaderVPBB = Plan->createVPBasicBlock("vector.body");
+  VPBasicBlock *LatchVPBB = Plan->createVPBasicBlock("vector.latch");
   VPBlockUtils::insertBlockAfter(LatchVPBB, HeaderVPBB);
-  auto *TopRegion = new VPRegionBlock(HeaderVPBB, LatchVPBB, "vector loop",
-                                      false /*isReplicator*/);
+  auto *TopRegion = Plan->createVPRegionBlock(
+      HeaderVPBB, LatchVPBB, "vector loop", false /*isReplicator*/);
 
   VPBlockUtils::insertBlockAfter(TopRegion, VecPreheader);
-  VPBasicBlock *MiddleVPBB = new VPBasicBlock("middle.block");
+  VPBasicBlock *MiddleVPBB = Plan->createVPBasicBlock("middle.block");
   VPBlockUtils::insertBlockAfter(MiddleVPBB, TopRegion);
 
-  VPBasicBlock *ScalarPH = new VPBasicBlock("scalar.ph");
+  VPBasicBlock *ScalarPH = Plan->createVPBasicBlock("scalar.ph");
   VPBlockUtils::connectBlocks(ScalarPH, ScalarHeader);
   if (!RequiresScalarEpilogueCheck) {
     VPBlockUtils::connectBlocks(MiddleVPBB, ScalarPH);
@@ -904,7 +911,7 @@ VPlanPtr VPlan::createInitialVPlan(Type *InductionTy,
   //    we unconditionally branch to the scalar preheader.  Do nothing.
   // 3) Otherwise, construct a runtime check.
   BasicBlock *IRExitBlock = TheLoop->getUniqueLatchExitBlock();
-  auto *VPExitBlock = VPIRBasicBlock::fromBasicBlock(IRExitBlock);
+  auto *VPExitBlock = Plan->createVPIRBasicBlock(IRExitBlock);
   // The connection order corresponds to the operands of the conditional branch.
   VPBlockUtils::insertBlockAfter(VPExitBlock, MiddleVPBB);
   VPBlockUtils::connectBlocks(MiddleVPBB, ScalarPH);
@@ -960,15 +967,13 @@ void VPlan::prepareToExecute(Value *TripCountV, Value *VectorTripCountV,
 /// have a single predecessor, which is rewired to the new VPIRBasicBlock. All
 /// successors of VPBB, if any, are rewired to the new VPIRBasicBlock.
 static void replaceVPBBWithIRVPBB(VPBasicBlock *VPBB, BasicBlock *IRBB) {
-  VPIRBasicBlock *IRVPBB = VPIRBasicBlock::fromBasicBlock(IRBB);
+  VPIRBasicBlock *IRVPBB = VPBB->getPlan()->createVPIRBasicBlock(IRBB);
   for (auto &R : make_early_inc_range(*VPBB)) {
     assert(!R.isPhi() && "Tried to move phi recipe to end of block");
     R.moveBefore(*IRVPBB, IRVPBB->end());
   }
 
   VPBlockUtils::reassociateBlocks(VPBB, IRVPBB);
-
-  delete VPBB;
 }
 
 /// Generate the code inside the preheader and body of the vectorized loop.
@@ -1217,6 +1222,7 @@ static void remapOperands(VPBlockBase *Entry, VPBlockBase *NewEntry,
 }
 
 VPlan *VPlan::duplicate() {
+  unsigned CreatedBlockSize = CreatedBlocks.size();
   // Clone blocks.
   const auto &[NewEntry, __] = cloneFrom(Entry);
 
@@ -1257,9 +1263,23 @@ VPlan *VPlan::duplicate() {
   assert(Old2NewVPValues.contains(TripCount) &&
          "TripCount must have been added to Old2NewVPValues");
   NewPlan->TripCount = Old2NewVPValues[TripCount];
+
+  for (unsigned I = CreatedBlockSize; I != CreatedBlocks.size(); ++I)
+    NewPlan->CreatedBlocks.push_back(CreatedBlocks[I]);
+  CreatedBlocks.truncate(CreatedBlockSize);
+
   return NewPlan;
 }
 
+VPIRBasicBlock *VPlan::createVPIRBasicBlock(BasicBlock *IRBB) {
+  auto *VPIRBB = new VPIRBasicBlock(IRBB);
+  for (Instruction &I :
+       make_range(IRBB->begin(), IRBB->getTerminator()->getIterator()))
+    VPIRBB->appendRecipe(new VPIRInstruction(I));
+  CreatedBlocks.push_back(VPIRBB);
+  return VPIRBB;
+}
+
 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
 
 Twine VPlanPrinter::getUID(const VPBlockBase *Block) {
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h
index e2c0ff79546758..eb0e3baa8d4f38 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.h
+++ b/llvm/lib/Transforms/Vectorize/VPlan.h
@@ -636,9 +636,6 @@ class VPBlockBase {
   /// Return the cost of the block.
   virtual InstructionCost cost(ElementCount VF, VPCostContext &Ctx) = 0;
 
-  /// Delete all blocks reachable from a given VPBlockBase, inclusive.
-  static void deleteCFG(VPBlockBase *Entry);
-
   /// Return true if it is legal to hoist instructions into this block.
   bool isLegalToHoistInto() {
     // There are currently no constraints that prevent an instruction to be
@@ -3638,12 +3635,7 @@ class VPBasicBlock : public VPBlockBase {
 
   /// Clone the current block and it's recipes, without updating the operands of
   /// the cloned recipes.
-  VPBasicBlock *clone() override {
-    auto *NewBlock = new VPBasicBlock(getName());
-    for (VPRecipeBase &R : *this)
-      NewBlock->appendRecipe(R.clone());
-    return NewBlock;
-  }
+  VPBasicBlock *clone() override;
 
 protected:
   /// Execute the recipes in the IR basic block \p BB.
@@ -3679,20 +3671,11 @@ class VPIRBasicBlock : public VPBasicBlock {
     return V->getVPBlockID() == VPBlockBase::VPIRBasicBlockSC;
   }
 
-  /// Create a VPIRBasicBlock from \p IRBB containing VPIRInstructions for all
-  /// instructions in \p IRBB, except its terminator which is managed in VPlan.
-  static VPIRBasicBlock *fromBasicBlock(BasicBlock *IRBB);
-
   /// The method which generates the output IR instructions that correspond to
   /// this VPBasicBlock, thereby "executing" the VPlan.
   void execute(VPTransformState *State) override;
 
-  VPIRBasicBlock *clone() override {
-    auto *NewBlock = new VPIRBasicBlock(IRBB);
-    for (VPRecipeBase &R : Recipes)
-      NewBlock->appendRecipe(R.clone());
-    return NewBlock;
-  }
+  VPIRBasicBlock *clone() override;
 
   BasicBlock *getIRBasicBlock() const { return IRBB; }
 };
@@ -3732,11 +3715,6 @@ class VPRegionBlock : public VPBlockBase {
         IsReplicator(IsReplicator) {}
 
   ~VPRegionBlock() override {
-    if (Entry) {
-      VPValue DummyValue;
-      Entry->dropAllReferences(&DummyValue);
-      deleteCFG(Entry);
-    }
   }
 
   /// Method to support type inquiry through isa, cast, and dyn_cast.
@@ -3863,6 +3841,8 @@ class VPlan {
   /// been modeled in VPlan directly.
   DenseMap<const SCEV *, VPValue *> SCEVToExpansion;
 
+  SmallVector<VPBlockBase *> CreatedBlocks;
+
 public:
   /// Construct a VPlan with \p Entry to the plan and with \p ScalarHeader
   /// wrapping the original header of the scalar loop.
@@ -4079,6 +4059,32 @@ class VPlan {
   /// Clone the current VPlan, update all VPValues of the new VPlan and cloned
   /// recipes to refer to the clones, and return it.
   VPlan *duplicate();
+
+  VPBasicBlock *createVPBasicBlock(const Twine &Name,
+                                   VPRecipeBase *Recipe = nullptr) {
+    auto *VPB = new VPBasicBlock(Name, Recipe);
+    CreatedBlocks.push_back(VPB);
+    return VPB;
+  }
+
+  VPRegionBlock *createVPRegionBlock(VPBlockBase *Entry, VPBlockBase *Exiting,
+                                     const std::string &Name = "",
+                                     bool IsReplicator = false) {
+    auto *VPB = new VPRegionBlock(Entry, Exiting, Name, IsReplicator);
+    CreatedBlocks.push_back(VPB);
+    return VPB;
+  }
+
+  VPRegionBlock *createVPRegionBlock(const std::string &Name = "",
+                                     bool IsReplicator = false) {
+    auto *VPB = new VPRegionBlock(Name, IsReplicator);
+    CreatedBlocks.push_back(VPB);
+    return VPB;
+  }
+
+  /// Create a VPIRBasicBlock from \p IRBB containing VPIRInstructions for all
+  /// instructions in \p IRBB, except its terminator which is managed in VPlan.
+  VPIRBasicBlock *createVPIRBasicBlock(BasicBlock *IRBB);
 };
 
 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
diff --git a/llvm/lib/Transforms/Vectorize/VPlanHCFGBuilder.cpp b/llvm/lib/Transforms/Vectorize/VPlanHCFGBuilder.cpp
index 6e633739fcc3dd..02f4c8d8872d82 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanHCFGBuilder.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanHCFGBuilder.cpp
@@ -182,7 +182,7 @@ VPBasicBlock *PlainCFGBuilder::getOrCreateVPBB(BasicBlock *BB) {
   // Create new VPBB.
   StringRef Name = isHeaderBB(BB, TheLoop) ? "vector.body" : BB->getName();
   LLVM_DEBUG(dbgs() << "Creating VPBasicBlock for " << Name << "\n");
-  VPBasicBlock *VPBB = new VPBasicBlock(Name);
+  VPBasicBlock *VPBB = Plan.createVPBasicBlock(Name);
   BB2VPBB[BB] = VPBB;
 
   // Get or create a region for the loop containing BB.
@@ -204,7 +204,7 @@ VPBasicBlock *PlainCFGBuilder::getOrCreateVPBB(BasicBlock *BB) {
   if (LoopOfBB == TheLoop) {
     RegionOfVPBB = Plan.getVectorLoopRegion();
   } else {
-    RegionOfVPBB = new VPRegionBlock(Name.str(), false /*isReplicator*/);
+    RegionOfVPBB = Plan.createVPRegionBlock(Name.str(), false /*isReplicator*/);
     RegionOfVPBB->setParent(Loop2Region[LoopOfBB->getParentLoop()]);
   }
   RegionOfVPBB->setEntry(VPBB);
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
index 0b809c2b34df9e..cd3ea561e6aac0 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
@@ -297,8 +297,6 @@ static bool mergeReplicateRegionsIntoSuccessors(VPlan &Plan) {
     DeletedRegions.insert(Region1);
   }
 
-  for (VPRegionBlock *ToDelete : DeletedRegions)
-    delete ToDelete;
   return !DeletedRegions.empty();
 }
 
@@ -310,7 +308,8 @@ static VPRegionBlock *createReplicateRegion(VPReplicateRecipe *PredRecipe,
   assert(Instr->getParent() && "Predicated instruction not in any basic block");
   auto *BlockInMask = PredRecipe->getMask();
   auto *BOMRecipe = new VPBranchOnMaskRecipe(BlockInMask);
-  auto *Entry = new VPBasicBlock(Twine(RegionName) + ".entry", BOMRecipe);
+  auto *Entry =
+      Plan.createVPBasicBlock(Twine(RegionName) + ".entry", BOMRecipe);
 
   // Replace predicated replicate recipe with a replicate recipe without a
   // mask but in the replicate region.
@@ -318,7 +317,8 @@ static VPRegionBlock *createReplicateRegion(VPReplicateRecipe *PredRecipe,
       PredRecipe->getUnderlyingInstr(),
       make_range(PredRecipe->op_begin(), std::prev(PredRecipe->op_end())),
       PredRecipe->isUniform());
-  auto *Pred = new VPBasicBlock(Twine(RegionName) + ".if", RecipeWithoutMask);
+  auto *Pred =
+      Plan.createVPBasicBlock(Twine(RegionName) + ".if", RecipeWithoutMask);
 
   VPPredInstPHIRecipe *PHIRecipe = nullptr;
   if (PredRecipe->getNumUsers() != 0) {
@@ -328,8 +328,10 @@ static VPRegionBlock *createReplicateRegion(VPReplicateRecipe *PredRecipe,
     PHIRecipe->setOperand(0, RecipeWithoutMask);
   }
   PredRecipe->eraseFromParent();
-  auto *Exiting = new VPBasicBlock(Twine(RegionName) + ".continue", PHIRecipe);
-  VPRegionBlock *Region = new VPRegionBlock(Entry, Exiting, RegionName, true);
+  auto *Exiting =
+      Plan.createVPBasicBlock(Twine(RegionName) + ".continue", PHIRecipe);
+  VPRegionBlock *Region =
+      Plan.createVPRegionBlock(Entry, Exiting, RegionName, true);
 
   // Note: first set Entry as region entry and then connect successors starting
   // from it in order, to propagate the "parent" of each VPBasicBlock.
@@ -396,7 +398,6 @@ static bool mergeBlocksIntoPredecessors(VPlan &Plan) {
       VPBlockUtils::disconnectBlocks(VPBB, Succ);
       VPBlockUtils::connectBlocks(PredVPBB, Succ);
     }
-    delete VPBB;
   }
   return !WorkList.empty();
 }
@@ -1898,7 +1899,7 @@ void VPlanTransforms::handleUncountableEarlyExit(
   if (OrigLoop->getUniqueExitBlock()) {
     VPEarlyExitBlock = cast<VPIRBasicBlock>(MiddleVPBB->getSuccessors()[0]);
   } else {
-    VPEarlyExitBlock = VPIRBasicBlock::fromBasicBlock(
+    VPEarlyExitBlock = Plan.createVPIRBasicBlock(
         !OrigLoop->contains(TrueSucc) ? TrueSucc : FalseSucc);
   }
 
@@ -1908,7 +1909,7 @@ void VPlanTransforms::handleUncountableEarlyExit(
   IsEarlyExitTaken =
       Builder.createNaryOp(VPInstruction::AnyOf, {EarlyExitTakenCond});
 
-  VPBasicBlock *NewMiddle = new VPBasicBlock("middle.split");
+  VPBasicBlock *NewMiddle = Plan.createVPBasicBlock("middle.split");
   VPBlockUtils::insertOnEdge(LoopRegion, MiddleVPBB, NewMiddle);
   VPBlockUtils::connectBlocks(NewMiddle, VPEarlyExitBlock);
   NewMiddle->swapSuccessors();

>From e72a71fa3c31872b72a84cd78ac8b9b1aa719883 Mon Sep 17 00:00:00 2001
From: Florian Hahn <flo at fhahn.com>
Date: Tue, 24 Dec 2024 21:51:48 +0000
Subject: [PATCH 12/24] !fixup address comments, add comments

---
 llvm/lib/Transforms/Vectorize/VPlan.cpp |  3 ++-
 llvm/lib/Transforms/Vectorize/VPlan.h   | 17 +++++++++++++++--
 2 files changed, 17 insertions(+), 3 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/VPlan.cpp b/llvm/lib/Transforms/Vectorize/VPlan.cpp
index e03847cea131f0..204a1e01b9313c 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlan.cpp
@@ -1264,7 +1264,8 @@ VPlan *VPlan::duplicate() {
          "TripCount must have been added to Old2NewVPValues");
   NewPlan->TripCount = Old2NewVPValues[TripCount];
 
-  for (unsigned I = CreatedBlockSize; I != CreatedBlocks.size(); ++I)
+  // Transfer cloned blocks to new VPlan.
+  for (unsigned I : seq<unsigned>(CreatedBlockSize, CreatedBlocks.size()))
     NewPlan->CreatedBlocks.push_back(CreatedBlocks[I]);
   CreatedBlocks.truncate(CreatedBlockSize);
 
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h
index eb0e3baa8d4f38..434b4d7e49ab82 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.h
+++ b/llvm/lib/Transforms/Vectorize/VPlan.h
@@ -3714,8 +3714,7 @@ class VPRegionBlock : public VPBlockBase {
       : VPBlockBase(VPRegionBlockSC, Name), Entry(nullptr), Exiting(nullptr),
         IsReplicator(IsReplicator) {}
 
-  ~VPRegionBlock() override {
-  }
+  ~VPRegionBlock() override {}
 
   /// Method to support type inquiry through isa, cast, and dyn_cast.
   static inline bool classof(const VPBlockBase *V) {
@@ -3841,6 +3840,8 @@ class VPlan {
   /// been modeled in VPlan directly.
   DenseMap<const SCEV *, VPValue *> SCEVToExpansion;
 
+  /// Blocks allocated and owned by the VPlan. They will be deleted once the
+  /// VPlan is destroyed.
   SmallVector<VPBlockBase *> CreatedBlocks;
 
 public:
@@ -4060,6 +4061,9 @@ class VPlan {
   /// recipes to refer to the clones, and return it.
   VPlan *duplicate();
 
+  /// Create a new VPBasicBlock with \p Name and containing \p Recipe if
+  /// present. The returned block is owned by the VPlan and deleted once the
+  /// VPlan is destroyed.
   VPBasicBlock *createVPBasicBlock(const Twine &Name,
                                    VPRecipeBase *Recipe = nullptr) {
     auto *VPB = new VPBasicBlock(Name, Recipe);
@@ -4067,6 +4071,9 @@ class VPlan {
     return VPB;
   }
 
+  /// Create a new VPRegionBlock with \p Entry, \p Exiting and \p Name. If \p
+  /// IsReplicator is true, the region is a replicate region. The returned block
+  /// is owned by the VPlan and deleted once the VPlan is destroyed.
   VPRegionBlock *createVPRegionBlock(VPBlockBase *Entry, VPBlockBase *Exiting,
                                      const std::string &Name = "",
                                      bool IsReplicator = false) {
@@ -4075,6 +4082,10 @@ class VPlan {
     return VPB;
   }
 
+  /// Create a new VPRegionBlock with \p Name and entry and exiting blocks set
+  /// to nullptr. If \p IsReplicator is true, the region is a replicate region.
+  /// The returned block is owned by the VPlan and deleted once the VPlan is
+  /// destroyed.
   VPRegionBlock *createVPRegionBlock(const std::string &Name = "",
                                      bool IsReplicator = false) {
     auto *VPB = new VPRegionBlock(Name, IsReplicator);
@@ -4084,6 +4095,8 @@ class VPlan {
 
   /// Create a VPIRBasicBlock from \p IRBB containing VPIRInstructions for all
   /// instructions in \p IRBB, except its terminator which is managed in VPlan.
+  /// The returned block is owned by the VPlan and deleted once the VPlan is
+  /// destroyed.
   VPIRBasicBlock *createVPIRBasicBlock(BasicBlock *IRBB);
 };
 

>From 407dbc1eccf9e4cd22e1c20f9212fef0b2ead7f4 Mon Sep 17 00:00:00 2001
From: Florian Hahn <flo at fhahn.com>
Date: Fri, 27 Dec 2024 11:25:59 +0000
Subject: [PATCH 13/24] [VPlan] Funnel

---
 llvm/lib/Transforms/Vectorize/VPlan.h         |  14 +-
 .../Transforms/Vectorize/VPlanHCFGBuilder.h   |   4 +-
 .../Transforms/Vectorize/VPDomTreeTest.cpp    |  42 ++--
 .../Transforms/Vectorize/VPlanHCFGTest.cpp    |   2 +-
 .../Transforms/Vectorize/VPlanSlpTest.cpp     |   2 +-
 .../Transforms/Vectorize/VPlanTest.cpp        | 210 ++++++------------
 .../Transforms/Vectorize/VPlanTestBase.h      |  20 +-
 .../Vectorize/VPlanVerifierTest.cpp           |  89 +++-----
 8 files changed, 139 insertions(+), 244 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h
index 434b4d7e49ab82..beabcc7fd41874 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.h
+++ b/llvm/lib/Transforms/Vectorize/VPlan.h
@@ -3854,18 +3854,18 @@ class VPlan {
            "scalar header must be a leaf node");
   }
 
-  /// Construct a VPlan with \p Entry entering the plan, trip count \p TC and
-  /// with \p ScalarHeader wrapping the original header of the scalar loop.
-  VPlan(VPBasicBlock *Entry, VPValue *TC, VPIRBasicBlock *ScalarHeader)
-      : VPlan(Entry, ScalarHeader) {
-    TripCount = TC;
-  }
-
+public:
   /// Construct a VPlan for \p L. This will create VPIRBasicBlocks wrapping the
   /// original preheader and scalar header of \p L, to be used as entry and
   /// scalar header blocks of the new VPlan.
   VPlan(Loop *L);
 
+  VPlan(BasicBlock *ScalarHeaderBB, VPValue *TC) {
+    setEntry(new VPBasicBlock("preheader"));
+    ScalarHeader = VPIRBasicBlock::fromBasicBlock(ScalarHeaderBB);
+    TripCount = TC;
+  }
+
   ~VPlan();
 
   void setEntry(VPBasicBlock *VPBB) {
diff --git a/llvm/lib/Transforms/Vectorize/VPlanHCFGBuilder.h b/llvm/lib/Transforms/Vectorize/VPlanHCFGBuilder.h
index 9e8f9f3f400293..ad6e2ad90a9610 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanHCFGBuilder.h
+++ b/llvm/lib/Transforms/Vectorize/VPlanHCFGBuilder.h
@@ -32,11 +32,11 @@ class Loop;
 class LoopInfo;
 class VPRegionBlock;
 class VPlan;
-class VPlanTestBase;
+class VPlanTestIRBase;
 
 /// Main class to build the VPlan H-CFG for an incoming IR.
 class VPlanHCFGBuilder {
-  friend VPlanTestBase;
+  friend VPlanTestIRBase;
 
 private:
   // The outermost loop of the input loop nest considered for vectorization.
diff --git a/llvm/unittests/Transforms/Vectorize/VPDomTreeTest.cpp b/llvm/unittests/Transforms/Vectorize/VPDomTreeTest.cpp
index 847cca7714effc..6aa34a5fa431b5 100644
--- a/llvm/unittests/Transforms/Vectorize/VPDomTreeTest.cpp
+++ b/llvm/unittests/Transforms/Vectorize/VPDomTreeTest.cpp
@@ -9,12 +9,15 @@
 
 #include "../lib/Transforms/Vectorize/VPlan.h"
 #include "../lib/Transforms/Vectorize/VPlanDominatorTree.h"
+#include "VPlanTestBase.h"
 #include "gtest/gtest.h"
 
 namespace llvm {
 namespace {
 
-TEST(VPDominatorTreeTest, DominanceNoRegionsTest) {
+using VPDominatorTreeTest = VPlanTestBase;
+
+TEST_F(VPDominatorTreeTest, DominanceNoRegionsTest) {
   //   VPBB0
   //    |
   //   R1 {
@@ -24,8 +27,8 @@ TEST(VPDominatorTreeTest, DominanceNoRegionsTest) {
   //    \    /
   //    VPBB4
   //  }
-  VPBasicBlock *VPPH = new VPBasicBlock("ph");
-  VPBasicBlock *VPBB0 = new VPBasicBlock("VPBB0");
+  VPlan &Plan = getPlan();
+  VPBasicBlock *VPBB0 = Plan.getEntry();
   VPBasicBlock *VPBB1 = new VPBasicBlock("VPBB1");
   VPBasicBlock *VPBB2 = new VPBasicBlock("VPBB2");
   VPBasicBlock *VPBB3 = new VPBasicBlock("VPBB3");
@@ -40,12 +43,7 @@ TEST(VPDominatorTreeTest, DominanceNoRegionsTest) {
   VPBlockUtils::connectBlocks(VPBB2, VPBB4);
   VPBlockUtils::connectBlocks(VPBB3, VPBB4);
 
-  LLVMContext C;
-  auto *ScalarHeader = BasicBlock::Create(C, "");
-  VPIRBasicBlock *ScalarHeaderVPBB = new VPIRBasicBlock(ScalarHeader);
-  VPBlockUtils::connectBlocks(R1, ScalarHeaderVPBB);
-  VPBlockUtils::connectBlocks(VPPH, VPBB0);
-  VPlan Plan(VPPH, ScalarHeaderVPBB);
+  VPBlockUtils::connectBlocks(R1, Plan.getScalarHeader());
 
   VPDominatorTree VPDT;
   VPDT.recalculate(Plan);
@@ -62,7 +60,6 @@ TEST(VPDominatorTreeTest, DominanceNoRegionsTest) {
   EXPECT_EQ(VPDT.findNearestCommonDominator(VPBB2, VPBB3), VPBB1);
   EXPECT_EQ(VPDT.findNearestCommonDominator(VPBB2, VPBB4), VPBB1);
   EXPECT_EQ(VPDT.findNearestCommonDominator(VPBB4, VPBB4), VPBB4);
-  delete ScalarHeader;
 }
 
 static void
@@ -76,9 +73,7 @@ checkDomChildren(VPDominatorTree &VPDT, VPBlockBase *Src,
   EXPECT_EQ(Children, ExpectedNodes);
 }
 
-TEST(VPDominatorTreeTest, DominanceRegionsTest) {
-  LLVMContext C;
-  auto *ScalarHeader = BasicBlock::Create(C, "");
+TEST_F(VPDominatorTreeTest, DominanceRegionsTest) {
   {
     // 2 consecutive regions.
     // VPBB0
@@ -99,8 +94,8 @@ TEST(VPDominatorTreeTest, DominanceRegionsTest) {
     //    R2BB2
     // }
     //
-    VPBasicBlock *VPPH = new VPBasicBlock("ph");
-    VPBasicBlock *VPBB0 = new VPBasicBlock("VPBB0");
+    VPlan &Plan = getPlan();
+    VPBasicBlock *VPBB0 = Plan.getEntry();
     VPBasicBlock *R1BB1 = new VPBasicBlock();
     VPBasicBlock *R1BB2 = new VPBasicBlock();
     VPBasicBlock *R1BB3 = new VPBasicBlock();
@@ -122,10 +117,7 @@ TEST(VPDominatorTreeTest, DominanceRegionsTest) {
     VPBlockUtils::connectBlocks(R2BB1, R2BB2);
     VPBlockUtils::connectBlocks(R1, R2);
 
-    VPIRBasicBlock *ScalarHeaderVPBB = new VPIRBasicBlock(ScalarHeader);
-    VPBlockUtils::connectBlocks(R2, ScalarHeaderVPBB);
-    VPBlockUtils::connectBlocks(VPPH, VPBB0);
-    VPlan Plan(VPPH, ScalarHeaderVPBB);
+    VPBlockUtils::connectBlocks(R2, Plan.getScalarHeader());
     VPDominatorTree VPDT;
     VPDT.recalculate(Plan);
 
@@ -177,7 +169,7 @@ TEST(VPDominatorTreeTest, DominanceRegionsTest) {
     //   |
     //  VPBB2
     //
-    VPBasicBlock *VPPH = new VPBasicBlock("ph");
+    VPlan &Plan = getPlan();
     VPBasicBlock *R1BB1 = new VPBasicBlock("R1BB1");
     VPBasicBlock *R1BB2 = new VPBasicBlock("R1BB2");
     VPBasicBlock *R1BB3 = new VPBasicBlock("R1BB3");
@@ -199,15 +191,12 @@ TEST(VPDominatorTreeTest, DominanceRegionsTest) {
     VPBlockUtils::connectBlocks(R1BB2, R1BB3);
     VPBlockUtils::connectBlocks(R2, R1BB3);
 
-    VPBasicBlock *VPBB1 = new VPBasicBlock("VPBB1");
+    VPBasicBlock *VPBB1 = Plan.getEntry();
     VPBlockUtils::connectBlocks(VPBB1, R1);
     VPBasicBlock *VPBB2 = new VPBasicBlock("VPBB2");
     VPBlockUtils::connectBlocks(R1, VPBB2);
 
-    VPIRBasicBlock *ScalarHeaderVPBB = new VPIRBasicBlock(ScalarHeader);
-    VPBlockUtils::connectBlocks(VPBB2, ScalarHeaderVPBB);
-    VPBlockUtils::connectBlocks(VPPH, VPBB1);
-    VPlan Plan(VPPH, ScalarHeaderVPBB);
+    VPBlockUtils::connectBlocks(VPBB2, Plan.getScalarHeader());
     VPDominatorTree VPDT;
     VPDT.recalculate(Plan);
 
@@ -220,9 +209,8 @@ TEST(VPDominatorTreeTest, DominanceRegionsTest) {
     checkDomChildren(VPDT, R2BB2, {R2BB3});
     checkDomChildren(VPDT, R2BB3, {});
     checkDomChildren(VPDT, R1BB3, {VPBB2});
-    checkDomChildren(VPDT, VPBB2, {ScalarHeaderVPBB});
+    checkDomChildren(VPDT, VPBB2, {Plan.getScalarHeader()});
   }
-  delete ScalarHeader;
 }
 
 } // namespace
diff --git a/llvm/unittests/Transforms/Vectorize/VPlanHCFGTest.cpp b/llvm/unittests/Transforms/Vectorize/VPlanHCFGTest.cpp
index 1b362d1d26bdd3..19c2483d34ed17 100644
--- a/llvm/unittests/Transforms/Vectorize/VPlanHCFGTest.cpp
+++ b/llvm/unittests/Transforms/Vectorize/VPlanHCFGTest.cpp
@@ -17,7 +17,7 @@
 namespace llvm {
 namespace {
 
-class VPlanHCFGTest : public VPlanTestBase {};
+class VPlanHCFGTest : public VPlanTestIRBase {};
 
 TEST_F(VPlanHCFGTest, testBuildHCFGInnerLoop) {
   const char *ModuleString =
diff --git a/llvm/unittests/Transforms/Vectorize/VPlanSlpTest.cpp b/llvm/unittests/Transforms/Vectorize/VPlanSlpTest.cpp
index 1b993b63898caa..e3c542ec5cac85 100644
--- a/llvm/unittests/Transforms/Vectorize/VPlanSlpTest.cpp
+++ b/llvm/unittests/Transforms/Vectorize/VPlanSlpTest.cpp
@@ -16,7 +16,7 @@
 namespace llvm {
 namespace {
 
-class VPlanSlpTest : public VPlanTestBase {
+class VPlanSlpTest : public VPlanTestIRBase {
 protected:
   TargetLibraryInfoImpl TLII;
   TargetLibraryInfo TLI;
diff --git a/llvm/unittests/Transforms/Vectorize/VPlanTest.cpp b/llvm/unittests/Transforms/Vectorize/VPlanTest.cpp
index f3a1bba518c83c..2ab55f64a20730 100644
--- a/llvm/unittests/Transforms/Vectorize/VPlanTest.cpp
+++ b/llvm/unittests/Transforms/Vectorize/VPlanTest.cpp
@@ -9,6 +9,7 @@
 
 #include "../lib/Transforms/Vectorize/VPlan.h"
 #include "../lib/Transforms/Vectorize/VPlanCFG.h"
+#include "VPlanTestBase.h"
 #include "llvm/ADT/DepthFirstIterator.h"
 #include "llvm/ADT/PostOrderIterator.h"
 #include "llvm/Analysis/VectorUtils.h"
@@ -237,12 +238,13 @@ TEST(VPInstructionTest, releaseOperandsAtDeletion) {
   delete VPV1;
   delete VPV2;
 }
-TEST(VPBasicBlockTest, getPlan) {
-  LLVMContext C;
-  auto *ScalarHeader = BasicBlock::Create(C, "");
+
+using VPBasicBlockTest = VPlanTestBase;
+
+TEST_F(VPBasicBlockTest, getPlan) {
   {
-    VPBasicBlock *VPPH = new VPBasicBlock("ph");
-    VPBasicBlock *VPBB1 = new VPBasicBlock();
+    VPlan &Plan = getPlan();
+    VPBasicBlock *VPBB1 = Plan.getEntry();
     VPBasicBlock *VPBB2 = new VPBasicBlock();
     VPBasicBlock *VPBB3 = new VPBasicBlock();
     VPBasicBlock *VPBB4 = new VPBasicBlock();
@@ -256,11 +258,7 @@ TEST(VPBasicBlockTest, getPlan) {
     VPBlockUtils::connectBlocks(VPBB1, VPBB3);
     VPBlockUtils::connectBlocks(VPBB2, VPBB4);
     VPBlockUtils::connectBlocks(VPBB3, VPBB4);
-
-    VPIRBasicBlock *ScalarHeaderVPBB = new VPIRBasicBlock(ScalarHeader);
-    VPBlockUtils::connectBlocks(VPBB4, ScalarHeaderVPBB);
-    VPBlockUtils::connectBlocks(VPPH, VPBB1);
-    VPlan Plan(VPPH, ScalarHeaderVPBB);
+    VPBlockUtils::connectBlocks(VPBB4, Plan.getScalarHeader());
 
     EXPECT_EQ(&Plan, VPBB1->getPlan());
     EXPECT_EQ(&Plan, VPBB2->getPlan());
@@ -269,20 +267,17 @@ TEST(VPBasicBlockTest, getPlan) {
   }
 
   {
-    VPBasicBlock *VPPH = new VPBasicBlock("ph");
+    VPlan &Plan = getPlan();
+    VPBasicBlock *VPBB1 = Plan.getEntry();
     // VPBasicBlock is the entry into the VPlan, followed by a region.
     VPBasicBlock *R1BB1 = new VPBasicBlock();
     VPBasicBlock *R1BB2 = new VPBasicBlock();
     VPRegionBlock *R1 = new VPRegionBlock(R1BB1, R1BB2, "R1");
     VPBlockUtils::connectBlocks(R1BB1, R1BB2);
 
-    VPBasicBlock *VPBB1 = new VPBasicBlock();
     VPBlockUtils::connectBlocks(VPBB1, R1);
 
-    VPIRBasicBlock *ScalarHeaderVPBB = new VPIRBasicBlock(ScalarHeader);
-    VPBlockUtils::connectBlocks(R1, ScalarHeaderVPBB);
-    VPBlockUtils::connectBlocks(VPPH, VPBB1);
-    VPlan Plan(VPPH, ScalarHeaderVPBB);
+    VPBlockUtils::connectBlocks(R1, Plan.getScalarHeader());
 
     EXPECT_EQ(&Plan, VPBB1->getPlan());
     EXPECT_EQ(&Plan, R1->getPlan());
@@ -291,8 +286,7 @@ TEST(VPBasicBlockTest, getPlan) {
   }
 
   {
-    VPBasicBlock *VPPH = new VPBasicBlock("ph");
-
+    VPlan &Plan = getPlan();
     VPBasicBlock *R1BB1 = new VPBasicBlock();
     VPBasicBlock *R1BB2 = new VPBasicBlock();
     VPRegionBlock *R1 = new VPRegionBlock(R1BB1, R1BB2, "R1");
@@ -303,7 +297,7 @@ TEST(VPBasicBlockTest, getPlan) {
     VPRegionBlock *R2 = new VPRegionBlock(R2BB1, R2BB2, "R2");
     VPBlockUtils::connectBlocks(R2BB1, R2BB2);
 
-    VPBasicBlock *VPBB1 = new VPBasicBlock();
+    VPBasicBlock *VPBB1 = Plan.getEntry();
     VPBlockUtils::connectBlocks(VPBB1, R1);
     VPBlockUtils::connectBlocks(VPBB1, R2);
 
@@ -311,10 +305,7 @@ TEST(VPBasicBlockTest, getPlan) {
     VPBlockUtils::connectBlocks(R1, VPBB2);
     VPBlockUtils::connectBlocks(R2, VPBB2);
 
-    VPIRBasicBlock *ScalarHeaderVPBB = new VPIRBasicBlock(ScalarHeader);
-    VPBlockUtils::connectBlocks(R2, ScalarHeaderVPBB);
-    VPBlockUtils::connectBlocks(VPPH, VPBB1);
-    VPlan Plan(VPPH, ScalarHeaderVPBB);
+    VPBlockUtils::connectBlocks(R2, Plan.getScalarHeader());
 
     EXPECT_EQ(&Plan, VPBB1->getPlan());
     EXPECT_EQ(&Plan, R1->getPlan());
@@ -325,12 +316,9 @@ TEST(VPBasicBlockTest, getPlan) {
     EXPECT_EQ(&Plan, R2BB2->getPlan());
     EXPECT_EQ(&Plan, VPBB2->getPlan());
   }
-  delete ScalarHeader;
 }
 
-TEST(VPBasicBlockTest, TraversingIteratorTest) {
-  LLVMContext C;
-  auto *ScalarHeader = BasicBlock::Create(C, "");
+TEST_F(VPBasicBlockTest, TraversingIteratorTest) {
   {
     // VPBasicBlocks only
     //     VPBB1
@@ -339,8 +327,8 @@ TEST(VPBasicBlockTest, TraversingIteratorTest) {
     //    \    /
     //    VPBB4
     //
-    VPBasicBlock *VPPH = new VPBasicBlock("ph");
-    VPBasicBlock *VPBB1 = new VPBasicBlock();
+    VPlan &Plan = getPlan();
+    VPBasicBlock *VPBB1 = Plan.getEntry();
     VPBasicBlock *VPBB2 = new VPBasicBlock();
     VPBasicBlock *VPBB3 = new VPBasicBlock();
     VPBasicBlock *VPBB4 = new VPBasicBlock();
@@ -356,11 +344,7 @@ TEST(VPBasicBlockTest, TraversingIteratorTest) {
     EXPECT_EQ(VPBB1, FromIterator[0]);
     EXPECT_EQ(VPBB2, FromIterator[1]);
 
-    // Use Plan to properly clean up created blocks.
-    VPIRBasicBlock *ScalarHeaderVPBB = new VPIRBasicBlock(ScalarHeader);
-    VPBlockUtils::connectBlocks(VPBB4, ScalarHeaderVPBB);
-    VPBlockUtils::connectBlocks(VPPH, VPBB1);
-    VPlan Plan(VPPH, ScalarHeaderVPBB);
+    VPBlockUtils::connectBlocks(VPBB4, Plan.getScalarHeader());
   }
 
   {
@@ -382,8 +366,8 @@ TEST(VPBasicBlockTest, TraversingIteratorTest) {
     //      |
     //    R2BB2
     //
-    VPBasicBlock *VPPH = new VPBasicBlock("ph");
-    VPBasicBlock *VPBB0 = new VPBasicBlock("VPBB0");
+    VPlan &Plan = getPlan();
+    VPBasicBlock *VPBB0 = Plan.getEntry();
     VPBasicBlock *R1BB1 = new VPBasicBlock();
     VPBasicBlock *R1BB2 = new VPBasicBlock();
     VPBasicBlock *R1BB3 = new VPBasicBlock();
@@ -458,11 +442,7 @@ TEST(VPBasicBlockTest, TraversingIteratorTest) {
     EXPECT_EQ(R1BB1, FromIterator[6]);
     EXPECT_EQ(R1, FromIterator[7]);
 
-    // Use Plan to properly clean up created blocks.
-    VPIRBasicBlock *ScalarHeaderVPBB = new VPIRBasicBlock(ScalarHeader);
-    VPBlockUtils::connectBlocks(R2, ScalarHeaderVPBB);
-    VPBlockUtils::connectBlocks(VPPH, VPBB0);
-    VPlan Plan(VPPH, ScalarHeaderVPBB);
+    VPBlockUtils::connectBlocks(R2, Plan.getScalarHeader());
   }
 
   {
@@ -486,7 +466,7 @@ TEST(VPBasicBlockTest, TraversingIteratorTest) {
     //   |
     //  VPBB2
     //
-    VPBasicBlock *VPPH = new VPBasicBlock("ph");
+    VPlan &Plan = getPlan();
     VPBasicBlock *R1BB1 = new VPBasicBlock("R1BB1");
     VPBasicBlock *R1BB2 = new VPBasicBlock("R1BB2");
     VPBasicBlock *R1BB3 = new VPBasicBlock("R1BB3");
@@ -508,7 +488,7 @@ TEST(VPBasicBlockTest, TraversingIteratorTest) {
     VPBlockUtils::connectBlocks(R1BB2, R1BB3);
     VPBlockUtils::connectBlocks(R2, R1BB3);
 
-    VPBasicBlock *VPBB1 = new VPBasicBlock("VPBB1");
+    VPBasicBlock *VPBB1 = Plan.getEntry();
     VPBlockUtils::connectBlocks(VPBB1, R1);
     VPBasicBlock *VPBB2 = new VPBasicBlock("VPBB2");
     VPBlockUtils::connectBlocks(R1, VPBB2);
@@ -543,11 +523,7 @@ TEST(VPBasicBlockTest, TraversingIteratorTest) {
     EXPECT_EQ(R1, FromIterator[8]);
     EXPECT_EQ(VPBB1, FromIterator[9]);
 
-    // Use Plan to properly clean up created blocks.
-    VPIRBasicBlock *ScalarHeaderVPBB = new VPIRBasicBlock(ScalarHeader);
-    VPBlockUtils::connectBlocks(VPBB2, ScalarHeaderVPBB);
-    VPBlockUtils::connectBlocks(VPPH, VPBB1);
-    VPlan Plan(VPPH, ScalarHeaderVPBB);
+    VPBlockUtils::connectBlocks(VPBB2, Plan.getScalarHeader());
   }
 
   {
@@ -561,7 +537,7 @@ TEST(VPBasicBlockTest, TraversingIteratorTest) {
     //      R2BB2
     //   }
     //
-    VPBasicBlock *VPPH = new VPBasicBlock("ph");
+    VPlan &Plan = getPlan();
     VPBasicBlock *R2BB1 = new VPBasicBlock("R2BB1");
     VPBasicBlock *R2BB2 = new VPBasicBlock("R2BB2");
     VPRegionBlock *R2 = new VPRegionBlock(R2BB1, R2BB2, "R2");
@@ -570,7 +546,7 @@ TEST(VPBasicBlockTest, TraversingIteratorTest) {
     VPRegionBlock *R1 = new VPRegionBlock(R2, R2, "R1");
     R2->setParent(R1);
 
-    VPBasicBlock *VPBB1 = new VPBasicBlock("VPBB1");
+    VPBasicBlock *VPBB1 = Plan.getEntry();
     VPBlockUtils::connectBlocks(VPBB1, R1);
 
     // Depth-first.
@@ -593,11 +569,7 @@ TEST(VPBasicBlockTest, TraversingIteratorTest) {
     EXPECT_EQ(R1, FromIterator[3]);
     EXPECT_EQ(VPBB1, FromIterator[4]);
 
-    // Use Plan to properly clean up created blocks.
-    VPIRBasicBlock *ScalarHeaderVPBB = new VPIRBasicBlock(ScalarHeader);
-    VPBlockUtils::connectBlocks(R1, ScalarHeaderVPBB);
-    VPBlockUtils::connectBlocks(VPPH, VPBB1);
-    VPlan Plan(VPPH, ScalarHeaderVPBB);
+    VPBlockUtils::connectBlocks(R1, Plan.getScalarHeader());
   }
 
   {
@@ -619,7 +591,7 @@ TEST(VPBasicBlockTest, TraversingIteratorTest) {
     //   |
     //  VPBB2
     //
-    VPBasicBlock *VPPH = new VPBasicBlock("ph");
+    VPlan &Plan = getPlan();
     VPBasicBlock *R3BB1 = new VPBasicBlock("R3BB1");
     VPRegionBlock *R3 = new VPRegionBlock(R3BB1, R3BB1, "R3");
 
@@ -631,7 +603,7 @@ TEST(VPBasicBlockTest, TraversingIteratorTest) {
     VPRegionBlock *R1 = new VPRegionBlock(R2, R2, "R1");
     R2->setParent(R1);
 
-    VPBasicBlock *VPBB1 = new VPBasicBlock("VPBB1");
+    VPBasicBlock *VPBB1 = Plan.getEntry();
     VPBasicBlock *VPBB2 = new VPBasicBlock("VPBB2");
     VPBlockUtils::connectBlocks(VPBB1, R1);
     VPBlockUtils::connectBlocks(R1, VPBB2);
@@ -687,19 +659,15 @@ TEST(VPBasicBlockTest, TraversingIteratorTest) {
     EXPECT_EQ(R2BB1, FromIterator[2]);
     EXPECT_EQ(VPBB1, FromIterator[3]);
 
-    // Use Plan to properly clean up created blocks.
-    VPIRBasicBlock *ScalarHeaderVPBB = new VPIRBasicBlock(ScalarHeader);
-    VPBlockUtils::connectBlocks(VPBB2, ScalarHeaderVPBB);
-    VPBlockUtils::connectBlocks(VPPH, VPBB1);
-    VPlan Plan(VPPH, ScalarHeaderVPBB);
+    VPBlockUtils::connectBlocks(VPBB2, Plan.getScalarHeader());
   }
-  delete ScalarHeader;
 }
 
 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
-TEST(VPBasicBlockTest, print) {
+TEST_F(VPBasicBlockTest, print) {
   VPInstruction *TC = new VPInstruction(Instruction::Add, {});
-  VPBasicBlock *VPBB0 = new VPBasicBlock("preheader");
+  VPlan &Plan = getPlan(TC);
+  VPBasicBlock *VPBB0 = Plan.getEntry();
   VPBB0->appendRecipe(TC);
 
   VPInstruction *I1 = new VPInstruction(Instruction::Add, {});
@@ -730,12 +698,8 @@ TEST(VPBasicBlockTest, print) {
     EXPECT_EQ("EMIT br <badref>, <badref>", I3Dump);
   }
 
-  LLVMContext C;
-  auto *ScalarHeader = BasicBlock::Create(C, "scalar.header");
-  auto * ScalarHeaderVPBB = new VPIRBasicBlock(ScalarHeader);
-  VPBlockUtils::connectBlocks(VPBB2, ScalarHeaderVPBB);
+  VPBlockUtils::connectBlocks(VPBB2, Plan.getScalarHeader());
   VPBlockUtils::connectBlocks(VPBB0, VPBB1);
-  VPlan Plan(VPBB0, TC, ScalarHeaderVPBB);
   std::string FullDump;
   raw_string_ostream OS(FullDump);
   Plan.printDOT(OS);
@@ -810,13 +774,12 @@ Successor(s): ir-bb<scalar.header>
     OS << *I4;
     EXPECT_EQ("EMIT vp<%5> = mul vp<%3>, vp<%2>", I4Dump);
   }
-  delete ScalarHeader;
 }
 
-TEST(VPBasicBlockTest, printPlanWithVFsAndUFs) {
-
+TEST_F(VPBasicBlockTest, printPlanWithVFsAndUFs) {
   VPInstruction *TC = new VPInstruction(Instruction::Sub, {});
-  VPBasicBlock *VPBB0 = new VPBasicBlock("preheader");
+  VPlan &Plan = getPlan(TC);
+  VPBasicBlock *VPBB0 = Plan.getEntry();
   VPBB0->appendRecipe(TC);
 
   VPInstruction *I1 = new VPInstruction(Instruction::Add, {});
@@ -824,12 +787,8 @@ TEST(VPBasicBlockTest, printPlanWithVFsAndUFs) {
   VPBB1->appendRecipe(I1);
   VPBB1->setName("bb1");
 
-  LLVMContext C;
-  auto *ScalarHeader = BasicBlock::Create(C, "");
-  VPIRBasicBlock *ScalarHeaderVPBB = new VPIRBasicBlock(ScalarHeader);
-  VPBlockUtils::connectBlocks(VPBB1, ScalarHeaderVPBB);
+  VPBlockUtils::connectBlocks(VPBB1, Plan.getScalarHeader());
   VPBlockUtils::connectBlocks(VPBB0, VPBB1);
-  VPlan Plan(VPBB0, TC, ScalarHeaderVPBB);
   Plan.setName("TestPlan");
   Plan.addVF(ElementCount::getFixed(4));
 
@@ -847,9 +806,9 @@ Successor(s): bb1
 
 bb1:
   EMIT vp<%2> = add
-Successor(s): ir-bb<>
+Successor(s): ir-bb<scalar.header>
 
-ir-bb<>:
+ir-bb<scalar.header>:
 No successors
 }
 )";
@@ -871,9 +830,9 @@ Successor(s): bb1
 
 bb1:
   EMIT vp<%2> = add
-Successor(s): ir-bb<>
+Successor(s): ir-bb<scalar.header>
 
-ir-bb<>:
+ir-bb<scalar.header>:
 No successors
 }
 )";
@@ -895,19 +854,19 @@ Successor(s): bb1
 
 bb1:
   EMIT vp<%2> = add
-Successor(s): ir-bb<>
+Successor(s): ir-bb<scalar.header>
 
-ir-bb<>:
+ir-bb<scalar.header>:
 No successors
 }
 )";
     EXPECT_EQ(ExpectedStr, FullDump);
   }
-  delete ScalarHeader;
 }
 #endif
 
-TEST(VPRecipeTest, CastVPInstructionToVPUser) {
+using VPRecipeTest = VPlanTestBase;
+TEST_F(VPRecipeTest, CastVPInstructionToVPUser) {
   VPValue Op1;
   VPValue Op2;
   VPInstruction Recipe(Instruction::Add, {&Op1, &Op2});
@@ -917,9 +876,7 @@ TEST(VPRecipeTest, CastVPInstructionToVPUser) {
   EXPECT_EQ(&Recipe, BaseR);
 }
 
-TEST(VPRecipeTest, CastVPWidenRecipeToVPUser) {
-  LLVMContext C;
-
+TEST_F(VPRecipeTest, CastVPWidenRecipeToVPUser) {
   IntegerType *Int32 = IntegerType::get(C, 32);
   auto *AI = BinaryOperator::CreateAdd(PoisonValue::get(Int32),
                                        PoisonValue::get(Int32));
@@ -936,9 +893,7 @@ TEST(VPRecipeTest, CastVPWidenRecipeToVPUser) {
   delete AI;
 }
 
-TEST(VPRecipeTest, CastVPWidenCallRecipeToVPUserAndVPDef) {
-  LLVMContext C;
-
+TEST_F(VPRecipeTest, CastVPWidenCallRecipeToVPUserAndVPDef) {
   IntegerType *Int32 = IntegerType::get(C, 32);
   FunctionType *FTy = FunctionType::get(Int32, false);
   Function *Fn = Function::Create(FTy, GlobalValue::ExternalLinkage, 0);
@@ -964,9 +919,7 @@ TEST(VPRecipeTest, CastVPWidenCallRecipeToVPUserAndVPDef) {
   delete Fn;
 }
 
-TEST(VPRecipeTest, CastVPWidenSelectRecipeToVPUserAndVPDef) {
-  LLVMContext C;
-
+TEST_F(VPRecipeTest, CastVPWidenSelectRecipeToVPUserAndVPDef) {
   IntegerType *Int1 = IntegerType::get(C, 1);
   IntegerType *Int32 = IntegerType::get(C, 32);
   auto *SelectI = SelectInst::Create(
@@ -992,9 +945,7 @@ TEST(VPRecipeTest, CastVPWidenSelectRecipeToVPUserAndVPDef) {
   delete SelectI;
 }
 
-TEST(VPRecipeTest, CastVPWidenGEPRecipeToVPUserAndVPDef) {
-  LLVMContext C;
-
+TEST_F(VPRecipeTest, CastVPWidenGEPRecipeToVPUserAndVPDef) {
   IntegerType *Int32 = IntegerType::get(C, 32);
   PointerType *Int32Ptr = PointerType::get(Int32, 0);
   auto *GEP = GetElementPtrInst::Create(Int32, PoisonValue::get(Int32Ptr),
@@ -1017,9 +968,7 @@ TEST(VPRecipeTest, CastVPWidenGEPRecipeToVPUserAndVPDef) {
   delete GEP;
 }
 
-TEST(VPRecipeTest, CastVPBlendRecipeToVPUser) {
-  LLVMContext C;
-
+TEST_F(VPRecipeTest, CastVPBlendRecipeToVPUser) {
   IntegerType *Int32 = IntegerType::get(C, 32);
   auto *Phi = PHINode::Create(Int32, 1);
   VPValue I1;
@@ -1036,9 +985,7 @@ TEST(VPRecipeTest, CastVPBlendRecipeToVPUser) {
   delete Phi;
 }
 
-TEST(VPRecipeTest, CastVPInterleaveRecipeToVPUser) {
-  LLVMContext C;
-
+TEST_F(VPRecipeTest, CastVPInterleaveRecipeToVPUser) {
   VPValue Addr;
   VPValue Mask;
   InterleaveGroup<Instruction> IG(4, false, Align(4));
@@ -1049,9 +996,7 @@ TEST(VPRecipeTest, CastVPInterleaveRecipeToVPUser) {
   EXPECT_EQ(&Recipe, BaseR);
 }
 
-TEST(VPRecipeTest, CastVPReplicateRecipeToVPUser) {
-  LLVMContext C;
-
+TEST_F(VPRecipeTest, CastVPReplicateRecipeToVPUser) {
   VPValue Op1;
   VPValue Op2;
   SmallVector<VPValue *, 4> Args;
@@ -1068,9 +1013,7 @@ TEST(VPRecipeTest, CastVPReplicateRecipeToVPUser) {
   delete Call;
 }
 
-TEST(VPRecipeTest, CastVPBranchOnMaskRecipeToVPUser) {
-  LLVMContext C;
-
+TEST_F(VPRecipeTest, CastVPBranchOnMaskRecipeToVPUser) {
   VPValue Mask;
   VPBranchOnMaskRecipe Recipe(&Mask);
   EXPECT_TRUE(isa<VPUser>(&Recipe));
@@ -1079,9 +1022,7 @@ TEST(VPRecipeTest, CastVPBranchOnMaskRecipeToVPUser) {
   EXPECT_EQ(&Recipe, BaseR);
 }
 
-TEST(VPRecipeTest, CastVPWidenMemoryRecipeToVPUserAndVPDef) {
-  LLVMContext C;
-
+TEST_F(VPRecipeTest, CastVPWidenMemoryRecipeToVPUserAndVPDef) {
   IntegerType *Int32 = IntegerType::get(C, 32);
   PointerType *Int32Ptr = PointerType::get(Int32, 0);
   auto *Load =
@@ -1101,8 +1042,7 @@ TEST(VPRecipeTest, CastVPWidenMemoryRecipeToVPUserAndVPDef) {
   delete Load;
 }
 
-TEST(VPRecipeTest, MayHaveSideEffectsAndMayReadWriteMemory) {
-  LLVMContext C;
+TEST_F(VPRecipeTest, MayHaveSideEffectsAndMayReadWriteMemory) {
   IntegerType *Int1 = IntegerType::get(C, 1);
   IntegerType *Int32 = IntegerType::get(C, 32);
   PointerType *Int32Ptr = PointerType::get(Int32, 0);
@@ -1242,7 +1182,6 @@ TEST(VPRecipeTest, MayHaveSideEffectsAndMayReadWriteMemory) {
 
   {
     // Test for a call to a function without side-effects.
-    LLVMContext C;
     Module M("", C);
     Function *TheFn =
         Intrinsic::getOrInsertDeclaration(&M, Intrinsic::thread_pointer);
@@ -1296,15 +1235,12 @@ TEST(VPRecipeTest, MayHaveSideEffectsAndMayReadWriteMemory) {
 }
 
 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
-TEST(VPRecipeTest, dumpRecipeInPlan) {
-  VPBasicBlock *VPBB0 = new VPBasicBlock("preheader");
+TEST_F(VPRecipeTest, dumpRecipeInPlan) {
+  VPlan &Plan = getPlan();
+  VPBasicBlock *VPBB0 = Plan.getEntry();
   VPBasicBlock *VPBB1 = new VPBasicBlock();
-  LLVMContext C;
-  auto *ScalarHeader = BasicBlock::Create(C, "");
-  VPIRBasicBlock *ScalarHeaderVPBB = new VPIRBasicBlock(ScalarHeader);
-  VPBlockUtils::connectBlocks(VPBB1, ScalarHeaderVPBB);
+  VPBlockUtils::connectBlocks(VPBB1, Plan.getScalarHeader());
   VPBlockUtils::connectBlocks(VPBB0, VPBB1);
-  VPlan Plan(VPBB0, ScalarHeaderVPBB);
 
   IntegerType *Int32 = IntegerType::get(C, 32);
   auto *AI = BinaryOperator::CreateAdd(PoisonValue::get(Int32),
@@ -1366,18 +1302,14 @@ TEST(VPRecipeTest, dumpRecipeInPlan) {
   }
 
   delete AI;
-  delete ScalarHeader;
 }
 
-TEST(VPRecipeTest, dumpRecipeUnnamedVPValuesInPlan) {
-  VPBasicBlock *VPBB0 = new VPBasicBlock("preheader");
+TEST_F(VPRecipeTest, dumpRecipeUnnamedVPValuesInPlan) {
+  VPlan &Plan = getPlan();
+  VPBasicBlock *VPBB0 = Plan.getEntry();
   VPBasicBlock *VPBB1 = new VPBasicBlock();
-  LLVMContext C;
-  auto *ScalarHeader = BasicBlock::Create(C, "");
-  VPIRBasicBlock *ScalarHeaderVPBB = new VPIRBasicBlock(ScalarHeader);
-  VPBlockUtils::connectBlocks(VPBB1, ScalarHeaderVPBB);
+  VPBlockUtils::connectBlocks(VPBB1, Plan.getScalarHeader());
   VPBlockUtils::connectBlocks(VPBB0, VPBB1);
-  VPlan Plan(VPBB0, ScalarHeaderVPBB);
 
   IntegerType *Int32 = IntegerType::get(C, 32);
   auto *AI = BinaryOperator::CreateAdd(PoisonValue::get(Int32),
@@ -1456,11 +1388,9 @@ TEST(VPRecipeTest, dumpRecipeUnnamedVPValuesInPlan) {
         testing::ExitedWithCode(0), "EMIT vp<%2> = mul vp<%1>, vp<%1>");
   }
   delete AI;
-  delete ScalarHeader;
 }
 
-TEST(VPRecipeTest, dumpRecipeUnnamedVPValuesNotInPlanOrBlock) {
-  LLVMContext C;
+TEST_F(VPRecipeTest, dumpRecipeUnnamedVPValuesNotInPlanOrBlock) {
   IntegerType *Int32 = IntegerType::get(C, 32);
   auto *AI = BinaryOperator::CreateAdd(PoisonValue::get(Int32),
                                        PoisonValue::get(Int32));
@@ -1543,9 +1473,7 @@ TEST(VPRecipeTest, dumpRecipeUnnamedVPValuesNotInPlanOrBlock) {
 
 #endif
 
-TEST(VPRecipeTest, CastVPReductionRecipeToVPUser) {
-  LLVMContext C;
-
+TEST_F(VPRecipeTest, CastVPReductionRecipeToVPUser) {
   VPValue ChainOp;
   VPValue VecOp;
   VPValue CondOp;
@@ -1556,9 +1484,7 @@ TEST(VPRecipeTest, CastVPReductionRecipeToVPUser) {
   EXPECT_TRUE(isa<VPUser>(BaseR));
 }
 
-TEST(VPRecipeTest, CastVPReductionEVLRecipeToVPUser) {
-  LLVMContext C;
-
+TEST_F(VPRecipeTest, CastVPReductionEVLRecipeToVPUser) {
   VPValue ChainOp;
   VPValue VecOp;
   VPValue CondOp;
@@ -1630,7 +1556,7 @@ TEST(VPDoubleValueDefTest, traverseUseLists) {
   EXPECT_EQ(&DoubleValueDef, I3.getOperand(0)->getDefiningRecipe());
 }
 
-TEST(VPRecipeTest, CastToVPSingleDefRecipe) {
+TEST_F(VPRecipeTest, CastToVPSingleDefRecipe) {
   VPValue Start;
   VPEVLBasedIVPHIRecipe R(&Start, {});
   VPRecipeBase *B = &R;
diff --git a/llvm/unittests/Transforms/Vectorize/VPlanTestBase.h b/llvm/unittests/Transforms/Vectorize/VPlanTestBase.h
index 06e091da9054e3..1836a5e39a290e 100644
--- a/llvm/unittests/Transforms/Vectorize/VPlanTestBase.h
+++ b/llvm/unittests/Transforms/Vectorize/VPlanTestBase.h
@@ -28,7 +28,7 @@ namespace llvm {
 
 /// Helper class to create a module from an assembly string and VPlans for a
 /// given loop entry block.
-class VPlanTestBase : public testing::Test {
+class VPlanTestIRBase : public testing::Test {
 protected:
   TargetLibraryInfoImpl TLII;
   TargetLibraryInfo TLI;
@@ -41,7 +41,7 @@ class VPlanTestBase : public testing::Test {
   std::unique_ptr<AssumptionCache> AC;
   std::unique_ptr<ScalarEvolution> SE;
 
-  VPlanTestBase()
+  VPlanTestIRBase()
       : TLII(), TLI(TLII),
         DL("e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-"
            "f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:"
@@ -92,6 +92,22 @@ class VPlanTestBase : public testing::Test {
   }
 };
 
+class VPlanTestBase : public testing::Test {
+protected:
+  LLVMContext C;
+  std::unique_ptr<BasicBlock> ScalarHeader;
+  SmallVector<std::unique_ptr<VPlan>> Plans;
+
+  VPlanTestBase() : ScalarHeader(BasicBlock::Create(C, "scalar.header")) {
+    BranchInst::Create(&*ScalarHeader, &*ScalarHeader);
+  }
+
+  VPlan &getPlan(VPValue *TC = nullptr) {
+    Plans.push_back(std::make_unique<VPlan>(&*ScalarHeader, TC));
+    return *Plans.back();
+  }
+};
+
 } // namespace llvm
 
 #endif // LLVM_UNITTESTS_TRANSFORMS_VECTORIZE_VPLANTESTBASE_H
diff --git a/llvm/unittests/Transforms/Vectorize/VPlanVerifierTest.cpp b/llvm/unittests/Transforms/Vectorize/VPlanVerifierTest.cpp
index 6448153de7821c..174249a7e85e32 100644
--- a/llvm/unittests/Transforms/Vectorize/VPlanVerifierTest.cpp
+++ b/llvm/unittests/Transforms/Vectorize/VPlanVerifierTest.cpp
@@ -8,32 +8,29 @@
 
 #include "../lib/Transforms/Vectorize/VPlanVerifier.h"
 #include "../lib/Transforms/Vectorize/VPlan.h"
+#include "VPlanTestBase.h"
 #include "llvm/IR/Instruction.h"
 #include "llvm/IR/Instructions.h"
 #include "gtest/gtest.h"
 
 using namespace llvm;
 
+using VPVerifierTest = VPlanTestBase;
+
 namespace {
-TEST(VPVerifierTest, VPInstructionUseBeforeDefSameBB) {
+TEST_F(VPVerifierTest, VPInstructionUseBeforeDefSameBB) {
+  VPlan &Plan = getPlan();
   VPInstruction *DefI = new VPInstruction(Instruction::Add, {});
   VPInstruction *UseI = new VPInstruction(Instruction::Sub, {DefI});
 
-  VPBasicBlock *VPPH = new VPBasicBlock("ph");
-  VPBasicBlock *VPBB1 = new VPBasicBlock();
+  VPBasicBlock *VPBB1 = Plan.getEntry();
   VPBB1->appendRecipe(UseI);
   VPBB1->appendRecipe(DefI);
 
   VPBasicBlock *VPBB2 = new VPBasicBlock();
   VPRegionBlock *R1 = new VPRegionBlock(VPBB2, VPBB2, "R1");
   VPBlockUtils::connectBlocks(VPBB1, R1);
-
-  LLVMContext C;
-  auto *ScalarHeader = BasicBlock::Create(C, "");
-  VPIRBasicBlock *ScalarHeaderVPBB = new VPIRBasicBlock(ScalarHeader);
-  VPBlockUtils::connectBlocks(R1, ScalarHeaderVPBB);
-  VPBlockUtils::connectBlocks(VPPH, VPBB1);
-  VPlan Plan(VPPH, ScalarHeaderVPBB);
+  VPBlockUtils::connectBlocks(R1, Plan.getScalarHeader());
 
 #if GTEST_HAS_STREAM_REDIRECTION
   ::testing::internal::CaptureStderr();
@@ -43,18 +40,17 @@ TEST(VPVerifierTest, VPInstructionUseBeforeDefSameBB) {
   EXPECT_STREQ("Use before def!\n",
                ::testing::internal::GetCapturedStderr().c_str());
 #endif
-  delete ScalarHeader;
 }
 
-TEST(VPVerifierTest, VPInstructionUseBeforeDefDifferentBB) {
+TEST_F(VPVerifierTest, VPInstructionUseBeforeDefDifferentBB) {
+  VPlan &Plan = getPlan();
   VPInstruction *DefI = new VPInstruction(Instruction::Add, {});
   VPInstruction *UseI = new VPInstruction(Instruction::Sub, {DefI});
   auto *CanIV = new VPCanonicalIVPHIRecipe(UseI, {});
   VPInstruction *BranchOnCond =
       new VPInstruction(VPInstruction::BranchOnCond, {CanIV});
 
-  VPBasicBlock *VPPH = new VPBasicBlock("ph");
-  VPBasicBlock *VPBB1 = new VPBasicBlock();
+  VPBasicBlock *VPBB1 = Plan.getEntry();
   VPBasicBlock *VPBB2 = new VPBasicBlock();
 
   VPBB1->appendRecipe(UseI);
@@ -64,13 +60,7 @@ TEST(VPVerifierTest, VPInstructionUseBeforeDefDifferentBB) {
 
   VPRegionBlock *R1 = new VPRegionBlock(VPBB2, VPBB2, "R1");
   VPBlockUtils::connectBlocks(VPBB1, R1);
-
-  LLVMContext C;
-  auto *ScalarHeader = BasicBlock::Create(C, "");
-  VPIRBasicBlock *ScalarHeaderVPBB = new VPIRBasicBlock(ScalarHeader);
-  VPBlockUtils::connectBlocks(R1, ScalarHeaderVPBB);
-  VPBlockUtils::connectBlocks(VPPH, VPBB1);
-  VPlan Plan(VPPH, ScalarHeaderVPBB);
+  VPBlockUtils::connectBlocks(R1, Plan.getScalarHeader());
 
 #if GTEST_HAS_STREAM_REDIRECTION
   ::testing::internal::CaptureStderr();
@@ -80,11 +70,9 @@ TEST(VPVerifierTest, VPInstructionUseBeforeDefDifferentBB) {
   EXPECT_STREQ("Use before def!\n",
                ::testing::internal::GetCapturedStderr().c_str());
 #endif
-  delete ScalarHeader;
 }
 
-TEST(VPVerifierTest, VPBlendUseBeforeDefDifferentBB) {
-  LLVMContext C;
+TEST_F(VPVerifierTest, VPBlendUseBeforeDefDifferentBB) {
   IntegerType *Int32 = IntegerType::get(C, 32);
   auto *Phi = PHINode::Create(Int32, 1);
 
@@ -95,8 +83,8 @@ TEST(VPVerifierTest, VPBlendUseBeforeDefDifferentBB) {
       new VPInstruction(VPInstruction::BranchOnCond, {CanIV});
   auto *Blend = new VPBlendRecipe(Phi, {DefI});
 
-  VPBasicBlock *VPPH = new VPBasicBlock("ph");
-  VPBasicBlock *VPBB1 = new VPBasicBlock();
+  VPlan &Plan = getPlan();
+  VPBasicBlock *VPBB1 = Plan.getEntry();
   VPBasicBlock *VPBB2 = new VPBasicBlock();
   VPBasicBlock *VPBB3 = new VPBasicBlock();
   VPBasicBlock *VPBB4 = new VPBasicBlock();
@@ -113,11 +101,7 @@ TEST(VPVerifierTest, VPBlendUseBeforeDefDifferentBB) {
   VPBlockUtils::connectBlocks(VPBB1, R1);
   VPBB3->setParent(R1);
 
-  auto *ScalarHeader = BasicBlock::Create(C, "");
-  VPIRBasicBlock *ScalarHeaderVPBB = new VPIRBasicBlock(ScalarHeader);
-  VPBlockUtils::connectBlocks(R1, ScalarHeaderVPBB);
-  VPBlockUtils::connectBlocks(VPPH, VPBB1);
-  VPlan Plan(VPPH, ScalarHeaderVPBB);
+  VPBlockUtils::connectBlocks(R1, Plan.getScalarHeader());
 
 #if GTEST_HAS_STREAM_REDIRECTION
   ::testing::internal::CaptureStderr();
@@ -129,10 +113,9 @@ TEST(VPVerifierTest, VPBlendUseBeforeDefDifferentBB) {
 #endif
 
   delete Phi;
-  delete ScalarHeader;
 }
 
-TEST(VPVerifierTest, DuplicateSuccessorsOutsideRegion) {
+TEST_F(VPVerifierTest, DuplicateSuccessorsOutsideRegion) {
   VPInstruction *I1 = new VPInstruction(Instruction::Add, {});
   auto *CanIV = new VPCanonicalIVPHIRecipe(I1, {});
   VPInstruction *BranchOnCond =
@@ -140,8 +123,8 @@ TEST(VPVerifierTest, DuplicateSuccessorsOutsideRegion) {
   VPInstruction *BranchOnCond2 =
       new VPInstruction(VPInstruction::BranchOnCond, {I1});
 
-  VPBasicBlock *VPPH = new VPBasicBlock("ph");
-  VPBasicBlock *VPBB1 = new VPBasicBlock();
+  VPlan &Plan = getPlan();
+  VPBasicBlock *VPBB1 = Plan.getEntry();
   VPBasicBlock *VPBB2 = new VPBasicBlock();
 
   VPBB1->appendRecipe(I1);
@@ -153,12 +136,7 @@ TEST(VPVerifierTest, DuplicateSuccessorsOutsideRegion) {
   VPBlockUtils::connectBlocks(VPBB1, R1);
   VPBlockUtils::connectBlocks(VPBB1, R1);
 
-  LLVMContext C;
-  auto *ScalarHeader = BasicBlock::Create(C, "");
-  VPIRBasicBlock *ScalarHeaderVPBB = new VPIRBasicBlock(ScalarHeader);
-  VPBlockUtils::connectBlocks(R1, ScalarHeaderVPBB);
-  VPBlockUtils::connectBlocks(VPPH, VPBB1);
-  VPlan Plan(VPPH, ScalarHeaderVPBB);
+  VPBlockUtils::connectBlocks(R1, Plan.getScalarHeader());
 
 #if GTEST_HAS_STREAM_REDIRECTION
   ::testing::internal::CaptureStderr();
@@ -168,10 +146,9 @@ TEST(VPVerifierTest, DuplicateSuccessorsOutsideRegion) {
   EXPECT_STREQ("Multiple instances of the same successor.\n",
                ::testing::internal::GetCapturedStderr().c_str());
 #endif
-  delete ScalarHeader;
 }
 
-TEST(VPVerifierTest, DuplicateSuccessorsInsideRegion) {
+TEST_F(VPVerifierTest, DuplicateSuccessorsInsideRegion) {
   VPInstruction *I1 = new VPInstruction(Instruction::Add, {});
   auto *CanIV = new VPCanonicalIVPHIRecipe(I1, {});
   VPInstruction *BranchOnCond =
@@ -179,8 +156,8 @@ TEST(VPVerifierTest, DuplicateSuccessorsInsideRegion) {
   VPInstruction *BranchOnCond2 =
       new VPInstruction(VPInstruction::BranchOnCond, {I1});
 
-  VPBasicBlock *VPPH = new VPBasicBlock("ph");
-  VPBasicBlock *VPBB1 = new VPBasicBlock();
+  VPlan &Plan = getPlan();
+  VPBasicBlock *VPBB1 = Plan.getEntry();
   VPBasicBlock *VPBB2 = new VPBasicBlock();
   VPBasicBlock *VPBB3 = new VPBasicBlock();
 
@@ -195,12 +172,7 @@ TEST(VPVerifierTest, DuplicateSuccessorsInsideRegion) {
   VPBlockUtils::connectBlocks(VPBB1, R1);
   VPBB3->setParent(R1);
 
-  LLVMContext C;
-  auto *ScalarHeader = BasicBlock::Create(C, "");
-  VPIRBasicBlock *ScalarHeaderVPBB = new VPIRBasicBlock(ScalarHeader);
-  VPBlockUtils::connectBlocks(R1, ScalarHeaderVPBB);
-  VPBlockUtils::connectBlocks(VPPH, VPBB1);
-  VPlan Plan(VPPH, ScalarHeaderVPBB);
+  VPBlockUtils::connectBlocks(R1, Plan.getScalarHeader());
 
 #if GTEST_HAS_STREAM_REDIRECTION
   ::testing::internal::CaptureStderr();
@@ -210,12 +182,11 @@ TEST(VPVerifierTest, DuplicateSuccessorsInsideRegion) {
   EXPECT_STREQ("Multiple instances of the same successor.\n",
                ::testing::internal::GetCapturedStderr().c_str());
 #endif
-  delete ScalarHeader;
 }
 
-TEST(VPVerifierTest, BlockOutsideRegionWithParent) {
-  VPBasicBlock *VPPH = new VPBasicBlock("ph");
-  VPBasicBlock *VPBB1 = new VPBasicBlock();
+TEST_F(VPVerifierTest, BlockOutsideRegionWithParent) {
+  VPlan &Plan = getPlan();
+  VPBasicBlock *VPBB1 = Plan.getEntry();
   VPBasicBlock *VPBB2 = new VPBasicBlock();
 
   VPInstruction *DefI = new VPInstruction(Instruction::Add, {});
@@ -228,12 +199,7 @@ TEST(VPVerifierTest, BlockOutsideRegionWithParent) {
   VPRegionBlock *R1 = new VPRegionBlock(VPBB2, VPBB2, "R1");
   VPBlockUtils::connectBlocks(VPBB1, R1);
 
-  LLVMContext C;
-  auto *ScalarHeader = BasicBlock::Create(C, "");
-  VPIRBasicBlock *ScalarHeaderVPBB = new VPIRBasicBlock(ScalarHeader);
-  VPBlockUtils::connectBlocks(R1, ScalarHeaderVPBB);
-  VPBlockUtils::connectBlocks(VPPH, VPBB1);
-  VPlan Plan(VPPH, ScalarHeaderVPBB);
+  VPBlockUtils::connectBlocks(R1, Plan.getScalarHeader());
   VPBB1->setParent(R1);
 
 #if GTEST_HAS_STREAM_REDIRECTION
@@ -244,7 +210,6 @@ TEST(VPVerifierTest, BlockOutsideRegionWithParent) {
   EXPECT_STREQ("Predecessor is not in the same region.\n",
                ::testing::internal::GetCapturedStderr().c_str());
 #endif
-  delete ScalarHeader;
 }
 
 } // namespace

>From af48fccd3d35851f25890f7844aebf2c57a566cc Mon Sep 17 00:00:00 2001
From: Florian Hahn <flo at fhahn.com>
Date: Fri, 27 Dec 2024 11:25:59 +0000
Subject: [PATCH 14/24] [VPlan] Funnel

---
 llvm/lib/Transforms/Vectorize/VPlan.h         |  5 +-
 .../Transforms/Vectorize/VPlanHCFGBuilder.cpp |  6 +-
 .../Transforms/Vectorize/VPDomTreeTest.cpp    | 35 +++++----
 .../Transforms/Vectorize/VPlanTest.cpp        | 77 ++++++++++---------
 .../Vectorize/VPlanVerifierTest.cpp           | 18 ++---
 5 files changed, 72 insertions(+), 69 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h
index beabcc7fd41874..f235ed37e2c9b0 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.h
+++ b/llvm/lib/Transforms/Vectorize/VPlan.h
@@ -3844,7 +3844,6 @@ class VPlan {
   /// VPlan is destroyed.
   SmallVector<VPBlockBase *> CreatedBlocks;
 
-public:
   /// Construct a VPlan with \p Entry to the plan and with \p ScalarHeader
   /// wrapping the original header of the scalar loop.
   VPlan(VPBasicBlock *Entry, VPIRBasicBlock *ScalarHeader)
@@ -3861,8 +3860,8 @@ class VPlan {
   VPlan(Loop *L);
 
   VPlan(BasicBlock *ScalarHeaderBB, VPValue *TC) {
-    setEntry(new VPBasicBlock("preheader"));
-    ScalarHeader = VPIRBasicBlock::fromBasicBlock(ScalarHeaderBB);
+    setEntry(createVPBasicBlock("preheader"));
+    ScalarHeader = createVPIRBasicBlock(ScalarHeaderBB);
     TripCount = TC;
   }
 
diff --git a/llvm/lib/Transforms/Vectorize/VPlanHCFGBuilder.cpp b/llvm/lib/Transforms/Vectorize/VPlanHCFGBuilder.cpp
index 02f4c8d8872d82..76ed578424dfec 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanHCFGBuilder.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanHCFGBuilder.cpp
@@ -357,12 +357,10 @@ void PlainCFGBuilder::buildPlainCFG() {
   BB2VPBB[TheLoop->getHeader()] = VectorHeaderVPBB;
   VectorHeaderVPBB->clearSuccessors();
   VectorLatchVPBB->clearPredecessors();
-  if (TheLoop->getHeader() != TheLoop->getLoopLatch()) {
+  if (TheLoop->getHeader() != TheLoop->getLoopLatch())
     BB2VPBB[TheLoop->getLoopLatch()] = VectorLatchVPBB;
-  } else {
+  else
     TheRegion->setExiting(VectorHeaderVPBB);
-    delete VectorLatchVPBB;
-  }
 
   // 1. Scan the body of the loop in a topological order to visit each basic
   // block after having visited its predecessor basic blocks. Create a VPBB for
diff --git a/llvm/unittests/Transforms/Vectorize/VPDomTreeTest.cpp b/llvm/unittests/Transforms/Vectorize/VPDomTreeTest.cpp
index 6aa34a5fa431b5..4e1415fa7ac135 100644
--- a/llvm/unittests/Transforms/Vectorize/VPDomTreeTest.cpp
+++ b/llvm/unittests/Transforms/Vectorize/VPDomTreeTest.cpp
@@ -29,10 +29,10 @@ TEST_F(VPDominatorTreeTest, DominanceNoRegionsTest) {
   //  }
   VPlan &Plan = getPlan();
   VPBasicBlock *VPBB0 = Plan.getEntry();
-  VPBasicBlock *VPBB1 = new VPBasicBlock("VPBB1");
-  VPBasicBlock *VPBB2 = new VPBasicBlock("VPBB2");
-  VPBasicBlock *VPBB3 = new VPBasicBlock("VPBB3");
-  VPBasicBlock *VPBB4 = new VPBasicBlock("VPBB4");
+  VPBasicBlock *VPBB1 = Plan.createVPBasicBlock("VPBB1");
+  VPBasicBlock *VPBB2 = Plan.createVPBasicBlock("VPBB2");
+  VPBasicBlock *VPBB3 = Plan.createVPBasicBlock("VPBB3");
+  VPBasicBlock *VPBB4 = Plan.createVPBasicBlock("VPBB4");
   VPRegionBlock *R1 = new VPRegionBlock(VPBB1, VPBB4);
   VPBB2->setParent(R1);
   VPBB3->setParent(R1);
@@ -96,10 +96,10 @@ TEST_F(VPDominatorTreeTest, DominanceRegionsTest) {
     //
     VPlan &Plan = getPlan();
     VPBasicBlock *VPBB0 = Plan.getEntry();
-    VPBasicBlock *R1BB1 = new VPBasicBlock();
-    VPBasicBlock *R1BB2 = new VPBasicBlock();
-    VPBasicBlock *R1BB3 = new VPBasicBlock();
-    VPBasicBlock *R1BB4 = new VPBasicBlock();
+    VPBasicBlock *R1BB1 = Plan.createVPBasicBlock("");
+    VPBasicBlock *R1BB2 = Plan.createVPBasicBlock("");
+    VPBasicBlock *R1BB3 = Plan.createVPBasicBlock("");
+    VPBasicBlock *R1BB4 = Plan.createVPBasicBlock("");
     VPRegionBlock *R1 = new VPRegionBlock(R1BB1, R1BB4, "R1");
     R1BB2->setParent(R1);
     R1BB3->setParent(R1);
@@ -111,8 +111,8 @@ TEST_F(VPDominatorTreeTest, DominanceRegionsTest) {
     // Cycle.
     VPBlockUtils::connectBlocks(R1BB3, R1BB3);
 
-    VPBasicBlock *R2BB1 = new VPBasicBlock();
-    VPBasicBlock *R2BB2 = new VPBasicBlock();
+    VPBasicBlock *R2BB1 = Plan.createVPBasicBlock("");
+    VPBasicBlock *R2BB2 = Plan.createVPBasicBlock("");
     VPRegionBlock *R2 = new VPRegionBlock(R2BB1, R2BB2, "R2");
     VPBlockUtils::connectBlocks(R2BB1, R2BB2);
     VPBlockUtils::connectBlocks(R1, R2);
@@ -170,14 +170,14 @@ TEST_F(VPDominatorTreeTest, DominanceRegionsTest) {
     //  VPBB2
     //
     VPlan &Plan = getPlan();
-    VPBasicBlock *R1BB1 = new VPBasicBlock("R1BB1");
-    VPBasicBlock *R1BB2 = new VPBasicBlock("R1BB2");
-    VPBasicBlock *R1BB3 = new VPBasicBlock("R1BB3");
+    VPBasicBlock *R1BB1 = Plan.createVPBasicBlock("R1BB1");
+    VPBasicBlock *R1BB2 = Plan.createVPBasicBlock("R1BB2");
+    VPBasicBlock *R1BB3 = Plan.createVPBasicBlock("R1BB3");
     VPRegionBlock *R1 = new VPRegionBlock(R1BB1, R1BB3, "R1");
 
-    VPBasicBlock *R2BB1 = new VPBasicBlock("R2BB1");
-    VPBasicBlock *R2BB2 = new VPBasicBlock("R2BB2");
-    VPBasicBlock *R2BB3 = new VPBasicBlock("R2BB3");
+    VPBasicBlock *R2BB1 = Plan.createVPBasicBlock("");
+    VPBasicBlock *R2BB2 = Plan.createVPBasicBlock("");
+    VPBasicBlock *R2BB3 = Plan.createVPBasicBlock("");
     VPRegionBlock *R2 = new VPRegionBlock(R2BB1, R2BB3, "R2");
     R2BB2->setParent(R2);
     VPBlockUtils::connectBlocks(R2BB1, R2BB2);
@@ -193,7 +193,8 @@ TEST_F(VPDominatorTreeTest, DominanceRegionsTest) {
 
     VPBasicBlock *VPBB1 = Plan.getEntry();
     VPBlockUtils::connectBlocks(VPBB1, R1);
-    VPBasicBlock *VPBB2 = new VPBasicBlock("VPBB2");
+    VPBasicBlock *VPBB2 = Plan.createVPBasicBlock(""
+                                                  "VPBB2");
     VPBlockUtils::connectBlocks(R1, VPBB2);
 
     VPBlockUtils::connectBlocks(VPBB2, Plan.getScalarHeader());
diff --git a/llvm/unittests/Transforms/Vectorize/VPlanTest.cpp b/llvm/unittests/Transforms/Vectorize/VPlanTest.cpp
index 2ab55f64a20730..69283034991311 100644
--- a/llvm/unittests/Transforms/Vectorize/VPlanTest.cpp
+++ b/llvm/unittests/Transforms/Vectorize/VPlanTest.cpp
@@ -245,9 +245,9 @@ TEST_F(VPBasicBlockTest, getPlan) {
   {
     VPlan &Plan = getPlan();
     VPBasicBlock *VPBB1 = Plan.getEntry();
-    VPBasicBlock *VPBB2 = new VPBasicBlock();
-    VPBasicBlock *VPBB3 = new VPBasicBlock();
-    VPBasicBlock *VPBB4 = new VPBasicBlock();
+    VPBasicBlock *VPBB2 = Plan.createVPBasicBlock("");
+    VPBasicBlock *VPBB3 = Plan.createVPBasicBlock("");
+    VPBasicBlock *VPBB4 = Plan.createVPBasicBlock("");
 
     //     VPBB1
     //     /   \
@@ -270,8 +270,8 @@ TEST_F(VPBasicBlockTest, getPlan) {
     VPlan &Plan = getPlan();
     VPBasicBlock *VPBB1 = Plan.getEntry();
     // VPBasicBlock is the entry into the VPlan, followed by a region.
-    VPBasicBlock *R1BB1 = new VPBasicBlock();
-    VPBasicBlock *R1BB2 = new VPBasicBlock();
+    VPBasicBlock *R1BB1 = Plan.createVPBasicBlock("");
+    VPBasicBlock *R1BB2 = Plan.createVPBasicBlock("");
     VPRegionBlock *R1 = new VPRegionBlock(R1BB1, R1BB2, "R1");
     VPBlockUtils::connectBlocks(R1BB1, R1BB2);
 
@@ -287,13 +287,13 @@ TEST_F(VPBasicBlockTest, getPlan) {
 
   {
     VPlan &Plan = getPlan();
-    VPBasicBlock *R1BB1 = new VPBasicBlock();
-    VPBasicBlock *R1BB2 = new VPBasicBlock();
+    VPBasicBlock *R1BB1 = Plan.createVPBasicBlock("");
+    VPBasicBlock *R1BB2 = Plan.createVPBasicBlock("");
     VPRegionBlock *R1 = new VPRegionBlock(R1BB1, R1BB2, "R1");
     VPBlockUtils::connectBlocks(R1BB1, R1BB2);
 
-    VPBasicBlock *R2BB1 = new VPBasicBlock();
-    VPBasicBlock *R2BB2 = new VPBasicBlock();
+    VPBasicBlock *R2BB1 = Plan.createVPBasicBlock("");
+    VPBasicBlock *R2BB2 = Plan.createVPBasicBlock("");
     VPRegionBlock *R2 = new VPRegionBlock(R2BB1, R2BB2, "R2");
     VPBlockUtils::connectBlocks(R2BB1, R2BB2);
 
@@ -301,7 +301,7 @@ TEST_F(VPBasicBlockTest, getPlan) {
     VPBlockUtils::connectBlocks(VPBB1, R1);
     VPBlockUtils::connectBlocks(VPBB1, R2);
 
-    VPBasicBlock *VPBB2 = new VPBasicBlock();
+    VPBasicBlock *VPBB2 = Plan.createVPBasicBlock("");
     VPBlockUtils::connectBlocks(R1, VPBB2);
     VPBlockUtils::connectBlocks(R2, VPBB2);
 
@@ -329,9 +329,9 @@ TEST_F(VPBasicBlockTest, TraversingIteratorTest) {
     //
     VPlan &Plan = getPlan();
     VPBasicBlock *VPBB1 = Plan.getEntry();
-    VPBasicBlock *VPBB2 = new VPBasicBlock();
-    VPBasicBlock *VPBB3 = new VPBasicBlock();
-    VPBasicBlock *VPBB4 = new VPBasicBlock();
+    VPBasicBlock *VPBB2 = Plan.createVPBasicBlock("");
+    VPBasicBlock *VPBB3 = Plan.createVPBasicBlock("");
+    VPBasicBlock *VPBB4 = Plan.createVPBasicBlock("");
 
     VPBlockUtils::connectBlocks(VPBB1, VPBB2);
     VPBlockUtils::connectBlocks(VPBB1, VPBB3);
@@ -368,10 +368,10 @@ TEST_F(VPBasicBlockTest, TraversingIteratorTest) {
     //
     VPlan &Plan = getPlan();
     VPBasicBlock *VPBB0 = Plan.getEntry();
-    VPBasicBlock *R1BB1 = new VPBasicBlock();
-    VPBasicBlock *R1BB2 = new VPBasicBlock();
-    VPBasicBlock *R1BB3 = new VPBasicBlock();
-    VPBasicBlock *R1BB4 = new VPBasicBlock();
+    VPBasicBlock *R1BB1 = Plan.createVPBasicBlock("");
+    VPBasicBlock *R1BB2 = Plan.createVPBasicBlock("");
+    VPBasicBlock *R1BB3 = Plan.createVPBasicBlock("");
+    VPBasicBlock *R1BB4 = Plan.createVPBasicBlock("");
     VPRegionBlock *R1 = new VPRegionBlock(R1BB1, R1BB4, "R1");
     R1BB2->setParent(R1);
     R1BB3->setParent(R1);
@@ -383,8 +383,8 @@ TEST_F(VPBasicBlockTest, TraversingIteratorTest) {
     // Cycle.
     VPBlockUtils::connectBlocks(R1BB3, R1BB3);
 
-    VPBasicBlock *R2BB1 = new VPBasicBlock();
-    VPBasicBlock *R2BB2 = new VPBasicBlock();
+    VPBasicBlock *R2BB1 = Plan.createVPBasicBlock("");
+    VPBasicBlock *R2BB2 = Plan.createVPBasicBlock("");
     VPRegionBlock *R2 = new VPRegionBlock(R2BB1, R2BB2, "R2");
     VPBlockUtils::connectBlocks(R2BB1, R2BB2);
     VPBlockUtils::connectBlocks(R1, R2);
@@ -467,14 +467,17 @@ TEST_F(VPBasicBlockTest, TraversingIteratorTest) {
     //  VPBB2
     //
     VPlan &Plan = getPlan();
-    VPBasicBlock *R1BB1 = new VPBasicBlock("R1BB1");
-    VPBasicBlock *R1BB2 = new VPBasicBlock("R1BB2");
-    VPBasicBlock *R1BB3 = new VPBasicBlock("R1BB3");
+    VPBasicBlock *R1BB1 = Plan.createVPBasicBlock("R1BB1");
+    VPBasicBlock *R1BB2 = Plan.createVPBasicBlock("R1BB2");
+    VPBasicBlock *R1BB3 = Plan.createVPBasicBlock("R1BB3");
     VPRegionBlock *R1 = new VPRegionBlock(R1BB1, R1BB3, "R1");
 
-    VPBasicBlock *R2BB1 = new VPBasicBlock("R2BB1");
-    VPBasicBlock *R2BB2 = new VPBasicBlock("R2BB2");
-    VPBasicBlock *R2BB3 = new VPBasicBlock("R2BB3");
+    VPBasicBlock *R2BB1 = Plan.createVPBasicBlock(""
+                                                  "R2BB1");
+    VPBasicBlock *R2BB2 = Plan.createVPBasicBlock(""
+                                                  "R2BB2");
+    VPBasicBlock *R2BB3 = Plan.createVPBasicBlock(""
+                                                  "R2BB3");
     VPRegionBlock *R2 = new VPRegionBlock(R2BB1, R2BB3, "R2");
     R2BB2->setParent(R2);
     VPBlockUtils::connectBlocks(R2BB1, R2BB2);
@@ -490,7 +493,8 @@ TEST_F(VPBasicBlockTest, TraversingIteratorTest) {
 
     VPBasicBlock *VPBB1 = Plan.getEntry();
     VPBlockUtils::connectBlocks(VPBB1, R1);
-    VPBasicBlock *VPBB2 = new VPBasicBlock("VPBB2");
+    VPBasicBlock *VPBB2 = Plan.createVPBasicBlock(""
+                                                  "VPBB2");
     VPBlockUtils::connectBlocks(R1, VPBB2);
 
     // Depth-first.
@@ -538,8 +542,8 @@ TEST_F(VPBasicBlockTest, TraversingIteratorTest) {
     //   }
     //
     VPlan &Plan = getPlan();
-    VPBasicBlock *R2BB1 = new VPBasicBlock("R2BB1");
-    VPBasicBlock *R2BB2 = new VPBasicBlock("R2BB2");
+    VPBasicBlock *R2BB1 = Plan.createVPBasicBlock("R2BB1");
+    VPBasicBlock *R2BB2 = Plan.createVPBasicBlock("R2BB2");
     VPRegionBlock *R2 = new VPRegionBlock(R2BB1, R2BB2, "R2");
     VPBlockUtils::connectBlocks(R2BB1, R2BB2);
 
@@ -592,10 +596,11 @@ TEST_F(VPBasicBlockTest, TraversingIteratorTest) {
     //  VPBB2
     //
     VPlan &Plan = getPlan();
-    VPBasicBlock *R3BB1 = new VPBasicBlock("R3BB1");
+    VPBasicBlock *R3BB1 = Plan.createVPBasicBlock("R3BB1");
     VPRegionBlock *R3 = new VPRegionBlock(R3BB1, R3BB1, "R3");
 
-    VPBasicBlock *R2BB1 = new VPBasicBlock("R2BB1");
+    VPBasicBlock *R2BB1 = Plan.createVPBasicBlock(""
+                                                  "R2BB1");
     VPRegionBlock *R2 = new VPRegionBlock(R2BB1, R3, "R2");
     R3->setParent(R2);
     VPBlockUtils::connectBlocks(R2BB1, R3);
@@ -604,7 +609,7 @@ TEST_F(VPBasicBlockTest, TraversingIteratorTest) {
     R2->setParent(R1);
 
     VPBasicBlock *VPBB1 = Plan.getEntry();
-    VPBasicBlock *VPBB2 = new VPBasicBlock("VPBB2");
+    VPBasicBlock *VPBB2 = Plan.createVPBasicBlock("VPBB2");
     VPBlockUtils::connectBlocks(VPBB1, R1);
     VPBlockUtils::connectBlocks(R1, VPBB2);
 
@@ -674,7 +679,7 @@ TEST_F(VPBasicBlockTest, print) {
   VPInstruction *I2 = new VPInstruction(Instruction::Sub, {I1});
   VPInstruction *I3 = new VPInstruction(Instruction::Br, {I1, I2});
 
-  VPBasicBlock *VPBB1 = new VPBasicBlock();
+  VPBasicBlock *VPBB1 = Plan.createVPBasicBlock("");
   VPBB1->appendRecipe(I1);
   VPBB1->appendRecipe(I2);
   VPBB1->appendRecipe(I3);
@@ -682,7 +687,7 @@ TEST_F(VPBasicBlockTest, print) {
 
   VPInstruction *I4 = new VPInstruction(Instruction::Mul, {I2, I1});
   VPInstruction *I5 = new VPInstruction(Instruction::Ret, {I4});
-  VPBasicBlock *VPBB2 = new VPBasicBlock();
+  VPBasicBlock *VPBB2 = Plan.createVPBasicBlock("");
   VPBB2->appendRecipe(I4);
   VPBB2->appendRecipe(I5);
   VPBB2->setName("bb2");
@@ -783,7 +788,7 @@ TEST_F(VPBasicBlockTest, printPlanWithVFsAndUFs) {
   VPBB0->appendRecipe(TC);
 
   VPInstruction *I1 = new VPInstruction(Instruction::Add, {});
-  VPBasicBlock *VPBB1 = new VPBasicBlock();
+  VPBasicBlock *VPBB1 = Plan.createVPBasicBlock("");
   VPBB1->appendRecipe(I1);
   VPBB1->setName("bb1");
 
@@ -1238,7 +1243,7 @@ TEST_F(VPRecipeTest, MayHaveSideEffectsAndMayReadWriteMemory) {
 TEST_F(VPRecipeTest, dumpRecipeInPlan) {
   VPlan &Plan = getPlan();
   VPBasicBlock *VPBB0 = Plan.getEntry();
-  VPBasicBlock *VPBB1 = new VPBasicBlock();
+  VPBasicBlock *VPBB1 = Plan.createVPBasicBlock("");
   VPBlockUtils::connectBlocks(VPBB1, Plan.getScalarHeader());
   VPBlockUtils::connectBlocks(VPBB0, VPBB1);
 
@@ -1307,7 +1312,7 @@ TEST_F(VPRecipeTest, dumpRecipeInPlan) {
 TEST_F(VPRecipeTest, dumpRecipeUnnamedVPValuesInPlan) {
   VPlan &Plan = getPlan();
   VPBasicBlock *VPBB0 = Plan.getEntry();
-  VPBasicBlock *VPBB1 = new VPBasicBlock();
+  VPBasicBlock *VPBB1 = Plan.createVPBasicBlock("");
   VPBlockUtils::connectBlocks(VPBB1, Plan.getScalarHeader());
   VPBlockUtils::connectBlocks(VPBB0, VPBB1);
 
diff --git a/llvm/unittests/Transforms/Vectorize/VPlanVerifierTest.cpp b/llvm/unittests/Transforms/Vectorize/VPlanVerifierTest.cpp
index 174249a7e85e32..5a29e7ac0893b6 100644
--- a/llvm/unittests/Transforms/Vectorize/VPlanVerifierTest.cpp
+++ b/llvm/unittests/Transforms/Vectorize/VPlanVerifierTest.cpp
@@ -27,7 +27,7 @@ TEST_F(VPVerifierTest, VPInstructionUseBeforeDefSameBB) {
   VPBB1->appendRecipe(UseI);
   VPBB1->appendRecipe(DefI);
 
-  VPBasicBlock *VPBB2 = new VPBasicBlock();
+  VPBasicBlock *VPBB2 = Plan.createVPBasicBlock("");
   VPRegionBlock *R1 = new VPRegionBlock(VPBB2, VPBB2, "R1");
   VPBlockUtils::connectBlocks(VPBB1, R1);
   VPBlockUtils::connectBlocks(R1, Plan.getScalarHeader());
@@ -51,7 +51,7 @@ TEST_F(VPVerifierTest, VPInstructionUseBeforeDefDifferentBB) {
       new VPInstruction(VPInstruction::BranchOnCond, {CanIV});
 
   VPBasicBlock *VPBB1 = Plan.getEntry();
-  VPBasicBlock *VPBB2 = new VPBasicBlock();
+  VPBasicBlock *VPBB2 = Plan.createVPBasicBlock("");
 
   VPBB1->appendRecipe(UseI);
   VPBB2->appendRecipe(CanIV);
@@ -85,9 +85,9 @@ TEST_F(VPVerifierTest, VPBlendUseBeforeDefDifferentBB) {
 
   VPlan &Plan = getPlan();
   VPBasicBlock *VPBB1 = Plan.getEntry();
-  VPBasicBlock *VPBB2 = new VPBasicBlock();
-  VPBasicBlock *VPBB3 = new VPBasicBlock();
-  VPBasicBlock *VPBB4 = new VPBasicBlock();
+  VPBasicBlock *VPBB2 = Plan.createVPBasicBlock("");
+  VPBasicBlock *VPBB3 = Plan.createVPBasicBlock("");
+  VPBasicBlock *VPBB4 = Plan.createVPBasicBlock("");
 
   VPBB1->appendRecipe(I1);
   VPBB2->appendRecipe(CanIV);
@@ -125,7 +125,7 @@ TEST_F(VPVerifierTest, DuplicateSuccessorsOutsideRegion) {
 
   VPlan &Plan = getPlan();
   VPBasicBlock *VPBB1 = Plan.getEntry();
-  VPBasicBlock *VPBB2 = new VPBasicBlock();
+  VPBasicBlock *VPBB2 = Plan.createVPBasicBlock("");
 
   VPBB1->appendRecipe(I1);
   VPBB1->appendRecipe(BranchOnCond2);
@@ -158,8 +158,8 @@ TEST_F(VPVerifierTest, DuplicateSuccessorsInsideRegion) {
 
   VPlan &Plan = getPlan();
   VPBasicBlock *VPBB1 = Plan.getEntry();
-  VPBasicBlock *VPBB2 = new VPBasicBlock();
-  VPBasicBlock *VPBB3 = new VPBasicBlock();
+  VPBasicBlock *VPBB2 = Plan.createVPBasicBlock("");
+  VPBasicBlock *VPBB3 = Plan.createVPBasicBlock("");
 
   VPBB1->appendRecipe(I1);
   VPBB2->appendRecipe(CanIV);
@@ -187,7 +187,7 @@ TEST_F(VPVerifierTest, DuplicateSuccessorsInsideRegion) {
 TEST_F(VPVerifierTest, BlockOutsideRegionWithParent) {
   VPlan &Plan = getPlan();
   VPBasicBlock *VPBB1 = Plan.getEntry();
-  VPBasicBlock *VPBB2 = new VPBasicBlock();
+  VPBasicBlock *VPBB2 = Plan.createVPBasicBlock("");
 
   VPInstruction *DefI = new VPInstruction(Instruction::Add, {});
   VPInstruction *BranchOnCond =

>From 56a41810be9d062ee2fb204952431c88fa9422dc Mon Sep 17 00:00:00 2001
From: Florian Hahn <flo at fhahn.com>
Date: Tue, 31 Dec 2024 13:18:44 +0000
Subject: [PATCH 15/24] !fixup update after merge

---
 llvm/lib/Transforms/Vectorize/VPlan.cpp       |  7 ----
 .../Transforms/Vectorize/VPlanTransforms.cpp  | 32 +++++++++----------
 .../version-stride-with-integer-casts.ll      |  3 +-
 .../vplan-printing-before-execute.ll          |  2 +-
 4 files changed, 17 insertions(+), 27 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/VPlan.cpp b/llvm/lib/Transforms/Vectorize/VPlan.cpp
index c2dcb77ce81a91..96c0da20ee3b8e 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlan.cpp
@@ -521,13 +521,6 @@ VPBasicBlock *VPBasicBlock::clone() {
   return NewBlock;
 }
 
-VPBasicBlock *VPBasicBlock::clone() {
-  auto *NewBlock = getPlan()->createVPBasicBlock(getName());
-  for (VPRecipeBase &R : *this)
-    NewBlock->appendRecipe(R.clone());
-  return NewBlock;
-}
-
 void VPBasicBlock::executeRecipes(VPTransformState *State, BasicBlock *BB) {
   LLVM_DEBUG(dbgs() << "LV: vectorizing VPBB:" << getName()
                     << " in BB:" << BB->getName() << '\n');
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
index 8c481774987e69..06edb4a5f8e980 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
@@ -667,8 +667,8 @@ void VPlanTransforms::optimizeForVFAndUF(VPlan &Plan, ElementCount BestVF,
                                          PredicatedScalarEvolution &PSE) {
   assert(Plan.hasVF(BestVF) && "BestVF is not available in Plan");
   assert(Plan.hasUF(BestUF) && "BestUF is not available in Plan");
-  VPBasicBlock *ExitingVPBB =
-      Plan.getVectorLoopRegion()->getExitingBasicBlock();
+  VPRegionBlock *VectorRegion = Plan.getVectorLoopRegion();
+  VPBasicBlock *ExitingVPBB = VectorRegion->getExitingBasicBlock();
   auto *Term = &ExitingVPBB->back();
   // Try to simplify the branch condition if TC <= VF * UF when preparing to
   // execute the plan for the main vector loop. We only do this if the
@@ -692,9 +692,9 @@ void VPlanTransforms::optimizeForVFAndUF(VPlan &Plan, ElementCount BestVF,
       !SE.isKnownPredicate(CmpInst::ICMP_ULE, TripCount, C))
     return;
 
-  SmallVector<VPValue *> PossiblyDead(Term->operands());
   Term->eraseFromParent();
-  auto *Header = cast<VPBasicBlock>(Plan.getVectorLoopRegion()->getEntry());
+  auto *Header = cast<VPBasicBlock>(VectorRegion->getEntry());
+  auto *CanIVTy = Plan.getCanonicalIV()->getScalarType();
   if (any_of(Header->phis(),
              IsaPred<VPWidenIntOrFpInductionRecipe, VPReductionPHIRecipe>)) {
     LLVMContext &Ctx = SE.getContext();
@@ -709,27 +709,25 @@ void VPlanTransforms::optimizeForVFAndUF(VPlan &Plan, ElementCount BestVF,
       P->eraseFromParent();
     }
 
-    VPBlockBase *Preheader = Plan.getVectorLoopRegion()->getSinglePredecessor();
-    auto *Exiting =
-        cast<VPBasicBlock>(Plan.getVectorLoopRegion()->getExiting());
-
-    auto *LoopRegion = Plan.getVectorLoopRegion();
-    VPBlockBase *Middle = LoopRegion->getSingleSuccessor();
-    VPBlockUtils::disconnectBlocks(Preheader, LoopRegion);
-    VPBlockUtils::disconnectBlocks(LoopRegion, Middle);
+    VPBlockBase *Preheader = Plan.getVectorPreheader();
+    VPBlockBase *Middle = Plan.getMiddleBlock();
+    VPBlockUtils::disconnectBlocks(Preheader, VectorRegion);
+    VPBlockUtils::disconnectBlocks(VectorRegion, Middle);
 
     Header->setParent(nullptr);
-    Exiting->setParent(nullptr);
+    ExitingVPBB->setParent(nullptr);
 
-    for (VPBlockBase *B : vp_depth_first_shallow(LoopRegion->getEntry())) {
+    for (VPBlockBase *B : vp_depth_first_shallow(VectorRegion->getEntry())) {
       if (isa<VPRegionBlock>(B))
         B->setParent(nullptr);
     }
     VPBlockUtils::connectBlocks(Preheader, Header);
-    VPBlockUtils::connectBlocks(Exiting, Middle);
+    VPBlockUtils::connectBlocks(ExitingVPBB, Middle);
+    simplifyRecipes(Plan, CanIVTy);
   }
-  for (VPValue *Op : PossiblyDead)
-    recursivelyDeleteDeadRecipes(Op);
+
+  VPlanTransforms::removeDeadRecipes(Plan);
+
   Plan.setVF(BestVF);
   Plan.setUF(BestUF);
   // TODO: Further simplifications are possible
diff --git a/llvm/test/Transforms/LoopVectorize/version-stride-with-integer-casts.ll b/llvm/test/Transforms/LoopVectorize/version-stride-with-integer-casts.ll
index 55eab88950504a..791c995d88c146 100644
--- a/llvm/test/Transforms/LoopVectorize/version-stride-with-integer-casts.ll
+++ b/llvm/test/Transforms/LoopVectorize/version-stride-with-integer-casts.ll
@@ -499,9 +499,8 @@ define void @sext_of_i1_stride(i1 %g, ptr %dst) mustprogress {
 ; CHECK-NEXT:    [[IND_END:%.*]] = mul i64 [[N_VEC]], [[G_64]]
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
-; CHECK-NEXT:    [[TMP3:%.*]] = mul i64 0, [[G_64]]
 ; CHECK-NEXT:    [[TMP8:%.*]] = mul i64 0, [[G_64]]
-; CHECK-NEXT:    [[TMP4:%.*]] = add i64 [[TMP3]], [[TMP8]]
+; CHECK-NEXT:    [[TMP4:%.*]] = add i64 0, [[TMP8]]
 ; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i16, ptr [[DST]], i64 [[TMP4]]
 ; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i16, ptr [[TMP5]], i32 0
 ; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i16, ptr [[TMP6]], i32 -3
diff --git a/llvm/test/Transforms/LoopVectorize/vplan-printing-before-execute.ll b/llvm/test/Transforms/LoopVectorize/vplan-printing-before-execute.ll
index d047d1da5ef735..beb305f23884e6 100644
--- a/llvm/test/Transforms/LoopVectorize/vplan-printing-before-execute.ll
+++ b/llvm/test/Transforms/LoopVectorize/vplan-printing-before-execute.ll
@@ -74,7 +74,7 @@ define void @test_tc_less_than_16(ptr %A, i64 %N) {
 ; CHECK-NEXT:  IR   %n.vec = sub i64 %and, %n.mod.vf
 ; CHECK-NEXT:  vp<[[END1:%.+]]> = DERIVED-IV ir<%and> + ir<[[VTC]]> * ir<-1>
 ; CHECK-NEXT:  vp<[[END2:%.+]]> = DERIVED-IV ir<%A> + ir<[[VTC]]> * ir<1>
-; CHECK-NEXT: Successor(s): vector loop
+; CHECK-NEXT: Successor(s): vector.body
 ; CHECK-EMPTY:
 ; CHECK-NEXT: vector.body:
 ; CHECK-NEXT:   vp<[[STEPS1:%.+]]> = SCALAR-STEPS ir<0>, ir<1>

>From 303ce930d626804869027a48369346712a1f7bb9 Mon Sep 17 00:00:00 2001
From: Florian Hahn <flo at fhahn.com>
Date: Tue, 31 Dec 2024 17:42:42 +0000
Subject: [PATCH 16/24] !fixup adjust assert

---
 llvm/lib/Transforms/Vectorize/VPlan.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llvm/lib/Transforms/Vectorize/VPlan.cpp b/llvm/lib/Transforms/Vectorize/VPlan.cpp
index 96c0da20ee3b8e..34ca03aa5cc756 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlan.cpp
@@ -555,7 +555,7 @@ VPBasicBlock *VPBasicBlock::splitAt(iterator SplitAt) {
 template <typename T> static T *getEnclosingLoopRegionForRegion(T *P) {
   if (P && P->isReplicator()) {
     P = P->getParent();
-    assert(!cast<VPRegionBlock>(P)->isReplicator() &&
+    assert((!P || !cast<VPRegionBlock>(P)->isReplicator()) &&
            "unexpected nested replicate regions");
   }
   return P;

>From f9db2d0c10fad26f1af3e996e1a06138c65e599b Mon Sep 17 00:00:00 2001
From: Florian Hahn <flo at fhahn.com>
Date: Tue, 31 Dec 2024 20:14:45 +0000
Subject: [PATCH 17/24] !fixup update extra tests.

---
 .../vector-loop-backedge-elimination.ll       | 608 ++++++++++--------
 1 file changed, 329 insertions(+), 279 deletions(-)

diff --git a/llvm/test/Transforms/LoopVectorize/vector-loop-backedge-elimination.ll b/llvm/test/Transforms/LoopVectorize/vector-loop-backedge-elimination.ll
index 6bf8c4c4ebb8a4..3de42080b1842a 100644
--- a/llvm/test/Transforms/LoopVectorize/vector-loop-backedge-elimination.ll
+++ b/llvm/test/Transforms/LoopVectorize/vector-loop-backedge-elimination.ll
@@ -8,7 +8,131 @@ target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128"
 ; Check if the vector loop condition can be simplified to true for a given
 ; VF/IC combination.
 define void @test_tc_less_than_16(ptr %A, i64 %N) {
+; VF8UF1-LABEL: define void @test_tc_less_than_16(
+; VF8UF1-SAME: ptr [[A:%.*]], i64 [[N:%.*]]) {
+; VF8UF1-NEXT:  [[ENTRY:.*]]:
+; VF8UF1-NEXT:    [[AND:%.*]] = and i64 [[N]], 15
+; VF8UF1-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[AND]], 8
+; VF8UF1-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; VF8UF1:       [[VECTOR_PH]]:
+; VF8UF1-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[AND]], 8
+; VF8UF1-NEXT:    [[N_VEC:%.*]] = sub i64 [[AND]], [[N_MOD_VF]]
+; VF8UF1-NEXT:    [[TMP0:%.*]] = sub i64 [[AND]], [[N_VEC]]
+; VF8UF1-NEXT:    [[TMP1:%.*]] = getelementptr i8, ptr [[A]], i64 [[N_VEC]]
+; VF8UF1-NEXT:    br label %[[VECTOR_BODY:.*]]
+; VF8UF1:       [[VECTOR_BODY]]:
+; VF8UF1-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; VF8UF1-NEXT:    [[TMP2:%.*]] = add i64 [[INDEX]], 0
+; VF8UF1-NEXT:    [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP2]]
+; VF8UF1-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[NEXT_GEP]], i32 0
+; VF8UF1-NEXT:    [[WIDE_LOAD:%.*]] = load <8 x i8>, ptr [[TMP3]], align 1
+; VF8UF1-NEXT:    [[TMP4:%.*]] = add nsw <8 x i8> [[WIDE_LOAD]], splat (i8 10)
+; VF8UF1-NEXT:    store <8 x i8> [[TMP4]], ptr [[TMP3]], align 1
+; VF8UF1-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
+; VF8UF1-NEXT:    [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; VF8UF1-NEXT:    br i1 [[TMP5]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; VF8UF1:       [[MIDDLE_BLOCK]]:
+; VF8UF1-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[AND]], [[N_VEC]]
+; VF8UF1-NEXT:    br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; VF8UF1:       [[SCALAR_PH]]:
+; VF8UF1-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[TMP0]], %[[MIDDLE_BLOCK]] ], [ [[AND]], %[[ENTRY]] ]
+; VF8UF1-NEXT:    [[BC_RESUME_VAL1:%.*]] = phi ptr [ [[TMP1]], %[[MIDDLE_BLOCK]] ], [ [[A]], %[[ENTRY]] ]
+; VF8UF1-NEXT:    br label %[[LOOP:.*]]
+; VF8UF1:       [[LOOP]]:
+; VF8UF1-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; VF8UF1-NEXT:    [[P_SRC:%.*]] = phi ptr [ [[BC_RESUME_VAL1]], %[[SCALAR_PH]] ], [ [[P_SRC_NEXT:%.*]], %[[LOOP]] ]
+; VF8UF1-NEXT:    [[P_SRC_NEXT]] = getelementptr inbounds i8, ptr [[P_SRC]], i64 1
+; VF8UF1-NEXT:    [[L:%.*]] = load i8, ptr [[P_SRC]], align 1
+; VF8UF1-NEXT:    [[ADD:%.*]] = add nsw i8 [[L]], 10
+; VF8UF1-NEXT:    store i8 [[ADD]], ptr [[P_SRC]], align 1
+; VF8UF1-NEXT:    [[IV_NEXT]] = add nsw i64 [[IV]], -1
+; VF8UF1-NEXT:    [[CMP:%.*]] = icmp eq i64 [[IV_NEXT]], 0
+; VF8UF1-NEXT:    br i1 [[CMP]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP3:![0-9]+]]
+; VF8UF1:       [[EXIT]]:
+; VF8UF1-NEXT:    ret void
+;
+; VF8UF2-LABEL: define void @test_tc_less_than_16(
+; VF8UF2-SAME: ptr [[A:%.*]], i64 [[N:%.*]]) {
+; VF8UF2-NEXT:  [[ENTRY:.*]]:
+; VF8UF2-NEXT:    [[AND:%.*]] = and i64 [[N]], 15
+; VF8UF2-NEXT:    br i1 true, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; VF8UF2:       [[VECTOR_PH]]:
+; VF8UF2-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[AND]], 16
+; VF8UF2-NEXT:    [[N_VEC:%.*]] = sub i64 [[AND]], [[N_MOD_VF]]
+; VF8UF2-NEXT:    [[TMP0:%.*]] = sub i64 [[AND]], [[N_VEC]]
+; VF8UF2-NEXT:    [[TMP1:%.*]] = getelementptr i8, ptr [[A]], i64 [[N_VEC]]
+; VF8UF2-NEXT:    br label %[[VECTOR_BODY:.*]]
+; VF8UF2:       [[VECTOR_BODY]]:
+; VF8UF2-NEXT:    [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[A]], i64 0
+; VF8UF2-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr [[NEXT_GEP]], i32 0
+; VF8UF2-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[NEXT_GEP]], i32 8
+; VF8UF2-NEXT:    [[WIDE_LOAD:%.*]] = load <8 x i8>, ptr [[TMP2]], align 1
+; VF8UF2-NEXT:    [[WIDE_LOAD1:%.*]] = load <8 x i8>, ptr [[TMP3]], align 1
+; VF8UF2-NEXT:    [[TMP4:%.*]] = add nsw <8 x i8> [[WIDE_LOAD]], splat (i8 10)
+; VF8UF2-NEXT:    [[TMP5:%.*]] = add nsw <8 x i8> [[WIDE_LOAD1]], splat (i8 10)
+; VF8UF2-NEXT:    [[TMP6:%.*]] = getelementptr i8, ptr [[NEXT_GEP]], i32 0
+; VF8UF2-NEXT:    [[TMP7:%.*]] = getelementptr i8, ptr [[NEXT_GEP]], i32 8
+; VF8UF2-NEXT:    store <8 x i8> [[TMP4]], ptr [[TMP6]], align 1
+; VF8UF2-NEXT:    store <8 x i8> [[TMP5]], ptr [[TMP7]], align 1
+; VF8UF2-NEXT:    br label %[[MIDDLE_BLOCK:.*]]
+; VF8UF2:       [[MIDDLE_BLOCK]]:
+; VF8UF2-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[AND]], [[N_VEC]]
+; VF8UF2-NEXT:    br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; VF8UF2:       [[SCALAR_PH]]:
+; VF8UF2-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[TMP0]], %[[MIDDLE_BLOCK]] ], [ [[AND]], %[[ENTRY]] ]
+; VF8UF2-NEXT:    [[BC_RESUME_VAL2:%.*]] = phi ptr [ [[TMP1]], %[[MIDDLE_BLOCK]] ], [ [[A]], %[[ENTRY]] ]
+; VF8UF2-NEXT:    br label %[[LOOP:.*]]
+; VF8UF2:       [[LOOP]]:
+; VF8UF2-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; VF8UF2-NEXT:    [[P_SRC:%.*]] = phi ptr [ [[BC_RESUME_VAL2]], %[[SCALAR_PH]] ], [ [[P_SRC_NEXT:%.*]], %[[LOOP]] ]
+; VF8UF2-NEXT:    [[P_SRC_NEXT]] = getelementptr inbounds i8, ptr [[P_SRC]], i64 1
+; VF8UF2-NEXT:    [[L:%.*]] = load i8, ptr [[P_SRC]], align 1
+; VF8UF2-NEXT:    [[ADD:%.*]] = add nsw i8 [[L]], 10
+; VF8UF2-NEXT:    store i8 [[ADD]], ptr [[P_SRC]], align 1
+; VF8UF2-NEXT:    [[IV_NEXT]] = add nsw i64 [[IV]], -1
+; VF8UF2-NEXT:    [[CMP:%.*]] = icmp eq i64 [[IV_NEXT]], 0
+; VF8UF2-NEXT:    br i1 [[CMP]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP0:![0-9]+]]
+; VF8UF2:       [[EXIT]]:
+; VF8UF2-NEXT:    ret void
 ;
+; VF16UF1-LABEL: define void @test_tc_less_than_16(
+; VF16UF1-SAME: ptr [[A:%.*]], i64 [[N:%.*]]) {
+; VF16UF1-NEXT:  [[ENTRY:.*]]:
+; VF16UF1-NEXT:    [[AND:%.*]] = and i64 [[N]], 15
+; VF16UF1-NEXT:    br i1 true, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; VF16UF1:       [[VECTOR_PH]]:
+; VF16UF1-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[AND]], 16
+; VF16UF1-NEXT:    [[N_VEC:%.*]] = sub i64 [[AND]], [[N_MOD_VF]]
+; VF16UF1-NEXT:    [[TMP0:%.*]] = sub i64 [[AND]], [[N_VEC]]
+; VF16UF1-NEXT:    [[TMP1:%.*]] = getelementptr i8, ptr [[A]], i64 [[N_VEC]]
+; VF16UF1-NEXT:    br label %[[VECTOR_BODY:.*]]
+; VF16UF1:       [[VECTOR_BODY]]:
+; VF16UF1-NEXT:    [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[A]], i64 0
+; VF16UF1-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr [[NEXT_GEP]], i32 0
+; VF16UF1-NEXT:    [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP2]], align 1
+; VF16UF1-NEXT:    [[TMP3:%.*]] = add nsw <16 x i8> [[WIDE_LOAD]], splat (i8 10)
+; VF16UF1-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[NEXT_GEP]], i32 0
+; VF16UF1-NEXT:    store <16 x i8> [[TMP3]], ptr [[TMP4]], align 1
+; VF16UF1-NEXT:    br label %[[MIDDLE_BLOCK:.*]]
+; VF16UF1:       [[MIDDLE_BLOCK]]:
+; VF16UF1-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[AND]], [[N_VEC]]
+; VF16UF1-NEXT:    br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; VF16UF1:       [[SCALAR_PH]]:
+; VF16UF1-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[TMP0]], %[[MIDDLE_BLOCK]] ], [ [[AND]], %[[ENTRY]] ]
+; VF16UF1-NEXT:    [[BC_RESUME_VAL1:%.*]] = phi ptr [ [[TMP1]], %[[MIDDLE_BLOCK]] ], [ [[A]], %[[ENTRY]] ]
+; VF16UF1-NEXT:    br label %[[LOOP:.*]]
+; VF16UF1:       [[LOOP]]:
+; VF16UF1-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; VF16UF1-NEXT:    [[P_SRC:%.*]] = phi ptr [ [[BC_RESUME_VAL1]], %[[SCALAR_PH]] ], [ [[P_SRC_NEXT:%.*]], %[[LOOP]] ]
+; VF16UF1-NEXT:    [[P_SRC_NEXT]] = getelementptr inbounds i8, ptr [[P_SRC]], i64 1
+; VF16UF1-NEXT:    [[L:%.*]] = load i8, ptr [[P_SRC]], align 1
+; VF16UF1-NEXT:    [[ADD:%.*]] = add nsw i8 [[L]], 10
+; VF16UF1-NEXT:    store i8 [[ADD]], ptr [[P_SRC]], align 1
+; VF16UF1-NEXT:    [[IV_NEXT]] = add nsw i64 [[IV]], -1
+; VF16UF1-NEXT:    [[CMP:%.*]] = icmp eq i64 [[IV_NEXT]], 0
+; VF16UF1-NEXT:    br i1 [[CMP]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP0:![0-9]+]]
+; VF16UF1:       [[EXIT]]:
+; VF16UF1-NEXT:    ret void
 ;
 entry:
   %and = and i64 %N, 15
@@ -41,82 +165,68 @@ define void @remove_loop_region_with_replicate_recipe(ptr %dst, i64 range(i64 5,
 ; VF8UF1-NEXT:    [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
 ; VF8UF1-NEXT:    [[TRIP_COUNT_MINUS_1:%.*]] = sub i64 [[TMP0]], 1
 ; VF8UF1-NEXT:    [[TMP1:%.*]] = add i64 2, [[N_VEC]]
-; VF8UF1-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <8 x i64> poison, i64 [[TRIP_COUNT_MINUS_1]], i64 0
-; VF8UF1-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <8 x i64> [[BROADCAST_SPLATINSERT]], <8 x i64> poison, <8 x i32> zeroinitializer
+; VF8UF1-NEXT:    [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <8 x i64> poison, i64 [[TRIP_COUNT_MINUS_1]], i64 0
+; VF8UF1-NEXT:    [[BROADCAST_SPLAT1:%.*]] = shufflevector <8 x i64> [[BROADCAST_SPLATINSERT1]], <8 x i64> poison, <8 x i32> zeroinitializer
 ; VF8UF1-NEXT:    br label %[[VECTOR_BODY:.*]]
 ; VF8UF1:       [[VECTOR_BODY]]:
-; VF8UF1-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[PRED_STORE_CONTINUE16:.*]] ]
-; VF8UF1-NEXT:    [[OFFSET_IDX:%.*]] = add i64 2, [[INDEX]]
-; VF8UF1-NEXT:    [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <8 x i64> poison, i64 [[INDEX]], i64 0
-; VF8UF1-NEXT:    [[BROADCAST_SPLAT1:%.*]] = shufflevector <8 x i64> [[BROADCAST_SPLATINSERT1]], <8 x i64> poison, <8 x i32> zeroinitializer
-; VF8UF1-NEXT:    [[VEC_IV:%.*]] = add <8 x i64> [[BROADCAST_SPLAT1]], <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7>
-; VF8UF1-NEXT:    [[TMP2:%.*]] = icmp ule <8 x i64> [[VEC_IV]], [[BROADCAST_SPLAT]]
+; VF8UF1-NEXT:    [[TMP2:%.*]] = icmp ule <8 x i64> <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7>, [[BROADCAST_SPLAT1]]
 ; VF8UF1-NEXT:    [[TMP3:%.*]] = extractelement <8 x i1> [[TMP2]], i32 0
 ; VF8UF1-NEXT:    br i1 [[TMP3]], label %[[PRED_STORE_IF:.*]], label %[[PRED_STORE_CONTINUE:.*]]
 ; VF8UF1:       [[PRED_STORE_IF]]:
-; VF8UF1-NEXT:    [[TMP20:%.*]] = add i64 [[OFFSET_IDX]], 0
-; VF8UF1-NEXT:    [[TMP4:%.*]] = getelementptr i16, ptr [[DST]], i64 [[TMP20]]
+; VF8UF1-NEXT:    [[TMP4:%.*]] = getelementptr i16, ptr [[DST]], i64 2
 ; VF8UF1-NEXT:    store i16 0, ptr [[TMP4]], align 2
 ; VF8UF1-NEXT:    br label %[[PRED_STORE_CONTINUE]]
 ; VF8UF1:       [[PRED_STORE_CONTINUE]]:
 ; VF8UF1-NEXT:    [[TMP5:%.*]] = extractelement <8 x i1> [[TMP2]], i32 1
-; VF8UF1-NEXT:    br i1 [[TMP5]], label %[[PRED_STORE_IF3:.*]], label %[[PRED_STORE_CONTINUE4:.*]]
-; VF8UF1:       [[PRED_STORE_IF3]]:
-; VF8UF1-NEXT:    [[TMP21:%.*]] = add i64 [[OFFSET_IDX]], 1
-; VF8UF1-NEXT:    [[TMP6:%.*]] = getelementptr i16, ptr [[DST]], i64 [[TMP21]]
+; VF8UF1-NEXT:    br i1 [[TMP5]], label %[[PRED_STORE_IF1:.*]], label %[[PRED_STORE_CONTINUE2:.*]]
+; VF8UF1:       [[PRED_STORE_IF1]]:
+; VF8UF1-NEXT:    [[TMP6:%.*]] = getelementptr i16, ptr [[DST]], i64 3
 ; VF8UF1-NEXT:    store i16 0, ptr [[TMP6]], align 2
+; VF8UF1-NEXT:    br label %[[PRED_STORE_CONTINUE2]]
+; VF8UF1:       [[PRED_STORE_CONTINUE2]]:
+; VF8UF1-NEXT:    [[TMP7:%.*]] = extractelement <8 x i1> [[TMP2]], i32 2
+; VF8UF1-NEXT:    br i1 [[TMP7]], label %[[PRED_STORE_IF3:.*]], label %[[PRED_STORE_CONTINUE4:.*]]
+; VF8UF1:       [[PRED_STORE_IF3]]:
+; VF8UF1-NEXT:    [[TMP8:%.*]] = getelementptr i16, ptr [[DST]], i64 4
+; VF8UF1-NEXT:    store i16 0, ptr [[TMP8]], align 2
 ; VF8UF1-NEXT:    br label %[[PRED_STORE_CONTINUE4]]
 ; VF8UF1:       [[PRED_STORE_CONTINUE4]]:
-; VF8UF1-NEXT:    [[TMP7:%.*]] = extractelement <8 x i1> [[TMP2]], i32 2
-; VF8UF1-NEXT:    br i1 [[TMP7]], label %[[PRED_STORE_IF5:.*]], label %[[PRED_STORE_CONTINUE6:.*]]
+; VF8UF1-NEXT:    [[TMP9:%.*]] = extractelement <8 x i1> [[TMP2]], i32 3
+; VF8UF1-NEXT:    br i1 [[TMP9]], label %[[PRED_STORE_IF5:.*]], label %[[PRED_STORE_CONTINUE6:.*]]
 ; VF8UF1:       [[PRED_STORE_IF5]]:
-; VF8UF1-NEXT:    [[TMP23:%.*]] = add i64 [[OFFSET_IDX]], 2
-; VF8UF1-NEXT:    [[TMP8:%.*]] = getelementptr i16, ptr [[DST]], i64 [[TMP23]]
-; VF8UF1-NEXT:    store i16 0, ptr [[TMP8]], align 2
+; VF8UF1-NEXT:    [[TMP10:%.*]] = getelementptr i16, ptr [[DST]], i64 5
+; VF8UF1-NEXT:    store i16 0, ptr [[TMP10]], align 2
 ; VF8UF1-NEXT:    br label %[[PRED_STORE_CONTINUE6]]
 ; VF8UF1:       [[PRED_STORE_CONTINUE6]]:
-; VF8UF1-NEXT:    [[TMP9:%.*]] = extractelement <8 x i1> [[TMP2]], i32 3
-; VF8UF1-NEXT:    br i1 [[TMP9]], label %[[PRED_STORE_IF7:.*]], label %[[PRED_STORE_CONTINUE8:.*]]
+; VF8UF1-NEXT:    [[TMP11:%.*]] = extractelement <8 x i1> [[TMP2]], i32 4
+; VF8UF1-NEXT:    br i1 [[TMP11]], label %[[PRED_STORE_IF7:.*]], label %[[PRED_STORE_CONTINUE8:.*]]
 ; VF8UF1:       [[PRED_STORE_IF7]]:
-; VF8UF1-NEXT:    [[TMP24:%.*]] = add i64 [[OFFSET_IDX]], 3
-; VF8UF1-NEXT:    [[TMP10:%.*]] = getelementptr i16, ptr [[DST]], i64 [[TMP24]]
-; VF8UF1-NEXT:    store i16 0, ptr [[TMP10]], align 2
+; VF8UF1-NEXT:    [[TMP12:%.*]] = getelementptr i16, ptr [[DST]], i64 6
+; VF8UF1-NEXT:    store i16 0, ptr [[TMP12]], align 2
 ; VF8UF1-NEXT:    br label %[[PRED_STORE_CONTINUE8]]
 ; VF8UF1:       [[PRED_STORE_CONTINUE8]]:
-; VF8UF1-NEXT:    [[TMP11:%.*]] = extractelement <8 x i1> [[TMP2]], i32 4
-; VF8UF1-NEXT:    br i1 [[TMP11]], label %[[PRED_STORE_IF9:.*]], label %[[PRED_STORE_CONTINUE10:.*]]
+; VF8UF1-NEXT:    [[TMP13:%.*]] = extractelement <8 x i1> [[TMP2]], i32 5
+; VF8UF1-NEXT:    br i1 [[TMP13]], label %[[PRED_STORE_IF9:.*]], label %[[PRED_STORE_CONTINUE10:.*]]
 ; VF8UF1:       [[PRED_STORE_IF9]]:
-; VF8UF1-NEXT:    [[TMP26:%.*]] = add i64 [[OFFSET_IDX]], 4
-; VF8UF1-NEXT:    [[TMP12:%.*]] = getelementptr i16, ptr [[DST]], i64 [[TMP26]]
-; VF8UF1-NEXT:    store i16 0, ptr [[TMP12]], align 2
+; VF8UF1-NEXT:    [[TMP14:%.*]] = getelementptr i16, ptr [[DST]], i64 7
+; VF8UF1-NEXT:    store i16 0, ptr [[TMP14]], align 2
 ; VF8UF1-NEXT:    br label %[[PRED_STORE_CONTINUE10]]
 ; VF8UF1:       [[PRED_STORE_CONTINUE10]]:
-; VF8UF1-NEXT:    [[TMP13:%.*]] = extractelement <8 x i1> [[TMP2]], i32 5
-; VF8UF1-NEXT:    br i1 [[TMP13]], label %[[PRED_STORE_IF11:.*]], label %[[PRED_STORE_CONTINUE12:.*]]
+; VF8UF1-NEXT:    [[TMP15:%.*]] = extractelement <8 x i1> [[TMP2]], i32 6
+; VF8UF1-NEXT:    br i1 [[TMP15]], label %[[PRED_STORE_IF11:.*]], label %[[PRED_STORE_CONTINUE12:.*]]
 ; VF8UF1:       [[PRED_STORE_IF11]]:
-; VF8UF1-NEXT:    [[TMP19:%.*]] = add i64 [[OFFSET_IDX]], 5
-; VF8UF1-NEXT:    [[TMP14:%.*]] = getelementptr i16, ptr [[DST]], i64 [[TMP19]]
-; VF8UF1-NEXT:    store i16 0, ptr [[TMP14]], align 2
+; VF8UF1-NEXT:    [[TMP16:%.*]] = getelementptr i16, ptr [[DST]], i64 8
+; VF8UF1-NEXT:    store i16 0, ptr [[TMP16]], align 2
 ; VF8UF1-NEXT:    br label %[[PRED_STORE_CONTINUE12]]
 ; VF8UF1:       [[PRED_STORE_CONTINUE12]]:
-; VF8UF1-NEXT:    [[TMP15:%.*]] = extractelement <8 x i1> [[TMP2]], i32 6
-; VF8UF1-NEXT:    br i1 [[TMP15]], label %[[PRED_STORE_IF13:.*]], label %[[PRED_STORE_CONTINUE14:.*]]
+; VF8UF1-NEXT:    [[TMP17:%.*]] = extractelement <8 x i1> [[TMP2]], i32 7
+; VF8UF1-NEXT:    br i1 [[TMP17]], label %[[PRED_STORE_IF13:.*]], label %[[PRED_STORE_CONTINUE14:.*]]
 ; VF8UF1:       [[PRED_STORE_IF13]]:
-; VF8UF1-NEXT:    [[TMP22:%.*]] = add i64 [[OFFSET_IDX]], 6
-; VF8UF1-NEXT:    [[TMP16:%.*]] = getelementptr i16, ptr [[DST]], i64 [[TMP22]]
-; VF8UF1-NEXT:    store i16 0, ptr [[TMP16]], align 2
+; VF8UF1-NEXT:    [[TMP18:%.*]] = getelementptr i16, ptr [[DST]], i64 9
+; VF8UF1-NEXT:    store i16 0, ptr [[TMP18]], align 2
 ; VF8UF1-NEXT:    br label %[[PRED_STORE_CONTINUE14]]
 ; VF8UF1:       [[PRED_STORE_CONTINUE14]]:
-; VF8UF1-NEXT:    [[TMP17:%.*]] = extractelement <8 x i1> [[TMP2]], i32 7
-; VF8UF1-NEXT:    br i1 [[TMP17]], label %[[PRED_STORE_IF15:.*]], label %[[PRED_STORE_CONTINUE16]]
-; VF8UF1:       [[PRED_STORE_IF15]]:
-; VF8UF1-NEXT:    [[TMP25:%.*]] = add i64 [[OFFSET_IDX]], 7
-; VF8UF1-NEXT:    [[TMP18:%.*]] = getelementptr i16, ptr [[DST]], i64 [[TMP25]]
-; VF8UF1-NEXT:    store i16 0, ptr [[TMP18]], align 2
-; VF8UF1-NEXT:    br label %[[PRED_STORE_CONTINUE16]]
-; VF8UF1:       [[PRED_STORE_CONTINUE16]]:
-; VF8UF1-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
-; VF8UF1-NEXT:    br i1 true, label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
+; VF8UF1-NEXT:    br label %[[MIDDLE_BLOCK:.*]]
 ; VF8UF1:       [[MIDDLE_BLOCK]]:
 ; VF8UF1-NEXT:    br i1 true, label %[[EXIT:.*]], label %[[SCALAR_PH]]
 ; VF8UF1:       [[SCALAR_PH]]:
@@ -128,7 +238,7 @@ define void @remove_loop_region_with_replicate_recipe(ptr %dst, i64 range(i64 5,
 ; VF8UF1-NEXT:    store i16 0, ptr [[GEP_DST]], align 2
 ; VF8UF1-NEXT:    [[IV_NEXT]] = add i64 [[IV]], 1
 ; VF8UF1-NEXT:    [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
-; VF8UF1-NEXT:    br i1 [[EC]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP5:![0-9]+]]
+; VF8UF1-NEXT:    br i1 [[EC]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP4:![0-9]+]]
 ; VF8UF1:       [[EXIT]]:
 ; VF8UF1-NEXT:    ret void
 ;
@@ -143,148 +253,125 @@ define void @remove_loop_region_with_replicate_recipe(ptr %dst, i64 range(i64 5,
 ; VF8UF2-NEXT:    [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
 ; VF8UF2-NEXT:    [[TRIP_COUNT_MINUS_1:%.*]] = sub i64 [[TMP0]], 1
 ; VF8UF2-NEXT:    [[TMP1:%.*]] = add i64 2, [[N_VEC]]
-; VF8UF2-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <8 x i64> poison, i64 [[TRIP_COUNT_MINUS_1]], i64 0
-; VF8UF2-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <8 x i64> [[BROADCAST_SPLATINSERT]], <8 x i64> poison, <8 x i32> zeroinitializer
+; VF8UF2-NEXT:    [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <8 x i64> poison, i64 [[TRIP_COUNT_MINUS_1]], i64 0
+; VF8UF2-NEXT:    [[BROADCAST_SPLAT1:%.*]] = shufflevector <8 x i64> [[BROADCAST_SPLATINSERT1]], <8 x i64> poison, <8 x i32> zeroinitializer
 ; VF8UF2-NEXT:    br label %[[VECTOR_BODY:.*]]
 ; VF8UF2:       [[VECTOR_BODY]]:
-; VF8UF2-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[PRED_STORE_CONTINUE35:.*]] ]
-; VF8UF2-NEXT:    [[OFFSET_IDX:%.*]] = add i64 2, [[INDEX]]
-; VF8UF2-NEXT:    [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <8 x i64> poison, i64 [[INDEX]], i64 0
-; VF8UF2-NEXT:    [[BROADCAST_SPLAT1:%.*]] = shufflevector <8 x i64> [[BROADCAST_SPLATINSERT1]], <8 x i64> poison, <8 x i32> zeroinitializer
-; VF8UF2-NEXT:    [[VEC_IV:%.*]] = add <8 x i64> [[BROADCAST_SPLAT1]], <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7>
-; VF8UF2-NEXT:    [[VEC_IV3:%.*]] = add <8 x i64> [[BROADCAST_SPLAT1]], <i64 8, i64 9, i64 10, i64 11, i64 12, i64 13, i64 14, i64 15>
-; VF8UF2-NEXT:    [[TMP2:%.*]] = icmp ule <8 x i64> [[VEC_IV]], [[BROADCAST_SPLAT]]
-; VF8UF2-NEXT:    [[TMP3:%.*]] = icmp ule <8 x i64> [[VEC_IV3]], [[BROADCAST_SPLAT]]
+; VF8UF2-NEXT:    [[TMP2:%.*]] = icmp ule <8 x i64> <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7>, [[BROADCAST_SPLAT1]]
+; VF8UF2-NEXT:    [[TMP3:%.*]] = icmp ule <8 x i64> <i64 8, i64 9, i64 10, i64 11, i64 12, i64 13, i64 14, i64 15>, [[BROADCAST_SPLAT1]]
 ; VF8UF2-NEXT:    [[TMP4:%.*]] = extractelement <8 x i1> [[TMP2]], i32 0
 ; VF8UF2-NEXT:    br i1 [[TMP4]], label %[[PRED_STORE_IF:.*]], label %[[PRED_STORE_CONTINUE:.*]]
 ; VF8UF2:       [[PRED_STORE_IF]]:
-; VF8UF2-NEXT:    [[TMP36:%.*]] = add i64 [[OFFSET_IDX]], 0
-; VF8UF2-NEXT:    [[TMP5:%.*]] = getelementptr i16, ptr [[DST]], i64 [[TMP36]]
+; VF8UF2-NEXT:    [[TMP5:%.*]] = getelementptr i16, ptr [[DST]], i64 2
 ; VF8UF2-NEXT:    store i16 0, ptr [[TMP5]], align 2
 ; VF8UF2-NEXT:    br label %[[PRED_STORE_CONTINUE]]
 ; VF8UF2:       [[PRED_STORE_CONTINUE]]:
 ; VF8UF2-NEXT:    [[TMP6:%.*]] = extractelement <8 x i1> [[TMP2]], i32 1
-; VF8UF2-NEXT:    br i1 [[TMP6]], label %[[PRED_STORE_IF6:.*]], label %[[PRED_STORE_CONTINUE7:.*]]
-; VF8UF2:       [[PRED_STORE_IF6]]:
-; VF8UF2-NEXT:    [[TMP37:%.*]] = add i64 [[OFFSET_IDX]], 1
-; VF8UF2-NEXT:    [[TMP7:%.*]] = getelementptr i16, ptr [[DST]], i64 [[TMP37]]
+; VF8UF2-NEXT:    br i1 [[TMP6]], label %[[PRED_STORE_IF1:.*]], label %[[PRED_STORE_CONTINUE2:.*]]
+; VF8UF2:       [[PRED_STORE_IF1]]:
+; VF8UF2-NEXT:    [[TMP7:%.*]] = getelementptr i16, ptr [[DST]], i64 3
 ; VF8UF2-NEXT:    store i16 0, ptr [[TMP7]], align 2
-; VF8UF2-NEXT:    br label %[[PRED_STORE_CONTINUE7]]
-; VF8UF2:       [[PRED_STORE_CONTINUE7]]:
+; VF8UF2-NEXT:    br label %[[PRED_STORE_CONTINUE2]]
+; VF8UF2:       [[PRED_STORE_CONTINUE2]]:
 ; VF8UF2-NEXT:    [[TMP8:%.*]] = extractelement <8 x i1> [[TMP2]], i32 2
-; VF8UF2-NEXT:    br i1 [[TMP8]], label %[[PRED_STORE_IF8:.*]], label %[[PRED_STORE_CONTINUE9:.*]]
-; VF8UF2:       [[PRED_STORE_IF8]]:
-; VF8UF2-NEXT:    [[TMP39:%.*]] = add i64 [[OFFSET_IDX]], 2
-; VF8UF2-NEXT:    [[TMP9:%.*]] = getelementptr i16, ptr [[DST]], i64 [[TMP39]]
+; VF8UF2-NEXT:    br i1 [[TMP8]], label %[[PRED_STORE_IF3:.*]], label %[[PRED_STORE_CONTINUE4:.*]]
+; VF8UF2:       [[PRED_STORE_IF3]]:
+; VF8UF2-NEXT:    [[TMP9:%.*]] = getelementptr i16, ptr [[DST]], i64 4
 ; VF8UF2-NEXT:    store i16 0, ptr [[TMP9]], align 2
-; VF8UF2-NEXT:    br label %[[PRED_STORE_CONTINUE9]]
-; VF8UF2:       [[PRED_STORE_CONTINUE9]]:
+; VF8UF2-NEXT:    br label %[[PRED_STORE_CONTINUE4]]
+; VF8UF2:       [[PRED_STORE_CONTINUE4]]:
 ; VF8UF2-NEXT:    [[TMP10:%.*]] = extractelement <8 x i1> [[TMP2]], i32 3
-; VF8UF2-NEXT:    br i1 [[TMP10]], label %[[PRED_STORE_IF10:.*]], label %[[PRED_STORE_CONTINUE11:.*]]
-; VF8UF2:       [[PRED_STORE_IF10]]:
-; VF8UF2-NEXT:    [[TMP40:%.*]] = add i64 [[OFFSET_IDX]], 3
-; VF8UF2-NEXT:    [[TMP11:%.*]] = getelementptr i16, ptr [[DST]], i64 [[TMP40]]
+; VF8UF2-NEXT:    br i1 [[TMP10]], label %[[PRED_STORE_IF5:.*]], label %[[PRED_STORE_CONTINUE6:.*]]
+; VF8UF2:       [[PRED_STORE_IF5]]:
+; VF8UF2-NEXT:    [[TMP11:%.*]] = getelementptr i16, ptr [[DST]], i64 5
 ; VF8UF2-NEXT:    store i16 0, ptr [[TMP11]], align 2
-; VF8UF2-NEXT:    br label %[[PRED_STORE_CONTINUE11]]
-; VF8UF2:       [[PRED_STORE_CONTINUE11]]:
+; VF8UF2-NEXT:    br label %[[PRED_STORE_CONTINUE6]]
+; VF8UF2:       [[PRED_STORE_CONTINUE6]]:
 ; VF8UF2-NEXT:    [[TMP12:%.*]] = extractelement <8 x i1> [[TMP2]], i32 4
-; VF8UF2-NEXT:    br i1 [[TMP12]], label %[[PRED_STORE_IF12:.*]], label %[[PRED_STORE_CONTINUE13:.*]]
-; VF8UF2:       [[PRED_STORE_IF12]]:
-; VF8UF2-NEXT:    [[TMP42:%.*]] = add i64 [[OFFSET_IDX]], 4
-; VF8UF2-NEXT:    [[TMP13:%.*]] = getelementptr i16, ptr [[DST]], i64 [[TMP42]]
+; VF8UF2-NEXT:    br i1 [[TMP12]], label %[[PRED_STORE_IF7:.*]], label %[[PRED_STORE_CONTINUE8:.*]]
+; VF8UF2:       [[PRED_STORE_IF7]]:
+; VF8UF2-NEXT:    [[TMP13:%.*]] = getelementptr i16, ptr [[DST]], i64 6
 ; VF8UF2-NEXT:    store i16 0, ptr [[TMP13]], align 2
-; VF8UF2-NEXT:    br label %[[PRED_STORE_CONTINUE13]]
-; VF8UF2:       [[PRED_STORE_CONTINUE13]]:
+; VF8UF2-NEXT:    br label %[[PRED_STORE_CONTINUE8]]
+; VF8UF2:       [[PRED_STORE_CONTINUE8]]:
 ; VF8UF2-NEXT:    [[TMP14:%.*]] = extractelement <8 x i1> [[TMP2]], i32 5
-; VF8UF2-NEXT:    br i1 [[TMP14]], label %[[PRED_STORE_IF14:.*]], label %[[PRED_STORE_CONTINUE15:.*]]
-; VF8UF2:       [[PRED_STORE_IF14]]:
-; VF8UF2-NEXT:    [[TMP43:%.*]] = add i64 [[OFFSET_IDX]], 5
-; VF8UF2-NEXT:    [[TMP15:%.*]] = getelementptr i16, ptr [[DST]], i64 [[TMP43]]
+; VF8UF2-NEXT:    br i1 [[TMP14]], label %[[PRED_STORE_IF9:.*]], label %[[PRED_STORE_CONTINUE10:.*]]
+; VF8UF2:       [[PRED_STORE_IF9]]:
+; VF8UF2-NEXT:    [[TMP15:%.*]] = getelementptr i16, ptr [[DST]], i64 7
 ; VF8UF2-NEXT:    store i16 0, ptr [[TMP15]], align 2
-; VF8UF2-NEXT:    br label %[[PRED_STORE_CONTINUE15]]
-; VF8UF2:       [[PRED_STORE_CONTINUE15]]:
+; VF8UF2-NEXT:    br label %[[PRED_STORE_CONTINUE10]]
+; VF8UF2:       [[PRED_STORE_CONTINUE10]]:
 ; VF8UF2-NEXT:    [[TMP16:%.*]] = extractelement <8 x i1> [[TMP2]], i32 6
-; VF8UF2-NEXT:    br i1 [[TMP16]], label %[[PRED_STORE_IF16:.*]], label %[[PRED_STORE_CONTINUE17:.*]]
-; VF8UF2:       [[PRED_STORE_IF16]]:
-; VF8UF2-NEXT:    [[TMP45:%.*]] = add i64 [[OFFSET_IDX]], 6
-; VF8UF2-NEXT:    [[TMP17:%.*]] = getelementptr i16, ptr [[DST]], i64 [[TMP45]]
+; VF8UF2-NEXT:    br i1 [[TMP16]], label %[[PRED_STORE_IF11:.*]], label %[[PRED_STORE_CONTINUE12:.*]]
+; VF8UF2:       [[PRED_STORE_IF11]]:
+; VF8UF2-NEXT:    [[TMP17:%.*]] = getelementptr i16, ptr [[DST]], i64 8
 ; VF8UF2-NEXT:    store i16 0, ptr [[TMP17]], align 2
-; VF8UF2-NEXT:    br label %[[PRED_STORE_CONTINUE17]]
-; VF8UF2:       [[PRED_STORE_CONTINUE17]]:
+; VF8UF2-NEXT:    br label %[[PRED_STORE_CONTINUE12]]
+; VF8UF2:       [[PRED_STORE_CONTINUE12]]:
 ; VF8UF2-NEXT:    [[TMP18:%.*]] = extractelement <8 x i1> [[TMP2]], i32 7
-; VF8UF2-NEXT:    br i1 [[TMP18]], label %[[PRED_STORE_IF18:.*]], label %[[PRED_STORE_CONTINUE19:.*]]
-; VF8UF2:       [[PRED_STORE_IF18]]:
-; VF8UF2-NEXT:    [[TMP46:%.*]] = add i64 [[OFFSET_IDX]], 7
-; VF8UF2-NEXT:    [[TMP19:%.*]] = getelementptr i16, ptr [[DST]], i64 [[TMP46]]
+; VF8UF2-NEXT:    br i1 [[TMP18]], label %[[PRED_STORE_IF13:.*]], label %[[PRED_STORE_CONTINUE14:.*]]
+; VF8UF2:       [[PRED_STORE_IF13]]:
+; VF8UF2-NEXT:    [[TMP19:%.*]] = getelementptr i16, ptr [[DST]], i64 9
 ; VF8UF2-NEXT:    store i16 0, ptr [[TMP19]], align 2
-; VF8UF2-NEXT:    br label %[[PRED_STORE_CONTINUE19]]
-; VF8UF2:       [[PRED_STORE_CONTINUE19]]:
+; VF8UF2-NEXT:    br label %[[PRED_STORE_CONTINUE14]]
+; VF8UF2:       [[PRED_STORE_CONTINUE14]]:
 ; VF8UF2-NEXT:    [[TMP20:%.*]] = extractelement <8 x i1> [[TMP3]], i32 0
-; VF8UF2-NEXT:    br i1 [[TMP20]], label %[[PRED_STORE_IF20:.*]], label %[[PRED_STORE_CONTINUE21:.*]]
-; VF8UF2:       [[PRED_STORE_IF20]]:
-; VF8UF2-NEXT:    [[TMP48:%.*]] = add i64 [[OFFSET_IDX]], 8
-; VF8UF2-NEXT:    [[TMP21:%.*]] = getelementptr i16, ptr [[DST]], i64 [[TMP48]]
+; VF8UF2-NEXT:    br i1 [[TMP20]], label %[[PRED_STORE_IF15:.*]], label %[[PRED_STORE_CONTINUE16:.*]]
+; VF8UF2:       [[PRED_STORE_IF15]]:
+; VF8UF2-NEXT:    [[TMP21:%.*]] = getelementptr i16, ptr [[DST]], i64 10
 ; VF8UF2-NEXT:    store i16 0, ptr [[TMP21]], align 2
-; VF8UF2-NEXT:    br label %[[PRED_STORE_CONTINUE21]]
-; VF8UF2:       [[PRED_STORE_CONTINUE21]]:
+; VF8UF2-NEXT:    br label %[[PRED_STORE_CONTINUE16]]
+; VF8UF2:       [[PRED_STORE_CONTINUE16]]:
 ; VF8UF2-NEXT:    [[TMP22:%.*]] = extractelement <8 x i1> [[TMP3]], i32 1
-; VF8UF2-NEXT:    br i1 [[TMP22]], label %[[PRED_STORE_IF22:.*]], label %[[PRED_STORE_CONTINUE23:.*]]
-; VF8UF2:       [[PRED_STORE_IF22]]:
-; VF8UF2-NEXT:    [[TMP49:%.*]] = add i64 [[OFFSET_IDX]], 9
-; VF8UF2-NEXT:    [[TMP23:%.*]] = getelementptr i16, ptr [[DST]], i64 [[TMP49]]
+; VF8UF2-NEXT:    br i1 [[TMP22]], label %[[PRED_STORE_IF17:.*]], label %[[PRED_STORE_CONTINUE18:.*]]
+; VF8UF2:       [[PRED_STORE_IF17]]:
+; VF8UF2-NEXT:    [[TMP23:%.*]] = getelementptr i16, ptr [[DST]], i64 11
 ; VF8UF2-NEXT:    store i16 0, ptr [[TMP23]], align 2
-; VF8UF2-NEXT:    br label %[[PRED_STORE_CONTINUE23]]
-; VF8UF2:       [[PRED_STORE_CONTINUE23]]:
+; VF8UF2-NEXT:    br label %[[PRED_STORE_CONTINUE18]]
+; VF8UF2:       [[PRED_STORE_CONTINUE18]]:
 ; VF8UF2-NEXT:    [[TMP24:%.*]] = extractelement <8 x i1> [[TMP3]], i32 2
-; VF8UF2-NEXT:    br i1 [[TMP24]], label %[[PRED_STORE_IF24:.*]], label %[[PRED_STORE_CONTINUE25:.*]]
-; VF8UF2:       [[PRED_STORE_IF24]]:
-; VF8UF2-NEXT:    [[TMP51:%.*]] = add i64 [[OFFSET_IDX]], 10
-; VF8UF2-NEXT:    [[TMP25:%.*]] = getelementptr i16, ptr [[DST]], i64 [[TMP51]]
+; VF8UF2-NEXT:    br i1 [[TMP24]], label %[[PRED_STORE_IF19:.*]], label %[[PRED_STORE_CONTINUE20:.*]]
+; VF8UF2:       [[PRED_STORE_IF19]]:
+; VF8UF2-NEXT:    [[TMP25:%.*]] = getelementptr i16, ptr [[DST]], i64 12
 ; VF8UF2-NEXT:    store i16 0, ptr [[TMP25]], align 2
-; VF8UF2-NEXT:    br label %[[PRED_STORE_CONTINUE25]]
-; VF8UF2:       [[PRED_STORE_CONTINUE25]]:
+; VF8UF2-NEXT:    br label %[[PRED_STORE_CONTINUE20]]
+; VF8UF2:       [[PRED_STORE_CONTINUE20]]:
 ; VF8UF2-NEXT:    [[TMP26:%.*]] = extractelement <8 x i1> [[TMP3]], i32 3
-; VF8UF2-NEXT:    br i1 [[TMP26]], label %[[PRED_STORE_IF26:.*]], label %[[PRED_STORE_CONTINUE27:.*]]
-; VF8UF2:       [[PRED_STORE_IF26]]:
-; VF8UF2-NEXT:    [[TMP38:%.*]] = add i64 [[OFFSET_IDX]], 11
-; VF8UF2-NEXT:    [[TMP27:%.*]] = getelementptr i16, ptr [[DST]], i64 [[TMP38]]
+; VF8UF2-NEXT:    br i1 [[TMP26]], label %[[PRED_STORE_IF21:.*]], label %[[PRED_STORE_CONTINUE22:.*]]
+; VF8UF2:       [[PRED_STORE_IF21]]:
+; VF8UF2-NEXT:    [[TMP27:%.*]] = getelementptr i16, ptr [[DST]], i64 13
 ; VF8UF2-NEXT:    store i16 0, ptr [[TMP27]], align 2
-; VF8UF2-NEXT:    br label %[[PRED_STORE_CONTINUE27]]
-; VF8UF2:       [[PRED_STORE_CONTINUE27]]:
+; VF8UF2-NEXT:    br label %[[PRED_STORE_CONTINUE22]]
+; VF8UF2:       [[PRED_STORE_CONTINUE22]]:
 ; VF8UF2-NEXT:    [[TMP28:%.*]] = extractelement <8 x i1> [[TMP3]], i32 4
-; VF8UF2-NEXT:    br i1 [[TMP28]], label %[[PRED_STORE_IF28:.*]], label %[[PRED_STORE_CONTINUE29:.*]]
-; VF8UF2:       [[PRED_STORE_IF28]]:
-; VF8UF2-NEXT:    [[TMP41:%.*]] = add i64 [[OFFSET_IDX]], 12
-; VF8UF2-NEXT:    [[TMP29:%.*]] = getelementptr i16, ptr [[DST]], i64 [[TMP41]]
+; VF8UF2-NEXT:    br i1 [[TMP28]], label %[[PRED_STORE_IF23:.*]], label %[[PRED_STORE_CONTINUE24:.*]]
+; VF8UF2:       [[PRED_STORE_IF23]]:
+; VF8UF2-NEXT:    [[TMP29:%.*]] = getelementptr i16, ptr [[DST]], i64 14
 ; VF8UF2-NEXT:    store i16 0, ptr [[TMP29]], align 2
-; VF8UF2-NEXT:    br label %[[PRED_STORE_CONTINUE29]]
-; VF8UF2:       [[PRED_STORE_CONTINUE29]]:
+; VF8UF2-NEXT:    br label %[[PRED_STORE_CONTINUE24]]
+; VF8UF2:       [[PRED_STORE_CONTINUE24]]:
 ; VF8UF2-NEXT:    [[TMP30:%.*]] = extractelement <8 x i1> [[TMP3]], i32 5
-; VF8UF2-NEXT:    br i1 [[TMP30]], label %[[PRED_STORE_IF30:.*]], label %[[PRED_STORE_CONTINUE31:.*]]
-; VF8UF2:       [[PRED_STORE_IF30]]:
-; VF8UF2-NEXT:    [[TMP44:%.*]] = add i64 [[OFFSET_IDX]], 13
-; VF8UF2-NEXT:    [[TMP31:%.*]] = getelementptr i16, ptr [[DST]], i64 [[TMP44]]
+; VF8UF2-NEXT:    br i1 [[TMP30]], label %[[PRED_STORE_IF25:.*]], label %[[PRED_STORE_CONTINUE26:.*]]
+; VF8UF2:       [[PRED_STORE_IF25]]:
+; VF8UF2-NEXT:    [[TMP31:%.*]] = getelementptr i16, ptr [[DST]], i64 15
 ; VF8UF2-NEXT:    store i16 0, ptr [[TMP31]], align 2
-; VF8UF2-NEXT:    br label %[[PRED_STORE_CONTINUE31]]
-; VF8UF2:       [[PRED_STORE_CONTINUE31]]:
+; VF8UF2-NEXT:    br label %[[PRED_STORE_CONTINUE26]]
+; VF8UF2:       [[PRED_STORE_CONTINUE26]]:
 ; VF8UF2-NEXT:    [[TMP32:%.*]] = extractelement <8 x i1> [[TMP3]], i32 6
-; VF8UF2-NEXT:    br i1 [[TMP32]], label %[[PRED_STORE_IF32:.*]], label %[[PRED_STORE_CONTINUE33:.*]]
-; VF8UF2:       [[PRED_STORE_IF32]]:
-; VF8UF2-NEXT:    [[TMP47:%.*]] = add i64 [[OFFSET_IDX]], 14
-; VF8UF2-NEXT:    [[TMP33:%.*]] = getelementptr i16, ptr [[DST]], i64 [[TMP47]]
+; VF8UF2-NEXT:    br i1 [[TMP32]], label %[[PRED_STORE_IF27:.*]], label %[[PRED_STORE_CONTINUE28:.*]]
+; VF8UF2:       [[PRED_STORE_IF27]]:
+; VF8UF2-NEXT:    [[TMP33:%.*]] = getelementptr i16, ptr [[DST]], i64 16
 ; VF8UF2-NEXT:    store i16 0, ptr [[TMP33]], align 2
-; VF8UF2-NEXT:    br label %[[PRED_STORE_CONTINUE33]]
-; VF8UF2:       [[PRED_STORE_CONTINUE33]]:
+; VF8UF2-NEXT:    br label %[[PRED_STORE_CONTINUE28]]
+; VF8UF2:       [[PRED_STORE_CONTINUE28]]:
 ; VF8UF2-NEXT:    [[TMP34:%.*]] = extractelement <8 x i1> [[TMP3]], i32 7
-; VF8UF2-NEXT:    br i1 [[TMP34]], label %[[PRED_STORE_IF34:.*]], label %[[PRED_STORE_CONTINUE35]]
-; VF8UF2:       [[PRED_STORE_IF34]]:
-; VF8UF2-NEXT:    [[TMP50:%.*]] = add i64 [[OFFSET_IDX]], 15
-; VF8UF2-NEXT:    [[TMP35:%.*]] = getelementptr i16, ptr [[DST]], i64 [[TMP50]]
+; VF8UF2-NEXT:    br i1 [[TMP34]], label %[[PRED_STORE_IF29:.*]], label %[[PRED_STORE_CONTINUE30:.*]]
+; VF8UF2:       [[PRED_STORE_IF29]]:
+; VF8UF2-NEXT:    [[TMP35:%.*]] = getelementptr i16, ptr [[DST]], i64 17
 ; VF8UF2-NEXT:    store i16 0, ptr [[TMP35]], align 2
-; VF8UF2-NEXT:    br label %[[PRED_STORE_CONTINUE35]]
-; VF8UF2:       [[PRED_STORE_CONTINUE35]]:
-; VF8UF2-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
-; VF8UF2-NEXT:    br i1 true, label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
+; VF8UF2-NEXT:    br label %[[PRED_STORE_CONTINUE30]]
+; VF8UF2:       [[PRED_STORE_CONTINUE30]]:
+; VF8UF2-NEXT:    br label %[[MIDDLE_BLOCK:.*]]
 ; VF8UF2:       [[MIDDLE_BLOCK]]:
 ; VF8UF2-NEXT:    br i1 true, label %[[EXIT:.*]], label %[[SCALAR_PH]]
 ; VF8UF2:       [[SCALAR_PH]]:
@@ -296,7 +383,7 @@ define void @remove_loop_region_with_replicate_recipe(ptr %dst, i64 range(i64 5,
 ; VF8UF2-NEXT:    store i16 0, ptr [[GEP_DST]], align 2
 ; VF8UF2-NEXT:    [[IV_NEXT]] = add i64 [[IV]], 1
 ; VF8UF2-NEXT:    [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
-; VF8UF2-NEXT:    br i1 [[EC]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP5:![0-9]+]]
+; VF8UF2-NEXT:    br i1 [[EC]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP3:![0-9]+]]
 ; VF8UF2:       [[EXIT]]:
 ; VF8UF2-NEXT:    ret void
 ;
@@ -311,146 +398,124 @@ define void @remove_loop_region_with_replicate_recipe(ptr %dst, i64 range(i64 5,
 ; VF16UF1-NEXT:    [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
 ; VF16UF1-NEXT:    [[TRIP_COUNT_MINUS_1:%.*]] = sub i64 [[TMP0]], 1
 ; VF16UF1-NEXT:    [[TMP1:%.*]] = add i64 2, [[N_VEC]]
-; VF16UF1-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <16 x i64> poison, i64 [[TRIP_COUNT_MINUS_1]], i64 0
-; VF16UF1-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <16 x i64> [[BROADCAST_SPLATINSERT]], <16 x i64> poison, <16 x i32> zeroinitializer
+; VF16UF1-NEXT:    [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <16 x i64> poison, i64 [[TRIP_COUNT_MINUS_1]], i64 0
+; VF16UF1-NEXT:    [[BROADCAST_SPLAT1:%.*]] = shufflevector <16 x i64> [[BROADCAST_SPLATINSERT1]], <16 x i64> poison, <16 x i32> zeroinitializer
 ; VF16UF1-NEXT:    br label %[[VECTOR_BODY:.*]]
 ; VF16UF1:       [[VECTOR_BODY]]:
-; VF16UF1-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[PRED_STORE_CONTINUE32:.*]] ]
-; VF16UF1-NEXT:    [[OFFSET_IDX:%.*]] = add i64 2, [[INDEX]]
-; VF16UF1-NEXT:    [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <16 x i64> poison, i64 [[INDEX]], i64 0
-; VF16UF1-NEXT:    [[BROADCAST_SPLAT1:%.*]] = shufflevector <16 x i64> [[BROADCAST_SPLATINSERT1]], <16 x i64> poison, <16 x i32> zeroinitializer
-; VF16UF1-NEXT:    [[VEC_IV:%.*]] = add <16 x i64> [[BROADCAST_SPLAT1]], <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7, i64 8, i64 9, i64 10, i64 11, i64 12, i64 13, i64 14, i64 15>
-; VF16UF1-NEXT:    [[TMP2:%.*]] = icmp ule <16 x i64> [[VEC_IV]], [[BROADCAST_SPLAT]]
+; VF16UF1-NEXT:    [[TMP2:%.*]] = icmp ule <16 x i64> <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7, i64 8, i64 9, i64 10, i64 11, i64 12, i64 13, i64 14, i64 15>, [[BROADCAST_SPLAT1]]
 ; VF16UF1-NEXT:    [[TMP3:%.*]] = extractelement <16 x i1> [[TMP2]], i32 0
 ; VF16UF1-NEXT:    br i1 [[TMP3]], label %[[PRED_STORE_IF:.*]], label %[[PRED_STORE_CONTINUE:.*]]
 ; VF16UF1:       [[PRED_STORE_IF]]:
-; VF16UF1-NEXT:    [[TMP35:%.*]] = add i64 [[OFFSET_IDX]], 0
-; VF16UF1-NEXT:    [[TMP4:%.*]] = getelementptr i16, ptr [[DST]], i64 [[TMP35]]
+; VF16UF1-NEXT:    [[TMP4:%.*]] = getelementptr i16, ptr [[DST]], i64 2
 ; VF16UF1-NEXT:    store i16 0, ptr [[TMP4]], align 2
 ; VF16UF1-NEXT:    br label %[[PRED_STORE_CONTINUE]]
 ; VF16UF1:       [[PRED_STORE_CONTINUE]]:
 ; VF16UF1-NEXT:    [[TMP5:%.*]] = extractelement <16 x i1> [[TMP2]], i32 1
-; VF16UF1-NEXT:    br i1 [[TMP5]], label %[[PRED_STORE_IF3:.*]], label %[[PRED_STORE_CONTINUE4:.*]]
-; VF16UF1:       [[PRED_STORE_IF3]]:
-; VF16UF1-NEXT:    [[TMP36:%.*]] = add i64 [[OFFSET_IDX]], 1
-; VF16UF1-NEXT:    [[TMP6:%.*]] = getelementptr i16, ptr [[DST]], i64 [[TMP36]]
+; VF16UF1-NEXT:    br i1 [[TMP5]], label %[[PRED_STORE_IF1:.*]], label %[[PRED_STORE_CONTINUE2:.*]]
+; VF16UF1:       [[PRED_STORE_IF1]]:
+; VF16UF1-NEXT:    [[TMP6:%.*]] = getelementptr i16, ptr [[DST]], i64 3
 ; VF16UF1-NEXT:    store i16 0, ptr [[TMP6]], align 2
+; VF16UF1-NEXT:    br label %[[PRED_STORE_CONTINUE2]]
+; VF16UF1:       [[PRED_STORE_CONTINUE2]]:
+; VF16UF1-NEXT:    [[TMP7:%.*]] = extractelement <16 x i1> [[TMP2]], i32 2
+; VF16UF1-NEXT:    br i1 [[TMP7]], label %[[PRED_STORE_IF3:.*]], label %[[PRED_STORE_CONTINUE4:.*]]
+; VF16UF1:       [[PRED_STORE_IF3]]:
+; VF16UF1-NEXT:    [[TMP8:%.*]] = getelementptr i16, ptr [[DST]], i64 4
+; VF16UF1-NEXT:    store i16 0, ptr [[TMP8]], align 2
 ; VF16UF1-NEXT:    br label %[[PRED_STORE_CONTINUE4]]
 ; VF16UF1:       [[PRED_STORE_CONTINUE4]]:
-; VF16UF1-NEXT:    [[TMP7:%.*]] = extractelement <16 x i1> [[TMP2]], i32 2
-; VF16UF1-NEXT:    br i1 [[TMP7]], label %[[PRED_STORE_IF5:.*]], label %[[PRED_STORE_CONTINUE6:.*]]
+; VF16UF1-NEXT:    [[TMP9:%.*]] = extractelement <16 x i1> [[TMP2]], i32 3
+; VF16UF1-NEXT:    br i1 [[TMP9]], label %[[PRED_STORE_IF5:.*]], label %[[PRED_STORE_CONTINUE6:.*]]
 ; VF16UF1:       [[PRED_STORE_IF5]]:
-; VF16UF1-NEXT:    [[TMP38:%.*]] = add i64 [[OFFSET_IDX]], 2
-; VF16UF1-NEXT:    [[TMP8:%.*]] = getelementptr i16, ptr [[DST]], i64 [[TMP38]]
-; VF16UF1-NEXT:    store i16 0, ptr [[TMP8]], align 2
+; VF16UF1-NEXT:    [[TMP10:%.*]] = getelementptr i16, ptr [[DST]], i64 5
+; VF16UF1-NEXT:    store i16 0, ptr [[TMP10]], align 2
 ; VF16UF1-NEXT:    br label %[[PRED_STORE_CONTINUE6]]
 ; VF16UF1:       [[PRED_STORE_CONTINUE6]]:
-; VF16UF1-NEXT:    [[TMP9:%.*]] = extractelement <16 x i1> [[TMP2]], i32 3
-; VF16UF1-NEXT:    br i1 [[TMP9]], label %[[PRED_STORE_IF7:.*]], label %[[PRED_STORE_CONTINUE8:.*]]
+; VF16UF1-NEXT:    [[TMP11:%.*]] = extractelement <16 x i1> [[TMP2]], i32 4
+; VF16UF1-NEXT:    br i1 [[TMP11]], label %[[PRED_STORE_IF7:.*]], label %[[PRED_STORE_CONTINUE8:.*]]
 ; VF16UF1:       [[PRED_STORE_IF7]]:
-; VF16UF1-NEXT:    [[TMP39:%.*]] = add i64 [[OFFSET_IDX]], 3
-; VF16UF1-NEXT:    [[TMP10:%.*]] = getelementptr i16, ptr [[DST]], i64 [[TMP39]]
-; VF16UF1-NEXT:    store i16 0, ptr [[TMP10]], align 2
+; VF16UF1-NEXT:    [[TMP12:%.*]] = getelementptr i16, ptr [[DST]], i64 6
+; VF16UF1-NEXT:    store i16 0, ptr [[TMP12]], align 2
 ; VF16UF1-NEXT:    br label %[[PRED_STORE_CONTINUE8]]
 ; VF16UF1:       [[PRED_STORE_CONTINUE8]]:
-; VF16UF1-NEXT:    [[TMP11:%.*]] = extractelement <16 x i1> [[TMP2]], i32 4
-; VF16UF1-NEXT:    br i1 [[TMP11]], label %[[PRED_STORE_IF9:.*]], label %[[PRED_STORE_CONTINUE10:.*]]
+; VF16UF1-NEXT:    [[TMP13:%.*]] = extractelement <16 x i1> [[TMP2]], i32 5
+; VF16UF1-NEXT:    br i1 [[TMP13]], label %[[PRED_STORE_IF9:.*]], label %[[PRED_STORE_CONTINUE10:.*]]
 ; VF16UF1:       [[PRED_STORE_IF9]]:
-; VF16UF1-NEXT:    [[TMP41:%.*]] = add i64 [[OFFSET_IDX]], 4
-; VF16UF1-NEXT:    [[TMP12:%.*]] = getelementptr i16, ptr [[DST]], i64 [[TMP41]]
-; VF16UF1-NEXT:    store i16 0, ptr [[TMP12]], align 2
+; VF16UF1-NEXT:    [[TMP14:%.*]] = getelementptr i16, ptr [[DST]], i64 7
+; VF16UF1-NEXT:    store i16 0, ptr [[TMP14]], align 2
 ; VF16UF1-NEXT:    br label %[[PRED_STORE_CONTINUE10]]
 ; VF16UF1:       [[PRED_STORE_CONTINUE10]]:
-; VF16UF1-NEXT:    [[TMP13:%.*]] = extractelement <16 x i1> [[TMP2]], i32 5
-; VF16UF1-NEXT:    br i1 [[TMP13]], label %[[PRED_STORE_IF11:.*]], label %[[PRED_STORE_CONTINUE12:.*]]
+; VF16UF1-NEXT:    [[TMP15:%.*]] = extractelement <16 x i1> [[TMP2]], i32 6
+; VF16UF1-NEXT:    br i1 [[TMP15]], label %[[PRED_STORE_IF11:.*]], label %[[PRED_STORE_CONTINUE12:.*]]
 ; VF16UF1:       [[PRED_STORE_IF11]]:
-; VF16UF1-NEXT:    [[TMP42:%.*]] = add i64 [[OFFSET_IDX]], 5
-; VF16UF1-NEXT:    [[TMP14:%.*]] = getelementptr i16, ptr [[DST]], i64 [[TMP42]]
-; VF16UF1-NEXT:    store i16 0, ptr [[TMP14]], align 2
+; VF16UF1-NEXT:    [[TMP16:%.*]] = getelementptr i16, ptr [[DST]], i64 8
+; VF16UF1-NEXT:    store i16 0, ptr [[TMP16]], align 2
 ; VF16UF1-NEXT:    br label %[[PRED_STORE_CONTINUE12]]
 ; VF16UF1:       [[PRED_STORE_CONTINUE12]]:
-; VF16UF1-NEXT:    [[TMP15:%.*]] = extractelement <16 x i1> [[TMP2]], i32 6
-; VF16UF1-NEXT:    br i1 [[TMP15]], label %[[PRED_STORE_IF13:.*]], label %[[PRED_STORE_CONTINUE14:.*]]
+; VF16UF1-NEXT:    [[TMP17:%.*]] = extractelement <16 x i1> [[TMP2]], i32 7
+; VF16UF1-NEXT:    br i1 [[TMP17]], label %[[PRED_STORE_IF13:.*]], label %[[PRED_STORE_CONTINUE14:.*]]
 ; VF16UF1:       [[PRED_STORE_IF13]]:
-; VF16UF1-NEXT:    [[TMP44:%.*]] = add i64 [[OFFSET_IDX]], 6
-; VF16UF1-NEXT:    [[TMP16:%.*]] = getelementptr i16, ptr [[DST]], i64 [[TMP44]]
-; VF16UF1-NEXT:    store i16 0, ptr [[TMP16]], align 2
+; VF16UF1-NEXT:    [[TMP18:%.*]] = getelementptr i16, ptr [[DST]], i64 9
+; VF16UF1-NEXT:    store i16 0, ptr [[TMP18]], align 2
 ; VF16UF1-NEXT:    br label %[[PRED_STORE_CONTINUE14]]
 ; VF16UF1:       [[PRED_STORE_CONTINUE14]]:
-; VF16UF1-NEXT:    [[TMP17:%.*]] = extractelement <16 x i1> [[TMP2]], i32 7
-; VF16UF1-NEXT:    br i1 [[TMP17]], label %[[PRED_STORE_IF15:.*]], label %[[PRED_STORE_CONTINUE16:.*]]
+; VF16UF1-NEXT:    [[TMP19:%.*]] = extractelement <16 x i1> [[TMP2]], i32 8
+; VF16UF1-NEXT:    br i1 [[TMP19]], label %[[PRED_STORE_IF15:.*]], label %[[PRED_STORE_CONTINUE16:.*]]
 ; VF16UF1:       [[PRED_STORE_IF15]]:
-; VF16UF1-NEXT:    [[TMP45:%.*]] = add i64 [[OFFSET_IDX]], 7
-; VF16UF1-NEXT:    [[TMP18:%.*]] = getelementptr i16, ptr [[DST]], i64 [[TMP45]]
-; VF16UF1-NEXT:    store i16 0, ptr [[TMP18]], align 2
+; VF16UF1-NEXT:    [[TMP20:%.*]] = getelementptr i16, ptr [[DST]], i64 10
+; VF16UF1-NEXT:    store i16 0, ptr [[TMP20]], align 2
 ; VF16UF1-NEXT:    br label %[[PRED_STORE_CONTINUE16]]
 ; VF16UF1:       [[PRED_STORE_CONTINUE16]]:
-; VF16UF1-NEXT:    [[TMP19:%.*]] = extractelement <16 x i1> [[TMP2]], i32 8
-; VF16UF1-NEXT:    br i1 [[TMP19]], label %[[PRED_STORE_IF17:.*]], label %[[PRED_STORE_CONTINUE18:.*]]
+; VF16UF1-NEXT:    [[TMP21:%.*]] = extractelement <16 x i1> [[TMP2]], i32 9
+; VF16UF1-NEXT:    br i1 [[TMP21]], label %[[PRED_STORE_IF17:.*]], label %[[PRED_STORE_CONTINUE18:.*]]
 ; VF16UF1:       [[PRED_STORE_IF17]]:
-; VF16UF1-NEXT:    [[TMP47:%.*]] = add i64 [[OFFSET_IDX]], 8
-; VF16UF1-NEXT:    [[TMP20:%.*]] = getelementptr i16, ptr [[DST]], i64 [[TMP47]]
-; VF16UF1-NEXT:    store i16 0, ptr [[TMP20]], align 2
+; VF16UF1-NEXT:    [[TMP22:%.*]] = getelementptr i16, ptr [[DST]], i64 11
+; VF16UF1-NEXT:    store i16 0, ptr [[TMP22]], align 2
 ; VF16UF1-NEXT:    br label %[[PRED_STORE_CONTINUE18]]
 ; VF16UF1:       [[PRED_STORE_CONTINUE18]]:
-; VF16UF1-NEXT:    [[TMP21:%.*]] = extractelement <16 x i1> [[TMP2]], i32 9
-; VF16UF1-NEXT:    br i1 [[TMP21]], label %[[PRED_STORE_IF19:.*]], label %[[PRED_STORE_CONTINUE20:.*]]
+; VF16UF1-NEXT:    [[TMP23:%.*]] = extractelement <16 x i1> [[TMP2]], i32 10
+; VF16UF1-NEXT:    br i1 [[TMP23]], label %[[PRED_STORE_IF19:.*]], label %[[PRED_STORE_CONTINUE20:.*]]
 ; VF16UF1:       [[PRED_STORE_IF19]]:
-; VF16UF1-NEXT:    [[TMP48:%.*]] = add i64 [[OFFSET_IDX]], 9
-; VF16UF1-NEXT:    [[TMP22:%.*]] = getelementptr i16, ptr [[DST]], i64 [[TMP48]]
-; VF16UF1-NEXT:    store i16 0, ptr [[TMP22]], align 2
+; VF16UF1-NEXT:    [[TMP24:%.*]] = getelementptr i16, ptr [[DST]], i64 12
+; VF16UF1-NEXT:    store i16 0, ptr [[TMP24]], align 2
 ; VF16UF1-NEXT:    br label %[[PRED_STORE_CONTINUE20]]
 ; VF16UF1:       [[PRED_STORE_CONTINUE20]]:
-; VF16UF1-NEXT:    [[TMP23:%.*]] = extractelement <16 x i1> [[TMP2]], i32 10
-; VF16UF1-NEXT:    br i1 [[TMP23]], label %[[PRED_STORE_IF21:.*]], label %[[PRED_STORE_CONTINUE22:.*]]
+; VF16UF1-NEXT:    [[TMP25:%.*]] = extractelement <16 x i1> [[TMP2]], i32 11
+; VF16UF1-NEXT:    br i1 [[TMP25]], label %[[PRED_STORE_IF21:.*]], label %[[PRED_STORE_CONTINUE22:.*]]
 ; VF16UF1:       [[PRED_STORE_IF21]]:
-; VF16UF1-NEXT:    [[TMP50:%.*]] = add i64 [[OFFSET_IDX]], 10
-; VF16UF1-NEXT:    [[TMP24:%.*]] = getelementptr i16, ptr [[DST]], i64 [[TMP50]]
-; VF16UF1-NEXT:    store i16 0, ptr [[TMP24]], align 2
+; VF16UF1-NEXT:    [[TMP26:%.*]] = getelementptr i16, ptr [[DST]], i64 13
+; VF16UF1-NEXT:    store i16 0, ptr [[TMP26]], align 2
 ; VF16UF1-NEXT:    br label %[[PRED_STORE_CONTINUE22]]
 ; VF16UF1:       [[PRED_STORE_CONTINUE22]]:
-; VF16UF1-NEXT:    [[TMP25:%.*]] = extractelement <16 x i1> [[TMP2]], i32 11
-; VF16UF1-NEXT:    br i1 [[TMP25]], label %[[PRED_STORE_IF23:.*]], label %[[PRED_STORE_CONTINUE24:.*]]
+; VF16UF1-NEXT:    [[TMP27:%.*]] = extractelement <16 x i1> [[TMP2]], i32 12
+; VF16UF1-NEXT:    br i1 [[TMP27]], label %[[PRED_STORE_IF23:.*]], label %[[PRED_STORE_CONTINUE24:.*]]
 ; VF16UF1:       [[PRED_STORE_IF23]]:
-; VF16UF1-NEXT:    [[TMP37:%.*]] = add i64 [[OFFSET_IDX]], 11
-; VF16UF1-NEXT:    [[TMP26:%.*]] = getelementptr i16, ptr [[DST]], i64 [[TMP37]]
-; VF16UF1-NEXT:    store i16 0, ptr [[TMP26]], align 2
+; VF16UF1-NEXT:    [[TMP28:%.*]] = getelementptr i16, ptr [[DST]], i64 14
+; VF16UF1-NEXT:    store i16 0, ptr [[TMP28]], align 2
 ; VF16UF1-NEXT:    br label %[[PRED_STORE_CONTINUE24]]
 ; VF16UF1:       [[PRED_STORE_CONTINUE24]]:
-; VF16UF1-NEXT:    [[TMP27:%.*]] = extractelement <16 x i1> [[TMP2]], i32 12
-; VF16UF1-NEXT:    br i1 [[TMP27]], label %[[PRED_STORE_IF25:.*]], label %[[PRED_STORE_CONTINUE26:.*]]
+; VF16UF1-NEXT:    [[TMP29:%.*]] = extractelement <16 x i1> [[TMP2]], i32 13
+; VF16UF1-NEXT:    br i1 [[TMP29]], label %[[PRED_STORE_IF25:.*]], label %[[PRED_STORE_CONTINUE26:.*]]
 ; VF16UF1:       [[PRED_STORE_IF25]]:
-; VF16UF1-NEXT:    [[TMP40:%.*]] = add i64 [[OFFSET_IDX]], 12
-; VF16UF1-NEXT:    [[TMP28:%.*]] = getelementptr i16, ptr [[DST]], i64 [[TMP40]]
-; VF16UF1-NEXT:    store i16 0, ptr [[TMP28]], align 2
+; VF16UF1-NEXT:    [[TMP30:%.*]] = getelementptr i16, ptr [[DST]], i64 15
+; VF16UF1-NEXT:    store i16 0, ptr [[TMP30]], align 2
 ; VF16UF1-NEXT:    br label %[[PRED_STORE_CONTINUE26]]
 ; VF16UF1:       [[PRED_STORE_CONTINUE26]]:
-; VF16UF1-NEXT:    [[TMP29:%.*]] = extractelement <16 x i1> [[TMP2]], i32 13
-; VF16UF1-NEXT:    br i1 [[TMP29]], label %[[PRED_STORE_IF27:.*]], label %[[PRED_STORE_CONTINUE28:.*]]
+; VF16UF1-NEXT:    [[TMP31:%.*]] = extractelement <16 x i1> [[TMP2]], i32 14
+; VF16UF1-NEXT:    br i1 [[TMP31]], label %[[PRED_STORE_IF27:.*]], label %[[PRED_STORE_CONTINUE28:.*]]
 ; VF16UF1:       [[PRED_STORE_IF27]]:
-; VF16UF1-NEXT:    [[TMP43:%.*]] = add i64 [[OFFSET_IDX]], 13
-; VF16UF1-NEXT:    [[TMP30:%.*]] = getelementptr i16, ptr [[DST]], i64 [[TMP43]]
-; VF16UF1-NEXT:    store i16 0, ptr [[TMP30]], align 2
+; VF16UF1-NEXT:    [[TMP32:%.*]] = getelementptr i16, ptr [[DST]], i64 16
+; VF16UF1-NEXT:    store i16 0, ptr [[TMP32]], align 2
 ; VF16UF1-NEXT:    br label %[[PRED_STORE_CONTINUE28]]
 ; VF16UF1:       [[PRED_STORE_CONTINUE28]]:
-; VF16UF1-NEXT:    [[TMP31:%.*]] = extractelement <16 x i1> [[TMP2]], i32 14
-; VF16UF1-NEXT:    br i1 [[TMP31]], label %[[PRED_STORE_IF29:.*]], label %[[PRED_STORE_CONTINUE30:.*]]
+; VF16UF1-NEXT:    [[TMP33:%.*]] = extractelement <16 x i1> [[TMP2]], i32 15
+; VF16UF1-NEXT:    br i1 [[TMP33]], label %[[PRED_STORE_IF29:.*]], label %[[PRED_STORE_CONTINUE30:.*]]
 ; VF16UF1:       [[PRED_STORE_IF29]]:
-; VF16UF1-NEXT:    [[TMP46:%.*]] = add i64 [[OFFSET_IDX]], 14
-; VF16UF1-NEXT:    [[TMP32:%.*]] = getelementptr i16, ptr [[DST]], i64 [[TMP46]]
-; VF16UF1-NEXT:    store i16 0, ptr [[TMP32]], align 2
+; VF16UF1-NEXT:    [[TMP34:%.*]] = getelementptr i16, ptr [[DST]], i64 17
+; VF16UF1-NEXT:    store i16 0, ptr [[TMP34]], align 2
 ; VF16UF1-NEXT:    br label %[[PRED_STORE_CONTINUE30]]
 ; VF16UF1:       [[PRED_STORE_CONTINUE30]]:
-; VF16UF1-NEXT:    [[TMP33:%.*]] = extractelement <16 x i1> [[TMP2]], i32 15
-; VF16UF1-NEXT:    br i1 [[TMP33]], label %[[PRED_STORE_IF31:.*]], label %[[PRED_STORE_CONTINUE32]]
-; VF16UF1:       [[PRED_STORE_IF31]]:
-; VF16UF1-NEXT:    [[TMP49:%.*]] = add i64 [[OFFSET_IDX]], 15
-; VF16UF1-NEXT:    [[TMP34:%.*]] = getelementptr i16, ptr [[DST]], i64 [[TMP49]]
-; VF16UF1-NEXT:    store i16 0, ptr [[TMP34]], align 2
-; VF16UF1-NEXT:    br label %[[PRED_STORE_CONTINUE32]]
-; VF16UF1:       [[PRED_STORE_CONTINUE32]]:
-; VF16UF1-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
-; VF16UF1-NEXT:    br i1 true, label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
+; VF16UF1-NEXT:    br label %[[MIDDLE_BLOCK:.*]]
 ; VF16UF1:       [[MIDDLE_BLOCK]]:
 ; VF16UF1-NEXT:    br i1 true, label %[[EXIT:.*]], label %[[SCALAR_PH]]
 ; VF16UF1:       [[SCALAR_PH]]:
@@ -462,7 +527,7 @@ define void @remove_loop_region_with_replicate_recipe(ptr %dst, i64 range(i64 5,
 ; VF16UF1-NEXT:    store i16 0, ptr [[GEP_DST]], align 2
 ; VF16UF1-NEXT:    [[IV_NEXT]] = add i64 [[IV]], 1
 ; VF16UF1-NEXT:    [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
-; VF16UF1-NEXT:    br i1 [[EC]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP5:![0-9]+]]
+; VF16UF1-NEXT:    br i1 [[EC]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP3:![0-9]+]]
 ; VF16UF1:       [[EXIT]]:
 ; VF16UF1-NEXT:    ret void
 ;
@@ -506,7 +571,7 @@ define void @remove_loop_region_outer_loop(i64 range(i64 8, 17) %N, ptr noalias
 ; VF8UF1-NEXT:    store <8 x i8> [[WIDE_LOAD]], ptr [[TMP4]], align 1
 ; VF8UF1-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
 ; VF8UF1-NEXT:    [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; VF8UF1-NEXT:    br i1 [[TMP5]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
+; VF8UF1-NEXT:    br i1 [[TMP5]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
 ; VF8UF1:       [[MIDDLE_BLOCK]]:
 ; VF8UF1-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
 ; VF8UF1-NEXT:    br i1 [[CMP_N]], label %[[OUTER_LATCH]], label %[[SCALAR_PH]]
@@ -521,7 +586,7 @@ define void @remove_loop_region_outer_loop(i64 range(i64 8, 17) %N, ptr noalias
 ; VF8UF1-NEXT:    store i8 [[L]], ptr [[GEP_DST]], align 1
 ; VF8UF1-NEXT:    [[IV_NEXT]] = add i64 [[INNER_IV]], 1
 ; VF8UF1-NEXT:    [[C_1:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
-; VF8UF1-NEXT:    br i1 [[C_1]], label %[[OUTER_LATCH]], label %[[INNER]], !llvm.loop [[LOOP7:![0-9]+]]
+; VF8UF1-NEXT:    br i1 [[C_1]], label %[[OUTER_LATCH]], label %[[INNER]], !llvm.loop [[LOOP6:![0-9]+]]
 ; VF8UF1:       [[OUTER_LATCH]]:
 ; VF8UF1-NEXT:    [[OUTER_IV_NEXT]] = getelementptr i8, ptr [[OUTER_IV]], i64 1
 ; VF8UF1-NEXT:    [[C_2:%.*]] = call i1 @cond()
@@ -542,20 +607,17 @@ define void @remove_loop_region_outer_loop(i64 range(i64 8, 17) %N, ptr noalias
 ; VF8UF2-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
 ; VF8UF2-NEXT:    br label %[[VECTOR_BODY:.*]]
 ; VF8UF2:       [[VECTOR_BODY]]:
-; VF8UF2-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
-; VF8UF2-NEXT:    [[TMP6:%.*]] = add i64 [[INDEX]], 0
-; VF8UF2-NEXT:    [[TMP0:%.*]] = getelementptr i8, ptr [[OUTER_IV]], i64 [[TMP6]]
+; VF8UF2-NEXT:    [[TMP0:%.*]] = getelementptr i8, ptr [[OUTER_IV]], i64 0
 ; VF8UF2-NEXT:    [[TMP1:%.*]] = getelementptr i8, ptr [[TMP0]], i32 0
 ; VF8UF2-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr [[TMP0]], i32 8
 ; VF8UF2-NEXT:    [[WIDE_LOAD:%.*]] = load <8 x i8>, ptr [[TMP1]], align 1
 ; VF8UF2-NEXT:    [[WIDE_LOAD1:%.*]] = load <8 x i8>, ptr [[TMP2]], align 1
-; VF8UF2-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP6]]
+; VF8UF2-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[DST]], i64 0
 ; VF8UF2-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[TMP3]], i32 0
 ; VF8UF2-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[TMP3]], i32 8
 ; VF8UF2-NEXT:    store <8 x i8> [[WIDE_LOAD]], ptr [[TMP4]], align 1
 ; VF8UF2-NEXT:    store <8 x i8> [[WIDE_LOAD1]], ptr [[TMP5]], align 1
-; VF8UF2-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
-; VF8UF2-NEXT:    br i1 true, label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
+; VF8UF2-NEXT:    br label %[[MIDDLE_BLOCK:.*]]
 ; VF8UF2:       [[MIDDLE_BLOCK]]:
 ; VF8UF2-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
 ; VF8UF2-NEXT:    br i1 [[CMP_N]], label %[[OUTER_LATCH]], label %[[SCALAR_PH]]
@@ -570,7 +632,7 @@ define void @remove_loop_region_outer_loop(i64 range(i64 8, 17) %N, ptr noalias
 ; VF8UF2-NEXT:    store i8 [[L]], ptr [[GEP_DST]], align 1
 ; VF8UF2-NEXT:    [[IV_NEXT]] = add i64 [[INNER_IV]], 1
 ; VF8UF2-NEXT:    [[C_1:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
-; VF8UF2-NEXT:    br i1 [[C_1]], label %[[OUTER_LATCH]], label %[[INNER]], !llvm.loop [[LOOP7:![0-9]+]]
+; VF8UF2-NEXT:    br i1 [[C_1]], label %[[OUTER_LATCH]], label %[[INNER]], !llvm.loop [[LOOP4:![0-9]+]]
 ; VF8UF2:       [[OUTER_LATCH]]:
 ; VF8UF2-NEXT:    [[OUTER_IV_NEXT]] = getelementptr i8, ptr [[OUTER_IV]], i64 1
 ; VF8UF2-NEXT:    [[C_2:%.*]] = call i1 @cond()
@@ -591,16 +653,13 @@ define void @remove_loop_region_outer_loop(i64 range(i64 8, 17) %N, ptr noalias
 ; VF16UF1-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
 ; VF16UF1-NEXT:    br label %[[VECTOR_BODY:.*]]
 ; VF16UF1:       [[VECTOR_BODY]]:
-; VF16UF1-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
-; VF16UF1-NEXT:    [[TMP4:%.*]] = add i64 [[INDEX]], 0
-; VF16UF1-NEXT:    [[TMP0:%.*]] = getelementptr i8, ptr [[OUTER_IV]], i64 [[TMP4]]
+; VF16UF1-NEXT:    [[TMP0:%.*]] = getelementptr i8, ptr [[OUTER_IV]], i64 0
 ; VF16UF1-NEXT:    [[TMP1:%.*]] = getelementptr i8, ptr [[TMP0]], i32 0
 ; VF16UF1-NEXT:    [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP1]], align 1
-; VF16UF1-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP4]]
+; VF16UF1-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr [[DST]], i64 0
 ; VF16UF1-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[TMP2]], i32 0
 ; VF16UF1-NEXT:    store <16 x i8> [[WIDE_LOAD]], ptr [[TMP3]], align 1
-; VF16UF1-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
-; VF16UF1-NEXT:    br i1 true, label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
+; VF16UF1-NEXT:    br label %[[MIDDLE_BLOCK:.*]]
 ; VF16UF1:       [[MIDDLE_BLOCK]]:
 ; VF16UF1-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
 ; VF16UF1-NEXT:    br i1 [[CMP_N]], label %[[OUTER_LATCH]], label %[[SCALAR_PH]]
@@ -615,7 +674,7 @@ define void @remove_loop_region_outer_loop(i64 range(i64 8, 17) %N, ptr noalias
 ; VF16UF1-NEXT:    store i8 [[L]], ptr [[GEP_DST]], align 1
 ; VF16UF1-NEXT:    [[IV_NEXT]] = add i64 [[INNER_IV]], 1
 ; VF16UF1-NEXT:    [[C_1:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
-; VF16UF1-NEXT:    br i1 [[C_1]], label %[[OUTER_LATCH]], label %[[INNER]], !llvm.loop [[LOOP7:![0-9]+]]
+; VF16UF1-NEXT:    br i1 [[C_1]], label %[[OUTER_LATCH]], label %[[INNER]], !llvm.loop [[LOOP4:![0-9]+]]
 ; VF16UF1:       [[OUTER_LATCH]]:
 ; VF16UF1-NEXT:    [[OUTER_IV_NEXT]] = getelementptr i8, ptr [[OUTER_IV]], i64 1
 ; VF16UF1-NEXT:    [[C_2:%.*]] = call i1 @cond()
@@ -653,28 +712,19 @@ exit:
 ; VF8UF1: [[META1]] = !{!"llvm.loop.isvectorized", i32 1}
 ; VF8UF1: [[META2]] = !{!"llvm.loop.unroll.runtime.disable"}
 ; VF8UF1: [[LOOP3]] = distinct !{[[LOOP3]], [[META2]], [[META1]]}
-; VF8UF1: [[LOOP4]] = distinct !{[[LOOP4]], [[META1]], [[META2]]}
-; VF8UF1: [[LOOP5]] = distinct !{[[LOOP5]], [[META2]], [[META1]]}
-; VF8UF1: [[LOOP6]] = distinct !{[[LOOP6]], [[META1]], [[META2]]}
-; VF8UF1: [[LOOP7]] = distinct !{[[LOOP7]], [[META2]], [[META1]]}
+; VF8UF1: [[LOOP4]] = distinct !{[[LOOP4]], [[META2]], [[META1]]}
+; VF8UF1: [[LOOP5]] = distinct !{[[LOOP5]], [[META1]], [[META2]]}
+; VF8UF1: [[LOOP6]] = distinct !{[[LOOP6]], [[META2]], [[META1]]}
 ;.
 ; VF8UF2: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]}
-; VF8UF2: [[META1]] = !{!"llvm.loop.isvectorized", i32 1}
-; VF8UF2: [[META2]] = !{!"llvm.loop.unroll.runtime.disable"}
-; VF8UF2: [[LOOP3]] = distinct !{[[LOOP3]], [[META2]], [[META1]]}
+; VF8UF2: [[META1]] = !{!"llvm.loop.unroll.runtime.disable"}
+; VF8UF2: [[META2]] = !{!"llvm.loop.isvectorized", i32 1}
+; VF8UF2: [[LOOP3]] = distinct !{[[LOOP3]], [[META1]], [[META2]]}
 ; VF8UF2: [[LOOP4]] = distinct !{[[LOOP4]], [[META1]], [[META2]]}
-; VF8UF2: [[LOOP5]] = distinct !{[[LOOP5]], [[META2]], [[META1]]}
-; VF8UF2: [[LOOP6]] = distinct !{[[LOOP6]], [[META1]], [[META2]]}
-; VF8UF2: [[LOOP7]] = distinct !{[[LOOP7]], [[META2]], [[META1]]}
 ;.
 ; VF16UF1: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]}
-; VF16UF1: [[META1]] = !{!"llvm.loop.isvectorized", i32 1}
-; VF16UF1: [[META2]] = !{!"llvm.loop.unroll.runtime.disable"}
-; VF16UF1: [[LOOP3]] = distinct !{[[LOOP3]], [[META2]], [[META1]]}
+; VF16UF1: [[META1]] = !{!"llvm.loop.unroll.runtime.disable"}
+; VF16UF1: [[META2]] = !{!"llvm.loop.isvectorized", i32 1}
+; VF16UF1: [[LOOP3]] = distinct !{[[LOOP3]], [[META1]], [[META2]]}
 ; VF16UF1: [[LOOP4]] = distinct !{[[LOOP4]], [[META1]], [[META2]]}
-; VF16UF1: [[LOOP5]] = distinct !{[[LOOP5]], [[META2]], [[META1]]}
-; VF16UF1: [[LOOP6]] = distinct !{[[LOOP6]], [[META1]], [[META2]]}
-; VF16UF1: [[LOOP7]] = distinct !{[[LOOP7]], [[META2]], [[META1]]}
 ;.
-;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
-; CHECK: {{.*}}

>From cabc59194fcaeefceee8cbd2c87293fdcb7c57ae Mon Sep 17 00:00:00 2001
From: Florian Hahn <flo at fhahn.com>
Date: Tue, 31 Dec 2024 20:44:53 +0000
Subject: [PATCH 18/24] !fixup fix formatting

---
 llvm/lib/Transforms/Vectorize/VPlan.cpp | 46 ++++++++++++-------------
 1 file changed, 23 insertions(+), 23 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/VPlan.cpp b/llvm/lib/Transforms/Vectorize/VPlan.cpp
index 88282dfbe1918b..65a3f046422620 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlan.cpp
@@ -1025,31 +1025,31 @@ void VPlan::execute(VPTransformState *State) {
       if (isa<VPWidenPHIRecipe>(&R))
         continue;
 
-    if (isa<VPWidenInductionRecipe>(&R)) {
-      PHINode *Phi = nullptr;
-      if (isa<VPWidenIntOrFpInductionRecipe>(&R)) {
-        Phi = cast<PHINode>(State->get(R.getVPSingleValue()));
-      } else {
-        auto *WidenPhi = cast<VPWidenPointerInductionRecipe>(&R);
-        assert(!WidenPhi->onlyScalarsGenerated(State->VF.isScalable()) &&
-               "recipe generating only scalars should have been replaced");
-        auto *GEP = cast<GetElementPtrInst>(State->get(WidenPhi));
-        Phi = cast<PHINode>(GEP->getPointerOperand());
+      if (isa<VPWidenInductionRecipe>(&R)) {
+        PHINode *Phi = nullptr;
+        if (isa<VPWidenIntOrFpInductionRecipe>(&R)) {
+          Phi = cast<PHINode>(State->get(R.getVPSingleValue()));
+        } else {
+          auto *WidenPhi = cast<VPWidenPointerInductionRecipe>(&R);
+          assert(!WidenPhi->onlyScalarsGenerated(State->VF.isScalable()) &&
+                 "recipe generating only scalars should have been replaced");
+          auto *GEP = cast<GetElementPtrInst>(State->get(WidenPhi));
+          Phi = cast<PHINode>(GEP->getPointerOperand());
+        }
+
+        Phi->setIncomingBlock(1, VectorLatchBB);
+
+        // Move the last step to the end of the latch block. This ensures
+        // consistent placement of all induction updates.
+        Instruction *Inc = cast<Instruction>(Phi->getIncomingValue(1));
+        Inc->moveBefore(VectorLatchBB->getTerminator()->getPrevNode());
+
+        // Use the steps for the last part as backedge value for the induction.
+        if (auto *IV = dyn_cast<VPWidenIntOrFpInductionRecipe>(&R))
+          Inc->setOperand(0, State->get(IV->getLastUnrolledPartOperand()));
+        continue;
       }
 
-      Phi->setIncomingBlock(1, VectorLatchBB);
-
-      // Move the last step to the end of the latch block. This ensures
-      // consistent placement of all induction updates.
-      Instruction *Inc = cast<Instruction>(Phi->getIncomingValue(1));
-      Inc->moveBefore(VectorLatchBB->getTerminator()->getPrevNode());
-
-      // Use the steps for the last part as backedge value for the induction.
-      if (auto *IV = dyn_cast<VPWidenIntOrFpInductionRecipe>(&R))
-        Inc->setOperand(0, State->get(IV->getLastUnrolledPartOperand()));
-      continue;
-    }
-
       auto *PhiR = cast<VPHeaderPHIRecipe>(&R);
       bool NeedsScalar = isa<VPScalarPHIRecipe>(PhiR) ||
                          (isa<VPReductionPHIRecipe>(PhiR) &&

>From 057f2e921543d436d1187ba81afe2e6b6ae3f81b Mon Sep 17 00:00:00 2001
From: Florian Hahn <flo at fhahn.com>
Date: Wed, 1 Jan 2025 21:55:33 +0000
Subject: [PATCH 19/24] [VPlan] Replace VPBBs with VPIRBBs during skeleton
 creation (NFC).

---
 .../Transforms/Vectorize/LoopVectorize.cpp    | 20 +++++++++++-
 llvm/lib/Transforms/Vectorize/VPlan.cpp       | 32 +++----------------
 2 files changed, 23 insertions(+), 29 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index b03dd471bd9716..ee8997ec0fc8d4 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -2601,10 +2601,24 @@ BasicBlock *InnerLoopVectorizer::emitMemRuntimeChecks(BasicBlock *Bypass) {
 
   AddedSafetyChecks = true;
 
-  introduceCheckBlockInVPlan(Plan, MemCheckBlock);
+  introduceCheckBlockInVPlan(Plan, VectorPHVPBB, MemCheckBlock);
   return MemCheckBlock;
 }
 
+/// Replace \p VPBB with a VPIRBasicBlock wrapping \p IRBB. All recipes from \p
+/// VPBB are moved to the end of the newly created VPIRBasicBlock. VPBB must
+/// have a single predecessor, which is rewired to the new VPIRBasicBlock. All
+/// successors of VPBB, if any, are rewired to the new VPIRBasicBlock.
+static void replaceVPBBWithIRVPBB(VPBasicBlock *VPBB, BasicBlock *IRBB) {
+  VPIRBasicBlock *IRVPBB = VPBB->getPlan()->createVPIRBasicBlock(IRBB);
+  for (auto &R : make_early_inc_range(*VPBB)) {
+    assert(!R.isPhi() && "Tried to move phi recipe to end of block");
+    R.moveBefore(*IRVPBB, IRVPBB->end());
+  }
+
+  VPBlockUtils::reassociateBlocks(VPBB, IRVPBB);
+  // VPBB is now dead and will be cleaned up when the plan gets destroyed.
+}
 void InnerLoopVectorizer::createVectorLoopSkeleton(StringRef Prefix) {
   LoopVectorPreHeader = OrigLoop->getLoopPreheader();
   assert(LoopVectorPreHeader && "Invalid loop structure");
@@ -2615,9 +2629,11 @@ void InnerLoopVectorizer::createVectorLoopSkeleton(StringRef Prefix) {
   LoopMiddleBlock =
       SplitBlock(LoopVectorPreHeader, LoopVectorPreHeader->getTerminator(), DT,
                  LI, nullptr, Twine(Prefix) + "middle.block");
+  replaceVPBBWithIRVPBB(Plan.getMiddleBlock(), LoopMiddleBlock);
   LoopScalarPreHeader =
       SplitBlock(LoopMiddleBlock, LoopMiddleBlock->getTerminator(), DT, LI,
                  nullptr, Twine(Prefix) + "scalar.ph");
+  replaceVPBBWithIRVPBB(Plan.getScalarPreheader(), LoopScalarPreHeader);
 }
 
 /// Return the expanded step for \p ID using \p ExpandedSCEVs to look up SCEV
@@ -7727,6 +7743,7 @@ DenseMap<const SCEV *, Value *> LoopVectorizationPlanner::executePlan(
 
   // 1. Set up the skeleton for vectorization, including vector pre-header and
   // middle block. The vector loop is created during VPlan execution.
+  VPBasicBlock *VectorPH = cast<VPBasicBlock>(BestVPlan.getEntry()->getSingleSuccessor());
   State.CFG.PrevBB = ILV.createVectorizedLoopSkeleton(
       ExpandedSCEVs ? *ExpandedSCEVs : State.ExpandedSCEVs);
   if (VectorizingEpilogue)
@@ -7764,6 +7781,7 @@ DenseMap<const SCEV *, Value *> LoopVectorizationPlanner::executePlan(
   BestVPlan.prepareToExecute(
       ILV.getTripCount(),
       ILV.getOrCreateVectorTripCount(ILV.LoopVectorPreHeader), State);
+  replaceVPBBWithIRVPBB(VectorPH, State.CFG.PrevBB);
 
   BestVPlan.execute(&State);
 
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.cpp b/llvm/lib/Transforms/Vectorize/VPlan.cpp
index 65a3f046422620..8d4f3b9a945220 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlan.cpp
@@ -947,21 +947,6 @@ void VPlan::prepareToExecute(Value *TripCountV, Value *VectorTripCountV,
   }
 }
 
-/// Replace \p VPBB with a VPIRBasicBlock wrapping \p IRBB. All recipes from \p
-/// VPBB are moved to the end of the newly created VPIRBasicBlock. VPBB must
-/// have a single predecessor, which is rewired to the new VPIRBasicBlock. All
-/// successors of VPBB, if any, are rewired to the new VPIRBasicBlock.
-static void replaceVPBBWithIRVPBB(VPBasicBlock *VPBB, BasicBlock *IRBB) {
-  VPIRBasicBlock *IRVPBB = VPBB->getPlan()->createVPIRBasicBlock(IRBB);
-  for (auto &R : make_early_inc_range(*VPBB)) {
-    assert(!R.isPhi() && "Tried to move phi recipe to end of block");
-    R.moveBefore(*IRVPBB, IRVPBB->end());
-  }
-
-  VPBlockUtils::reassociateBlocks(VPBB, IRVPBB);
-  // VPBB is now dead and will be cleaned up when the plan gets destroyed.
-}
-
 /// Generate the code inside the preheader and body of the vectorized loop.
 /// Assumes a single pre-header basic-block was created for this. Introduce
 /// additional basic-blocks as needed, and fill them all.
@@ -970,25 +955,14 @@ void VPlan::execute(VPTransformState *State) {
   State->CFG.PrevVPBB = nullptr;
   State->CFG.ExitBB = State->CFG.PrevBB->getSingleSuccessor();
   BasicBlock *VectorPreHeader = State->CFG.PrevBB;
-  State->Builder.SetInsertPoint(VectorPreHeader->getTerminator());
+  //State->Builder.SetInsertPoint(VectorPreHeader->getTerminator());
 
   // Disconnect VectorPreHeader from ExitBB in both the CFG and DT.
   cast<BranchInst>(VectorPreHeader->getTerminator())->setSuccessor(0, nullptr);
   State->CFG.DTU.applyUpdates(
       {{DominatorTree::Delete, VectorPreHeader, State->CFG.ExitBB}});
 
-  // Replace regular VPBB's for the vector preheader, middle and scalar
-  // preheader blocks with VPIRBasicBlocks wrapping their IR blocks. The IR
-  // blocks are created during skeleton creation, so we can only create the
-  // VPIRBasicBlocks now during VPlan execution rather than earlier during VPlan
-  // construction.
-  BasicBlock *MiddleBB = State->CFG.ExitBB;
-  BasicBlock *ScalarPh = MiddleBB->getSingleSuccessor();
-  replaceVPBBWithIRVPBB(getVectorPreheader(), VectorPreHeader);
-  replaceVPBBWithIRVPBB(getMiddleBlock(), MiddleBB);
-  replaceVPBBWithIRVPBB(getScalarPreheader(), ScalarPh);
-
-  LLVM_DEBUG(dbgs() << "Executing best plan with VF=" << State->VF
+   LLVM_DEBUG(dbgs() << "Executing best plan with VF=" << State->VF
                     << ", UF=" << getUF() << '\n');
   setName("Final VPlan");
   LLVM_DEBUG(dump());
@@ -996,6 +970,8 @@ void VPlan::execute(VPTransformState *State) {
   // Disconnect the middle block from its single successor (the scalar loop
   // header) in both the CFG and DT. The branch will be recreated during VPlan
   // execution.
+ BasicBlock *MiddleBB = State->CFG.ExitBB;
+ BasicBlock *ScalarPh = MiddleBB->getSingleSuccessor();
   auto *BrInst = new UnreachableInst(MiddleBB->getContext());
   BrInst->insertBefore(MiddleBB->getTerminator());
   MiddleBB->getTerminator()->eraseFromParent();

>From fe2c3a51b8cfee9d2d0a549b8f0512f02929e74f Mon Sep 17 00:00:00 2001
From: Florian Hahn <flo at fhahn.com>
Date: Wed, 1 Jan 2025 22:07:44 +0000
Subject: [PATCH 20/24] [VPlan] Track VectorPH during skeleton creation.

---
 .../lib/Transforms/Vectorize/LoopVectorize.cpp | 18 +++++++++++-------
 1 file changed, 11 insertions(+), 7 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index c70ebcefabc5c6..c05d9dfe0ad0a2 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -479,7 +479,9 @@ class InnerLoopVectorizer {
         AC(AC), ORE(ORE), VF(VecWidth),
         MinProfitableTripCount(MinProfitableTripCount), UF(UnrollFactor),
         Builder(PSE.getSE()->getContext()), Legal(LVL), Cost(CM), BFI(BFI),
-        PSI(PSI), RTChecks(RTChecks), Plan(Plan) {
+        PSI(PSI), RTChecks(RTChecks), Plan(Plan),
+        VectorPHVPBB(
+            cast<VPBasicBlock>(Plan.getEntry()->getSingleSuccessor())) {
     // Query this against the original loop and save it here because the profile
     // of the original loop header may change as the transformation happens.
     OptForSizeBasedOnProfile = llvm::shouldOptimizeForSize(
@@ -676,6 +678,8 @@ class InnerLoopVectorizer {
   BasicBlock *AdditionalBypassBlock = nullptr;
 
   VPlan &Plan;
+
+  VPBasicBlock *VectorPHVPBB;
 };
 
 /// Encapsulate information regarding vectorization of a loop and its epilogue.
@@ -2446,11 +2450,11 @@ InnerLoopVectorizer::getOrCreateVectorTripCount(BasicBlock *InsertBlock) {
 /// Introduces a new VPIRBasicBlock for \p CheckIRBB to \p Plan between the
 /// vector preheader and its predecessor, also connecting the new block to the
 /// scalar preheader.
-static void introduceCheckBlockInVPlan(VPlan &Plan, BasicBlock *CheckIRBB) {
+static void introduceCheckBlockInVPlan(VPlan &Plan, VPBlockBase *VectorPH,
+                                       BasicBlock *CheckIRBB) {
   VPBlockBase *ScalarPH = Plan.getScalarPreheader();
   // FIXME: Cannot get the vector preheader at the moment if the vector loop
   // region has been removed.
-  VPBlockBase *VectorPH = Plan.getVectorPreheader();
   VPBlockBase *PreVectorPH = VectorPH->getSinglePredecessor();
   if (PreVectorPH->getNumSuccessors() != 1) {
     assert(PreVectorPH->getNumSuccessors() == 2 && "Expected 2 successors");
@@ -2546,7 +2550,7 @@ void InnerLoopVectorizer::emitIterationCountCheck(BasicBlock *Bypass) {
   LoopBypassBlocks.push_back(TCCheckBlock);
 
   // TODO: Wrap LoopVectorPreHeader in VPIRBasicBlock here.
-  introduceCheckBlockInVPlan(Plan, TCCheckBlock);
+  introduceCheckBlockInVPlan(Plan, VectorPHVPBB, TCCheckBlock);
 }
 
 BasicBlock *InnerLoopVectorizer::emitSCEVChecks(BasicBlock *Bypass) {
@@ -2564,7 +2568,7 @@ BasicBlock *InnerLoopVectorizer::emitSCEVChecks(BasicBlock *Bypass) {
   LoopBypassBlocks.push_back(SCEVCheckBlock);
   AddedSafetyChecks = true;
 
-  introduceCheckBlockInVPlan(Plan, SCEVCheckBlock);
+  introduceCheckBlockInVPlan(Plan, VectorPHVPBB, SCEVCheckBlock);
   return SCEVCheckBlock;
 }
 
@@ -7963,7 +7967,7 @@ EpilogueVectorizerMainLoop::emitIterationCountCheck(BasicBlock *Bypass,
     setBranchWeights(BI, MinItersBypassWeights, /*IsExpected=*/false);
   ReplaceInstWithInst(TCCheckBlock->getTerminator(), &BI);
 
-  introduceCheckBlockInVPlan(Plan, TCCheckBlock);
+  introduceCheckBlockInVPlan(Plan, VectorPHVPBB, TCCheckBlock);
   return TCCheckBlock;
 }
 
@@ -8103,7 +8107,7 @@ EpilogueVectorizerEpilogueLoop::emitMinimumVectorEpilogueIterCountCheck(
   Plan.setEntry(NewEntry);
   // OldEntry is now dead and will be cleaned up when the plan gets destroyed.
 
-  introduceCheckBlockInVPlan(Plan, Insert);
+  introduceCheckBlockInVPlan(Plan, VectorPHVPBB, Insert);
   return Insert;
 }
 

>From 26c94b1c25784c788536f0070df7d2d775192f92 Mon Sep 17 00:00:00 2001
From: Florian Hahn <flo at fhahn.com>
Date: Wed, 1 Jan 2025 22:08:39 +0000
Subject: [PATCH 21/24] !fixup restore original getVectorLoopRegion impl

---
 llvm/lib/Transforms/Vectorize/VPlan.cpp       | 25 ++++++-------------
 llvm/lib/Transforms/Vectorize/VPlan.h         |  8 ++++--
 .../truncate-to-minimal-bitwidth-cost.ll      |  4 +--
 .../vector-loop-backedge-elimination.ll       | 12 ++++-----
 4 files changed, 22 insertions(+), 27 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/VPlan.cpp b/llvm/lib/Transforms/Vectorize/VPlan.cpp
index 8a67d7f4b75cbd..6899657a969d5f 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlan.cpp
@@ -1044,28 +1044,19 @@ InstructionCost VPlan::cost(ElementCount VF, VPCostContext &Ctx) {
   return getVectorLoopRegion()->cost(VF, Ctx);
 }
 
-VPBasicBlock *VPlan::getVectorPreheader() {
-  VPBlockBase *Current = getEntry()->getSuccessors().back();
-  while (Current->getNumSuccessors() == 2)
-    Current = Current->getSuccessors().back();
-  return cast<VPBasicBlock>(Current);
-}
-
-VPBasicBlock *VPlan::getVectorPreheader() const {
-  VPBlockBase *Current = getEntry()->getSuccessors().back();
-  while (Current->getNumSuccessors() == 2)
-    Current = Current->getSuccessors().back();
-  return cast<VPBasicBlock>(Current);
-}
-
 VPRegionBlock *VPlan::getVectorLoopRegion() {
   // TODO: Cache if possible.
-  return dyn_cast<VPRegionBlock>(getVectorPreheader()->getSingleSuccessor());
-  ;
+  for (VPBlockBase *B : vp_depth_first_shallow(getEntry()))
+    if (auto *R = dyn_cast<VPRegionBlock>(B))
+      return R->isReplicator() ? nullptr : R;
+  return nullptr;
 }
 
 const VPRegionBlock *VPlan::getVectorLoopRegion() const {
-  return dyn_cast<VPRegionBlock>(getVectorPreheader()->getSingleSuccessor());
+  for (const VPBlockBase *B : vp_depth_first_shallow(getEntry()))
+    if (auto *R = dyn_cast<VPRegionBlock>(B))
+      return R->isReplicator() ? nullptr : R;
+  return nullptr;
 }
 
 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h
index c15018dc49c1c1..16024fc69703a2 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.h
+++ b/llvm/lib/Transforms/Vectorize/VPlan.h
@@ -3843,8 +3843,12 @@ class VPlan {
   const VPBasicBlock *getEntry() const { return Entry; }
 
   /// Returns the preheader of the vector loop region.
-  VPBasicBlock *getVectorPreheader();
-  VPBasicBlock *getVectorPreheader() const;
+  VPBasicBlock *getVectorPreheader() {
+    VPRegionBlock *VectorRegion = getVectorLoopRegion();
+    return VectorRegion
+               ? cast<VPBasicBlock>(VectorRegion->getSinglePredecessor())
+               : nullptr;
+  }
 
   /// Returns the VPRegionBlock of the vector loop.
   VPRegionBlock *getVectorLoopRegion();
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/truncate-to-minimal-bitwidth-cost.ll b/llvm/test/Transforms/LoopVectorize/RISCV/truncate-to-minimal-bitwidth-cost.ll
index 7a6676ff6ea1b3..f38aa11b5af873 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/truncate-to-minimal-bitwidth-cost.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/truncate-to-minimal-bitwidth-cost.ll
@@ -163,12 +163,12 @@ define void @truncate_to_i1_used_by_branch(i8 %x, ptr %dst) #0 {
 ; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <2 x i8> [[BROADCAST_SPLATINSERT]], <2 x i8> poison, <2 x i32> zeroinitializer
 ; CHECK-NEXT:    [[TMP0:%.*]] = trunc <2 x i8> [[BROADCAST_SPLAT]] to <2 x i1>
 ; CHECK-NEXT:    [[TMP2:%.*]] = or <2 x i1> splat (i1 true), [[TMP0]]
-; CHECK-NEXT:    [[BROADCAST_SPLATINSERT3:%.*]] = insertelement <2 x ptr> poison, ptr [[DST]], i64 0
-; CHECK-NEXT:    [[BROADCAST_SPLAT4:%.*]] = shufflevector <2 x ptr> [[BROADCAST_SPLATINSERT3]], <2 x ptr> poison, <2 x i32> zeroinitializer
 ; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
 ; CHECK:       [[VECTOR_BODY]]:
 ; CHECK-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = call <2 x i1> @llvm.get.active.lane.mask.v2i1.i32(i32 0, i32 2)
 ; CHECK-NEXT:    [[TMP3:%.*]] = select <2 x i1> [[ACTIVE_LANE_MASK]], <2 x i1> [[TMP2]], <2 x i1> zeroinitializer
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <2 x ptr> poison, ptr [[DST]], i64 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT4:%.*]] = shufflevector <2 x ptr> [[BROADCAST_SPLATINSERT1]], <2 x ptr> poison, <2 x i32> zeroinitializer
 ; CHECK-NEXT:    call void @llvm.masked.scatter.v2i8.v2p0(<2 x i8> zeroinitializer, <2 x ptr> [[BROADCAST_SPLAT4]], i32 1, <2 x i1> [[TMP3]])
 ; CHECK-NEXT:    br label %[[MIDDLE_BLOCK:.*]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
diff --git a/llvm/test/Transforms/LoopVectorize/vector-loop-backedge-elimination.ll b/llvm/test/Transforms/LoopVectorize/vector-loop-backedge-elimination.ll
index 3de42080b1842a..fe5811e7e11591 100644
--- a/llvm/test/Transforms/LoopVectorize/vector-loop-backedge-elimination.ll
+++ b/llvm/test/Transforms/LoopVectorize/vector-loop-backedge-elimination.ll
@@ -165,10 +165,10 @@ define void @remove_loop_region_with_replicate_recipe(ptr %dst, i64 range(i64 5,
 ; VF8UF1-NEXT:    [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
 ; VF8UF1-NEXT:    [[TRIP_COUNT_MINUS_1:%.*]] = sub i64 [[TMP0]], 1
 ; VF8UF1-NEXT:    [[TMP1:%.*]] = add i64 2, [[N_VEC]]
-; VF8UF1-NEXT:    [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <8 x i64> poison, i64 [[TRIP_COUNT_MINUS_1]], i64 0
-; VF8UF1-NEXT:    [[BROADCAST_SPLAT1:%.*]] = shufflevector <8 x i64> [[BROADCAST_SPLATINSERT1]], <8 x i64> poison, <8 x i32> zeroinitializer
 ; VF8UF1-NEXT:    br label %[[VECTOR_BODY:.*]]
 ; VF8UF1:       [[VECTOR_BODY]]:
+; VF8UF1-NEXT:    [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <8 x i64> poison, i64 [[TRIP_COUNT_MINUS_1]], i64 0
+; VF8UF1-NEXT:    [[BROADCAST_SPLAT1:%.*]] = shufflevector <8 x i64> [[BROADCAST_SPLATINSERT1]], <8 x i64> poison, <8 x i32> zeroinitializer
 ; VF8UF1-NEXT:    [[TMP2:%.*]] = icmp ule <8 x i64> <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7>, [[BROADCAST_SPLAT1]]
 ; VF8UF1-NEXT:    [[TMP3:%.*]] = extractelement <8 x i1> [[TMP2]], i32 0
 ; VF8UF1-NEXT:    br i1 [[TMP3]], label %[[PRED_STORE_IF:.*]], label %[[PRED_STORE_CONTINUE:.*]]
@@ -253,10 +253,10 @@ define void @remove_loop_region_with_replicate_recipe(ptr %dst, i64 range(i64 5,
 ; VF8UF2-NEXT:    [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
 ; VF8UF2-NEXT:    [[TRIP_COUNT_MINUS_1:%.*]] = sub i64 [[TMP0]], 1
 ; VF8UF2-NEXT:    [[TMP1:%.*]] = add i64 2, [[N_VEC]]
-; VF8UF2-NEXT:    [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <8 x i64> poison, i64 [[TRIP_COUNT_MINUS_1]], i64 0
-; VF8UF2-NEXT:    [[BROADCAST_SPLAT1:%.*]] = shufflevector <8 x i64> [[BROADCAST_SPLATINSERT1]], <8 x i64> poison, <8 x i32> zeroinitializer
 ; VF8UF2-NEXT:    br label %[[VECTOR_BODY:.*]]
 ; VF8UF2:       [[VECTOR_BODY]]:
+; VF8UF2-NEXT:    [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <8 x i64> poison, i64 [[TRIP_COUNT_MINUS_1]], i64 0
+; VF8UF2-NEXT:    [[BROADCAST_SPLAT1:%.*]] = shufflevector <8 x i64> [[BROADCAST_SPLATINSERT1]], <8 x i64> poison, <8 x i32> zeroinitializer
 ; VF8UF2-NEXT:    [[TMP2:%.*]] = icmp ule <8 x i64> <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7>, [[BROADCAST_SPLAT1]]
 ; VF8UF2-NEXT:    [[TMP3:%.*]] = icmp ule <8 x i64> <i64 8, i64 9, i64 10, i64 11, i64 12, i64 13, i64 14, i64 15>, [[BROADCAST_SPLAT1]]
 ; VF8UF2-NEXT:    [[TMP4:%.*]] = extractelement <8 x i1> [[TMP2]], i32 0
@@ -398,10 +398,10 @@ define void @remove_loop_region_with_replicate_recipe(ptr %dst, i64 range(i64 5,
 ; VF16UF1-NEXT:    [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
 ; VF16UF1-NEXT:    [[TRIP_COUNT_MINUS_1:%.*]] = sub i64 [[TMP0]], 1
 ; VF16UF1-NEXT:    [[TMP1:%.*]] = add i64 2, [[N_VEC]]
-; VF16UF1-NEXT:    [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <16 x i64> poison, i64 [[TRIP_COUNT_MINUS_1]], i64 0
-; VF16UF1-NEXT:    [[BROADCAST_SPLAT1:%.*]] = shufflevector <16 x i64> [[BROADCAST_SPLATINSERT1]], <16 x i64> poison, <16 x i32> zeroinitializer
 ; VF16UF1-NEXT:    br label %[[VECTOR_BODY:.*]]
 ; VF16UF1:       [[VECTOR_BODY]]:
+; VF16UF1-NEXT:    [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <16 x i64> poison, i64 [[TRIP_COUNT_MINUS_1]], i64 0
+; VF16UF1-NEXT:    [[BROADCAST_SPLAT1:%.*]] = shufflevector <16 x i64> [[BROADCAST_SPLATINSERT1]], <16 x i64> poison, <16 x i32> zeroinitializer
 ; VF16UF1-NEXT:    [[TMP2:%.*]] = icmp ule <16 x i64> <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7, i64 8, i64 9, i64 10, i64 11, i64 12, i64 13, i64 14, i64 15>, [[BROADCAST_SPLAT1]]
 ; VF16UF1-NEXT:    [[TMP3:%.*]] = extractelement <16 x i1> [[TMP2]], i32 0
 ; VF16UF1-NEXT:    br i1 [[TMP3]], label %[[PRED_STORE_IF:.*]], label %[[PRED_STORE_CONTINUE:.*]]

>From f180edb07a037e812ccf49bb42dc4ae8281dd74c Mon Sep 17 00:00:00 2001
From: Florian Hahn <flo at fhahn.com>
Date: Fri, 3 Jan 2025 20:39:26 +0000
Subject: [PATCH 22/24] !fixup address latest comments, thanks

---
 .../Transforms/Vectorize/LoopVectorize.cpp    |  21 ++--
 llvm/lib/Transforms/Vectorize/VPlan.cpp       | 104 ++++++++++--------
 llvm/lib/Transforms/Vectorize/VPlan.h         |   5 +-
 .../lib/Transforms/Vectorize/VPlanRecipes.cpp |   4 +-
 .../Transforms/Vectorize/VPlanTransforms.cpp  |  61 +++++-----
 5 files changed, 103 insertions(+), 92 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 4c42dc2b3a410e..e7baa3e556db82 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -3012,6 +3012,8 @@ void InnerLoopVectorizer::fixVectorizedLoop(VPTransformState &State) {
                    getOrCreateVectorTripCount(nullptr), LoopMiddleBlock, State);
   }
 
+  // Don't apply optimizations below when no vector region remains, as they all
+  // require a vector loop at the moment.
   if (!State.Plan->getVectorLoopRegion())
     return;
 
@@ -7811,14 +7813,14 @@ DenseMap<const SCEV *, Value *> LoopVectorizationPlanner::executePlan(
   // 2.6. Maintain Loop Hints
   // Keep all loop hints from the original loop on the vector loop (we'll
   // replace the vectorizer-specific hints below).
-  MDNode *OrigLoopID = OrigLoop->getLoopID();
+  if (auto *LoopRegion = BestVPlan.getVectorLoopRegion()) {
+    MDNode *OrigLoopID = OrigLoop->getLoopID();
 
-  std::optional<MDNode *> VectorizedLoopID =
-      makeFollowupLoopID(OrigLoopID, {LLVMLoopVectorizeFollowupAll,
-                                      LLVMLoopVectorizeFollowupVectorized});
+    std::optional<MDNode *> VectorizedLoopID =
+        makeFollowupLoopID(OrigLoopID, {LLVMLoopVectorizeFollowupAll,
+                                        LLVMLoopVectorizeFollowupVectorized});
 
-  if (auto *R = BestVPlan.getVectorLoopRegion()) {
-    VPBasicBlock *HeaderVPBB = R->getEntryBasicBlock();
+    VPBasicBlock *HeaderVPBB = LoopRegion->getEntryBasicBlock();
     Loop *L = LI->getLoopFor(State.CFG.VPBB2IRBB[HeaderVPBB]);
     if (VectorizedLoopID) {
       L->setLoopID(*VectorizedLoopID);
@@ -7844,11 +7846,10 @@ DenseMap<const SCEV *, Value *> LoopVectorizationPlanner::executePlan(
   ILV.printDebugTracesAtEnd();
 
   // 4. Adjust branch weight of the branch in the middle block.
-  if (auto *R = BestVPlan.getVectorLoopRegion()) {
-    auto *ExitVPBB = cast<VPBasicBlock>(R->getSingleSuccessor());
-
+  if (BestVPlan.getVectorLoopRegion()) {
+    auto *MiddleVPBB = BestVPlan.getMiddleBlock();
     auto *MiddleTerm =
-        cast<BranchInst>(State.CFG.VPBB2IRBB[ExitVPBB]->getTerminator());
+        cast<BranchInst>(State.CFG.VPBB2IRBB[MiddleVPBB]->getTerminator());
     if (MiddleTerm->isConditional() &&
         hasBranchWeightMD(*OrigLoop->getLoopLatch()->getTerminator())) {
       // Assume that `Count % VectorTripCount` is equally distributed.
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.cpp b/llvm/lib/Transforms/Vectorize/VPlan.cpp
index ac6180379eded3..62b4490a528451 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlan.cpp
@@ -554,6 +554,8 @@ VPBasicBlock *VPBasicBlock::splitAt(iterator SplitAt) {
 template <typename T> static T *getEnclosingLoopRegionForRegion(T *P) {
   if (P && P->isReplicator()) {
     P = P->getParent();
+    // Multiple loop regions can be nested, but replicate regions can only be
+    // nested inside a loop region or must be outside any other region.
     assert((!P || !cast<VPRegionBlock>(P)->isReplicator()) &&
            "unexpected nested replicate regions");
   }
@@ -933,6 +935,8 @@ void VPlan::prepareToExecute(Value *TripCountV, Value *VectorTripCountV,
 
   IRBuilder<> Builder(State.CFG.PrevBB->getTerminator());
   // FIXME: Model VF * UF computation completely in VPlan.
+  assert((!getVectorLoopRegion() || VFxUF.getNumUsers()) &&
+         "VFxUF expected to always have users");
   unsigned UF = getUF();
   if (VF.getNumUsers()) {
     Value *RuntimeVF = getRuntimeVF(Builder, TCTy, State.VF);
@@ -986,54 +990,56 @@ void VPlan::execute(VPTransformState *State) {
   for (VPBlockBase *Block : RPOT)
     Block->execute(State);
 
-  if (auto *LoopRegion = getVectorLoopRegion()) {
-    VPBasicBlock *LatchVPBB = LoopRegion->getExitingBasicBlock();
-    BasicBlock *VectorLatchBB = State->CFG.VPBB2IRBB[LatchVPBB];
+  State->CFG.DTU.flush();
 
-    // Fix the latch value of canonical, reduction and first-order recurrences
-    // phis in the vector loop.
-    VPBasicBlock *Header = LoopRegion->getEntryBasicBlock();
-    for (VPRecipeBase &R : Header->phis()) {
-      // Skip phi-like recipes that generate their backedege values themselves.
-      if (isa<VPWidenPHIRecipe>(&R))
-        continue;
+  auto *LoopRegion = getVectorLoopRegion();
+  if (!LoopRegion)
+    return;
 
-      if (isa<VPWidenInductionRecipe>(&R)) {
-        PHINode *Phi = nullptr;
-        if (isa<VPWidenIntOrFpInductionRecipe>(&R)) {
-          Phi = cast<PHINode>(State->get(R.getVPSingleValue()));
-        } else {
-          auto *WidenPhi = cast<VPWidenPointerInductionRecipe>(&R);
-          assert(!WidenPhi->onlyScalarsGenerated(State->VF.isScalable()) &&
-                 "recipe generating only scalars should have been replaced");
-          auto *GEP = cast<GetElementPtrInst>(State->get(WidenPhi));
-          Phi = cast<PHINode>(GEP->getPointerOperand());
-        }
-
-        Phi->setIncomingBlock(1, VectorLatchBB);
-
-        // Move the last step to the end of the latch block. This ensures
-        // consistent placement of all induction updates.
-        Instruction *Inc = cast<Instruction>(Phi->getIncomingValue(1));
-        Inc->moveBefore(VectorLatchBB->getTerminator()->getPrevNode());
-
-        // Use the steps for the last part as backedge value for the induction.
-        if (auto *IV = dyn_cast<VPWidenIntOrFpInductionRecipe>(&R))
-          Inc->setOperand(0, State->get(IV->getLastUnrolledPartOperand()));
-        continue;
+  VPBasicBlock *LatchVPBB = LoopRegion->getExitingBasicBlock();
+  BasicBlock *VectorLatchBB = State->CFG.VPBB2IRBB[LatchVPBB];
+
+  // Fix the latch value of canonical, reduction and first-order recurrences
+  // phis in the vector loop.
+  VPBasicBlock *Header = LoopRegion->getEntryBasicBlock();
+  for (VPRecipeBase &R : Header->phis()) {
+    // Skip phi-like recipes that generate their backedege values themselves.
+    if (isa<VPWidenPHIRecipe>(&R))
+      continue;
+
+    if (isa<VPWidenInductionRecipe>(&R)) {
+      PHINode *Phi = nullptr;
+      if (isa<VPWidenIntOrFpInductionRecipe>(&R)) {
+        Phi = cast<PHINode>(State->get(R.getVPSingleValue()));
+      } else {
+        auto *WidenPhi = cast<VPWidenPointerInductionRecipe>(&R);
+        assert(!WidenPhi->onlyScalarsGenerated(State->VF.isScalable()) &&
+               "recipe generating only scalars should have been replaced");
+        auto *GEP = cast<GetElementPtrInst>(State->get(WidenPhi));
+        Phi = cast<PHINode>(GEP->getPointerOperand());
       }
 
-      auto *PhiR = cast<VPHeaderPHIRecipe>(&R);
-      bool NeedsScalar = isa<VPScalarPHIRecipe>(PhiR) ||
-                         (isa<VPReductionPHIRecipe>(PhiR) &&
-                          cast<VPReductionPHIRecipe>(PhiR)->isInLoop());
-      Value *Phi = State->get(PhiR, NeedsScalar);
-      Value *Val = State->get(PhiR->getBackedgeValue(), NeedsScalar);
-      cast<PHINode>(Phi)->addIncoming(Val, VectorLatchBB);
+      Phi->setIncomingBlock(1, VectorLatchBB);
+
+      // Move the last step to the end of the latch block. This ensures
+      // consistent placement of all induction updates.
+      Instruction *Inc = cast<Instruction>(Phi->getIncomingValue(1));
+      Inc->moveBefore(VectorLatchBB->getTerminator()->getPrevNode());
+
+      // Use the steps for the last part as backedge value for the induction.
+      if (auto *IV = dyn_cast<VPWidenIntOrFpInductionRecipe>(&R))
+        Inc->setOperand(0, State->get(IV->getLastUnrolledPartOperand()));
+      continue;
     }
-  }
 
-  State->CFG.DTU.flush();
+    auto *PhiR = cast<VPHeaderPHIRecipe>(&R);
+    bool NeedsScalar = isa<VPScalarPHIRecipe>(PhiR) ||
+                       (isa<VPReductionPHIRecipe>(PhiR) &&
+                        cast<VPReductionPHIRecipe>(PhiR)->isInLoop());
+    Value *Phi = State->get(PhiR, NeedsScalar);
+    Value *Val = State->get(PhiR->getBackedgeValue(), NeedsScalar);
+    cast<PHINode>(Phi)->addIncoming(Val, VectorLatchBB);
+  }
 }
 
 InstructionCost VPlan::cost(ElementCount VF, VPCostContext &Ctx) {
@@ -1399,13 +1405,17 @@ void VPlanIngredient::print(raw_ostream &O) const {
 
 #endif
 
-bool VPValue::isDefinedOutsideLoopRegions() const {
-
-  return !hasDefiningRecipe() ||
-         (!getDefiningRecipe()->getParent()->getEnclosingLoopRegion() &&
-          getDefiningRecipe()->getParent()->getPlan()->getVectorLoopRegion());
+/// Returns true if there is a vector loop region and \p VPV is defined in a
+/// loop region.
+static bool isDefinedInsideLoopRegions(const VPValue *VPV) {
+  const VPRecipeBase *DefR = VPV->getDefiningRecipe();
+  return DefR && (!DefR->getParent()->getPlan()->getVectorLoopRegion() ||
+                  DefR->getParent()->getEnclosingLoopRegion());
 }
 
+bool VPValue::isDefinedOutsideLoopRegions() const {
+  return !isDefinedInsideLoopRegions(this);
+}
 void VPValue::replaceAllUsesWith(VPValue *New) {
   replaceUsesWithIf(New, [](VPUser &, unsigned) { return true; });
 }
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h
index 16024fc69703a2..27ae4dff25e3e1 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.h
+++ b/llvm/lib/Transforms/Vectorize/VPlan.h
@@ -3809,6 +3809,8 @@ class VPlan {
 
   ~VPlan();
 
+  /// Set \p EntryBlock as the entry VPBlockBase of this VPRegionBlock. \p
+  /// EntryBlock must have no predecessors.
   void setEntry(VPBasicBlock *VPBB) {
     Entry = VPBB;
     VPBB->setPlan(this);
@@ -3842,7 +3844,8 @@ class VPlan {
   VPBasicBlock *getEntry() { return Entry; }
   const VPBasicBlock *getEntry() const { return Entry; }
 
-  /// Returns the preheader of the vector loop region.
+  /// Returns the preheader of the vector loop region, if one exists, or null
+  /// otherwise.
   VPBasicBlock *getVectorPreheader() {
     VPRegionBlock *VectorRegion = getVectorLoopRegion();
     return VectorRegion
diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
index 8b424ed055feff..77c08839dbfa95 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
@@ -2376,9 +2376,7 @@ void VPBranchOnMaskRecipe::execute(VPTransformState &State) {
   // Replace the temporary unreachable terminator with a new conditional branch,
   // whose two destinations will be set later when they are created.
   auto *CurrentTerminator = State.CFG.PrevBB->getTerminator();
-  assert((isa<UnreachableInst>(CurrentTerminator) ||
-          (isa<BranchInst>(CurrentTerminator) &&
-           !CurrentTerminator->getOperand(0))) &&
+  assert(isa<UnreachableInst>(CurrentTerminator) &&
          "Expected to replace unreachable terminator with conditional branch.");
   auto *CondBr = BranchInst::Create(State.CFG.PrevBB, nullptr, ConditionBit);
   CondBr->setSuccessor(0, nullptr);
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
index c59bd874487c87..84a2146977f8bd 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
@@ -794,15 +794,12 @@ static void simplifyRecipe(VPRecipeBase &R, VPTypeAnalysis &TypeInfo) {
     return R.getVPSingleValue()->replaceAllUsesWith(R.getOperand(1));
 }
 
-/// Try to simplify the recipes in \p Plan. If \p CanonicalIVTy is not nullptr,
-/// use it directly instead of retrieving the canonical IV type from the plan
-/// which may not exist any longer.
-static void simplifyRecipes(VPlan &Plan, Type *CanonicalIVTy = nullptr) {
+/// Try to simplify the recipes in \p Plan. Use \p CanonicalIVTy as type for all
+/// un-typed live-ins in VPTypeAnalysis.
+static void simplifyRecipes(VPlan &Plan, Type *CanonicalIVTy) {
   ReversePostOrderTraversal<VPBlockDeepTraversalWrapper<VPBlockBase *>> RPOT(
       Plan.getEntry());
-  Type *CanonicalIVType =
-      CanonicalIVTy ? CanonicalIVTy : Plan.getCanonicalIV()->getScalarType();
-  VPTypeAnalysis TypeInfo(CanonicalIVType);
+  VPTypeAnalysis TypeInfo(CanonicalIVTy);
   for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(RPOT)) {
     for (VPRecipeBase &R : make_early_inc_range(*VPBB)) {
       simplifyRecipe(R, TypeInfo);
@@ -840,38 +837,40 @@ void VPlanTransforms::optimizeForVFAndUF(VPlan &Plan, ElementCount BestVF,
       !SE.isKnownPredicate(CmpInst::ICMP_ULE, TripCount, C))
     return;
 
+  // The vector loop region only executes once. If possible, completely remove
+  // the region, otherwise replace the terminator controlling the latch with
+  // (BranchOnCond true).
   Term->eraseFromParent();
   auto *Header = cast<VPBasicBlock>(VectorRegion->getEntry());
   auto *CanIVTy = Plan.getCanonicalIV()->getScalarType();
-  if (any_of(Header->phis(),
-             IsaPred<VPWidenIntOrFpInductionRecipe, VPReductionPHIRecipe>)) {
-    LLVMContext &Ctx = SE.getContext();
-    auto *BOC = new VPInstruction(
-        VPInstruction::BranchOnCond,
-        {Plan.getOrAddLiveIn(ConstantInt::getTrue(Ctx))}, Term->getDebugLoc());
-    ExitingVPBB->appendRecipe(BOC);
-  } else {
-    for (VPRecipeBase &R : make_early_inc_range(Header->phis())) {
-      auto *P = cast<VPHeaderPHIRecipe>(&R);
-      P->replaceAllUsesWith(P->getStartValue());
-      P->eraseFromParent();
+  if (all_of(
+          Header->phis(),
+          IsaPred<VPCanonicalIVPHIRecipe, VPFirstOrderRecurrencePHIRecipe>)) {
+    for (VPRecipeBase &HeaderR : make_early_inc_range(Header->phis())) {
+      auto *HeaderPhiR = cast<VPHeaderPHIRecipe>(&HeaderR);
+      HeaderPhiR->replaceAllUsesWith(HeaderPhiR->getStartValue());
+      HeaderPhiR->eraseFromParent();
     }
 
-    VPBlockBase *Preheader = Plan.getVectorPreheader();
-    VPBlockBase *Middle = Plan.getMiddleBlock();
+    VPBlockBase *Preheader = VectorRegion->getSinglePredecessor();
+    VPBlockBase *Exit = VectorRegion->getSingleSuccessor();
     VPBlockUtils::disconnectBlocks(Preheader, VectorRegion);
-    VPBlockUtils::disconnectBlocks(VectorRegion, Middle);
+    VPBlockUtils::disconnectBlocks(VectorRegion, Exit);
 
-    Header->setParent(nullptr);
-    ExitingVPBB->setParent(nullptr);
+    for (VPBlockBase *B : vp_depth_first_shallow(VectorRegion->getEntry()))
+      B->setParent(nullptr);
 
-    for (VPBlockBase *B : vp_depth_first_shallow(VectorRegion->getEntry())) {
-      if (isa<VPRegionBlock>(B))
-        B->setParent(nullptr);
-    }
     VPBlockUtils::connectBlocks(Preheader, Header);
-    VPBlockUtils::connectBlocks(ExitingVPBB, Middle);
+    VPBlockUtils::connectBlocks(ExitingVPBB, Exit);
     simplifyRecipes(Plan, CanIVTy);
+  } else {
+    // The vector region contains header phis for which we cannot remove the
+    // loop region yet.
+    LLVMContext &Ctx = SE.getContext();
+    auto *BOC = new VPInstruction(
+        VPInstruction::BranchOnCond,
+        {Plan.getOrAddLiveIn(ConstantInt::getTrue(Ctx))}, Term->getDebugLoc());
+    ExitingVPBB->appendRecipe(BOC);
   }
 
   VPlanTransforms::removeDeadRecipes(Plan);
@@ -1287,10 +1286,10 @@ void VPlanTransforms::optimize(VPlan &Plan) {
   removeRedundantCanonicalIVs(Plan);
   removeRedundantInductionCasts(Plan);
 
-  simplifyRecipes(Plan);
+  simplifyRecipes(Plan, Plan.getCanonicalIV()->getScalarType());
   legalizeAndOptimizeInductions(Plan);
   removeRedundantExpandSCEVRecipes(Plan);
-  simplifyRecipes(Plan);
+  simplifyRecipes(Plan, Plan.getCanonicalIV()->getScalarType());
   removeDeadRecipes(Plan);
 
   createAndOptimizeReplicateRegions(Plan);

>From 71ff80a8c69eea1b84ba63a9e7b588410161997f Mon Sep 17 00:00:00 2001
From: Florian Hahn <flo at fhahn.com>
Date: Fri, 3 Jan 2025 20:45:00 +0000
Subject: [PATCH 23/24] !fixup fix formatting

---
 llvm/lib/Transforms/Vectorize/LoopVectorize.cpp | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index e7baa3e556db82..b1aa1d198de2d7 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -7749,7 +7749,8 @@ DenseMap<const SCEV *, Value *> LoopVectorizationPlanner::executePlan(
 
   // 1. Set up the skeleton for vectorization, including vector pre-header and
   // middle block. The vector loop is created during VPlan execution.
-  VPBasicBlock *VectorPH = cast<VPBasicBlock>(BestVPlan.getEntry()->getSingleSuccessor());
+  VPBasicBlock *VectorPH =
+      cast<VPBasicBlock>(BestVPlan.getEntry()->getSingleSuccessor());
   State.CFG.PrevBB = ILV.createVectorizedLoopSkeleton(
       ExpandedSCEVs ? *ExpandedSCEVs : State.ExpandedSCEVs);
   if (VectorizingEpilogue)

>From 4f748274b9250ad4f6c85a40af3727b9adea283f Mon Sep 17 00:00:00 2001
From: Florian Hahn <flo at fhahn.com>
Date: Sat, 4 Jan 2025 13:59:06 +0000
Subject: [PATCH 24/24] !fixup restore newline and move comment

---
 llvm/lib/Transforms/Vectorize/VPlan.cpp | 1 +
 llvm/lib/Transforms/Vectorize/VPlan.h   | 4 ++--
 2 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/VPlan.cpp b/llvm/lib/Transforms/Vectorize/VPlan.cpp
index 62b4490a528451..e804f81c36dba0 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlan.cpp
@@ -447,6 +447,7 @@ void VPBasicBlock::connectToPredecessors(VPTransformState::CFGState &CFG) {
     CFG.DTU.applyUpdates({{DominatorTree::Insert, PredBB, NewBB}});
   }
 }
+
 void VPIRBasicBlock::execute(VPTransformState *State) {
   assert(getHierarchicalSuccessors().size() <= 2 &&
          "VPIRBasicBlock can have at most two successors at the moment!");
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h
index 27ae4dff25e3e1..3e8021e2625c78 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.h
+++ b/llvm/lib/Transforms/Vectorize/VPlan.h
@@ -3668,6 +3668,8 @@ class VPRegionBlock : public VPBlockBase {
   const VPBlockBase *getEntry() const { return Entry; }
   VPBlockBase *getEntry() { return Entry; }
 
+  /// Set \p EntryBlock as the entry VPBlockBase of this VPRegionBlock. \p
+  /// EntryBlock must have no predecessors.
   void setEntry(VPBlockBase *EntryBlock) {
     assert(EntryBlock->getPredecessors().empty() &&
            "Entry block cannot have predecessors.");
@@ -3809,8 +3811,6 @@ class VPlan {
 
   ~VPlan();
 
-  /// Set \p EntryBlock as the entry VPBlockBase of this VPRegionBlock. \p
-  /// EntryBlock must have no predecessors.
   void setEntry(VPBasicBlock *VPBB) {
     Entry = VPBB;
     VPBB->setPlan(this);



More information about the llvm-commits mailing list