[llvm] [VPlan] Add VPIRInstruction, use for exit block live-outs. (PR #100735)

Florian Hahn via llvm-commits llvm-commits at lists.llvm.org
Sat Sep 14 12:45:49 PDT 2024


https://github.com/fhahn updated https://github.com/llvm/llvm-project/pull/100735

>From 2f441bb3ae998745e03326cc2e59ea7b54439ec4 Mon Sep 17 00:00:00 2001
From: Florian Hahn <flo at fhahn.com>
Date: Tue, 23 Jul 2024 15:25:01 +0100
Subject: [PATCH 1/7] [VPlan] Introduce explicit ExtractFromEnd recipes for
 live-outs.

Introduce explicit ExtractFromEnd recipes to extract the final values
for live-outs instead of implicitly extracting in VPLiveOut::fixPhi.

This is a follow-up to the recent changes of modeling extracts for
recurrences and consolidates live-out extract creation for fixed-order
recurrences at a single place: addLiveOutsForFirstOrderRecurrences.

It is also in preparation of replacing VPLiveOut with VPIRInstructions
wrapping the original scalar phis.
---
 .../Transforms/Vectorize/LoopVectorize.cpp    | 130 ++++++++++++++++--
 .../lib/Transforms/Vectorize/VPlanRecipes.cpp |   8 +-
 .../Transforms/Vectorize/VPlanTransforms.cpp  |  94 -------------
 .../RISCV/vplan-vp-intrinsics-reduction.ll    |   9 +-
 ...-order-recurrence-sink-replicate-region.ll |   3 +-
 .../instruction-only-used-outside-of-loop.ll  |  20 ++-
 .../pr55167-fold-tail-live-out.ll             |   2 +-
 .../LoopVectorize/select-cmp-multiuse.ll      |   4 +-
 .../LoopVectorize/vplan-printing.ll           |  11 +-
 9 files changed, 144 insertions(+), 137 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 09ca859f526808..61b29295d07b5e 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -8452,7 +8452,104 @@ static void addUsersInExitBlock(
            return P && Inductions.contains(P);
          })))
       continue;
-    Plan.addLiveOut(&ExitPhi, V);
+
+    auto MiddleVPBB =
+        cast<VPBasicBlock>(Plan.getVectorLoopRegion()->getSingleSuccessor());
+    VPBuilder B(MiddleVPBB);
+    if (auto *Terminator = MiddleVPBB->getTerminator()) {
+      auto *Condition = dyn_cast<VPInstruction>(Terminator->getOperand(0));
+      assert((!Condition || Condition->getParent() == MiddleVPBB) &&
+             "Condition expected in MiddleVPBB");
+      B.setInsertPoint(Condition ? Condition : Terminator);
+    }
+
+    VPValue *Ext;
+    if (auto *FOR = dyn_cast_or_null<VPFirstOrderRecurrencePHIRecipe>(
+            V->getDefiningRecipe())) {
+      // This is the second phase of vectorizing first-order recurrences. An
+      // overview of the transformation is described below. Suppose we have the
+      // following loop with some use after the loop of the last a[i-1],
+      //
+      //   for (int i = 0; i < n; ++i) {
+      //     t = a[i - 1];
+      //     b[i] = a[i] - t;
+      //   }
+      //   use t;
+      //
+      // There is a first-order recurrence on "a". For this loop, the shorthand
+      // scalar IR looks like:
+      //
+      //   scalar.ph:
+      //     s_init = a[-1]
+      //     br scalar.body
+      //
+      //   scalar.body:
+      //     i = phi [0, scalar.ph], [i+1, scalar.body]
+      //     s1 = phi [s_init, scalar.ph], [s2, scalar.body]
+      //     s2 = a[i]
+      //     b[i] = s2 - s1
+      //     br cond, scalar.body, exit.block
+      //
+      //   exit.block:
+      //     use = lcssa.phi [s1, scalar.body]
+      //
+      // In this example, s1 is a recurrence because it's value depends on the
+      // previous iteration. In the first phase of vectorization, we created a
+      // vector phi v1 for s1. We now complete the vectorization and produce the
+      // shorthand vector IR shown below (for VF = 4, UF = 1).
+      //
+      //   vector.ph:
+      //     v_init = vector(..., ..., ..., a[-1])
+      //     br vector.body
+      //
+      //   vector.body
+      //     i = phi [0, vector.ph], [i+4, vector.body]
+      //     v1 = phi [v_init, vector.ph], [v2, vector.body]
+      //     v2 = a[i, i+1, i+2, i+3];
+      //     v3 = vector(v1(3), v2(0, 1, 2))
+      //     b[i, i+1, i+2, i+3] = v2 - v3
+      //     br cond, vector.body, middle.block
+      //
+      //   middle.block:
+      //     s_penultimate = v2(2) = v3(3)
+      //     s_resume = v2(3)
+      //     br cond, scalar.ph, exit.block
+      //
+      //   scalar.ph:
+      //     s_init' = phi [s_resume, middle.block], [s_init, otherwise]
+      //     br scalar.body
+      //
+      //   scalar.body:
+      //     i = phi [0, scalar.ph], [i+1, scalar.body]
+      //     s1 = phi [s_init', scalar.ph], [s2, scalar.body]
+      //     s2 = a[i]
+      //     b[i] = s2 - s1
+      //     br cond, scalar.body, exit.block
+      //
+      //   exit.block:
+      //     lo = lcssa.phi [s1, scalar.body], [s.penultimate, middle.block]
+      //
+      // After execution completes the vector loop, we extract the next value of
+      // the recurrence (x) to use as the initial value in the scalar loop. This
+      // is modeled by ExtractFromEnd.
+      //
+      // Extract the penultimate value of the recurrence and update VPLiveOut
+      // users of the recurrence splice. Note that the extract of the final
+      // value used to resume in the scalar loop is created earlier during VPlan
+      // construction.
+      Ext =
+          B.createNaryOp(VPInstruction::ExtractFromEnd,
+                         {FOR->getBackedgeValue(),
+                          Plan.getOrAddLiveIn(ConstantInt::get(
+                              IntegerType::get(ExitBB->getContext(), 32), 2))},
+                         {}, "vector.recur.extract.for.phi");
+    } else {
+      Ext = B.createNaryOp(
+          VPInstruction::ExtractFromEnd,
+          {V, Plan.getOrAddLiveIn(ConstantInt::get(
+                  IntegerType::get(ExitBB->getContext(), 32), 1))});
+    }
+    Plan.addLiveOut(&ExitPhi, Ext);
   }
 }
 
@@ -8660,6 +8757,14 @@ LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(VFRange &Range) {
   // After here, VPBB should not be used.
   VPBB = nullptr;
 
+  assert(isa<VPRegionBlock>(Plan->getVectorLoopRegion()) &&
+         !Plan->getVectorLoopRegion()->getEntryBasicBlock()->empty() &&
+         "entry block must be set to a VPRegionBlock having a non-empty entry "
+         "VPBasicBlock");
+  RecipeBuilder.fixHeaderPhis();
+
+  addLiveOutsForFirstOrderRecurrences(*Plan);
+
   if (CM.requiresScalarEpilogue(Range)) {
     // No edge from the middle block to the unique exit block has been inserted
     // and there is nothing to fix from vector loop; phis should have incoming
@@ -8668,13 +8773,6 @@ LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(VFRange &Range) {
     addUsersInExitBlock(OrigLoop, RecipeBuilder, *Plan,
                         Legal->getInductionVars());
 
-  assert(isa<VPRegionBlock>(Plan->getVectorLoopRegion()) &&
-         !Plan->getVectorLoopRegion()->getEntryBasicBlock()->empty() &&
-         "entry block must be set to a VPRegionBlock having a non-empty entry "
-         "VPBasicBlock");
-  RecipeBuilder.fixHeaderPhis();
-
-  addLiveOutsForFirstOrderRecurrences(*Plan);
 
   // ---------------------------------------------------------------------------
   // Transform initial VPlan: Apply previously taken decisions, in order, to
@@ -8884,10 +8982,12 @@ void LoopVectorizationPlanner::adjustRecipesForReductions(
     for (unsigned I = 0; I != Worklist.size(); ++I) {
       VPSingleDefRecipe *Cur = Worklist[I];
       for (VPUser *U : Cur->users()) {
-        auto *UserRecipe = dyn_cast<VPSingleDefRecipe>(U);
-        if (!UserRecipe) {
-          assert(isa<VPLiveOut>(U) &&
-                 "U must either be a VPSingleDef or VPLiveOut");
+        auto *UserRecipe = cast<VPSingleDefRecipe>(U);
+        if (!UserRecipe->getParent()->getParent()) {
+          assert(cast<VPInstruction>(U) &&
+                 cast<VPInstruction>(U)->getOpcode() ==
+                     VPInstruction::ExtractFromEnd &&
+                 "U must be an ExtractFromEnd VPInstruction");
           continue;
         }
         Worklist.insert(UserRecipe);
@@ -9105,8 +9205,10 @@ void LoopVectorizationPlanner::adjustRecipesForReductions(
         VPInstruction::ComputeReductionResult, {PhiR, NewExitingVPV}, ExitDL);
     FinalReductionResult->insertBefore(*MiddleVPBB, IP);
     OrigExitingVPV->replaceUsesWithIf(
-        FinalReductionResult,
-        [](VPUser &User, unsigned) { return isa<VPLiveOut>(&User); });
+        FinalReductionResult, [](VPUser &User, unsigned) {
+          auto *R = dyn_cast<VPInstruction>(&User);
+          return R && R->getOpcode() == VPInstruction::ExtractFromEnd;
+        });
   }
 
   VPlanTransforms::clearReductionWrapFlags(*Plan);
diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
index 2d6d67a55c17d3..798d178e5a963d 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
@@ -194,9 +194,6 @@ bool VPRecipeBase::mayHaveSideEffects() const {
 
 void VPLiveOut::fixPhi(VPlan &Plan, VPTransformState &State) {
   VPValue *ExitValue = getOperand(0);
-  auto Lane = vputils::isUniformAfterVectorization(ExitValue)
-                  ? VPLane::getFirstLane()
-                  : VPLane::getLastLaneForVF(State.VF);
   VPBasicBlock *MiddleVPBB =
       cast<VPBasicBlock>(Plan.getVectorLoopRegion()->getSingleSuccessor());
   VPRecipeBase *ExitingRecipe = ExitValue->getDefiningRecipe();
@@ -207,10 +204,7 @@ void VPLiveOut::fixPhi(VPlan &Plan, VPTransformState &State) {
                        ? MiddleVPBB
                        : ExitingVPBB;
   BasicBlock *PredBB = State.CFG.VPBB2IRBB[PredVPBB];
-  // Set insertion point in PredBB in case an extract needs to be generated.
-  // TODO: Model extracts explicitly.
-  State.Builder.SetInsertPoint(PredBB, PredBB->getFirstNonPHIIt());
-  Value *V = State.get(ExitValue, VPIteration(State.UF - 1, Lane));
+  Value *V = State.get(ExitValue, VPIteration(0, 0));
   if (Phi->getBasicBlockIndex(PredBB) != -1)
     Phi->setIncomingValueForBlock(PredBB, V);
   else
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
index c91fd0f118e311..967939c4854c6d 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
@@ -826,20 +826,6 @@ bool VPlanTransforms::adjustFixedOrderRecurrences(VPlan &Plan,
     if (auto *FOR = dyn_cast<VPFirstOrderRecurrencePHIRecipe>(&R))
       RecurrencePhis.push_back(FOR);
 
-  VPBasicBlock *MiddleVPBB =
-      cast<VPBasicBlock>(Plan.getVectorLoopRegion()->getSingleSuccessor());
-  VPBuilder MiddleBuilder;
-  // Set insert point so new recipes are inserted before terminator and
-  // condition, if there is either the former or both.
-  if (auto *Term =
-          dyn_cast_or_null<VPInstruction>(MiddleVPBB->getTerminator())) {
-    if (auto *Cmp = dyn_cast<VPInstruction>(Term->getOperand(0)))
-      MiddleBuilder.setInsertPoint(Cmp);
-    else
-      MiddleBuilder.setInsertPoint(Term);
-  } else
-    MiddleBuilder.setInsertPoint(MiddleVPBB);
-
   for (VPFirstOrderRecurrencePHIRecipe *FOR : RecurrencePhis) {
     SmallPtrSet<VPFirstOrderRecurrencePHIRecipe *, 4> SeenPhis;
     VPRecipeBase *Previous = FOR->getBackedgeValue()->getDefiningRecipe();
@@ -872,86 +858,6 @@ bool VPlanTransforms::adjustFixedOrderRecurrences(VPlan &Plan,
     // Set the first operand of RecurSplice to FOR again, after replacing
     // all users.
     RecurSplice->setOperand(0, FOR);
-
-    // This is the second phase of vectorizing first-order recurrences. An
-    // overview of the transformation is described below. Suppose we have the
-    // following loop with some use after the loop of the last a[i-1],
-    //
-    //   for (int i = 0; i < n; ++i) {
-    //     t = a[i - 1];
-    //     b[i] = a[i] - t;
-    //   }
-    //   use t;
-    //
-    // There is a first-order recurrence on "a". For this loop, the shorthand
-    // scalar IR looks like:
-    //
-    //   scalar.ph:
-    //     s_init = a[-1]
-    //     br scalar.body
-    //
-    //   scalar.body:
-    //     i = phi [0, scalar.ph], [i+1, scalar.body]
-    //     s1 = phi [s_init, scalar.ph], [s2, scalar.body]
-    //     s2 = a[i]
-    //     b[i] = s2 - s1
-    //     br cond, scalar.body, exit.block
-    //
-    //   exit.block:
-    //     use = lcssa.phi [s1, scalar.body]
-    //
-    // In this example, s1 is a recurrence because it's value depends on the
-    // previous iteration. In the first phase of vectorization, we created a
-    // vector phi v1 for s1. We now complete the vectorization and produce the
-    // shorthand vector IR shown below (for VF = 4, UF = 1).
-    //
-    //   vector.ph:
-    //     v_init = vector(..., ..., ..., a[-1])
-    //     br vector.body
-    //
-    //   vector.body
-    //     i = phi [0, vector.ph], [i+4, vector.body]
-    //     v1 = phi [v_init, vector.ph], [v2, vector.body]
-    //     v2 = a[i, i+1, i+2, i+3];
-    //     v3 = vector(v1(3), v2(0, 1, 2))
-    //     b[i, i+1, i+2, i+3] = v2 - v3
-    //     br cond, vector.body, middle.block
-    //
-    //   middle.block:
-    //     s_penultimate = v2(2) = v3(3)
-    //     s_resume = v2(3)
-    //     br cond, scalar.ph, exit.block
-    //
-    //   scalar.ph:
-    //     s_init' = phi [s_resume, middle.block], [s_init, otherwise]
-    //     br scalar.body
-    //
-    //   scalar.body:
-    //     i = phi [0, scalar.ph], [i+1, scalar.body]
-    //     s1 = phi [s_init', scalar.ph], [s2, scalar.body]
-    //     s2 = a[i]
-    //     b[i] = s2 - s1
-    //     br cond, scalar.body, exit.block
-    //
-    //   exit.block:
-    //     lo = lcssa.phi [s1, scalar.body], [s.penultimate, middle.block]
-    //
-    // After execution completes the vector loop, we extract the next value of
-    // the recurrence (x) to use as the initial value in the scalar loop. This
-    // is modeled by ExtractFromEnd.
-    Type *IntTy = Plan.getCanonicalIV()->getScalarType();
-
-    // Extract the penultimate value of the recurrence and update VPLiveOut
-    // users of the recurrence splice. Note that the extract of the final value
-    // used to resume in the scalar loop is created earlier during VPlan
-    // construction.
-    auto *Penultimate = cast<VPInstruction>(MiddleBuilder.createNaryOp(
-        VPInstruction::ExtractFromEnd,
-        {FOR->getBackedgeValue(),
-         Plan.getOrAddLiveIn(ConstantInt::get(IntTy, 2))},
-        {}, "vector.recur.extract.for.phi"));
-    RecurSplice->replaceUsesWithIf(
-        Penultimate, [](VPUser &U, unsigned) { return isa<VPLiveOut>(&U); });
   }
   return true;
 }
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/vplan-vp-intrinsics-reduction.ll b/llvm/test/Transforms/LoopVectorize/RISCV/vplan-vp-intrinsics-reduction.ll
index 16db6cf828af8a..f14ffe854a3a6b 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/vplan-vp-intrinsics-reduction.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/vplan-vp-intrinsics-reduction.ll
@@ -55,6 +55,7 @@ define i32 @reduction(ptr %a, i64 %n, i32 %start) {
 ; IF-EVL-INLOOP-EMPTY:
 ; IF-EVL-INLOOP-NEXT: middle.block:
 ; IF-EVL-INLOOP-NEXT:   EMIT vp<[[RDX:%.+]]> = compute-reduction-result ir<[[RDX_PHI]]>, ir<[[ADD]]>
+; IF-EVL-INLOOP-NEXT:   EMIT vp<[[RDX_EX:%.+]]> = extract-from-end vp<[[RDX]]>, ir<1>
 ; IF-EVL-INLOOP-NEXT:   EMIT branch-on-cond ir<true>
 ; IF-EVL-INLOOP-NEXT: Successor(s): ir-bb<for.end>, scalar.ph
 ; IF-EVL-INLOOP-EMPTY:
@@ -64,7 +65,7 @@ define i32 @reduction(ptr %a, i64 %n, i32 %start) {
 ; IF-EVL-INLOOP-NEXT: scalar.ph:
 ; IF-EVL-INLOOP-NEXT: No successors
 ; IF-EVL-INLOOP-EMPTY:
-; IF-EVL-INLOOP-NEXT: Live-out i32 %add.lcssa = vp<[[RDX]]>
+; IF-EVL-INLOOP-NEXT: Live-out i32 %add.lcssa = vp<[[RDX_EX]]>
 ; IF-EVL-INLOOP-NEXT: }
 ;
 
@@ -93,6 +94,7 @@ define i32 @reduction(ptr %a, i64 %n, i32 %start) {
 ; NO-VP-OUTLOOP-EMPTY:
 ; NO-VP-OUTLOOP-NEXT: middle.block:
 ; NO-VP-OUTLOOP-NEXT:   EMIT vp<[[RDX:%.+]]> = compute-reduction-result ir<[[RDX_PHI]]>, ir<[[ADD]]>
+; NO-VP-OUTLOOP-NEXT:   EMIT vp<[[RDX_EX:%.+]]> = extract-from-end vp<[[RDX]]>, ir<1>
 ; NO-VP-OUTLOOP-NEXT:   EMIT vp<[[BOC:%.+]]> = icmp eq ir<%n>, vp<[[VTC]]>
 ; NO-VP-OUTLOOP-NEXT:   EMIT branch-on-cond vp<[[BOC]]>
 ; NO-VP-OUTLOOP-NEXT: Successor(s): ir-bb<for.end>, scalar.ph
@@ -103,7 +105,7 @@ define i32 @reduction(ptr %a, i64 %n, i32 %start) {
 ; NO-VP-OUTLOOP-NEXT: scalar.ph:
 ; NO-VP-OUTLOOP-NEXT: No successors
 ; NO-VP-OUTLOOP-EMPTY:
-; NO-VP-OUTLOOP-NEXT: Live-out i32 %add.lcssa = vp<[[RDX]]>
+; NO-VP-OUTLOOP-NEXT: Live-out i32 %add.lcssa = vp<[[RDX_EX]]>
 ; NO-VP-OUTLOOP-NEXT: }
 ;
 
@@ -132,6 +134,7 @@ define i32 @reduction(ptr %a, i64 %n, i32 %start) {
 ; NO-VP-INLOOP-EMPTY:
 ; NO-VP-INLOOP-NEXT: middle.block:
 ; NO-VP-INLOOP-NEXT:   EMIT vp<[[RDX:%.+]]> = compute-reduction-result ir<[[RDX_PHI]]>, ir<[[ADD]]>
+; NO-VP-INLOOP-NEXT:   EMIT vp<[[RDX_EX:%.+]]> = extract-from-end vp<[[RDX]]>, ir<1>
 ; NO-VP-INLOOP-NEXT:   EMIT vp<[[BOC:%.+]]> = icmp eq ir<%n>, vp<[[VTC]]>
 ; NO-VP-INLOOP-NEXT:   EMIT branch-on-cond vp<[[BOC]]>
 ; NO-VP-INLOOP-NEXT: Successor(s): ir-bb<for.end>, scalar.ph
@@ -142,7 +145,7 @@ define i32 @reduction(ptr %a, i64 %n, i32 %start) {
 ; NO-VP-INLOOP-NEXT: scalar.ph:
 ; NO-VP-INLOOP-NEXT: No successors
 ; NO-VP-INLOOP-EMPTY:
-; NO-VP-INLOOP-NEXT: Live-out i32 %add.lcssa = vp<[[RDX]]>
+; NO-VP-INLOOP-NEXT: Live-out i32 %add.lcssa = vp<[[RDX_EX]]>
 ; NO-VP-INLOOP-NEXT: }
 ;
 entry:
diff --git a/llvm/test/Transforms/LoopVectorize/first-order-recurrence-sink-replicate-region.ll b/llvm/test/Transforms/LoopVectorize/first-order-recurrence-sink-replicate-region.ll
index 06fbeafba31c01..9e49cf6b42c6b0 100644
--- a/llvm/test/Transforms/LoopVectorize/first-order-recurrence-sink-replicate-region.ll
+++ b/llvm/test/Transforms/LoopVectorize/first-order-recurrence-sink-replicate-region.ll
@@ -220,6 +220,7 @@ define i32 @sink_replicate_region_3_reduction(i32 %x, i8 %y, ptr %ptr) optsize {
 ; CHECK-NEXT: middle.block:
 ; CHECK-NEXT:  EMIT vp<[[RED_RES:%.+]]> = compute-reduction-result ir<%and.red>, vp<[[SEL]]>
 ; CHECK-NEXT:   EMIT vp<[[RESUME_1:%.+]]> = extract-from-end ir<%recur.next>, ir<1>
+; CHECK-NEXT:   EMIT vp<[[RED_EX:%.+]]> = extract-from-end vp<[[RED_RES]]>, ir<1>
 ; CHECK-NEXT:   EMIT branch-on-cond ir<true>
 ; CHECK-NEXT: Successor(s): ir-bb<exit>, scalar.ph
 ; CHECK-EMPTY:
@@ -230,8 +231,8 @@ define i32 @sink_replicate_region_3_reduction(i32 %x, i8 %y, ptr %ptr) optsize {
 ; CHECK-NEXT:   EMIT vp<[[RESUME_1_P:%.*]]> = resume-phi vp<[[RESUME_1]]>, ir<0>
 ; CHECK-NEXT: No successors
 ; CHECK-EMPTY:
-; CHECK-NEXT: Live-out i32 %res = vp<[[RED_RES]]>
 ; CHECK-NEXT: Live-out i32 %recur = vp<[[RESUME_1_P]]>
+; CHECK-NEXT: Live-out i32 %res = vp<[[RED_EX]]>
 ; CHECK-NEXT: }
 ;
 entry:
diff --git a/llvm/test/Transforms/LoopVectorize/instruction-only-used-outside-of-loop.ll b/llvm/test/Transforms/LoopVectorize/instruction-only-used-outside-of-loop.ll
index 5f5cd78dc2d30c..fcc6b7376d408b 100644
--- a/llvm/test/Transforms/LoopVectorize/instruction-only-used-outside-of-loop.ll
+++ b/llvm/test/Transforms/LoopVectorize/instruction-only-used-outside-of-loop.ll
@@ -34,7 +34,7 @@ define i32 @one_direct_branch(ptr %src) {
 ; CHECK-NEXT:    [[PHI_XOR:%.*]] = phi i32 [ [[XOR]], [[LOOP]] ]
 ; CHECK-NEXT:    [[IV_NEXT]] = add nsw i32 [[IV]], 1
 ; CHECK-NEXT:    [[TOBOOL_NOT:%.*]] = icmp eq i32 [[IV_NEXT]], 1000
-; CHECK-NEXT:    br i1 [[TOBOOL_NOT]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP2:![0-9]+]]
+; CHECK-NEXT:    br i1 [[TOBOOL_NOT]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP3:![0-9]+]]
 ; CHECK:       exit:
 ; CHECK-NEXT:    [[XOR_LCSSA:%.*]] = phi i32 [ [[PHI_XOR]], [[LOOP_LATCH]] ], [ [[TMP5]], [[MIDDLE_BLOCK]] ]
 ; CHECK-NEXT:    ret i32 [[XOR_LCSSA]]
@@ -205,16 +205,14 @@ define i32 @optimizable_trunc_used_outside() {
 ; CHECK:       vector.ph:
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
-; CHECK-NEXT:    [[OFFSET_IDX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP0:%.*]] = trunc i64 [[OFFSET_IDX]] to i32
-; CHECK-NEXT:    [[TMP1:%.*]] = add i32 [[TMP0]], 0
-; CHECK-NEXT:    [[TMP2:%.*]] = add i32 [[TMP0]], 1
-; CHECK-NEXT:    [[TMP3:%.*]] = add i32 [[TMP0]], 2
-; CHECK-NEXT:    [[TMP4:%.*]] = add i32 [[TMP0]], 3
-; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[OFFSET_IDX]], 4
-; CHECK-NEXT:    [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1000
-; CHECK-NEXT:    br i1 [[TMP5]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <4 x i32> [ <i32 0, i32 1, i32 2, i32 3>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
+; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <4 x i32> [[VEC_IND]], <i32 4, i32 4, i32 4, i32 4>
+; CHECK-NEXT:    [[TMP0:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1000
+; CHECK-NEXT:    br i1 [[TMP0]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
 ; CHECK:       middle.block:
+; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <4 x i32> [[VEC_IND]], i32 3
 ; CHECK-NEXT:    br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
 ; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 1000, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
@@ -226,7 +224,7 @@ define i32 @optimizable_trunc_used_outside() {
 ; CHECK-NEXT:    [[EXITCOND_NOT_I_I:%.*]] = icmp eq i64 [[IV_NEXT]], 1000
 ; CHECK-NEXT:    br i1 [[EXITCOND_NOT_I_I]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP9:![0-9]+]]
 ; CHECK:       exit:
-; CHECK-NEXT:    [[IV_TRUNC_LCSSA:%.*]] = phi i32 [ [[IV_TRUNC]], [[LOOP]] ], [ [[TMP4]], [[MIDDLE_BLOCK]] ]
+; CHECK-NEXT:    [[IV_TRUNC_LCSSA:%.*]] = phi i32 [ [[IV_TRUNC]], [[LOOP]] ], [ [[TMP1]], [[MIDDLE_BLOCK]] ]
 ; CHECK-NEXT:    ret i32 [[IV_TRUNC_LCSSA]]
 ;
 entry:
diff --git a/llvm/test/Transforms/LoopVectorize/pr55167-fold-tail-live-out.ll b/llvm/test/Transforms/LoopVectorize/pr55167-fold-tail-live-out.ll
index 72f8cf22cafa78..b79525bc3e440d 100644
--- a/llvm/test/Transforms/LoopVectorize/pr55167-fold-tail-live-out.ll
+++ b/llvm/test/Transforms/LoopVectorize/pr55167-fold-tail-live-out.ll
@@ -34,8 +34,8 @@ define i32 @test(i32 %a, i1 %c.1, i1 %c.2 ) #0 {
 ; CHECK-NEXT:    [[TMP8:%.*]] = icmp eq i32 [[INDEX_NEXT]], 176
 ; CHECK-NEXT:    br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    [[TMP9:%.*]] = extractelement <2 x i32> [[PREDPHI5]], i32 1
 ; CHECK-NEXT:    [[TMP10:%.*]] = call i32 @llvm.vector.reduce.add.v2i32(<2 x i32> [[PREDPHI7]])
+; CHECK-NEXT:    [[TMP9:%.*]] = extractelement <2 x i32> [[PREDPHI5]], i32 1
 ; CHECK-NEXT:    br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
 ; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i32 [ 182, [[MIDDLE_BLOCK]] ], [ 6, [[BB:%.*]] ]
diff --git a/llvm/test/Transforms/LoopVectorize/select-cmp-multiuse.ll b/llvm/test/Transforms/LoopVectorize/select-cmp-multiuse.ll
index 9eb90099214e1c..b88e597e6bc8e8 100644
--- a/llvm/test/Transforms/LoopVectorize/select-cmp-multiuse.ll
+++ b/llvm/test/Transforms/LoopVectorize/select-cmp-multiuse.ll
@@ -916,13 +916,13 @@ define i32 @multi_user_cmp_branch_use_and_outside_bb_use(ptr readonly %a, i64 no
 ; CHECK-VF4-IC1-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-VF4-IC1-NEXT:    br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP13:![0-9]+]]
 ; CHECK-VF4-IC1:       middle.block:
-; CHECK-VF4-IC1-NEXT:    [[TMP8:%.*]] = extractelement <4 x i1> [[TMP3]], i32 3
 ; CHECK-VF4-IC1-NEXT:    [[TMP9:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP6]])
 ; CHECK-VF4-IC1-NEXT:    [[TMP10:%.*]] = freeze i1 [[TMP9]]
 ; CHECK-VF4-IC1-NEXT:    [[RDX_SELECT:%.*]] = select i1 [[TMP10]], i1 false, i1 true
 ; CHECK-VF4-IC1-NEXT:    [[TMP11:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP4]])
 ; CHECK-VF4-IC1-NEXT:    [[TMP12:%.*]] = freeze i1 [[TMP11]]
 ; CHECK-VF4-IC1-NEXT:    [[RDX_SELECT2:%.*]] = select i1 [[TMP12]], i1 true, i1 false
+; CHECK-VF4-IC1-NEXT:    [[TMP8:%.*]] = extractelement <4 x i1> [[TMP3]], i32 3
 ; CHECK-VF4-IC1-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
 ; CHECK-VF4-IC1-NEXT:    br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]
 ; CHECK-VF4-IC1:       scalar.ph:
@@ -986,7 +986,6 @@ define i32 @multi_user_cmp_branch_use_and_outside_bb_use(ptr readonly %a, i64 no
 ; CHECK-VF4-IC2-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-VF4-IC2-NEXT:    br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP13:![0-9]+]]
 ; CHECK-VF4-IC2:       middle.block:
-; CHECK-VF4-IC2-NEXT:    [[TMP15:%.*]] = extractelement <4 x i1> [[TMP7]], i32 3
 ; CHECK-VF4-IC2-NEXT:    [[BIN_RDX:%.*]] = or <4 x i1> [[TMP13]], [[TMP12]]
 ; CHECK-VF4-IC2-NEXT:    [[TMP16:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[BIN_RDX]])
 ; CHECK-VF4-IC2-NEXT:    [[TMP17:%.*]] = freeze i1 [[TMP16]]
@@ -995,6 +994,7 @@ define i32 @multi_user_cmp_branch_use_and_outside_bb_use(ptr readonly %a, i64 no
 ; CHECK-VF4-IC2-NEXT:    [[TMP18:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[BIN_RDX5]])
 ; CHECK-VF4-IC2-NEXT:    [[TMP19:%.*]] = freeze i1 [[TMP18]]
 ; CHECK-VF4-IC2-NEXT:    [[RDX_SELECT6:%.*]] = select i1 [[TMP19]], i1 true, i1 false
+; CHECK-VF4-IC2-NEXT:    [[TMP15:%.*]] = extractelement <4 x i1> [[TMP7]], i32 3
 ; CHECK-VF4-IC2-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
 ; CHECK-VF4-IC2-NEXT:    br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]
 ; CHECK-VF4-IC2:       scalar.ph:
diff --git a/llvm/test/Transforms/LoopVectorize/vplan-printing.ll b/llvm/test/Transforms/LoopVectorize/vplan-printing.ll
index 3a664de748d2d7..f18ed825a6b886 100644
--- a/llvm/test/Transforms/LoopVectorize/vplan-printing.ll
+++ b/llvm/test/Transforms/LoopVectorize/vplan-printing.ll
@@ -154,6 +154,7 @@ define float @print_reduction(i64 %n, ptr noalias %y) {
 ; CHECK-EMPTY:
 ; CHECK-NEXT: middle.block:
 ; CHECK-NEXT:   EMIT vp<[[RED_RES:%.+]]> = compute-reduction-result ir<%red>, ir<%red.next>
+; CHECK-NEXT:   EMIT vp<[[RED_EX:%.+]]> = extract-from-end vp<[[RED_RES]]>, ir<1>
 ; CHECK-NEXT:   EMIT vp<[[CMP:%.+]]> = icmp eq ir<%n>, vp<[[VEC_TC]]>
 ; CHECK-NEXT:   EMIT branch-on-cond vp<[[CMP]]>
 ; CHECK-NEXT: Successor(s): ir-bb<for.end>, scalar.ph
@@ -164,7 +165,7 @@ define float @print_reduction(i64 %n, ptr noalias %y) {
 ; CHECK-NEXT: scalar.ph
 ; CHECK-NEXT: No successors
 ; CHECK-EMPTY:
-; CHECK-NEXT: Live-out float %red.next.lcssa = vp<[[RED_RES]]>
+; CHECK-NEXT: Live-out float %red.next.lcssa = vp<[[RED_EX]]>
 ; CHECK-NEXT: }
 ;
 entry:
@@ -435,6 +436,7 @@ define float @print_fmuladd_strict(ptr %a, ptr %b, i64 %n) {
 ; CHECK-EMPTY:
 ; CHECK-NEXT: middle.block:
 ; CHECK-NEXT:   EMIT vp<[[RED_RES:%.+]]> = compute-reduction-result ir<%sum.07>, ir<[[MULADD]]>
+; CHECK-NEXT:   EMIT vp<[[RED_EX:%.+]]> = extract-from-end vp<[[RED_RES]]>, ir<1>
 ; CHECK-NEXT:   EMIT vp<[[CMP:%.+]]> = icmp eq ir<%n>, vp<[[VEC_TC]]>
 ; CHECK-NEXT:   EMIT branch-on-cond vp<[[CMP]]>
 ; CHECK-NEXT: Successor(s): ir-bb<for.end>, scalar.ph
@@ -445,7 +447,7 @@ define float @print_fmuladd_strict(ptr %a, ptr %b, i64 %n) {
 ; CHECK-NEXT: scalar.ph
 ; CHECK-NEXT: No successors
 ; CHECK-EMPTY:
-; CHECK-NEXT: Live-out float %muladd.lcssa = vp<[[RED_RES]]>
+; CHECK-NEXT: Live-out float %muladd.lcssa = vp<[[RED_EX]]>
 ; CHECK-NEXT:}
 
 entry:
@@ -654,6 +656,7 @@ define i32 @print_exit_value(ptr %ptr, i32 %off) {
 ; CHECK-NEXT: Successor(s): middle.block
 ; CHECK-EMPTY:
 ; CHECK-NEXT: middle.block:
+; CHECK-NEXT:   EMIT vp<[[EXIT:%.+]]> = extract-from-end ir<%add>, ir<1>
 ; CHECK-NEXT:   EMIT vp<[[CMP:%.+]]> = icmp eq ir<1000>, vp<[[VEC_TC]]>
 ; CHECK-NEXT:   EMIT branch-on-cond vp<[[CMP]]>
 ; CHECK-NEXT: Successor(s): ir-bb<exit>, scalar.ph
@@ -664,7 +667,7 @@ define i32 @print_exit_value(ptr %ptr, i32 %off) {
 ; CHECK-NEXT: scalar.ph
 ; CHECK-NEXT: No successors
 ; CHECK-EMPTY:
-; CHECK-NEXT: Live-out i32 %lcssa = ir<%add>
+; CHECK-NEXT: Live-out i32 %lcssa = vp<[[EXIT]]>
 ; CHECK-NEXT: }
 ;
 entry:
@@ -1036,8 +1039,8 @@ define i16 @print_first_order_recurrence_and_result(ptr %ptr) {
 ; CHECK-NEXT:   EMIT vp<[[RESUME_P:%.*]]> = resume-phi vp<[[RESUME_1]]>, ir<22>
 ; CHECK-NEXT: No successors
 ; CHECK-EMPTY:
-; CHECK-NEXT:   Live-out i16 %for.1.lcssa = vp<[[FOR_RESULT]]>
 ; CHECK-NEXT:   Live-out i16 %for.1 = vp<[[RESUME_P]]>
+; CHECK-NEXT:   Live-out i16 %for.1.lcssa = vp<[[FOR_RESULT]]>
 ; CHECK-NEXT: }
 ;
 entry:

>From 540b9c5de195002470389e47dcfbe01c5e22a540 Mon Sep 17 00:00:00 2001
From: Florian Hahn <flo at fhahn.com>
Date: Fri, 26 Jul 2024 12:25:29 +0100
Subject: [PATCH 2/7] !fixup fix formatting

---
 llvm/lib/Transforms/Vectorize/LoopVectorize.cpp | 1 -
 1 file changed, 1 deletion(-)

diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index f3cc7059623571..89ea7bc5636734 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -8773,7 +8773,6 @@ LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(VFRange &Range) {
     addUsersInExitBlock(OrigLoop, RecipeBuilder, *Plan,
                         Legal->getInductionVars());
 
-
   // ---------------------------------------------------------------------------
   // Transform initial VPlan: Apply previously taken decisions, in order, to
   // bring the VPlan to its final state.

>From 0284675e468ef130fcf8d45b6715d2c6d3829706 Mon Sep 17 00:00:00 2001
From: Florian Hahn <flo at fhahn.com>
Date: Mon, 22 Jul 2024 10:47:54 +0100
Subject: [PATCH 3/7] [VPlan] Add VPIRInstruction, use for exit block
 live-outs.

Add a new VPIRInstruction recipe to wrap existing IR instructions not to
be modified during execution, execept for PHIs. For PHIs, a single VPValue
operand is allowed, and it is used to add a new incoming value for the
single predecessor VPBB. Expect PHIs, VPIRInstructions cannot have any
operands.

Depends on https://github.com/llvm/llvm-project/pull/100658.
---
 .../Transforms/Vectorize/LoopVectorize.cpp    | 239 +++++++++---------
 llvm/lib/Transforms/Vectorize/VPlan.cpp       |  14 +-
 llvm/lib/Transforms/Vectorize/VPlan.h         |  40 +++
 .../lib/Transforms/Vectorize/VPlanRecipes.cpp |  29 +++
 llvm/lib/Transforms/Vectorize/VPlanValue.h    |   1 +
 .../RISCV/riscv-vector-reverse.ll             |   4 +
 .../RISCV/vplan-vp-intrinsics-reduction.ll    |   9 +-
 ...-order-recurrence-sink-replicate-region.ll |   2 +-
 llvm/test/Transforms/LoopVectorize/pr36983.ll |   2 +-
 llvm/test/Transforms/LoopVectorize/pr45259.ll |   5 +-
 .../vplan-printing-before-execute.ll          |   2 +
 .../LoopVectorize/vplan-printing.ll           |  13 +-
 12 files changed, 227 insertions(+), 133 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 89ea7bc5636734..fabddbbce139a0 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -8434,122 +8434,133 @@ static void addUsersInExitBlock(
   if (!ExitBB || !ExitBB->getSinglePredecessor() || !ExitingBB)
     return;
 
-  // Introduce VPUsers modeling the exit values.
-  for (PHINode &ExitPhi : ExitBB->phis()) {
-    Value *IncomingValue =
-        ExitPhi.getIncomingValueForBlock(ExitingBB);
-    VPValue *V = Builder.getVPValueOrAddLiveIn(IncomingValue, Plan);
-    // Exit values for inductions are computed and updated outside of VPlan and
-    // independent of induction recipes.
-    // TODO: Compute induction exit values in VPlan, use VPLiveOuts to update
-    // live-outs.
-    if ((isa<VPWidenIntOrFpInductionRecipe>(V) &&
-         !cast<VPWidenIntOrFpInductionRecipe>(V)->getTruncInst()) ||
-        isa<VPWidenPointerInductionRecipe>(V) ||
-        (isa<Instruction>(IncomingValue) &&
-         any_of(IncomingValue->users(), [&Inductions](User *U) {
-           auto *P = dyn_cast<PHINode>(U);
-           return P && Inductions.contains(P);
-         })))
+  auto *MiddleBlock = Plan.getVectorLoopRegion()->getSingleSuccessor();
+  for (VPIRBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPIRBasicBlock>(
+           vp_depth_first_shallow(MiddleBlock))) {
+    if (VPBB->getIRBasicBlock() != ExitBB)
       continue;
 
-    auto MiddleVPBB =
-        cast<VPBasicBlock>(Plan.getVectorLoopRegion()->getSingleSuccessor());
-    VPBuilder B(MiddleVPBB);
-    if (auto *Terminator = MiddleVPBB->getTerminator()) {
-      auto *Condition = dyn_cast<VPInstruction>(Terminator->getOperand(0));
-      assert((!Condition || Condition->getParent() == MiddleVPBB) &&
-             "Condition expected in MiddleVPBB");
-      B.setInsertPoint(Condition ? Condition : Terminator);
-    }
+    for (auto &R : *VPBB) {
+      auto *IR = dyn_cast<VPIRInstruction>(&R);
+      if (!IR)
+        continue;
+      auto *ExitPhi = dyn_cast<PHINode>(&IR->getInstruction());
+      if (!ExitPhi)
+        break;
+      Value *IncomingValue = ExitPhi->getIncomingValueForBlock(ExitingBB);
+      VPValue *V = Builder.getVPValueOrAddLiveIn(IncomingValue, Plan);
+      // Exit values for inductions are computed and updated outside of VPlan
+      // and independent of induction recipes.
+      // TODO: Compute induction exit values in VPlan, use VPLiveOuts to update
+      // live-outs.
+      if ((isa<VPWidenIntOrFpInductionRecipe>(V) &&
+           !cast<VPWidenIntOrFpInductionRecipe>(V)->getTruncInst()) ||
+          isa<VPWidenPointerInductionRecipe>(V) ||
+          (isa<Instruction>(IncomingValue) &&
+           any_of(IncomingValue->users(), [&Inductions](User *U) {
+             auto *P = dyn_cast<PHINode>(U);
+             return P && Inductions.contains(P);
+           })))
+        continue;
 
-    VPValue *Ext;
-    if (auto *FOR = dyn_cast_or_null<VPFirstOrderRecurrencePHIRecipe>(
-            V->getDefiningRecipe())) {
-      // This is the second phase of vectorizing first-order recurrences. An
-      // overview of the transformation is described below. Suppose we have the
-      // following loop with some use after the loop of the last a[i-1],
-      //
-      //   for (int i = 0; i < n; ++i) {
-      //     t = a[i - 1];
-      //     b[i] = a[i] - t;
-      //   }
-      //   use t;
-      //
-      // There is a first-order recurrence on "a". For this loop, the shorthand
-      // scalar IR looks like:
-      //
-      //   scalar.ph:
-      //     s_init = a[-1]
-      //     br scalar.body
-      //
-      //   scalar.body:
-      //     i = phi [0, scalar.ph], [i+1, scalar.body]
-      //     s1 = phi [s_init, scalar.ph], [s2, scalar.body]
-      //     s2 = a[i]
-      //     b[i] = s2 - s1
-      //     br cond, scalar.body, exit.block
-      //
-      //   exit.block:
-      //     use = lcssa.phi [s1, scalar.body]
-      //
-      // In this example, s1 is a recurrence because it's value depends on the
-      // previous iteration. In the first phase of vectorization, we created a
-      // vector phi v1 for s1. We now complete the vectorization and produce the
-      // shorthand vector IR shown below (for VF = 4, UF = 1).
-      //
-      //   vector.ph:
-      //     v_init = vector(..., ..., ..., a[-1])
-      //     br vector.body
-      //
-      //   vector.body
-      //     i = phi [0, vector.ph], [i+4, vector.body]
-      //     v1 = phi [v_init, vector.ph], [v2, vector.body]
-      //     v2 = a[i, i+1, i+2, i+3];
-      //     v3 = vector(v1(3), v2(0, 1, 2))
-      //     b[i, i+1, i+2, i+3] = v2 - v3
-      //     br cond, vector.body, middle.block
-      //
-      //   middle.block:
-      //     s_penultimate = v2(2) = v3(3)
-      //     s_resume = v2(3)
-      //     br cond, scalar.ph, exit.block
-      //
-      //   scalar.ph:
-      //     s_init' = phi [s_resume, middle.block], [s_init, otherwise]
-      //     br scalar.body
-      //
-      //   scalar.body:
-      //     i = phi [0, scalar.ph], [i+1, scalar.body]
-      //     s1 = phi [s_init', scalar.ph], [s2, scalar.body]
-      //     s2 = a[i]
-      //     b[i] = s2 - s1
-      //     br cond, scalar.body, exit.block
-      //
-      //   exit.block:
-      //     lo = lcssa.phi [s1, scalar.body], [s.penultimate, middle.block]
-      //
-      // After execution completes the vector loop, we extract the next value of
-      // the recurrence (x) to use as the initial value in the scalar loop. This
-      // is modeled by ExtractFromEnd.
-      //
-      // Extract the penultimate value of the recurrence and update VPLiveOut
-      // users of the recurrence splice. Note that the extract of the final
-      // value used to resume in the scalar loop is created earlier during VPlan
-      // construction.
-      Ext =
-          B.createNaryOp(VPInstruction::ExtractFromEnd,
-                         {FOR->getBackedgeValue(),
-                          Plan.getOrAddLiveIn(ConstantInt::get(
-                              IntegerType::get(ExitBB->getContext(), 32), 2))},
-                         {}, "vector.recur.extract.for.phi");
-    } else {
-      Ext = B.createNaryOp(
-          VPInstruction::ExtractFromEnd,
-          {V, Plan.getOrAddLiveIn(ConstantInt::get(
-                  IntegerType::get(ExitBB->getContext(), 32), 1))});
+      auto MiddleVPBB =
+          cast<VPBasicBlock>(Plan.getVectorLoopRegion()->getSingleSuccessor());
+      VPBuilder B(MiddleVPBB);
+      if (auto *Terminator = MiddleVPBB->getTerminator()) {
+        auto *Condition = dyn_cast<VPInstruction>(Terminator->getOperand(0));
+        assert((!Condition || Condition->getParent() == MiddleVPBB) &&
+               "Condition expected in MiddleVPBB");
+        B.setInsertPoint(Condition ? Condition : Terminator);
+      }
+
+      VPValue *Ext;
+      if (auto *FOR = dyn_cast_or_null<VPFirstOrderRecurrencePHIRecipe>(
+              V->getDefiningRecipe())) {
+        // This is the second phase of vectorizing first-order recurrences. An
+        // overview of the transformation is described below. Suppose we have
+        // the following loop with some use after the loop of the last a[i-1],
+        //
+        //   for (int i = 0; i < n; ++i) {
+        //     t = a[i - 1];
+        //     b[i] = a[i] - t;
+        //   }
+        //   use t;
+        //
+        // There is a first-order recurrence on "a". For this loop, the
+        // shorthand scalar IR looks like:
+        //
+        //   scalar.ph:
+        //     s_init = a[-1]
+        //     br scalar.body
+        //
+        //   scalar.body:
+        //     i = phi [0, scalar.ph], [i+1, scalar.body]
+        //     s1 = phi [s_init, scalar.ph], [s2, scalar.body]
+        //     s2 = a[i]
+        //     b[i] = s2 - s1
+        //     br cond, scalar.body, exit.block
+        //
+        //   exit.block:
+        //     use = lcssa.phi [s1, scalar.body]
+        //
+        // In this example, s1 is a recurrence because it's value depends on the
+        // previous iteration. In the first phase of vectorization, we created a
+        // vector phi v1 for s1. We now complete the vectorization and produce
+        // the shorthand vector IR shown below (for VF = 4, UF = 1).
+        //
+        //   vector.ph:
+        //     v_init = vector(..., ..., ..., a[-1])
+        //     br vector.body
+        //
+        //   vector.body
+        //     i = phi [0, vector.ph], [i+4, vector.body]
+        //     v1 = phi [v_init, vector.ph], [v2, vector.body]
+        //     v2 = a[i, i+1, i+2, i+3];
+        //     v3 = vector(v1(3), v2(0, 1, 2))
+        //     b[i, i+1, i+2, i+3] = v2 - v3
+        //     br cond, vector.body, middle.block
+        //
+        //   middle.block:
+        //     s_penultimate = v2(2) = v3(3)
+        //     s_resume = v2(3)
+        //     br cond, scalar.ph, exit.block
+        //
+        //   scalar.ph:
+        //     s_init' = phi [s_resume, middle.block], [s_init, otherwise]
+        //     br scalar.body
+        //
+        //   scalar.body:
+        //     i = phi [0, scalar.ph], [i+1, scalar.body]
+        //     s1 = phi [s_init', scalar.ph], [s2, scalar.body]
+        //     s2 = a[i]
+        //     b[i] = s2 - s1
+        //     br cond, scalar.body, exit.block
+        //
+        //   exit.block:
+        //     lo = lcssa.phi [s1, scalar.body], [s.penultimate, middle.block]
+        //
+        // After execution completes the vector loop, we extract the next value
+        // of the recurrence (x) to use as the initial value in the scalar loop.
+        // This is modeled by ExtractFromEnd.
+        //
+        // Extract the penultimate value of the recurrence and update VPLiveOut
+        // users of the recurrence splice. Note that the extract of the final
+        // value used to resume in the scalar loop is created earlier during
+        // VPlan construction.
+        Ext = B.createNaryOp(
+            VPInstruction::ExtractFromEnd,
+            {FOR->getBackedgeValue(),
+             Plan.getOrAddLiveIn(ConstantInt::get(
+                 IntegerType::get(ExitBB->getContext(), 32), 2))},
+            {}, "vector.recur.extract.for.phi");
+      } else {
+        Ext = B.createNaryOp(
+            VPInstruction::ExtractFromEnd,
+            {V, Plan.getOrAddLiveIn(ConstantInt::get(
+                    IntegerType::get(ExitBB->getContext(), 32), 1))});
+      }
+      IR->addOperand(Ext);
     }
-    Plan.addLiveOut(&ExitPhi, Ext);
   }
 }
 
@@ -10156,7 +10167,9 @@ bool LoopVectorizePass::processLoop(Loop *L) {
         // directly in VPlan.
         EpilogILV.setTripCount(MainILV.getTripCount());
         for (auto &R : make_early_inc_range(*BestEpiPlan.getPreheader())) {
-          auto *ExpandR = cast<VPExpandSCEVRecipe>(&R);
+          auto *ExpandR = dyn_cast<VPExpandSCEVRecipe>(&R);
+          if (!ExpandR)
+            continue;
           auto *ExpandedVal = BestEpiPlan.getOrAddLiveIn(
               ExpandedSCEVs.find(ExpandR->getSCEV())->second);
           ExpandR->replaceAllUsesWith(ExpandedVal);
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.cpp b/llvm/lib/Transforms/Vectorize/VPlan.cpp
index 58de6256900f09..b16527386521b3 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlan.cpp
@@ -855,10 +855,18 @@ VPlan::~VPlan() {
     delete BackedgeTakenCount;
 }
 
+static VPIRBasicBlock *createVPIRBasicBlockFor(BasicBlock *BB) {
+  auto *VPIRBB = new VPIRBasicBlock(BB);
+  for (Instruction &I :
+       make_range(BB->begin(), BB->getTerminator()->getIterator()))
+    VPIRBB->appendRecipe(new VPIRInstruction(I));
+  return VPIRBB;
+}
+
 VPlanPtr VPlan::createInitialVPlan(const SCEV *TripCount, ScalarEvolution &SE,
                                    bool RequiresScalarEpilogueCheck,
                                    bool TailFolded, Loop *TheLoop) {
-  VPIRBasicBlock *Entry = new VPIRBasicBlock(TheLoop->getLoopPreheader());
+  VPIRBasicBlock *Entry = createVPIRBasicBlockFor(TheLoop->getLoopPreheader());
   VPBasicBlock *VecPreheader = new VPBasicBlock("vector.ph");
   auto Plan = std::make_unique<VPlan>(Entry, VecPreheader);
   Plan->TripCount =
@@ -890,7 +898,7 @@ VPlanPtr VPlan::createInitialVPlan(const SCEV *TripCount, ScalarEvolution &SE,
   //    we unconditionally branch to the scalar preheader.  Do nothing.
   // 3) Otherwise, construct a runtime check.
   BasicBlock *IRExitBlock = TheLoop->getUniqueExitBlock();
-  auto *VPExitBlock = new VPIRBasicBlock(IRExitBlock);
+  auto *VPExitBlock = createVPIRBasicBlockFor(IRExitBlock);
   // The connection order corresponds to the operands of the conditional branch.
   VPBlockUtils::insertBlockAfter(VPExitBlock, MiddleVPBB);
   VPBlockUtils::connectBlocks(MiddleVPBB, ScalarPH);
@@ -957,7 +965,7 @@ void VPlan::prepareToExecute(Value *TripCountV, Value *VectorTripCountV,
 /// predecessor, which is rewired to the new VPIRBasicBlock. All successors of
 /// VPBB, if any, are rewired to the new VPIRBasicBlock.
 static void replaceVPBBWithIRVPBB(VPBasicBlock *VPBB, BasicBlock *IRBB) {
-  VPIRBasicBlock *IRMiddleVPBB = new VPIRBasicBlock(IRBB);
+  VPIRBasicBlock *IRMiddleVPBB = createVPIRBasicBlockFor(IRBB);
   for (auto &R : make_early_inc_range(*VPBB))
     R.moveBefore(*IRMiddleVPBB, IRMiddleVPBB->end());
   VPBlockBase *PredVPBB = VPBB->getSinglePredecessor();
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h
index c9da5e5d38a6b8..2100734eead1bb 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.h
+++ b/llvm/lib/Transforms/Vectorize/VPlan.h
@@ -932,6 +932,7 @@ class VPSingleDefRecipe : public VPRecipeBase, public VPValue {
     case VPRecipeBase::VPScalarCastSC:
       return true;
     case VPRecipeBase::VPInterleaveSC:
+    case VPRecipeBase::VPIRInstructionSC:
     case VPRecipeBase::VPBranchOnMaskSC:
     case VPRecipeBase::VPWidenLoadEVLSC:
     case VPRecipeBase::VPWidenLoadSC:
@@ -1399,6 +1400,45 @@ class VPInstruction : public VPRecipeWithIRFlags {
   bool isSingleScalar() const;
 };
 
+/// A recipe to wrap on original IR instruction not to be modified during
+/// execution, execept for PHIs. For PHIs, a single VPValue operand is allowed,
+/// and it is used to add a new incoming value for the single predecessor VPBB.
+/// Expect PHIs, VPIRInstructions cannot have any operands.
+class VPIRInstruction : public VPRecipeBase {
+  Instruction &I;
+
+public:
+  VPIRInstruction(Instruction &I)
+      : VPRecipeBase(VPDef::VPIRInstructionSC, ArrayRef<VPValue *>()), I(I) {}
+
+  ~VPIRInstruction() override = default;
+
+  VP_CLASSOF_IMPL(VPDef::VPIRInstructionSC)
+
+  VPIRInstruction *clone() override {
+    auto *R = new VPIRInstruction(I);
+    for (auto *Op : operands())
+      R->addOperand(Op);
+    return R;
+  }
+
+  void execute(VPTransformState &State) override;
+
+  Instruction &getInstruction() { return I; }
+
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+  /// Print the recipe.
+  void print(raw_ostream &O, const Twine &Indent,
+             VPSlotTracker &SlotTracker) const override;
+#endif
+
+  bool usesScalars(const VPValue *Op) const override {
+    assert(is_contained(operands(), Op) &&
+           "Op must be an operand of the recipe");
+    return true;
+  }
+};
+
 /// VPWidenRecipe is a recipe for producing a widened instruction using the
 /// opcode and operands of the recipe. This recipe covers most of the
 /// traditional vectorization cases where each recipe transforms into a
diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
index 798d178e5a963d..780eeff5a2b7b9 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
@@ -856,6 +856,35 @@ void VPInstruction::print(raw_ostream &O, const Twine &Indent,
 }
 #endif
 
+void VPIRInstruction::execute(VPTransformState &State) {
+  assert(isa<VPIRBasicBlock>(getParent()) &&
+         "VPIRInstructions can only be placed in VPIRBasicBlocks");
+
+  if (getNumOperands() == 1 && isa<PHINode>(&I)) {
+    VPValue *ExitValue = getOperand(0);
+    auto Lane = vputils::isUniformAfterVectorization(ExitValue)
+                    ? VPLane::getFirstLane()
+                    : VPLane::getLastLaneForVF(State.VF);
+    auto *PredVPBB = cast<VPBasicBlock>(getParent()->getSinglePredecessor());
+    BasicBlock *PredBB = State.CFG.VPBB2IRBB[PredVPBB];
+    // Set insertion point in PredBB in case an extract needs to be generated.
+    // TODO: Model extracts explicitly.
+    State.Builder.SetInsertPoint(PredBB, PredBB->getFirstNonPHIIt());
+    Value *V = State.get(ExitValue, VPIteration(State.UF - 1, Lane));
+    auto *Phi = cast<PHINode>(&I);
+    Phi->addIncoming(V, PredBB);
+  }
+
+  State.Builder.SetInsertPoint(I.getParent(), std::next(I.getIterator()));
+}
+
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+void VPIRInstruction::print(raw_ostream &O, const Twine &Indent,
+                            VPSlotTracker &SlotTracker) const {
+  O << Indent << "IR " << I;
+}
+#endif
+
 void VPWidenCallRecipe::execute(VPTransformState &State) {
   assert(State.VF.isVector() && "not widening");
   Function *CalledScalarFn = getCalledScalarFunction();
diff --git a/llvm/lib/Transforms/Vectorize/VPlanValue.h b/llvm/lib/Transforms/Vectorize/VPlanValue.h
index 452c977106a773..33c221aa946f1e 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanValue.h
+++ b/llvm/lib/Transforms/Vectorize/VPlanValue.h
@@ -339,6 +339,7 @@ class VPDef {
     VPBranchOnMaskSC,
     VPDerivedIVSC,
     VPExpandSCEVSC,
+    VPIRInstructionSC,
     VPInstructionSC,
     VPInterleaveSC,
     VPReductionEVLSC,
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/riscv-vector-reverse.ll b/llvm/test/Transforms/LoopVectorize/RISCV/riscv-vector-reverse.ll
index 3a14842580425b..97809e50d784ae 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/riscv-vector-reverse.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/riscv-vector-reverse.ll
@@ -60,6 +60,7 @@ define void @vector_reverse_i64(ptr nocapture noundef writeonly %A, ptr nocaptur
 ; CHECK-NEXT:  vp<%2> = original trip-count
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  ir-bb<for.body.preheader>:
+; CHECK-NEXT:    IR %0 = zext i32 %n to i64
 ; CHECK-NEXT:    EMIT vp<%2> = EXPAND SCEV (zext i32 %n to i64)
 ; CHECK-NEXT:  No successors
 ; CHECK-EMPTY:
@@ -144,6 +145,7 @@ define void @vector_reverse_i64(ptr nocapture noundef writeonly %A, ptr nocaptur
 ; CHECK-NEXT:  vp<%2> = original trip-count
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  ir-bb<for.body.preheader>:
+; CHECK-NEXT:    IR %0 = zext i32 %n to i64
 ; CHECK-NEXT:    EMIT vp<%2> = EXPAND SCEV (zext i32 %n to i64)
 ; CHECK-NEXT:  No successors
 ; CHECK-EMPTY:
@@ -265,6 +267,7 @@ define void @vector_reverse_f32(ptr nocapture noundef writeonly %A, ptr nocaptur
 ; CHECK-NEXT:  vp<%2> = original trip-count
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  ir-bb<for.body.preheader>:
+; CHECK-NEXT:    IR %0 = zext i32 %n to i64
 ; CHECK-NEXT:    EMIT vp<%2> = EXPAND SCEV (zext i32 %n to i64)
 ; CHECK-NEXT:  No successors
 ; CHECK-EMPTY:
@@ -349,6 +352,7 @@ define void @vector_reverse_f32(ptr nocapture noundef writeonly %A, ptr nocaptur
 ; CHECK-NEXT:  vp<%2> = original trip-count
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  ir-bb<for.body.preheader>:
+; CHECK-NEXT:    IR %0 = zext i32 %n to i64
 ; CHECK-NEXT:    EMIT vp<%2> = EXPAND SCEV (zext i32 %n to i64)
 ; CHECK-NEXT:  No successors
 ; CHECK-EMPTY:
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/vplan-vp-intrinsics-reduction.ll b/llvm/test/Transforms/LoopVectorize/RISCV/vplan-vp-intrinsics-reduction.ll
index f14ffe854a3a6b..803a5e6ecb733e 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/vplan-vp-intrinsics-reduction.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/vplan-vp-intrinsics-reduction.ll
@@ -60,12 +60,11 @@ define i32 @reduction(ptr %a, i64 %n, i32 %start) {
 ; IF-EVL-INLOOP-NEXT: Successor(s): ir-bb<for.end>, scalar.ph
 ; IF-EVL-INLOOP-EMPTY:
 ; IF-EVL-INLOOP-NEXT: ir-bb<for.end>:
+; IF-EVL-INLOOP-NEXT:  IR %add.lcssa = phi i32 [ %add, %for.body ]
 ; IF-EVL-INLOOP-NEXT: No successors
 ; IF-EVL-INLOOP-EMPTY:
 ; IF-EVL-INLOOP-NEXT: scalar.ph:
 ; IF-EVL-INLOOP-NEXT: No successors
-; IF-EVL-INLOOP-EMPTY:
-; IF-EVL-INLOOP-NEXT: Live-out i32 %add.lcssa = vp<[[RDX_EX]]>
 ; IF-EVL-INLOOP-NEXT: }
 ;
 
@@ -100,12 +99,11 @@ define i32 @reduction(ptr %a, i64 %n, i32 %start) {
 ; NO-VP-OUTLOOP-NEXT: Successor(s): ir-bb<for.end>, scalar.ph
 ; NO-VP-OUTLOOP-EMPTY:
 ; NO-VP-OUTLOOP-NEXT: ir-bb<for.end>:
+; NO-VP-OUTLOOP-NEXT:  IR %add.lcssa = phi i32 [ %add, %for.body ]
 ; NO-VP-OUTLOOP-NEXT: No successors
 ; NO-VP-OUTLOOP-EMPTY:
 ; NO-VP-OUTLOOP-NEXT: scalar.ph:
 ; NO-VP-OUTLOOP-NEXT: No successors
-; NO-VP-OUTLOOP-EMPTY:
-; NO-VP-OUTLOOP-NEXT: Live-out i32 %add.lcssa = vp<[[RDX_EX]]>
 ; NO-VP-OUTLOOP-NEXT: }
 ;
 
@@ -140,12 +138,11 @@ define i32 @reduction(ptr %a, i64 %n, i32 %start) {
 ; NO-VP-INLOOP-NEXT: Successor(s): ir-bb<for.end>, scalar.ph
 ; NO-VP-INLOOP-EMPTY:
 ; NO-VP-INLOOP-NEXT: ir-bb<for.end>:
+; NO-VP-INLOOP-NEXT:   IR %add.lcssa = phi i32 [ %add, %for.body ]
 ; NO-VP-INLOOP-NEXT: No successors
 ; NO-VP-INLOOP-EMPTY:
 ; NO-VP-INLOOP-NEXT: scalar.ph:
 ; NO-VP-INLOOP-NEXT: No successors
-; NO-VP-INLOOP-EMPTY:
-; NO-VP-INLOOP-NEXT: Live-out i32 %add.lcssa = vp<[[RDX_EX]]>
 ; NO-VP-INLOOP-NEXT: }
 ;
 entry:
diff --git a/llvm/test/Transforms/LoopVectorize/first-order-recurrence-sink-replicate-region.ll b/llvm/test/Transforms/LoopVectorize/first-order-recurrence-sink-replicate-region.ll
index 9e49cf6b42c6b0..993b4b2a948e03 100644
--- a/llvm/test/Transforms/LoopVectorize/first-order-recurrence-sink-replicate-region.ll
+++ b/llvm/test/Transforms/LoopVectorize/first-order-recurrence-sink-replicate-region.ll
@@ -225,6 +225,7 @@ define i32 @sink_replicate_region_3_reduction(i32 %x, i8 %y, ptr %ptr) optsize {
 ; CHECK-NEXT: Successor(s): ir-bb<exit>, scalar.ph
 ; CHECK-EMPTY:
 ; CHECK-NEXT: ir-bb<exit>
+; CHECK-NEXT:   IR %res = phi i32 [ %and.red.next, %loop ]
 ; CHECK-NEXT: No successors
 ; CHECK-EMPTY:
 ; CHECK-NEXT: scalar.ph
@@ -232,7 +233,6 @@ define i32 @sink_replicate_region_3_reduction(i32 %x, i8 %y, ptr %ptr) optsize {
 ; CHECK-NEXT: No successors
 ; CHECK-EMPTY:
 ; CHECK-NEXT: Live-out i32 %recur = vp<[[RESUME_1_P]]>
-; CHECK-NEXT: Live-out i32 %res = vp<[[RED_EX]]>
 ; CHECK-NEXT: }
 ;
 entry:
diff --git a/llvm/test/Transforms/LoopVectorize/pr36983.ll b/llvm/test/Transforms/LoopVectorize/pr36983.ll
index f4da4f355eb0d1..7e38d60b6f5819 100644
--- a/llvm/test/Transforms/LoopVectorize/pr36983.ll
+++ b/llvm/test/Transforms/LoopVectorize/pr36983.ll
@@ -4,7 +4,7 @@
 
 ; CHECK-LABEL: bb1.bb3_crit_edge:
 ; CHECK: %_tmp133.lcssa1 = phi i16 [ %_tmp133, %bb2 ], [ %vector.recur.extract.for.phi, %middle.block ]
-; CHECK: %_tmp133.lcssa = phi i16 [ %_tmp133, %bb2 ], [ %vector.recur.extract.for.phi, %middle.block ]
+; CHECK: %_tmp133.lcssa = phi i16 [ %_tmp133, %bb2 ], [ %vector.recur.extract.for.phi1, %middle.block ]
 
 define void @f1() {
 bb2.lr.ph:
diff --git a/llvm/test/Transforms/LoopVectorize/pr45259.ll b/llvm/test/Transforms/LoopVectorize/pr45259.ll
index dcc8f3f2f9d8fe..008971697775e4 100644
--- a/llvm/test/Transforms/LoopVectorize/pr45259.ll
+++ b/llvm/test/Transforms/LoopVectorize/pr45259.ll
@@ -14,11 +14,12 @@ define i8 @widget(ptr %arr, i8 %t9) {
 ; CHECK-NEXT:    br i1 [[C]], label [[FOR_PREHEADER:%.*]], label [[BB6]]
 ; CHECK:       for.preheader:
 ; CHECK-NEXT:    [[T1_0_LCSSA:%.*]] = phi ptr [ [[T1_0]], [[BB6]] ]
-; CHECK-NEXT:    [[T1_0_LCSSA2:%.*]] = ptrtoint ptr [[T1_0_LCSSA]] to i64
 ; CHECK-NEXT:    [[TMP0:%.*]] = trunc i64 [[ARR1]] to i32
 ; CHECK-NEXT:    [[TMP1:%.*]] = sub i32 0, [[TMP0]]
-; CHECK-NEXT:    [[TMP2:%.*]] = trunc i64 [[T1_0_LCSSA2]] to i32
+; CHECK-NEXT:    [[T1_0_LCSSA3:%.*]] = ptrtoint ptr [[T1_0_LCSSA]] to i64
+; CHECK-NEXT:    [[TMP2:%.*]] = trunc i64 [[T1_0_LCSSA3]] to i32
 ; CHECK-NEXT:    [[TMP3:%.*]] = add i32 [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    [[T1_0_LCSSA2:%.*]] = ptrtoint ptr [[T1_0_LCSSA]] to i64
 ; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 [[TMP3]], 4
 ; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_SCEVCHECK:%.*]]
 ; CHECK:       vector.scevcheck:
diff --git a/llvm/test/Transforms/LoopVectorize/vplan-printing-before-execute.ll b/llvm/test/Transforms/LoopVectorize/vplan-printing-before-execute.ll
index e4984f52ee6ff9..431d14be45857b 100644
--- a/llvm/test/Transforms/LoopVectorize/vplan-printing-before-execute.ll
+++ b/llvm/test/Transforms/LoopVectorize/vplan-printing-before-execute.ll
@@ -14,6 +14,7 @@ define void @test_tc_less_than_16(ptr %A, i64 %N) {
 ; CHECK-NEXT: vp<[[TC:%.+]]> = original trip-count
 ; CHECK-EMPTY:
 ; CHECK-NEXT: ir-bb<entry>:
+; CHECK-NEXT:   IR %and = and i64 %N, 15
 ; CHECK-NEXT:   EMIT vp<[[TC]]> = EXPAND SCEV (zext i4 (trunc i64 %N to i4) to i64)
 ; CHECK-NEXT: No successors
 ; CHECK-EMPTY:
@@ -55,6 +56,7 @@ define void @test_tc_less_than_16(ptr %A, i64 %N) {
 ; CHECK-NEXT: vp<[[TC:%.+]]> = original trip-count
 ; CHECK-EMPTY:
 ; CHECK-NEXT: ir-bb<entry>:
+; CHECK-NEXT:   IR %and = and i64 %N, 15
 ; CHECK-NEXT:   EMIT vp<[[TC]]> = EXPAND SCEV (zext i4 (trunc i64 %N to i4) to i64)
 ; CHECK-NEXT: No successors
 ; CHECK-EMPTY:
diff --git a/llvm/test/Transforms/LoopVectorize/vplan-printing.ll b/llvm/test/Transforms/LoopVectorize/vplan-printing.ll
index f18ed825a6b886..9b7051630347e4 100644
--- a/llvm/test/Transforms/LoopVectorize/vplan-printing.ll
+++ b/llvm/test/Transforms/LoopVectorize/vplan-printing.ll
@@ -160,12 +160,11 @@ define float @print_reduction(i64 %n, ptr noalias %y) {
 ; CHECK-NEXT: Successor(s): ir-bb<for.end>, scalar.ph
 ; CHECK-EMPTY:
 ; CHECK-NEXT: ir-bb<for.end>
+; CHECK-NEXT:  IR %red.next.lcssa = phi float [ %red.next, %for.body ]
 ; CHECK-NEXT: No successors
 ; CHECK-EMPTY:
 ; CHECK-NEXT: scalar.ph
 ; CHECK-NEXT: No successors
-; CHECK-EMPTY:
-; CHECK-NEXT: Live-out float %red.next.lcssa = vp<[[RED_EX]]>
 ; CHECK-NEXT: }
 ;
 entry:
@@ -442,12 +441,11 @@ define float @print_fmuladd_strict(ptr %a, ptr %b, i64 %n) {
 ; CHECK-NEXT: Successor(s): ir-bb<for.end>, scalar.ph
 ; CHECK-EMPTY:
 ; CHECK-NEXT: ir-bb<for.end>
+; CHECK-NEXT:   IR %muladd.lcssa = phi float [ %muladd, %for.body ]
 ; CHECK-NEXT: No successors
 ; CHECK-EMPTY:
 ; CHECK-NEXT: scalar.ph
 ; CHECK-NEXT: No successors
-; CHECK-EMPTY:
-; CHECK-NEXT: Live-out float %muladd.lcssa = vp<[[RED_EX]]>
 ; CHECK-NEXT:}
 
 entry:
@@ -574,6 +572,8 @@ define void @print_expand_scev(i64 %y, ptr %ptr) {
 ; CHECK-NEXT: vp<[[TC:%.+]]> = original trip-count
 ; CHECK-EMPTY:
 ; CHECK-NEXT: ir-bb<entry>:
+; CHECK-NEXT:   IR %div = udiv i64 %y, 492802768830814060
+; CHECK-NEXT:   IR %inc = add i64 %div, 1
 ; CHECK-NEXT:   EMIT vp<[[TC]]> = EXPAND SCEV (1 + ((15 + (%y /u 492802768830814060))<nuw><nsw> /u (1 + (%y /u 492802768830814060))<nuw><nsw>))<nuw><nsw>
 ; CHECK-NEXT:   EMIT vp<[[EXP_SCEV:%.+]]> = EXPAND SCEV (1 + (%y /u 492802768830814060))<nuw><nsw>
 ; CHECK-NEXT: No successors
@@ -662,12 +662,11 @@ define i32 @print_exit_value(ptr %ptr, i32 %off) {
 ; CHECK-NEXT: Successor(s): ir-bb<exit>, scalar.ph
 ; CHECK-EMPTY:
 ; CHECK-NEXT: ir-bb<exit>
+; CHECK-NEXT:   IR %lcssa = phi i32 [ %add, %loop ]
 ; CHECK-NEXT: No successors
 ; CHECK-EMPTY:
 ; CHECK-NEXT: scalar.ph
 ; CHECK-NEXT: No successors
-; CHECK-EMPTY:
-; CHECK-NEXT: Live-out i32 %lcssa = vp<[[EXIT]]>
 ; CHECK-NEXT: }
 ;
 entry:
@@ -1033,6 +1032,7 @@ define i16 @print_first_order_recurrence_and_result(ptr %ptr) {
 ; CHECK-NEXT: Successor(s): ir-bb<exit>, scalar.ph
 ; CHECK-EMPTY:
 ; CHECK-NEXT: ir-bb<exit>
+; CHECK-NEXT:   IR %for.1.lcssa = phi i16 [ %for.1, %loop ]
 ; CHECK-NEXT: No successors
 ; CHECK-EMPTY:
 ; CHECK-NEXT: scalar.ph
@@ -1040,7 +1040,6 @@ define i16 @print_first_order_recurrence_and_result(ptr %ptr) {
 ; CHECK-NEXT: No successors
 ; CHECK-EMPTY:
 ; CHECK-NEXT:   Live-out i16 %for.1 = vp<[[RESUME_P]]>
-; CHECK-NEXT:   Live-out i16 %for.1.lcssa = vp<[[FOR_RESULT]]>
 ; CHECK-NEXT: }
 ;
 entry:

>From de8674195cdc97847b6e262c1d1f38be039b9545 Mon Sep 17 00:00:00 2001
From: Florian Hahn <flo at fhahn.com>
Date: Sun, 25 Aug 2024 14:54:42 +0100
Subject: [PATCH 4/7] !fixup address comments, thanks!

---
 .../Transforms/Vectorize/LoopVectorize.cpp    | 80 ++++++++-----------
 .../lib/Transforms/Vectorize/VPlanRecipes.cpp | 10 ++-
 ...-order-recurrence-sink-replicate-region.ll |  1 -
 .../LoopVectorize/vplan-printing.ll           |  1 -
 4 files changed, 41 insertions(+), 51 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index b51eea223d8021..e0f2046a263d59 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -8592,7 +8592,7 @@ static void addCanonicalIVRecipes(VPlan &Plan, Type *IdxTy, bool HasNUW,
 // are modeled in VPlan. Some exiting values are not modeled explicitly yet and
 // won't be included. Those are un-truncated VPWidenIntOrFpInductionRecipe,
 // VPWidenPointerInductionRecipe and induction increments.
-static MapVector<PHINode *, VPValue *> collectUsersInExitBlock(
+static MapVector<VPIRInstruction *, VPValue *> collectUsersInExitBlock(
     Loop *OrigLoop, VPRecipeBuilder &Builder, VPlan &Plan,
     const MapVector<PHINode *, InductionDescriptor> &Inductions) {
   auto MiddleVPBB =
@@ -8602,13 +8602,17 @@ static MapVector<PHINode *, VPValue *> collectUsersInExitBlock(
   // from scalar loop only.
   if (MiddleVPBB->getNumSuccessors() != 2)
     return {};
-  MapVector<PHINode *, VPValue *> ExitingValuesToFix;
-  BasicBlock *ExitBB =
-      cast<VPIRBasicBlock>(MiddleVPBB->getSuccessors()[0])->getIRBasicBlock();
+  MapVector<VPIRInstruction *, VPValue *> ExitingValuesToFix;
+  VPBasicBlock *ExitVPBB = cast<VPIRBasicBlock>(MiddleVPBB->getSuccessors()[0]);
   BasicBlock *ExitingBB = OrigLoop->getExitingBlock();
-  for (PHINode &ExitPhi : ExitBB->phis()) {
-    Value *IncomingValue =
-        ExitPhi.getIncomingValueForBlock(ExitingBB);
+  for (VPRecipeBase &R : *ExitVPBB) {
+    auto *IR = dyn_cast<VPIRInstruction>(&R);
+    if (!IR)
+      continue;
+    auto *ExitPhi = dyn_cast<PHINode>(&IR->getInstruction());
+    if (!ExitPhi)
+      break;
+    Value *IncomingValue = ExitPhi->getIncomingValueForBlock(ExitingBB);
     VPValue *V = Builder.getVPValueOrAddLiveIn(IncomingValue, Plan);
     // Exit values for inductions are computed and updated outside of VPlan and
     // independent of induction recipes.
@@ -8623,16 +8627,15 @@ static MapVector<PHINode *, VPValue *> collectUsersInExitBlock(
            return P && Inductions.contains(P);
          })))
       continue;
-    ExitingValuesToFix.insert({&ExitPhi, V});
+    ExitingValuesToFix.insert({IR, V});
   }
   return ExitingValuesToFix;
 }
 
 // Add exit values to \p Plan. Extracts and VPLiveOuts are added for each entry
 // in \p ExitingValuesToFix.
-static void
-addUsersInExitBlock(VPlan &Plan,
-                    MapVector<PHINode *, VPValue *> &ExitingValuesToFix) {
+static void addUsersInExitBlock(
+    VPlan &Plan, MapVector<VPIRInstruction *, VPValue *> &ExitingValuesToFix) {
   if (ExitingValuesToFix.empty())
     return;
 
@@ -8650,12 +8653,12 @@ addUsersInExitBlock(VPlan &Plan,
   }
 
   // Introduce VPUsers modeling the exit values.
-  for (const auto &[ExitPhi, V] : ExitingValuesToFix) {
+  for (const auto &[IR, V] : ExitingValuesToFix) {
     VPValue *Ext = B.createNaryOp(
         VPInstruction::ExtractFromEnd,
         {V, Plan.getOrAddLiveIn(ConstantInt::get(
                 IntegerType::get(ExitBB->getContext(), 32), 1))});
-    Plan.addLiveOut(ExitPhi, Ext);
+    IR->addOperand(Ext);
   }
 }
 
@@ -8668,7 +8671,7 @@ addUsersInExitBlock(VPlan &Plan,
 /// 2. Feed the penultimate value of recurrences to their LCSSA phi users in
 ///    the original exit block using a VPLiveOut.
 static void addLiveOutsForFirstOrderRecurrences(
-    VPlan &Plan, MapVector<PHINode *, VPValue *> &ExitingValuesToFix) {
+    VPlan &Plan, MapVector<VPIRInstruction *, VPValue *> &ExitingValuesToFix) {
   VPRegionBlock *VectorRegion = Plan.getVectorLoopRegion();
 
   // Start by finding out if middle block branches to scalar preheader, which is
@@ -8802,17 +8805,14 @@ static void addLiveOutsForFirstOrderRecurrences(
     // No edge from the middle block to the unique exit block has been inserted
     // and there is nothing to fix from vector loop; phis should have incoming
     // from scalar loop only.
-    if (ExitingValuesToFix.empty())
-      continue;
-    for (User *U : FORPhi->users()) {
-      auto *UI = cast<Instruction>(U);
-      if (UI->getParent() != ExitBB)
+    for (const auto &[IR, V] : ExitingValuesToFix) {
+      if (V != FOR)
         continue;
       VPValue *Ext = MiddleBuilder.createNaryOp(
           VPInstruction::ExtractFromEnd, {FOR->getBackedgeValue(), TwoVPV}, {},
           "vector.recur.extract.for.phi");
-      Plan.addLiveOut(cast<PHINode>(UI), Ext);
-      ExitingValuesToFix.erase(cast<PHINode>(UI));
+      IR->addOperand(Ext);
+      ExitingValuesToFix.erase(IR);
     }
   }
 }
@@ -8974,20 +8974,13 @@ LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(VFRange &Range) {
          "VPBasicBlock");
   RecipeBuilder.fixHeaderPhis();
 
-  MapVector<PHINode *, VPValue *> ExitingValuesToFix = collectUsersInExitBlock(
-      OrigLoop, RecipeBuilder, *Plan, Legal->getInductionVars());
+  MapVector<VPIRInstruction *, VPValue *> ExitingValuesToFix =
+      collectUsersInExitBlock(OrigLoop, RecipeBuilder, *Plan,
+                              Legal->getInductionVars());
 
   addLiveOutsForFirstOrderRecurrences(*Plan, ExitingValuesToFix);
   addUsersInExitBlock(*Plan, ExitingValuesToFix);
 
-  if (CM.requiresScalarEpilogue(Range)) {
-    // No edge from the middle block to the unique exit block has been inserted
-    // and there is nothing to fix from vector loop; phis should have incoming
-    // from scalar loop only.
-  } else
-    addUsersInExitBlock(OrigLoop, RecipeBuilder, *Plan,
-                        Legal->getInductionVars());
-
   // ---------------------------------------------------------------------------
   // Transform initial VPlan: Apply previously taken decisions, in order, to
   // bring the VPlan to its final state.
@@ -9198,13 +9191,9 @@ void LoopVectorizationPlanner::adjustRecipesForReductions(
       VPSingleDefRecipe *Cur = Worklist[I];
       for (VPUser *U : Cur->users()) {
         auto *UserRecipe = cast<VPSingleDefRecipe>(U);
-        if (!UserRecipe->getParent()->getParent()) {
-          assert(cast<VPInstruction>(U) &&
-                 cast<VPInstruction>(U)->getOpcode() ==
-                     VPInstruction::ExtractFromEnd &&
-/*        if (!UserRecipe->getParent()->getEnclosingLoopRegion()) {*/
-          /*assert(match(U, m_Binary<VPInstruction::ExtractFromEnd>(*/
-                              /*m_VPValue(), m_VPValue())) &&*/
+        if (!UserRecipe->getParent()->getEnclosingLoopRegion()) {
+          assert(match(U, m_Binary<VPInstruction::ExtractFromEnd>(
+                              m_VPValue(), m_VPValue())) &&
                  "U must be an ExtractFromEnd VPInstruction");
           continue;
         }
@@ -9422,16 +9411,11 @@ void LoopVectorizationPlanner::adjustRecipesForReductions(
     auto *FinalReductionResult = new VPInstruction(
         VPInstruction::ComputeReductionResult, {PhiR, NewExitingVPV}, ExitDL);
     FinalReductionResult->insertBefore(*MiddleVPBB, IP);
-    OrigExitingVPV->replaceUsesWithIf(
-        FinalReductionResult, [](VPUser &User, unsigned) {
-          auto *R = dyn_cast<VPInstruction>(&User);
-          return R && R->getOpcode() == VPInstruction::ExtractFromEnd;
-        });
-/*    OrigExitingVPV->replaceUsesWithIf(FinalReductionResult, [](VPUser &User,*/
-                                                               /*unsigned) {*/
-      /*return match(&User, m_Binary<VPInstruction::ExtractFromEnd>(m_VPValue(),*/
-                                                                  /*m_VPValue()));*/
-    /*});*/
+    OrigExitingVPV->replaceUsesWithIf(FinalReductionResult, [](VPUser &User,
+                                                               unsigned) {
+      return match(&User, m_Binary<VPInstruction::ExtractFromEnd>(m_VPValue(),
+                                                                  m_VPValue()));
+    });
   }
 
   VPlanTransforms::clearReductionWrapFlags(*Plan);
diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
index 62d408e786a4f2..1be9a1af94e244 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
@@ -860,7 +860,9 @@ void VPIRInstruction::execute(VPTransformState &State) {
   assert(isa<VPIRBasicBlock>(getParent()) &&
          "VPIRInstructions can only be placed in VPIRBasicBlocks");
 
-  if (getNumOperands() == 1 && isa<PHINode>(&I)) {
+  assert((isa<PHINode>(&I) || getNumOperands() == 0) &&
+         "Only PHINodes can have extra operands");
+  if (getNumOperands() == 1) {
     VPValue *ExitValue = getOperand(0);
     auto Lane = vputils::isUniformAfterVectorization(ExitValue)
                     ? VPLane::getFirstLane()
@@ -882,6 +884,12 @@ void VPIRInstruction::execute(VPTransformState &State) {
 void VPIRInstruction::print(raw_ostream &O, const Twine &Indent,
                             VPSlotTracker &SlotTracker) const {
   O << Indent << "IR " << I;
+
+  if (getNumOperands() != 0) {
+    O << " (extra operands: ";
+    printOperands(O, SlotTracker);
+    O << ")";
+  }
 }
 #endif
 
diff --git a/llvm/test/Transforms/LoopVectorize/first-order-recurrence-sink-replicate-region.ll b/llvm/test/Transforms/LoopVectorize/first-order-recurrence-sink-replicate-region.ll
index f1fb2f7f58e770..993b4b2a948e03 100644
--- a/llvm/test/Transforms/LoopVectorize/first-order-recurrence-sink-replicate-region.ll
+++ b/llvm/test/Transforms/LoopVectorize/first-order-recurrence-sink-replicate-region.ll
@@ -233,7 +233,6 @@ define i32 @sink_replicate_region_3_reduction(i32 %x, i8 %y, ptr %ptr) optsize {
 ; CHECK-NEXT: No successors
 ; CHECK-EMPTY:
 ; CHECK-NEXT: Live-out i32 %recur = vp<[[RESUME_1_P]]>
-; CHECK-NEXT: Live-out i32 %res = vp<[[RED_EX]]>
 ; CHECK-NEXT: }
 ;
 entry:
diff --git a/llvm/test/Transforms/LoopVectorize/vplan-printing.ll b/llvm/test/Transforms/LoopVectorize/vplan-printing.ll
index 22f3c23b47e602..9b7051630347e4 100644
--- a/llvm/test/Transforms/LoopVectorize/vplan-printing.ll
+++ b/llvm/test/Transforms/LoopVectorize/vplan-printing.ll
@@ -1040,7 +1040,6 @@ define i16 @print_first_order_recurrence_and_result(ptr %ptr) {
 ; CHECK-NEXT: No successors
 ; CHECK-EMPTY:
 ; CHECK-NEXT:   Live-out i16 %for.1 = vp<[[RESUME_P]]>
-; CHECK-NEXT:   Live-out i16 %for.1.lcssa = vp<[[FOR_RESULT]]>
 ; CHECK-NEXT: }
 ;
 entry:

>From aab14636f2146153c064a3597e9f60d5627669b9 Mon Sep 17 00:00:00 2001
From: Florian Hahn <flo at fhahn.com>
Date: Fri, 13 Sep 2024 21:56:14 +0100
Subject: [PATCH 5/7] !fixup address latest comments, thanks!

---
 .../Transforms/Vectorize/LoopVectorize.cpp    | 29 ++++++++++---------
 .../lib/Transforms/Vectorize/VPlanRecipes.cpp |  8 ++---
 .../Transforms/Vectorize/VPlanVerifier.cpp    | 11 +++++++
 .../RISCV/vplan-vp-intrinsics-reduction.ll    |  6 ++--
 ...-order-recurrence-sink-replicate-region.ll |  2 +-
 .../LoopVectorize/vplan-printing.ll           |  8 ++---
 6 files changed, 38 insertions(+), 26 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 6232a7b1c16df8..57e4da65626956 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -8638,10 +8638,11 @@ static void addCanonicalIVRecipes(VPlan &Plan, Type *IdxTy, bool HasNUW,
                        {CanonicalIVIncrement, &Plan.getVectorTripCount()}, DL);
 }
 
-// Collect (ExitPhi, ExitingValue) pairs phis in the original exit block that
-// are modeled in VPlan. Some exiting values are not modeled explicitly yet and
-// won't be included. Those are un-truncated VPWidenIntOrFpInductionRecipe,
-// VPWidenPointerInductionRecipe and induction increments.
+// Collect (VPIRInstruction, ExitingValue) pairs for phis in the original exit
+// block that are modeled in VPlan. Some exiting values are not modeled
+// explicitly yet and won't be included. Those are un-truncated
+// VPWidenIntOrFpInductionRecipe, VPWidenPointerInductionRecipe and induction
+// increments.
 static MapVector<VPIRInstruction *, VPValue *> collectUsersInExitBlock(
     Loop *OrigLoop, VPRecipeBuilder &Builder, VPlan &Plan,
     const MapVector<PHINode *, InductionDescriptor> &Inductions) {
@@ -8656,10 +8657,10 @@ static MapVector<VPIRInstruction *, VPValue *> collectUsersInExitBlock(
   VPBasicBlock *ExitVPBB = cast<VPIRBasicBlock>(MiddleVPBB->getSuccessors()[0]);
   BasicBlock *ExitingBB = OrigLoop->getExitingBlock();
   for (VPRecipeBase &R : *ExitVPBB) {
-    auto *IR = dyn_cast<VPIRInstruction>(&R);
-    if (!IR)
+    auto *ExitIRI = dyn_cast<VPIRInstruction>(&R);
+    if (!ExitIRI)
       continue;
-    auto *ExitPhi = dyn_cast<PHINode>(&IR->getInstruction());
+    auto *ExitPhi = dyn_cast<PHINode>(&ExitIRI->getInstruction());
     if (!ExitPhi)
       break;
     Value *IncomingValue = ExitPhi->getIncomingValueForBlock(ExitingBB);
@@ -8678,7 +8679,7 @@ static MapVector<VPIRInstruction *, VPValue *> collectUsersInExitBlock(
            return P && Inductions.contains(P);
          })))
       continue;
-    ExitingValuesToFix.insert({IR, V});
+    ExitingValuesToFix.insert({ExitIRI, V});
   }
   return ExitingValuesToFix;
 }
@@ -8697,10 +8698,10 @@ static void addUsersInExitBlock(
   VPBuilder B(MiddleVPBB, MiddleVPBB->getFirstNonPhi());
 
   // Introduce VPUsers modeling the exit values.
-  for (const auto &[IR, V] : ExitingValuesToFix) {
+  for (const auto &[ExitIRI, V] : ExitingValuesToFix) {
     // Pass live-in values used by exit phis directly through to the live-out.
     if (V->isLiveIn()) {
-      IR->addOperand(V);
+      ExitIRI->addOperand(V);
       continue;
     }
 
@@ -8708,7 +8709,7 @@ static void addUsersInExitBlock(
         VPInstruction::ExtractFromEnd,
         {V, Plan.getOrAddLiveIn(ConstantInt::get(
                 IntegerType::get(ExitBB->getContext(), 32), 1))});
-    IR->addOperand(Ext);
+    ExitIRI->addOperand(Ext);
   }
 }
 
@@ -8846,14 +8847,14 @@ static void addLiveOutsForFirstOrderRecurrences(
     // No edge from the middle block to the unique exit block has been inserted
     // and there is nothing to fix from vector loop; phis should have incoming
     // from scalar loop only.
-    for (const auto &[IR, V] : ExitingValuesToFix) {
+    for (const auto &[ExitIRI, V] : ExitingValuesToFix) {
       if (V != FOR)
         continue;
       VPValue *Ext = MiddleBuilder.createNaryOp(
           VPInstruction::ExtractFromEnd, {FOR->getBackedgeValue(), TwoVPV}, {},
           "vector.recur.extract.for.phi");
-      IR->addOperand(Ext);
-      ExitingValuesToFix.erase(IR);
+      ExitIRI->addOperand(Ext);
+      ExitingValuesToFix.erase(ExitIRI);
     }
   }
 }
diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
index 8a3242e1648657..9068ccf519c55c 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
@@ -868,9 +868,6 @@ void VPInstruction::print(raw_ostream &O, const Twine &Indent,
 #endif
 
 void VPIRInstruction::execute(VPTransformState &State) {
-  assert(isa<VPIRBasicBlock>(getParent()) &&
-         "VPIRInstructions can only be placed in VPIRBasicBlocks");
-
   assert((isa<PHINode>(&I) || getNumOperands() == 0) &&
          "Only PHINodes can have extra operands");
   if (getNumOperands() == 1) {
@@ -888,6 +885,8 @@ void VPIRInstruction::execute(VPTransformState &State) {
     Phi->addIncoming(V, PredBB);
   }
 
+  // Advance the insert point after the wrapped IR instruction. This allows
+  // interleaving VPIRInstructions and other recipes.
   State.Builder.SetInsertPoint(I.getParent(), std::next(I.getIterator()));
 }
 
@@ -897,7 +896,8 @@ void VPIRInstruction::print(raw_ostream &O, const Twine &Indent,
   O << Indent << "IR " << I;
 
   if (getNumOperands() != 0) {
-    O << " (extra operands: ";
+    assert(getNumOperands() == 1 && "can have at most 1 operand");
+    O << " (extra operand: ";
     printOperands(O, SlotTracker);
     O << ")";
   }
diff --git a/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp b/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp
index dfddb5b45f623d..ee6f035f1f494d 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp
@@ -126,6 +126,17 @@ bool VPlanVerifier::verifyVPBasicBlock(const VPBasicBlock *VPBB) {
     RecipeNumbering[&R] = Cnt++;
 
   for (const VPRecipeBase &R : *VPBB) {
+    if (auto *IRI = dyn_cast<VPIRInstruction>(&R)) {
+      if (!isa<VPIRBasicBlock>(IRI->getParent())) {
+        errs() << "VPIRInstructions ";
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+        IRI->dump();
+        errs() << " ";
+#endif
+        errs() << "not in a VPIRBasicBlock!\n";
+        return false;
+      }
+    }
     for (const VPValue *V : R.definedValues()) {
       for (const VPUser *U : V->users()) {
         auto *UI = dyn_cast<VPRecipeBase>(U);
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/vplan-vp-intrinsics-reduction.ll b/llvm/test/Transforms/LoopVectorize/RISCV/vplan-vp-intrinsics-reduction.ll
index 803a5e6ecb733e..11405a1c911584 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/vplan-vp-intrinsics-reduction.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/vplan-vp-intrinsics-reduction.ll
@@ -60,7 +60,7 @@ define i32 @reduction(ptr %a, i64 %n, i32 %start) {
 ; IF-EVL-INLOOP-NEXT: Successor(s): ir-bb<for.end>, scalar.ph
 ; IF-EVL-INLOOP-EMPTY:
 ; IF-EVL-INLOOP-NEXT: ir-bb<for.end>:
-; IF-EVL-INLOOP-NEXT:  IR %add.lcssa = phi i32 [ %add, %for.body ]
+; IF-EVL-INLOOP-NEXT:  IR %add.lcssa = phi i32 [ %add, %for.body ] (extra operand: vp<[[RDX_EX]]>)
 ; IF-EVL-INLOOP-NEXT: No successors
 ; IF-EVL-INLOOP-EMPTY:
 ; IF-EVL-INLOOP-NEXT: scalar.ph:
@@ -99,7 +99,7 @@ define i32 @reduction(ptr %a, i64 %n, i32 %start) {
 ; NO-VP-OUTLOOP-NEXT: Successor(s): ir-bb<for.end>, scalar.ph
 ; NO-VP-OUTLOOP-EMPTY:
 ; NO-VP-OUTLOOP-NEXT: ir-bb<for.end>:
-; NO-VP-OUTLOOP-NEXT:  IR %add.lcssa = phi i32 [ %add, %for.body ]
+; NO-VP-OUTLOOP-NEXT:  IR %add.lcssa = phi i32 [ %add, %for.body ] (extra operand: vp<[[RDX_EX]]>)
 ; NO-VP-OUTLOOP-NEXT: No successors
 ; NO-VP-OUTLOOP-EMPTY:
 ; NO-VP-OUTLOOP-NEXT: scalar.ph:
@@ -138,7 +138,7 @@ define i32 @reduction(ptr %a, i64 %n, i32 %start) {
 ; NO-VP-INLOOP-NEXT: Successor(s): ir-bb<for.end>, scalar.ph
 ; NO-VP-INLOOP-EMPTY:
 ; NO-VP-INLOOP-NEXT: ir-bb<for.end>:
-; NO-VP-INLOOP-NEXT:   IR %add.lcssa = phi i32 [ %add, %for.body ]
+; NO-VP-INLOOP-NEXT:   IR %add.lcssa = phi i32 [ %add, %for.body ] (extra operand: vp<[[RDX_EX]]>)
 ; NO-VP-INLOOP-NEXT: No successors
 ; NO-VP-INLOOP-EMPTY:
 ; NO-VP-INLOOP-NEXT: scalar.ph:
diff --git a/llvm/test/Transforms/LoopVectorize/first-order-recurrence-sink-replicate-region.ll b/llvm/test/Transforms/LoopVectorize/first-order-recurrence-sink-replicate-region.ll
index b0533997e30a00..45545feffd3257 100644
--- a/llvm/test/Transforms/LoopVectorize/first-order-recurrence-sink-replicate-region.ll
+++ b/llvm/test/Transforms/LoopVectorize/first-order-recurrence-sink-replicate-region.ll
@@ -227,7 +227,7 @@ define i32 @sink_replicate_region_3_reduction(i32 %x, i8 %y, ptr %ptr) optsize {
 ; CHECK-NEXT: Successor(s): ir-bb<exit>, scalar.ph
 ; CHECK-EMPTY:
 ; CHECK-NEXT: ir-bb<exit>
-; CHECK-NEXT:   IR %res = phi i32 [ %and.red.next, %loop ]
+; CHECK-NEXT:   IR %res = phi i32 [ %and.red.next, %loop ] (extra operand: vp<[[RED_EX]]>)
 ; CHECK-NEXT: No successors
 ; CHECK-EMPTY:
 ; CHECK-NEXT: scalar.ph
diff --git a/llvm/test/Transforms/LoopVectorize/vplan-printing.ll b/llvm/test/Transforms/LoopVectorize/vplan-printing.ll
index b0dc09e79f864e..26974c2307065a 100644
--- a/llvm/test/Transforms/LoopVectorize/vplan-printing.ll
+++ b/llvm/test/Transforms/LoopVectorize/vplan-printing.ll
@@ -161,7 +161,7 @@ define float @print_reduction(i64 %n, ptr noalias %y) {
 ; CHECK-NEXT: Successor(s): ir-bb<for.end>, scalar.ph
 ; CHECK-EMPTY:
 ; CHECK-NEXT: ir-bb<for.end>
-; CHECK-NEXT:  IR %red.next.lcssa = phi float [ %red.next, %for.body ]
+; CHECK-NEXT:  IR %red.next.lcssa = phi float [ %red.next, %for.body ] (extra operand: vp<[[RED_EX]]>)
 ; CHECK-NEXT: No successors
 ; CHECK-EMPTY:
 ; CHECK-NEXT: scalar.ph
@@ -443,7 +443,7 @@ define float @print_fmuladd_strict(ptr %a, ptr %b, i64 %n) {
 ; CHECK-NEXT: Successor(s): ir-bb<for.end>, scalar.ph
 ; CHECK-EMPTY:
 ; CHECK-NEXT: ir-bb<for.end>
-; CHECK-NEXT:   IR %muladd.lcssa = phi float [ %muladd, %for.body ]
+; CHECK-NEXT:   IR %muladd.lcssa = phi float [ %muladd, %for.body ] (extra operand: vp<[[RED_EX]]>)
 ; CHECK-NEXT: No successors
 ; CHECK-EMPTY:
 ; CHECK-NEXT: scalar.ph
@@ -666,7 +666,7 @@ define i32 @print_exit_value(ptr %ptr, i32 %off) {
 ; CHECK-NEXT: Successor(s): ir-bb<exit>, scalar.ph
 ; CHECK-EMPTY:
 ; CHECK-NEXT: ir-bb<exit>
-; CHECK-NEXT:   IR %lcssa = phi i32 [ %add, %loop ]
+; CHECK-NEXT:   IR %lcssa = phi i32 [ %add, %loop ] (extra operand: vp<[[EXIT]]>)
 ; CHECK-NEXT: No successors
 ; CHECK-EMPTY:
 ; CHECK-NEXT: scalar.ph
@@ -1036,7 +1036,7 @@ define i16 @print_first_order_recurrence_and_result(ptr %ptr) {
 ; CHECK-NEXT: Successor(s): ir-bb<exit>, scalar.ph
 ; CHECK-EMPTY:
 ; CHECK-NEXT: ir-bb<exit>
-; CHECK-NEXT:   IR %for.1.lcssa = phi i16 [ %for.1, %loop ]
+; CHECK-NEXT:   IR %for.1.lcssa = phi i16 [ %for.1, %loop ] (extra operand: vp<[[FOR_RESULT]]>)
 ; CHECK-NEXT: No successors
 ; CHECK-EMPTY:
 ; CHECK-NEXT: scalar.ph

>From 3dc696bad75a3b741fdfa24e2d48533b05177c82 Mon Sep 17 00:00:00 2001
From: Florian Hahn <flo at fhahn.com>
Date: Sat, 14 Sep 2024 20:41:09 +0100
Subject: [PATCH 6/7] !fixup address latest comments, thanks!

---
 .../Transforms/Vectorize/LoopVectorize.cpp    | 63 ++++++++-----------
 .../Transforms/Vectorize/VPlanVerifier.cpp    | 14 ++---
 2 files changed, 32 insertions(+), 45 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 2fea600a298828..feac26b6077623 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -8630,12 +8630,8 @@ static void addCanonicalIVRecipes(VPlan &Plan, Type *IdxTy, bool HasNUW,
                        {CanonicalIVIncrement, &Plan.getVectorTripCount()}, DL);
 }
 
-// Collect (VPIRInstruction, ExitingValue) pairs for phis in the original exit
-// block that are modeled in VPlan. Some exiting values are not modeled
-// explicitly yet and won't be included. Those are un-truncated
-// VPWidenIntOrFpInductionRecipe, VPWidenPointerInductionRecipe and induction
-// increments.
-static MapVector<VPIRInstruction *, VPValue *> collectUsersInExitBlock(
+// Collect VPIRInstructions for phis in the original exit block that are modeled in VPlan and add the exiting VPValue as operand. Some exiting values are not modeled explicitly yet and won't be included. Those are un-truncated VPWidenIntOrFpInductionRecipe, VPWidenPointerInductionRecipe and induction increments.
+static SetVector<VPIRInstruction *> collectUsersInExitBlock(
     Loop *OrigLoop, VPRecipeBuilder &Builder, VPlan &Plan,
     const MapVector<PHINode *, InductionDescriptor> &Inductions) {
   auto *MiddleVPBB =
@@ -8645,7 +8641,7 @@ static MapVector<VPIRInstruction *, VPValue *> collectUsersInExitBlock(
   // from scalar loop only.
   if (MiddleVPBB->getNumSuccessors() != 2)
     return {};
-  MapVector<VPIRInstruction *, VPValue *> ExitingValuesToFix;
+  SetVector<VPIRInstruction *> ExitUsersToFix;
   VPBasicBlock *ExitVPBB = cast<VPIRBasicBlock>(MiddleVPBB->getSuccessors()[0]);
   BasicBlock *ExitingBB = OrigLoop->getExitingBlock();
   for (VPRecipeBase &R : *ExitVPBB) {
@@ -8671,16 +8667,16 @@ static MapVector<VPIRInstruction *, VPValue *> collectUsersInExitBlock(
            return P && Inductions.contains(P);
          })))
       continue;
-    ExitingValuesToFix.insert({ExitIRI, V});
+    ExitUsersToFix.insert(ExitIRI);
+    ExitIRI->addOperand(V);
   }
-  return ExitingValuesToFix;
+  return ExitUsersToFix;
 }
 
-// Add exit values to \p Plan. Extracts and VPLiveOuts are added for each entry
-// in \p ExitingValuesToFix.
+// Add exit values to \p Plan. Extracts are added for each entry in \p ExitUsersToFix if needed and their operands are updated.
 static void addUsersInExitBlock(
-    VPlan &Plan, MapVector<VPIRInstruction *, VPValue *> &ExitingValuesToFix) {
-  if (ExitingValuesToFix.empty())
+    VPlan &Plan, const SetVector<VPIRInstruction *> &ExitUsersToFix) {
+  if (ExitUsersToFix.empty())
     return;
 
   auto *MiddleVPBB =
@@ -8689,19 +8685,18 @@ static void addUsersInExitBlock(
       cast<VPIRBasicBlock>(MiddleVPBB->getSuccessors()[0])->getIRBasicBlock();
   VPBuilder B(MiddleVPBB, MiddleVPBB->getFirstNonPhi());
 
-  // Introduce VPUsers modeling the exit values.
-  for (const auto &[ExitIRI, V] : ExitingValuesToFix) {
+  // Introduce extract for exiting values and update the VPIRInstructions modeling the corresponding LCSSA phis.
+  for (VPIRInstruction *ExitIRI : ExitUsersToFix) {
+    VPValue *V = ExitIRI->getOperand(0);
     // Pass live-in values used by exit phis directly through to the live-out.
-    if (V->isLiveIn()) {
-      ExitIRI->addOperand(V);
+    if (V->isLiveIn())
       continue;
-    }
 
     VPValue *Ext = B.createNaryOp(
         VPInstruction::ExtractFromEnd,
         {V, Plan.getOrAddLiveIn(ConstantInt::get(
                 IntegerType::get(ExitBB->getContext(), 32), 1))});
-    ExitIRI->addOperand(Ext);
+    ExitIRI->setOperand(0, Ext);
   }
 }
 
@@ -8714,7 +8709,7 @@ static void addUsersInExitBlock(
 /// 2. Feed the penultimate value of recurrences to their LCSSA phi users in
 ///    the original exit block using a VPLiveOut.
 static void addLiveOutsForFirstOrderRecurrences(
-    VPlan &Plan, MapVector<VPIRInstruction *, VPValue *> &ExitingValuesToFix) {
+    VPlan &Plan, SetVector<VPIRInstruction *> &ExitUsersToFix) {
   VPRegionBlock *VectorRegion = Plan.getVectorLoopRegion();
 
   // Start by finding out if middle block branches to scalar preheader, which is
@@ -8731,14 +8726,14 @@ static void addLiveOutsForFirstOrderRecurrences(
     ExitBB =
         cast<VPIRBasicBlock>(MiddleVPBB->getSuccessors()[0])->getIRBasicBlock();
     ScalarPHVPBB = cast<VPBasicBlock>(MiddleVPBB->getSuccessors()[1]);
-  } else if (ExitingValuesToFix.empty()) {
+  } else if (ExitUsersToFix.empty()) {
     ScalarPHVPBB = cast<VPBasicBlock>(MiddleVPBB->getSingleSuccessor());
   } else {
     ExitBB = cast<VPIRBasicBlock>(MiddleVPBB->getSingleSuccessor())
                  ->getIRBasicBlock();
   }
   if (!ScalarPHVPBB) {
-    assert(ExitingValuesToFix.empty() &&
+    assert(ExitUsersToFix.empty() &&
            "missed inserting extracts for exiting values");
     return;
   }
@@ -8832,21 +8827,16 @@ static void addLiveOutsForFirstOrderRecurrences(
     auto *FORPhi = cast<PHINode>(FOR->getUnderlyingInstr());
     Plan.addLiveOut(FORPhi, ResumePhiRecipe);
 
-    // Now create VPLiveOuts for users in the exit block.
-    // Extract the penultimate value of the recurrence and add VPLiveOut
-    // users of the recurrence splice.
-
-    // No edge from the middle block to the unique exit block has been inserted
-    // and there is nothing to fix from vector loop; phis should have incoming
-    // from scalar loop only.
-    for (const auto &[ExitIRI, V] : ExitingValuesToFix) {
-      if (V != FOR)
+    // Now update VPIRInstructions modeling LCSSA phis in the exit block.
+    // Extract the penultimate value of the recurrence and use it as operand for the VPIRInstruction modeling the phi.
+    for (VPIRInstruction *ExitIRI : ExitUsersToFix) {
+      if (ExitIRI->getOperand(0) != FOR)
         continue;
       VPValue *Ext = MiddleBuilder.createNaryOp(
           VPInstruction::ExtractFromEnd, {FOR->getBackedgeValue(), TwoVPV}, {},
           "vector.recur.extract.for.phi");
-      ExitIRI->addOperand(Ext);
-      ExitingValuesToFix.erase(ExitIRI);
+      ExitIRI->setOperand(0, Ext);
+      ExitUsersToFix.remove(ExitIRI);
     }
   }
 }
@@ -9008,12 +8998,11 @@ LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(VFRange &Range) {
          "VPBasicBlock");
   RecipeBuilder.fixHeaderPhis();
 
-  MapVector<VPIRInstruction *, VPValue *> ExitingValuesToFix =
+  SetVector<VPIRInstruction *> ExitUsersToFix =
       collectUsersInExitBlock(OrigLoop, RecipeBuilder, *Plan,
                               Legal->getInductionVars());
-
-  addLiveOutsForFirstOrderRecurrences(*Plan, ExitingValuesToFix);
-  addUsersInExitBlock(*Plan, ExitingValuesToFix);
+  addLiveOutsForFirstOrderRecurrences(*Plan, ExitUsersToFix);
+  addUsersInExitBlock(*Plan, ExitUsersToFix);
 
   // ---------------------------------------------------------------------------
   // Transform initial VPlan: Apply previously taken decisions, in order, to
diff --git a/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp b/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp
index ee6f035f1f494d..0870671e67190b 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp
@@ -126,16 +126,14 @@ bool VPlanVerifier::verifyVPBasicBlock(const VPBasicBlock *VPBB) {
     RecipeNumbering[&R] = Cnt++;
 
   for (const VPRecipeBase &R : *VPBB) {
-    if (auto *IRI = dyn_cast<VPIRInstruction>(&R)) {
-      if (!isa<VPIRBasicBlock>(IRI->getParent())) {
-        errs() << "VPIRInstructions ";
+    if (isa<VPIRInstruction>(&R) ^ isa<VPIRBasicBlock>(VPBB)) {
+      errs() << "VPIRInstructions ";
 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
-        IRI->dump();
-        errs() << " ";
+      R.dump();
+      errs() << " ";
 #endif
-        errs() << "not in a VPIRBasicBlock!\n";
-        return false;
-      }
+      errs() << "not in a VPIRBasicBlock!\n";
+      return false;
     }
     for (const VPValue *V : R.definedValues()) {
       for (const VPUser *U : V->users()) {

>From f66d27e5ae2c39d657c180555116e0427c24ef1a Mon Sep 17 00:00:00 2001
From: Florian Hahn <flo at fhahn.com>
Date: Sat, 14 Sep 2024 20:45:18 +0100
Subject: [PATCH 7/7] !fixup formatting and switch case odering.

---
 .../Transforms/Vectorize/LoopVectorize.cpp    | 25 ++++++++++++-------
 llvm/lib/Transforms/Vectorize/VPlan.h         |  2 +-
 2 files changed, 17 insertions(+), 10 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index feac26b6077623..e7ed6ad5d47460 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -8630,7 +8630,11 @@ static void addCanonicalIVRecipes(VPlan &Plan, Type *IdxTy, bool HasNUW,
                        {CanonicalIVIncrement, &Plan.getVectorTripCount()}, DL);
 }
 
-// Collect VPIRInstructions for phis in the original exit block that are modeled in VPlan and add the exiting VPValue as operand. Some exiting values are not modeled explicitly yet and won't be included. Those are un-truncated VPWidenIntOrFpInductionRecipe, VPWidenPointerInductionRecipe and induction increments.
+// Collect VPIRInstructions for phis in the original exit block that are modeled
+// in VPlan and add the exiting VPValue as operand. Some exiting values are not
+// modeled explicitly yet and won't be included. Those are un-truncated
+// VPWidenIntOrFpInductionRecipe, VPWidenPointerInductionRecipe and induction
+// increments.
 static SetVector<VPIRInstruction *> collectUsersInExitBlock(
     Loop *OrigLoop, VPRecipeBuilder &Builder, VPlan &Plan,
     const MapVector<PHINode *, InductionDescriptor> &Inductions) {
@@ -8673,9 +8677,11 @@ static SetVector<VPIRInstruction *> collectUsersInExitBlock(
   return ExitUsersToFix;
 }
 
-// Add exit values to \p Plan. Extracts are added for each entry in \p ExitUsersToFix if needed and their operands are updated.
-static void addUsersInExitBlock(
-    VPlan &Plan, const SetVector<VPIRInstruction *> &ExitUsersToFix) {
+// Add exit values to \p Plan. Extracts are added for each entry in \p
+// ExitUsersToFix if needed and their operands are updated.
+static void
+addUsersInExitBlock(VPlan &Plan,
+                    const SetVector<VPIRInstruction *> &ExitUsersToFix) {
   if (ExitUsersToFix.empty())
     return;
 
@@ -8685,7 +8691,8 @@ static void addUsersInExitBlock(
       cast<VPIRBasicBlock>(MiddleVPBB->getSuccessors()[0])->getIRBasicBlock();
   VPBuilder B(MiddleVPBB, MiddleVPBB->getFirstNonPhi());
 
-  // Introduce extract for exiting values and update the VPIRInstructions modeling the corresponding LCSSA phis.
+  // Introduce extract for exiting values and update the VPIRInstructions
+  // modeling the corresponding LCSSA phis.
   for (VPIRInstruction *ExitIRI : ExitUsersToFix) {
     VPValue *V = ExitIRI->getOperand(0);
     // Pass live-in values used by exit phis directly through to the live-out.
@@ -8828,7 +8835,8 @@ static void addLiveOutsForFirstOrderRecurrences(
     Plan.addLiveOut(FORPhi, ResumePhiRecipe);
 
     // Now update VPIRInstructions modeling LCSSA phis in the exit block.
-    // Extract the penultimate value of the recurrence and use it as operand for the VPIRInstruction modeling the phi.
+    // Extract the penultimate value of the recurrence and use it as operand for
+    // the VPIRInstruction modeling the phi.
     for (VPIRInstruction *ExitIRI : ExitUsersToFix) {
       if (ExitIRI->getOperand(0) != FOR)
         continue;
@@ -8998,9 +9006,8 @@ LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(VFRange &Range) {
          "VPBasicBlock");
   RecipeBuilder.fixHeaderPhis();
 
-  SetVector<VPIRInstruction *> ExitUsersToFix =
-      collectUsersInExitBlock(OrigLoop, RecipeBuilder, *Plan,
-                              Legal->getInductionVars());
+  SetVector<VPIRInstruction *> ExitUsersToFix = collectUsersInExitBlock(
+      OrigLoop, RecipeBuilder, *Plan, Legal->getInductionVars());
   addLiveOutsForFirstOrderRecurrences(*Plan, ExitUsersToFix);
   addUsersInExitBlock(*Plan, ExitUsersToFix);
 
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h
index 3f51d367367889..cff3b7514857c8 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.h
+++ b/llvm/lib/Transforms/Vectorize/VPlan.h
@@ -936,9 +936,9 @@ class VPSingleDefRecipe : public VPRecipeBase, public VPValue {
     case VPRecipeBase::VPReductionPHISC:
     case VPRecipeBase::VPScalarCastSC:
       return true;
+    case VPRecipeBase::VPBranchOnMaskSC:
     case VPRecipeBase::VPInterleaveSC:
     case VPRecipeBase::VPIRInstructionSC:
-    case VPRecipeBase::VPBranchOnMaskSC:
     case VPRecipeBase::VPWidenLoadEVLSC:
     case VPRecipeBase::VPWidenLoadSC:
     case VPRecipeBase::VPWidenStoreEVLSC:



More information about the llvm-commits mailing list