[llvm] [VPlan] Introduce explicit ExtractFromEnd recipes for live-outs. (PR #100658)

Thu Jul 25 14:42:51 PDT 2024

llvmbot wrote:




@llvm/pr-subscribers-llvm-transforms

Author: Florian Hahn (fhahn)

<details>
<summary>Changes</summary>

Introduce explicit ExtractFromEnd recipes to extract the final values for live-outs instead of implicitly extracting in VPLiveOut::fixPhi.

This is a follow-up to the recent changes of modeling extracts for recurrences and consolidates live-out extract creation for fixed-order recurrences at a single place: addLiveOutsForFirstOrderRecurrences.

It is also in preparation of replacing VPLiveOut with VPIRInstructions wrapping the original scalar phis.

---

Patch is 26.74 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/100658.diff


9 Files Affected:

- (modified) llvm/lib/Transforms/Vectorize/LoopVectorize.cpp (+116-14) 
- (modified) llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp (+1-7) 
- (modified) llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp (-94) 
- (modified) llvm/test/Transforms/LoopVectorize/RISCV/vplan-vp-intrinsics-reduction.ll (+6-3) 
- (modified) llvm/test/Transforms/LoopVectorize/first-order-recurrence-sink-replicate-region.ll (+2-1) 
- (modified) llvm/test/Transforms/LoopVectorize/instruction-only-used-outside-of-loop.ll (+9-11) 
- (modified) llvm/test/Transforms/LoopVectorize/pr55167-fold-tail-live-out.ll (+1-1) 
- (modified) llvm/test/Transforms/LoopVectorize/select-cmp-multiuse.ll (+2-2) 
- (modified) llvm/test/Transforms/LoopVectorize/vplan-printing.ll (+7-4) 


``````````diff

diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 09ca859f52680..61b29295d07b5 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -8452,7 +8452,104 @@ static void addUsersInExitBlock(
            return P && Inductions.contains(P);
          })))
       continue;
-    Plan.addLiveOut(&ExitPhi, V);
+
+    auto MiddleVPBB =
+        cast<VPBasicBlock>(Plan.getVectorLoopRegion()->getSingleSuccessor());
+    VPBuilder B(MiddleVPBB);
+    if (auto *Terminator = MiddleVPBB->getTerminator()) {
+      auto *Condition = dyn_cast<VPInstruction>(Terminator->getOperand(0));
+      assert((!Condition || Condition->getParent() == MiddleVPBB) &&
+             "Condition expected in MiddleVPBB");
+      B.setInsertPoint(Condition ? Condition : Terminator);
+    }
+
+    VPValue *Ext;
+    if (auto *FOR = dyn_cast_or_null<VPFirstOrderRecurrencePHIRecipe>(
+            V->getDefiningRecipe())) {
+      // This is the second phase of vectorizing first-order recurrences. An
+      // overview of the transformation is described below. Suppose we have the
+      // following loop with some use after the loop of the last a[i-1],
+      //
+      //   for (int i = 0; i < n; ++i) {
+      //     t = a[i - 1];
+      //     b[i] = a[i] - t;
+      //   }
+      //   use t;
+      //
+      // There is a first-order recurrence on "a". For this loop, the shorthand
+      // scalar IR looks like:
+      //
+      //   scalar.ph:
+      //     s_init = a[-1]
+      //     br scalar.body
+      //
+      //   scalar.body:
+      //     i = phi [0, scalar.ph], [i+1, scalar.body]
+      //     s1 = phi [s_init, scalar.ph], [s2, scalar.body]
+      //     s2 = a[i]
+      //     b[i] = s2 - s1
+      //     br cond, scalar.body, exit.block
+      //
+      //   exit.block:
+      //     use = lcssa.phi [s1, scalar.body]
+      //
+      // In this example, s1 is a recurrence because it's value depends on the
+      // previous iteration. In the first phase of vectorization, we created a
+      // vector phi v1 for s1. We now complete the vectorization and produce the
+      // shorthand vector IR shown below (for VF = 4, UF = 1).
+      //
+      //   vector.ph:
+      //     v_init = vector(..., ..., ..., a[-1])
+      //     br vector.body
+      //
+      //   vector.body
+      //     i = phi [0, vector.ph], [i+4, vector.body]
+      //     v1 = phi [v_init, vector.ph], [v2, vector.body]
+      //     v2 = a[i, i+1, i+2, i+3];
+      //     v3 = vector(v1(3), v2(0, 1, 2))
+      //     b[i, i+1, i+2, i+3] = v2 - v3
+      //     br cond, vector.body, middle.block
+      //
+      //   middle.block:
+      //     s_penultimate = v2(2) = v3(3)
+      //     s_resume = v2(3)
+      //     br cond, scalar.ph, exit.block
+      //
+      //   scalar.ph:
+      //     s_init' = phi [s_resume, middle.block], [s_init, otherwise]
+      //     br scalar.body
+      //
+      //   scalar.body:
+      //     i = phi [0, scalar.ph], [i+1, scalar.body]
+      //     s1 = phi [s_init', scalar.ph], [s2, scalar.body]
+      //     s2 = a[i]
+      //     b[i] = s2 - s1
+      //     br cond, scalar.body, exit.block
+      //
+      //   exit.block:
+      //     lo = lcssa.phi [s1, scalar.body], [s.penultimate, middle.block]
+      //
+      // After execution completes the vector loop, we extract the next value of
+      // the recurrence (x) to use as the initial value in the scalar loop. This
+      // is modeled by ExtractFromEnd.
+      //
+      // Extract the penultimate value of the recurrence and update VPLiveOut
+      // users of the recurrence splice. Note that the extract of the final
+      // value used to resume in the scalar loop is created earlier during VPlan
+      // construction.
+      Ext =
+          B.createNaryOp(VPInstruction::ExtractFromEnd,
+                         {FOR->getBackedgeValue(),
+                          Plan.getOrAddLiveIn(ConstantInt::get(
+                              IntegerType::get(ExitBB->getContext(), 32), 2))},
+                         {}, "vector.recur.extract.for.phi");
+    } else {
+      Ext = B.createNaryOp(
+          VPInstruction::ExtractFromEnd,
+          {V, Plan.getOrAddLiveIn(ConstantInt::get(
+                  IntegerType::get(ExitBB->getContext(), 32), 1))});
+    }
+    Plan.addLiveOut(&ExitPhi, Ext);
   }
 }
 
@@ -8660,6 +8757,14 @@ LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(VFRange &Range) {
   // After here, VPBB should not be used.
   VPBB = nullptr;
 
+  assert(isa<VPRegionBlock>(Plan->getVectorLoopRegion()) &&
+         !Plan->getVectorLoopRegion()->getEntryBasicBlock()->empty() &&
+         "entry block must be set to a VPRegionBlock having a non-empty entry "
+         "VPBasicBlock");
+  RecipeBuilder.fixHeaderPhis();
+
+  addLiveOutsForFirstOrderRecurrences(*Plan);
+
   if (CM.requiresScalarEpilogue(Range)) {
     // No edge from the middle block to the unique exit block has been inserted
     // and there is nothing to fix from vector loop; phis should have incoming
@@ -8668,13 +8773,6 @@ LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(VFRange &Range) {
     addUsersInExitBlock(OrigLoop, RecipeBuilder, *Plan,
                         Legal->getInductionVars());
 
-  assert(isa<VPRegionBlock>(Plan->getVectorLoopRegion()) &&
-         !Plan->getVectorLoopRegion()->getEntryBasicBlock()->empty() &&
-         "entry block must be set to a VPRegionBlock having a non-empty entry "
-         "VPBasicBlock");
-  RecipeBuilder.fixHeaderPhis();
-
-  addLiveOutsForFirstOrderRecurrences(*Plan);
 
   // ---------------------------------------------------------------------------
   // Transform initial VPlan: Apply previously taken decisions, in order, to
@@ -8884,10 +8982,12 @@ void LoopVectorizationPlanner::adjustRecipesForReductions(
     for (unsigned I = 0; I != Worklist.size(); ++I) {
       VPSingleDefRecipe *Cur = Worklist[I];
       for (VPUser *U : Cur->users()) {
-        auto *UserRecipe = dyn_cast<VPSingleDefRecipe>(U);
-        if (!UserRecipe) {
-          assert(isa<VPLiveOut>(U) &&
-                 "U must either be a VPSingleDef or VPLiveOut");
+        auto *UserRecipe = cast<VPSingleDefRecipe>(U);
+        if (!UserRecipe->getParent()->getParent()) {
+          assert(cast<VPInstruction>(U) &&
+                 cast<VPInstruction>(U)->getOpcode() ==
+                     VPInstruction::ExtractFromEnd &&
+                 "U must be an ExtractFromEnd VPInstruction");
           continue;
         }
         Worklist.insert(UserRecipe);
@@ -9105,8 +9205,10 @@ void LoopVectorizationPlanner::adjustRecipesForReductions(
         VPInstruction::ComputeReductionResult, {PhiR, NewExitingVPV}, ExitDL);
     FinalReductionResult->insertBefore(*MiddleVPBB, IP);
     OrigExitingVPV->replaceUsesWithIf(
-        FinalReductionResult,
-        [](VPUser &User, unsigned) { return isa<VPLiveOut>(&User); });
+        FinalReductionResult, [](VPUser &User, unsigned) {
+          auto *R = dyn_cast<VPInstruction>(&User);
+          return R && R->getOpcode() == VPInstruction::ExtractFromEnd;
+        });
   }
 
   VPlanTransforms::clearReductionWrapFlags(*Plan);
diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
index 2d6d67a55c17d..798d178e5a963 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
@@ -194,9 +194,6 @@ bool VPRecipeBase::mayHaveSideEffects() const {
 
 void VPLiveOut::fixPhi(VPlan &Plan, VPTransformState &State) {
   VPValue *ExitValue = getOperand(0);
-  auto Lane = vputils::isUniformAfterVectorization(ExitValue)
-                  ? VPLane::getFirstLane()
-                  : VPLane::getLastLaneForVF(State.VF);
   VPBasicBlock *MiddleVPBB =
       cast<VPBasicBlock>(Plan.getVectorLoopRegion()->getSingleSuccessor());
   VPRecipeBase *ExitingRecipe = ExitValue->getDefiningRecipe();
@@ -207,10 +204,7 @@ void VPLiveOut::fixPhi(VPlan &Plan, VPTransformState &State) {
                        ? MiddleVPBB
                        : ExitingVPBB;
   BasicBlock *PredBB = State.CFG.VPBB2IRBB[PredVPBB];
-  // Set insertion point in PredBB in case an extract needs to be generated.
-  // TODO: Model extracts explicitly.
-  State.Builder.SetInsertPoint(PredBB, PredBB->getFirstNonPHIIt());
-  Value *V = State.get(ExitValue, VPIteration(State.UF - 1, Lane));
+  Value *V = State.get(ExitValue, VPIteration(0, 0));
   if (Phi->getBasicBlockIndex(PredBB) != -1)
     Phi->setIncomingValueForBlock(PredBB, V);
   else
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
index c91fd0f118e31..967939c4854c6 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
@@ -826,20 +826,6 @@ bool VPlanTransforms::adjustFixedOrderRecurrences(VPlan &Plan,
     if (auto *FOR = dyn_cast<VPFirstOrderRecurrencePHIRecipe>(&R))
       RecurrencePhis.push_back(FOR);
 
-  VPBasicBlock *MiddleVPBB =
-      cast<VPBasicBlock>(Plan.getVectorLoopRegion()->getSingleSuccessor());
-  VPBuilder MiddleBuilder;
-  // Set insert point so new recipes are inserted before terminator and
-  // condition, if there is either the former or both.
-  if (auto *Term =
-          dyn_cast_or_null<VPInstruction>(MiddleVPBB->getTerminator())) {
-    if (auto *Cmp = dyn_cast<VPInstruction>(Term->getOperand(0)))
-      MiddleBuilder.setInsertPoint(Cmp);
-    else
-      MiddleBuilder.setInsertPoint(Term);
-  } else
-    MiddleBuilder.setInsertPoint(MiddleVPBB);
-
   for (VPFirstOrderRecurrencePHIRecipe *FOR : RecurrencePhis) {
     SmallPtrSet<VPFirstOrderRecurrencePHIRecipe *, 4> SeenPhis;
     VPRecipeBase *Previous = FOR->getBackedgeValue()->getDefiningRecipe();
@@ -872,86 +858,6 @@ bool VPlanTransforms::adjustFixedOrderRecurrences(VPlan &Plan,
     // Set the first operand of RecurSplice to FOR again, after replacing
     // all users.
     RecurSplice->setOperand(0, FOR);
-
-    // This is the second phase of vectorizing first-order recurrences. An
-    // overview of the transformation is described below. Suppose we have the
-    // following loop with some use after the loop of the last a[i-1],
-    //
-    //   for (int i = 0; i < n; ++i) {
-    //     t = a[i - 1];
-    //     b[i] = a[i] - t;
-    //   }
-    //   use t;
-    //
-    // There is a first-order recurrence on "a". For this loop, the shorthand
-    // scalar IR looks like:
-    //
-    //   scalar.ph:
-    //     s_init = a[-1]
-    //     br scalar.body
-    //
-    //   scalar.body:
-    //     i = phi [0, scalar.ph], [i+1, scalar.body]
-    //     s1 = phi [s_init, scalar.ph], [s2, scalar.body]
-    //     s2 = a[i]
-    //     b[i] = s2 - s1
-    //     br cond, scalar.body, exit.block
-    //
-    //   exit.block:
-    //     use = lcssa.phi [s1, scalar.body]
-    //
-    // In this example, s1 is a recurrence because it's value depends on the
-    // previous iteration. In the first phase of vectorization, we created a
-    // vector phi v1 for s1. We now complete the vectorization and produce the
-    // shorthand vector IR shown below (for VF = 4, UF = 1).
-    //
-    //   vector.ph:
-    //     v_init = vector(..., ..., ..., a[-1])
-    //     br vector.body
-    //
-    //   vector.body
-    //     i = phi [0, vector.ph], [i+4, vector.body]
-    //     v1 = phi [v_init, vector.ph], [v2, vector.body]
-    //     v2 = a[i, i+1, i+2, i+3];
-    //     v3 = vector(v1(3), v2(0, 1, 2))
-    //     b[i, i+1, i+2, i+3] = v2 - v3
-    //     br cond, vector.body, middle.block
-    //
-    //   middle.block:
-    //     s_penultimate = v2(2) = v3(3)
-    //     s_resume = v2(3)
-    //     br cond, scalar.ph, exit.block
-    //
-    //   scalar.ph:
-    //     s_init' = phi [s_resume, middle.block], [s_init, otherwise]
-    //     br scalar.body
-    //
-    //   scalar.body:
-    //     i = phi [0, scalar.ph], [i+1, scalar.body]
-    //     s1 = phi [s_init', scalar.ph], [s2, scalar.body]
-    //     s2 = a[i]
-    //     b[i] = s2 - s1
-    //     br cond, scalar.body, exit.block
-    //
-    //   exit.block:
-    //     lo = lcssa.phi [s1, scalar.body], [s.penultimate, middle.block]
-    //
-    // After execution completes the vector loop, we extract the next value of
-    // the recurrence (x) to use as the initial value in the scalar loop. This
-    // is modeled by ExtractFromEnd.
-    Type *IntTy = Plan.getCanonicalIV()->getScalarType();
-
-    // Extract the penultimate value of the recurrence and update VPLiveOut
-    // users of the recurrence splice. Note that the extract of the final value
-    // used to resume in the scalar loop is created earlier during VPlan
-    // construction.
-    auto *Penultimate = cast<VPInstruction>(MiddleBuilder.createNaryOp(
-        VPInstruction::ExtractFromEnd,
-        {FOR->getBackedgeValue(),
-         Plan.getOrAddLiveIn(ConstantInt::get(IntTy, 2))},
-        {}, "vector.recur.extract.for.phi"));
-    RecurSplice->replaceUsesWithIf(
-        Penultimate, [](VPUser &U, unsigned) { return isa<VPLiveOut>(&U); });
   }
   return true;
 }
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/vplan-vp-intrinsics-reduction.ll b/llvm/test/Transforms/LoopVectorize/RISCV/vplan-vp-intrinsics-reduction.ll
index 16db6cf828af8..f14ffe854a3a6 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/vplan-vp-intrinsics-reduction.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/vplan-vp-intrinsics-reduction.ll
@@ -55,6 +55,7 @@ define i32 @reduction(ptr %a, i64 %n, i32 %start) {
 ; IF-EVL-INLOOP-EMPTY:
 ; IF-EVL-INLOOP-NEXT: middle.block:
 ; IF-EVL-INLOOP-NEXT:   EMIT vp<[[RDX:%.+]]> = compute-reduction-result ir<[[RDX_PHI]]>, ir<[[ADD]]>
+; IF-EVL-INLOOP-NEXT:   EMIT vp<[[RDX_EX:%.+]]> = extract-from-end vp<[[RDX]]>, ir<1>
 ; IF-EVL-INLOOP-NEXT:   EMIT branch-on-cond ir<true>
 ; IF-EVL-INLOOP-NEXT: Successor(s): ir-bb<for.end>, scalar.ph
 ; IF-EVL-INLOOP-EMPTY:
@@ -64,7 +65,7 @@ define i32 @reduction(ptr %a, i64 %n, i32 %start) {
 ; IF-EVL-INLOOP-NEXT: scalar.ph:
 ; IF-EVL-INLOOP-NEXT: No successors
 ; IF-EVL-INLOOP-EMPTY:
-; IF-EVL-INLOOP-NEXT: Live-out i32 %add.lcssa = vp<[[RDX]]>
+; IF-EVL-INLOOP-NEXT: Live-out i32 %add.lcssa = vp<[[RDX_EX]]>
 ; IF-EVL-INLOOP-NEXT: }
 ;
 
@@ -93,6 +94,7 @@ define i32 @reduction(ptr %a, i64 %n, i32 %start) {
 ; NO-VP-OUTLOOP-EMPTY:
 ; NO-VP-OUTLOOP-NEXT: middle.block:
 ; NO-VP-OUTLOOP-NEXT:   EMIT vp<[[RDX:%.+]]> = compute-reduction-result ir<[[RDX_PHI]]>, ir<[[ADD]]>
+; NO-VP-OUTLOOP-NEXT:   EMIT vp<[[RDX_EX:%.+]]> = extract-from-end vp<[[RDX]]>, ir<1>
 ; NO-VP-OUTLOOP-NEXT:   EMIT vp<[[BOC:%.+]]> = icmp eq ir<%n>, vp<[[VTC]]>
 ; NO-VP-OUTLOOP-NEXT:   EMIT branch-on-cond vp<[[BOC]]>
 ; NO-VP-OUTLOOP-NEXT: Successor(s): ir-bb<for.end>, scalar.ph
@@ -103,7 +105,7 @@ define i32 @reduction(ptr %a, i64 %n, i32 %start) {
 ; NO-VP-OUTLOOP-NEXT: scalar.ph:
 ; NO-VP-OUTLOOP-NEXT: No successors
 ; NO-VP-OUTLOOP-EMPTY:
-; NO-VP-OUTLOOP-NEXT: Live-out i32 %add.lcssa = vp<[[RDX]]>
+; NO-VP-OUTLOOP-NEXT: Live-out i32 %add.lcssa = vp<[[RDX_EX]]>
 ; NO-VP-OUTLOOP-NEXT: }
 ;
 
@@ -132,6 +134,7 @@ define i32 @reduction(ptr %a, i64 %n, i32 %start) {
 ; NO-VP-INLOOP-EMPTY:
 ; NO-VP-INLOOP-NEXT: middle.block:
 ; NO-VP-INLOOP-NEXT:   EMIT vp<[[RDX:%.+]]> = compute-reduction-result ir<[[RDX_PHI]]>, ir<[[ADD]]>
+; NO-VP-INLOOP-NEXT:   EMIT vp<[[RDX_EX:%.+]]> = extract-from-end vp<[[RDX]]>, ir<1>
 ; NO-VP-INLOOP-NEXT:   EMIT vp<[[BOC:%.+]]> = icmp eq ir<%n>, vp<[[VTC]]>
 ; NO-VP-INLOOP-NEXT:   EMIT branch-on-cond vp<[[BOC]]>
 ; NO-VP-INLOOP-NEXT: Successor(s): ir-bb<for.end>, scalar.ph
@@ -142,7 +145,7 @@ define i32 @reduction(ptr %a, i64 %n, i32 %start) {
 ; NO-VP-INLOOP-NEXT: scalar.ph:
 ; NO-VP-INLOOP-NEXT: No successors
 ; NO-VP-INLOOP-EMPTY:
-; NO-VP-INLOOP-NEXT: Live-out i32 %add.lcssa = vp<[[RDX]]>
+; NO-VP-INLOOP-NEXT: Live-out i32 %add.lcssa = vp<[[RDX_EX]]>
 ; NO-VP-INLOOP-NEXT: }
 ;
 entry:
diff --git a/llvm/test/Transforms/LoopVectorize/first-order-recurrence-sink-replicate-region.ll b/llvm/test/Transforms/LoopVectorize/first-order-recurrence-sink-replicate-region.ll
index 06fbeafba31c0..9e49cf6b42c6b 100644
--- a/llvm/test/Transforms/LoopVectorize/first-order-recurrence-sink-replicate-region.ll
+++ b/llvm/test/Transforms/LoopVectorize/first-order-recurrence-sink-replicate-region.ll
@@ -220,6 +220,7 @@ define i32 @sink_replicate_region_3_reduction(i32 %x, i8 %y, ptr %ptr) optsize {
 ; CHECK-NEXT: middle.block:
 ; CHECK-NEXT:  EMIT vp<[[RED_RES:%.+]]> = compute-reduction-result ir<%and.red>, vp<[[SEL]]>
 ; CHECK-NEXT:   EMIT vp<[[RESUME_1:%.+]]> = extract-from-end ir<%recur.next>, ir<1>
+; CHECK-NEXT:   EMIT vp<[[RED_EX:%.+]]> = extract-from-end vp<[[RED_RES]]>, ir<1>
 ; CHECK-NEXT:   EMIT branch-on-cond ir<true>
 ; CHECK-NEXT: Successor(s): ir-bb<exit>, scalar.ph
 ; CHECK-EMPTY:
@@ -230,8 +231,8 @@ define i32 @sink_replicate_region_3_reduction(i32 %x, i8 %y, ptr %ptr) optsize {
 ; CHECK-NEXT:   EMIT vp<[[RESUME_1_P:%.*]]> = resume-phi vp<[[RESUME_1]]>, ir<0>
 ; CHECK-NEXT: No successors
 ; CHECK-EMPTY:
-; CHECK-NEXT: Live-out i32 %res = vp<[[RED_RES]]>
 ; CHECK-NEXT: Live-out i32 %recur = vp<[[RESUME_1_P]]>
+; CHECK-NEXT: Live-out i32 %res = vp<[[RED_EX]]>
 ; CHECK-NEXT: }
 ;
 entry:
diff --git a/llvm/test/Transforms/LoopVectorize/instruction-only-used-outside-of-loop.ll b/llvm/test/Transforms/LoopVectorize/instruction-only-used-outside-of-loop.ll
index 5f5cd78dc2d30..fcc6b7376d408 100644
--- a/llvm/test/Transforms/LoopVectorize/instruction-only-used-outside-of-loop.ll
+++ b/llvm/test/Transforms/LoopVectorize/instruction-only-used-outside-of-loop.ll
@@ -34,7 +34,7 @@ define i32 @one_direct_branch(ptr %src) {
 ; CHECK-NEXT:    [[PHI_XOR:%.*]] = phi i32 [ [[XOR]], [[LOOP]] ]
 ; CHECK-NEXT:    [[IV_NEXT]] = add nsw i32 [[IV]], 1
 ; CHECK-NEXT:    [[TOBOOL_NOT:%.*]] = icmp eq i32 [[IV_NEXT]], 1000
-; CHECK-NEXT:    br i1 [[TOBOOL_NOT]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP2:![0-9]+]]
+; CHECK-NEXT:    br i1 [[TOBOOL_NOT]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP3:![0-9]+]]
 ; CHECK:       exit:
 ; CHECK-NEXT:    [[XOR_LCSSA:%.*]] = phi i32 [ [[PHI_XOR]], [[LOOP_LATCH]] ], [ [[TMP5]], [[MIDDLE_BLOCK]] ]
 ; CHECK-NEXT:    ret i32 [[XOR_LCSSA]]
@@ -205,16 +205,14 @@ define i32 @optimizable_trunc_used_outside() {
 ; CHECK:       vector.ph:
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
-; CHECK-NEXT:    [[OFFSET_IDX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP0:%.*]] = trunc i64 [[OFFSET_IDX]] to i32
-; CHECK-NEXT:    [[TMP1:%.*]] = add i32 [[TMP0]], 0
-; CHECK-NEXT:    [[TMP2:%.*]] = add i32 [[TMP0]], 1
-; CHECK-NEXT:    [[TMP3:%.*]] = add i32 [[TMP0]], 2
-; CHECK-NEXT:    [[TMP4:%.*]] = add i32 [[TMP0]], 3
-; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[OFFSET_IDX]], 4
-; CHECK-NEXT:    [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1000
-; CHECK-NEXT:    br i1 [[TMP5]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <4 x i32> [ <i32 0, i32 1, i32 2, i32 3>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
+; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <4 x i32> [[VEC_IND]], <i32 4, i32 4, i32 4, i32 4>
+; CHECK-NEXT:    [[TMP0:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1000
+; CHECK-NEXT:    br i1 [[TMP0]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
 ; CHECK:       middle.block:
+; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <4 x i32> [[VEC_IND]], i32 3
 ; CHECK-NEXT:    br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
 ; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 1000, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
@@ -226,7 +224,7 @@ define i32 @optimizable_trunc_used_outside() {
 ; CHECK-NEXT:    [[EXITCOND_NOT_I_I:%.*]] = icmp eq i64 [[IV_NEXT]], 1000
 ; CHECK-NEXT:    br i1 [[EXITCOND_NOT_I_I]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP9:![0-9]+]]
 ; CHECK:       exit:
-; CHECK-NEXT:    [[IV_TRUNC_LCSSA:%.*]] = phi i32 [ [[IV_TRUNC]], [[LOOP]] ], [ [[TMP4]], [[MIDDLE_BLOCK]] ]
+; CHECK-NE...
[truncated]

``````````

</details>


https://github.com/llvm/llvm-project/pull/100658