[llvm] [VPlan] Add VPIRInstruction, use for exit block live-outs. (PR #100735)
Florian Hahn via llvm-commits
llvm-commits at lists.llvm.org
Fri Jul 26 06:01:59 PDT 2024
https://github.com/fhahn created https://github.com/llvm/llvm-project/pull/100735
Add a new VPIRInstruction recipe to wrap existing IR instructions not to
be modified during execution, execept for PHIs. For PHIs, a single VPValue
operand is allowed, and it is used to add a new incoming value for the
single predecessor VPBB. Expect PHIs, VPIRInstructions cannot have any
operands.
Depends on https://github.com/llvm/llvm-project/pull/100658.
>From 2f441bb3ae998745e03326cc2e59ea7b54439ec4 Mon Sep 17 00:00:00 2001
From: Florian Hahn <flo at fhahn.com>
Date: Tue, 23 Jul 2024 15:25:01 +0100
Subject: [PATCH 1/3] [VPlan] Introduce explicit ExtractFromEnd recipes for
live-outs.
Introduce explicit ExtractFromEnd recipes to extract the final values
for live-outs instead of implicitly extracting in VPLiveOut::fixPhi.
This is a follow-up to the recent changes of modeling extracts for
recurrences and consolidates live-out extract creation for fixed-order
recurrences at a single place: addLiveOutsForFirstOrderRecurrences.
It is also in preparation of replacing VPLiveOut with VPIRInstructions
wrapping the original scalar phis.
---
.../Transforms/Vectorize/LoopVectorize.cpp | 130 ++++++++++++++++--
.../lib/Transforms/Vectorize/VPlanRecipes.cpp | 8 +-
.../Transforms/Vectorize/VPlanTransforms.cpp | 94 -------------
.../RISCV/vplan-vp-intrinsics-reduction.ll | 9 +-
...-order-recurrence-sink-replicate-region.ll | 3 +-
.../instruction-only-used-outside-of-loop.ll | 20 ++-
.../pr55167-fold-tail-live-out.ll | 2 +-
.../LoopVectorize/select-cmp-multiuse.ll | 4 +-
.../LoopVectorize/vplan-printing.ll | 11 +-
9 files changed, 144 insertions(+), 137 deletions(-)
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 09ca859f52680..61b29295d07b5 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -8452,7 +8452,104 @@ static void addUsersInExitBlock(
return P && Inductions.contains(P);
})))
continue;
- Plan.addLiveOut(&ExitPhi, V);
+
+ auto MiddleVPBB =
+ cast<VPBasicBlock>(Plan.getVectorLoopRegion()->getSingleSuccessor());
+ VPBuilder B(MiddleVPBB);
+ if (auto *Terminator = MiddleVPBB->getTerminator()) {
+ auto *Condition = dyn_cast<VPInstruction>(Terminator->getOperand(0));
+ assert((!Condition || Condition->getParent() == MiddleVPBB) &&
+ "Condition expected in MiddleVPBB");
+ B.setInsertPoint(Condition ? Condition : Terminator);
+ }
+
+ VPValue *Ext;
+ if (auto *FOR = dyn_cast_or_null<VPFirstOrderRecurrencePHIRecipe>(
+ V->getDefiningRecipe())) {
+ // This is the second phase of vectorizing first-order recurrences. An
+ // overview of the transformation is described below. Suppose we have the
+ // following loop with some use after the loop of the last a[i-1],
+ //
+ // for (int i = 0; i < n; ++i) {
+ // t = a[i - 1];
+ // b[i] = a[i] - t;
+ // }
+ // use t;
+ //
+ // There is a first-order recurrence on "a". For this loop, the shorthand
+ // scalar IR looks like:
+ //
+ // scalar.ph:
+ // s_init = a[-1]
+ // br scalar.body
+ //
+ // scalar.body:
+ // i = phi [0, scalar.ph], [i+1, scalar.body]
+ // s1 = phi [s_init, scalar.ph], [s2, scalar.body]
+ // s2 = a[i]
+ // b[i] = s2 - s1
+ // br cond, scalar.body, exit.block
+ //
+ // exit.block:
+ // use = lcssa.phi [s1, scalar.body]
+ //
+ // In this example, s1 is a recurrence because it's value depends on the
+ // previous iteration. In the first phase of vectorization, we created a
+ // vector phi v1 for s1. We now complete the vectorization and produce the
+ // shorthand vector IR shown below (for VF = 4, UF = 1).
+ //
+ // vector.ph:
+ // v_init = vector(..., ..., ..., a[-1])
+ // br vector.body
+ //
+ // vector.body
+ // i = phi [0, vector.ph], [i+4, vector.body]
+ // v1 = phi [v_init, vector.ph], [v2, vector.body]
+ // v2 = a[i, i+1, i+2, i+3];
+ // v3 = vector(v1(3), v2(0, 1, 2))
+ // b[i, i+1, i+2, i+3] = v2 - v3
+ // br cond, vector.body, middle.block
+ //
+ // middle.block:
+ // s_penultimate = v2(2) = v3(3)
+ // s_resume = v2(3)
+ // br cond, scalar.ph, exit.block
+ //
+ // scalar.ph:
+ // s_init' = phi [s_resume, middle.block], [s_init, otherwise]
+ // br scalar.body
+ //
+ // scalar.body:
+ // i = phi [0, scalar.ph], [i+1, scalar.body]
+ // s1 = phi [s_init', scalar.ph], [s2, scalar.body]
+ // s2 = a[i]
+ // b[i] = s2 - s1
+ // br cond, scalar.body, exit.block
+ //
+ // exit.block:
+ // lo = lcssa.phi [s1, scalar.body], [s.penultimate, middle.block]
+ //
+ // After execution completes the vector loop, we extract the next value of
+ // the recurrence (x) to use as the initial value in the scalar loop. This
+ // is modeled by ExtractFromEnd.
+ //
+ // Extract the penultimate value of the recurrence and update VPLiveOut
+ // users of the recurrence splice. Note that the extract of the final
+ // value used to resume in the scalar loop is created earlier during VPlan
+ // construction.
+ Ext =
+ B.createNaryOp(VPInstruction::ExtractFromEnd,
+ {FOR->getBackedgeValue(),
+ Plan.getOrAddLiveIn(ConstantInt::get(
+ IntegerType::get(ExitBB->getContext(), 32), 2))},
+ {}, "vector.recur.extract.for.phi");
+ } else {
+ Ext = B.createNaryOp(
+ VPInstruction::ExtractFromEnd,
+ {V, Plan.getOrAddLiveIn(ConstantInt::get(
+ IntegerType::get(ExitBB->getContext(), 32), 1))});
+ }
+ Plan.addLiveOut(&ExitPhi, Ext);
}
}
@@ -8660,6 +8757,14 @@ LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(VFRange &Range) {
// After here, VPBB should not be used.
VPBB = nullptr;
+ assert(isa<VPRegionBlock>(Plan->getVectorLoopRegion()) &&
+ !Plan->getVectorLoopRegion()->getEntryBasicBlock()->empty() &&
+ "entry block must be set to a VPRegionBlock having a non-empty entry "
+ "VPBasicBlock");
+ RecipeBuilder.fixHeaderPhis();
+
+ addLiveOutsForFirstOrderRecurrences(*Plan);
+
if (CM.requiresScalarEpilogue(Range)) {
// No edge from the middle block to the unique exit block has been inserted
// and there is nothing to fix from vector loop; phis should have incoming
@@ -8668,13 +8773,6 @@ LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(VFRange &Range) {
addUsersInExitBlock(OrigLoop, RecipeBuilder, *Plan,
Legal->getInductionVars());
- assert(isa<VPRegionBlock>(Plan->getVectorLoopRegion()) &&
- !Plan->getVectorLoopRegion()->getEntryBasicBlock()->empty() &&
- "entry block must be set to a VPRegionBlock having a non-empty entry "
- "VPBasicBlock");
- RecipeBuilder.fixHeaderPhis();
-
- addLiveOutsForFirstOrderRecurrences(*Plan);
// ---------------------------------------------------------------------------
// Transform initial VPlan: Apply previously taken decisions, in order, to
@@ -8884,10 +8982,12 @@ void LoopVectorizationPlanner::adjustRecipesForReductions(
for (unsigned I = 0; I != Worklist.size(); ++I) {
VPSingleDefRecipe *Cur = Worklist[I];
for (VPUser *U : Cur->users()) {
- auto *UserRecipe = dyn_cast<VPSingleDefRecipe>(U);
- if (!UserRecipe) {
- assert(isa<VPLiveOut>(U) &&
- "U must either be a VPSingleDef or VPLiveOut");
+ auto *UserRecipe = cast<VPSingleDefRecipe>(U);
+ if (!UserRecipe->getParent()->getParent()) {
+ assert(cast<VPInstruction>(U) &&
+ cast<VPInstruction>(U)->getOpcode() ==
+ VPInstruction::ExtractFromEnd &&
+ "U must be an ExtractFromEnd VPInstruction");
continue;
}
Worklist.insert(UserRecipe);
@@ -9105,8 +9205,10 @@ void LoopVectorizationPlanner::adjustRecipesForReductions(
VPInstruction::ComputeReductionResult, {PhiR, NewExitingVPV}, ExitDL);
FinalReductionResult->insertBefore(*MiddleVPBB, IP);
OrigExitingVPV->replaceUsesWithIf(
- FinalReductionResult,
- [](VPUser &User, unsigned) { return isa<VPLiveOut>(&User); });
+ FinalReductionResult, [](VPUser &User, unsigned) {
+ auto *R = dyn_cast<VPInstruction>(&User);
+ return R && R->getOpcode() == VPInstruction::ExtractFromEnd;
+ });
}
VPlanTransforms::clearReductionWrapFlags(*Plan);
diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
index 2d6d67a55c17d..798d178e5a963 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
@@ -194,9 +194,6 @@ bool VPRecipeBase::mayHaveSideEffects() const {
void VPLiveOut::fixPhi(VPlan &Plan, VPTransformState &State) {
VPValue *ExitValue = getOperand(0);
- auto Lane = vputils::isUniformAfterVectorization(ExitValue)
- ? VPLane::getFirstLane()
- : VPLane::getLastLaneForVF(State.VF);
VPBasicBlock *MiddleVPBB =
cast<VPBasicBlock>(Plan.getVectorLoopRegion()->getSingleSuccessor());
VPRecipeBase *ExitingRecipe = ExitValue->getDefiningRecipe();
@@ -207,10 +204,7 @@ void VPLiveOut::fixPhi(VPlan &Plan, VPTransformState &State) {
? MiddleVPBB
: ExitingVPBB;
BasicBlock *PredBB = State.CFG.VPBB2IRBB[PredVPBB];
- // Set insertion point in PredBB in case an extract needs to be generated.
- // TODO: Model extracts explicitly.
- State.Builder.SetInsertPoint(PredBB, PredBB->getFirstNonPHIIt());
- Value *V = State.get(ExitValue, VPIteration(State.UF - 1, Lane));
+ Value *V = State.get(ExitValue, VPIteration(0, 0));
if (Phi->getBasicBlockIndex(PredBB) != -1)
Phi->setIncomingValueForBlock(PredBB, V);
else
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
index c91fd0f118e31..967939c4854c6 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
@@ -826,20 +826,6 @@ bool VPlanTransforms::adjustFixedOrderRecurrences(VPlan &Plan,
if (auto *FOR = dyn_cast<VPFirstOrderRecurrencePHIRecipe>(&R))
RecurrencePhis.push_back(FOR);
- VPBasicBlock *MiddleVPBB =
- cast<VPBasicBlock>(Plan.getVectorLoopRegion()->getSingleSuccessor());
- VPBuilder MiddleBuilder;
- // Set insert point so new recipes are inserted before terminator and
- // condition, if there is either the former or both.
- if (auto *Term =
- dyn_cast_or_null<VPInstruction>(MiddleVPBB->getTerminator())) {
- if (auto *Cmp = dyn_cast<VPInstruction>(Term->getOperand(0)))
- MiddleBuilder.setInsertPoint(Cmp);
- else
- MiddleBuilder.setInsertPoint(Term);
- } else
- MiddleBuilder.setInsertPoint(MiddleVPBB);
-
for (VPFirstOrderRecurrencePHIRecipe *FOR : RecurrencePhis) {
SmallPtrSet<VPFirstOrderRecurrencePHIRecipe *, 4> SeenPhis;
VPRecipeBase *Previous = FOR->getBackedgeValue()->getDefiningRecipe();
@@ -872,86 +858,6 @@ bool VPlanTransforms::adjustFixedOrderRecurrences(VPlan &Plan,
// Set the first operand of RecurSplice to FOR again, after replacing
// all users.
RecurSplice->setOperand(0, FOR);
-
- // This is the second phase of vectorizing first-order recurrences. An
- // overview of the transformation is described below. Suppose we have the
- // following loop with some use after the loop of the last a[i-1],
- //
- // for (int i = 0; i < n; ++i) {
- // t = a[i - 1];
- // b[i] = a[i] - t;
- // }
- // use t;
- //
- // There is a first-order recurrence on "a". For this loop, the shorthand
- // scalar IR looks like:
- //
- // scalar.ph:
- // s_init = a[-1]
- // br scalar.body
- //
- // scalar.body:
- // i = phi [0, scalar.ph], [i+1, scalar.body]
- // s1 = phi [s_init, scalar.ph], [s2, scalar.body]
- // s2 = a[i]
- // b[i] = s2 - s1
- // br cond, scalar.body, exit.block
- //
- // exit.block:
- // use = lcssa.phi [s1, scalar.body]
- //
- // In this example, s1 is a recurrence because it's value depends on the
- // previous iteration. In the first phase of vectorization, we created a
- // vector phi v1 for s1. We now complete the vectorization and produce the
- // shorthand vector IR shown below (for VF = 4, UF = 1).
- //
- // vector.ph:
- // v_init = vector(..., ..., ..., a[-1])
- // br vector.body
- //
- // vector.body
- // i = phi [0, vector.ph], [i+4, vector.body]
- // v1 = phi [v_init, vector.ph], [v2, vector.body]
- // v2 = a[i, i+1, i+2, i+3];
- // v3 = vector(v1(3), v2(0, 1, 2))
- // b[i, i+1, i+2, i+3] = v2 - v3
- // br cond, vector.body, middle.block
- //
- // middle.block:
- // s_penultimate = v2(2) = v3(3)
- // s_resume = v2(3)
- // br cond, scalar.ph, exit.block
- //
- // scalar.ph:
- // s_init' = phi [s_resume, middle.block], [s_init, otherwise]
- // br scalar.body
- //
- // scalar.body:
- // i = phi [0, scalar.ph], [i+1, scalar.body]
- // s1 = phi [s_init', scalar.ph], [s2, scalar.body]
- // s2 = a[i]
- // b[i] = s2 - s1
- // br cond, scalar.body, exit.block
- //
- // exit.block:
- // lo = lcssa.phi [s1, scalar.body], [s.penultimate, middle.block]
- //
- // After execution completes the vector loop, we extract the next value of
- // the recurrence (x) to use as the initial value in the scalar loop. This
- // is modeled by ExtractFromEnd.
- Type *IntTy = Plan.getCanonicalIV()->getScalarType();
-
- // Extract the penultimate value of the recurrence and update VPLiveOut
- // users of the recurrence splice. Note that the extract of the final value
- // used to resume in the scalar loop is created earlier during VPlan
- // construction.
- auto *Penultimate = cast<VPInstruction>(MiddleBuilder.createNaryOp(
- VPInstruction::ExtractFromEnd,
- {FOR->getBackedgeValue(),
- Plan.getOrAddLiveIn(ConstantInt::get(IntTy, 2))},
- {}, "vector.recur.extract.for.phi"));
- RecurSplice->replaceUsesWithIf(
- Penultimate, [](VPUser &U, unsigned) { return isa<VPLiveOut>(&U); });
}
return true;
}
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/vplan-vp-intrinsics-reduction.ll b/llvm/test/Transforms/LoopVectorize/RISCV/vplan-vp-intrinsics-reduction.ll
index 16db6cf828af8..f14ffe854a3a6 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/vplan-vp-intrinsics-reduction.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/vplan-vp-intrinsics-reduction.ll
@@ -55,6 +55,7 @@ define i32 @reduction(ptr %a, i64 %n, i32 %start) {
; IF-EVL-INLOOP-EMPTY:
; IF-EVL-INLOOP-NEXT: middle.block:
; IF-EVL-INLOOP-NEXT: EMIT vp<[[RDX:%.+]]> = compute-reduction-result ir<[[RDX_PHI]]>, ir<[[ADD]]>
+; IF-EVL-INLOOP-NEXT: EMIT vp<[[RDX_EX:%.+]]> = extract-from-end vp<[[RDX]]>, ir<1>
; IF-EVL-INLOOP-NEXT: EMIT branch-on-cond ir<true>
; IF-EVL-INLOOP-NEXT: Successor(s): ir-bb<for.end>, scalar.ph
; IF-EVL-INLOOP-EMPTY:
@@ -64,7 +65,7 @@ define i32 @reduction(ptr %a, i64 %n, i32 %start) {
; IF-EVL-INLOOP-NEXT: scalar.ph:
; IF-EVL-INLOOP-NEXT: No successors
; IF-EVL-INLOOP-EMPTY:
-; IF-EVL-INLOOP-NEXT: Live-out i32 %add.lcssa = vp<[[RDX]]>
+; IF-EVL-INLOOP-NEXT: Live-out i32 %add.lcssa = vp<[[RDX_EX]]>
; IF-EVL-INLOOP-NEXT: }
;
@@ -93,6 +94,7 @@ define i32 @reduction(ptr %a, i64 %n, i32 %start) {
; NO-VP-OUTLOOP-EMPTY:
; NO-VP-OUTLOOP-NEXT: middle.block:
; NO-VP-OUTLOOP-NEXT: EMIT vp<[[RDX:%.+]]> = compute-reduction-result ir<[[RDX_PHI]]>, ir<[[ADD]]>
+; NO-VP-OUTLOOP-NEXT: EMIT vp<[[RDX_EX:%.+]]> = extract-from-end vp<[[RDX]]>, ir<1>
; NO-VP-OUTLOOP-NEXT: EMIT vp<[[BOC:%.+]]> = icmp eq ir<%n>, vp<[[VTC]]>
; NO-VP-OUTLOOP-NEXT: EMIT branch-on-cond vp<[[BOC]]>
; NO-VP-OUTLOOP-NEXT: Successor(s): ir-bb<for.end>, scalar.ph
@@ -103,7 +105,7 @@ define i32 @reduction(ptr %a, i64 %n, i32 %start) {
; NO-VP-OUTLOOP-NEXT: scalar.ph:
; NO-VP-OUTLOOP-NEXT: No successors
; NO-VP-OUTLOOP-EMPTY:
-; NO-VP-OUTLOOP-NEXT: Live-out i32 %add.lcssa = vp<[[RDX]]>
+; NO-VP-OUTLOOP-NEXT: Live-out i32 %add.lcssa = vp<[[RDX_EX]]>
; NO-VP-OUTLOOP-NEXT: }
;
@@ -132,6 +134,7 @@ define i32 @reduction(ptr %a, i64 %n, i32 %start) {
; NO-VP-INLOOP-EMPTY:
; NO-VP-INLOOP-NEXT: middle.block:
; NO-VP-INLOOP-NEXT: EMIT vp<[[RDX:%.+]]> = compute-reduction-result ir<[[RDX_PHI]]>, ir<[[ADD]]>
+; NO-VP-INLOOP-NEXT: EMIT vp<[[RDX_EX:%.+]]> = extract-from-end vp<[[RDX]]>, ir<1>
; NO-VP-INLOOP-NEXT: EMIT vp<[[BOC:%.+]]> = icmp eq ir<%n>, vp<[[VTC]]>
; NO-VP-INLOOP-NEXT: EMIT branch-on-cond vp<[[BOC]]>
; NO-VP-INLOOP-NEXT: Successor(s): ir-bb<for.end>, scalar.ph
@@ -142,7 +145,7 @@ define i32 @reduction(ptr %a, i64 %n, i32 %start) {
; NO-VP-INLOOP-NEXT: scalar.ph:
; NO-VP-INLOOP-NEXT: No successors
; NO-VP-INLOOP-EMPTY:
-; NO-VP-INLOOP-NEXT: Live-out i32 %add.lcssa = vp<[[RDX]]>
+; NO-VP-INLOOP-NEXT: Live-out i32 %add.lcssa = vp<[[RDX_EX]]>
; NO-VP-INLOOP-NEXT: }
;
entry:
diff --git a/llvm/test/Transforms/LoopVectorize/first-order-recurrence-sink-replicate-region.ll b/llvm/test/Transforms/LoopVectorize/first-order-recurrence-sink-replicate-region.ll
index 06fbeafba31c0..9e49cf6b42c6b 100644
--- a/llvm/test/Transforms/LoopVectorize/first-order-recurrence-sink-replicate-region.ll
+++ b/llvm/test/Transforms/LoopVectorize/first-order-recurrence-sink-replicate-region.ll
@@ -220,6 +220,7 @@ define i32 @sink_replicate_region_3_reduction(i32 %x, i8 %y, ptr %ptr) optsize {
; CHECK-NEXT: middle.block:
; CHECK-NEXT: EMIT vp<[[RED_RES:%.+]]> = compute-reduction-result ir<%and.red>, vp<[[SEL]]>
; CHECK-NEXT: EMIT vp<[[RESUME_1:%.+]]> = extract-from-end ir<%recur.next>, ir<1>
+; CHECK-NEXT: EMIT vp<[[RED_EX:%.+]]> = extract-from-end vp<[[RED_RES]]>, ir<1>
; CHECK-NEXT: EMIT branch-on-cond ir<true>
; CHECK-NEXT: Successor(s): ir-bb<exit>, scalar.ph
; CHECK-EMPTY:
@@ -230,8 +231,8 @@ define i32 @sink_replicate_region_3_reduction(i32 %x, i8 %y, ptr %ptr) optsize {
; CHECK-NEXT: EMIT vp<[[RESUME_1_P:%.*]]> = resume-phi vp<[[RESUME_1]]>, ir<0>
; CHECK-NEXT: No successors
; CHECK-EMPTY:
-; CHECK-NEXT: Live-out i32 %res = vp<[[RED_RES]]>
; CHECK-NEXT: Live-out i32 %recur = vp<[[RESUME_1_P]]>
+; CHECK-NEXT: Live-out i32 %res = vp<[[RED_EX]]>
; CHECK-NEXT: }
;
entry:
diff --git a/llvm/test/Transforms/LoopVectorize/instruction-only-used-outside-of-loop.ll b/llvm/test/Transforms/LoopVectorize/instruction-only-used-outside-of-loop.ll
index 5f5cd78dc2d30..fcc6b7376d408 100644
--- a/llvm/test/Transforms/LoopVectorize/instruction-only-used-outside-of-loop.ll
+++ b/llvm/test/Transforms/LoopVectorize/instruction-only-used-outside-of-loop.ll
@@ -34,7 +34,7 @@ define i32 @one_direct_branch(ptr %src) {
; CHECK-NEXT: [[PHI_XOR:%.*]] = phi i32 [ [[XOR]], [[LOOP]] ]
; CHECK-NEXT: [[IV_NEXT]] = add nsw i32 [[IV]], 1
; CHECK-NEXT: [[TOBOOL_NOT:%.*]] = icmp eq i32 [[IV_NEXT]], 1000
-; CHECK-NEXT: br i1 [[TOBOOL_NOT]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP2:![0-9]+]]
+; CHECK-NEXT: br i1 [[TOBOOL_NOT]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP3:![0-9]+]]
; CHECK: exit:
; CHECK-NEXT: [[XOR_LCSSA:%.*]] = phi i32 [ [[PHI_XOR]], [[LOOP_LATCH]] ], [ [[TMP5]], [[MIDDLE_BLOCK]] ]
; CHECK-NEXT: ret i32 [[XOR_LCSSA]]
@@ -205,16 +205,14 @@ define i32 @optimizable_trunc_used_outside() {
; CHECK: vector.ph:
; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
; CHECK: vector.body:
-; CHECK-NEXT: [[OFFSET_IDX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[TMP0:%.*]] = trunc i64 [[OFFSET_IDX]] to i32
-; CHECK-NEXT: [[TMP1:%.*]] = add i32 [[TMP0]], 0
-; CHECK-NEXT: [[TMP2:%.*]] = add i32 [[TMP0]], 1
-; CHECK-NEXT: [[TMP3:%.*]] = add i32 [[TMP0]], 2
-; CHECK-NEXT: [[TMP4:%.*]] = add i32 [[TMP0]], 3
-; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[OFFSET_IDX]], 4
-; CHECK-NEXT: [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1000
-; CHECK-NEXT: br i1 [[TMP5]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
+; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT: [[VEC_IND:%.*]] = phi <4 x i32> [ <i32 0, i32 1, i32 2, i32 3>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
+; CHECK-NEXT: [[VEC_IND_NEXT]] = add <4 x i32> [[VEC_IND]], <i32 4, i32 4, i32 4, i32 4>
+; CHECK-NEXT: [[TMP0:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1000
+; CHECK-NEXT: br i1 [[TMP0]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
; CHECK: middle.block:
+; CHECK-NEXT: [[TMP1:%.*]] = extractelement <4 x i32> [[VEC_IND]], i32 3
; CHECK-NEXT: br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]]
; CHECK: scalar.ph:
; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 1000, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
@@ -226,7 +224,7 @@ define i32 @optimizable_trunc_used_outside() {
; CHECK-NEXT: [[EXITCOND_NOT_I_I:%.*]] = icmp eq i64 [[IV_NEXT]], 1000
; CHECK-NEXT: br i1 [[EXITCOND_NOT_I_I]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP9:![0-9]+]]
; CHECK: exit:
-; CHECK-NEXT: [[IV_TRUNC_LCSSA:%.*]] = phi i32 [ [[IV_TRUNC]], [[LOOP]] ], [ [[TMP4]], [[MIDDLE_BLOCK]] ]
+; CHECK-NEXT: [[IV_TRUNC_LCSSA:%.*]] = phi i32 [ [[IV_TRUNC]], [[LOOP]] ], [ [[TMP1]], [[MIDDLE_BLOCK]] ]
; CHECK-NEXT: ret i32 [[IV_TRUNC_LCSSA]]
;
entry:
diff --git a/llvm/test/Transforms/LoopVectorize/pr55167-fold-tail-live-out.ll b/llvm/test/Transforms/LoopVectorize/pr55167-fold-tail-live-out.ll
index 72f8cf22cafa7..b79525bc3e440 100644
--- a/llvm/test/Transforms/LoopVectorize/pr55167-fold-tail-live-out.ll
+++ b/llvm/test/Transforms/LoopVectorize/pr55167-fold-tail-live-out.ll
@@ -34,8 +34,8 @@ define i32 @test(i32 %a, i1 %c.1, i1 %c.2 ) #0 {
; CHECK-NEXT: [[TMP8:%.*]] = icmp eq i32 [[INDEX_NEXT]], 176
; CHECK-NEXT: br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
; CHECK: middle.block:
-; CHECK-NEXT: [[TMP9:%.*]] = extractelement <2 x i32> [[PREDPHI5]], i32 1
; CHECK-NEXT: [[TMP10:%.*]] = call i32 @llvm.vector.reduce.add.v2i32(<2 x i32> [[PREDPHI7]])
+; CHECK-NEXT: [[TMP9:%.*]] = extractelement <2 x i32> [[PREDPHI5]], i32 1
; CHECK-NEXT: br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]]
; CHECK: scalar.ph:
; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ 182, [[MIDDLE_BLOCK]] ], [ 6, [[BB:%.*]] ]
diff --git a/llvm/test/Transforms/LoopVectorize/select-cmp-multiuse.ll b/llvm/test/Transforms/LoopVectorize/select-cmp-multiuse.ll
index 9eb90099214e1..b88e597e6bc8e 100644
--- a/llvm/test/Transforms/LoopVectorize/select-cmp-multiuse.ll
+++ b/llvm/test/Transforms/LoopVectorize/select-cmp-multiuse.ll
@@ -916,13 +916,13 @@ define i32 @multi_user_cmp_branch_use_and_outside_bb_use(ptr readonly %a, i64 no
; CHECK-VF4-IC1-NEXT: [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
; CHECK-VF4-IC1-NEXT: br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP13:![0-9]+]]
; CHECK-VF4-IC1: middle.block:
-; CHECK-VF4-IC1-NEXT: [[TMP8:%.*]] = extractelement <4 x i1> [[TMP3]], i32 3
; CHECK-VF4-IC1-NEXT: [[TMP9:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP6]])
; CHECK-VF4-IC1-NEXT: [[TMP10:%.*]] = freeze i1 [[TMP9]]
; CHECK-VF4-IC1-NEXT: [[RDX_SELECT:%.*]] = select i1 [[TMP10]], i1 false, i1 true
; CHECK-VF4-IC1-NEXT: [[TMP11:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP4]])
; CHECK-VF4-IC1-NEXT: [[TMP12:%.*]] = freeze i1 [[TMP11]]
; CHECK-VF4-IC1-NEXT: [[RDX_SELECT2:%.*]] = select i1 [[TMP12]], i1 true, i1 false
+; CHECK-VF4-IC1-NEXT: [[TMP8:%.*]] = extractelement <4 x i1> [[TMP3]], i32 3
; CHECK-VF4-IC1-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
; CHECK-VF4-IC1-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]
; CHECK-VF4-IC1: scalar.ph:
@@ -986,7 +986,6 @@ define i32 @multi_user_cmp_branch_use_and_outside_bb_use(ptr readonly %a, i64 no
; CHECK-VF4-IC2-NEXT: [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
; CHECK-VF4-IC2-NEXT: br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP13:![0-9]+]]
; CHECK-VF4-IC2: middle.block:
-; CHECK-VF4-IC2-NEXT: [[TMP15:%.*]] = extractelement <4 x i1> [[TMP7]], i32 3
; CHECK-VF4-IC2-NEXT: [[BIN_RDX:%.*]] = or <4 x i1> [[TMP13]], [[TMP12]]
; CHECK-VF4-IC2-NEXT: [[TMP16:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[BIN_RDX]])
; CHECK-VF4-IC2-NEXT: [[TMP17:%.*]] = freeze i1 [[TMP16]]
@@ -995,6 +994,7 @@ define i32 @multi_user_cmp_branch_use_and_outside_bb_use(ptr readonly %a, i64 no
; CHECK-VF4-IC2-NEXT: [[TMP18:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[BIN_RDX5]])
; CHECK-VF4-IC2-NEXT: [[TMP19:%.*]] = freeze i1 [[TMP18]]
; CHECK-VF4-IC2-NEXT: [[RDX_SELECT6:%.*]] = select i1 [[TMP19]], i1 true, i1 false
+; CHECK-VF4-IC2-NEXT: [[TMP15:%.*]] = extractelement <4 x i1> [[TMP7]], i32 3
; CHECK-VF4-IC2-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
; CHECK-VF4-IC2-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]
; CHECK-VF4-IC2: scalar.ph:
diff --git a/llvm/test/Transforms/LoopVectorize/vplan-printing.ll b/llvm/test/Transforms/LoopVectorize/vplan-printing.ll
index 3a664de748d2d..f18ed825a6b88 100644
--- a/llvm/test/Transforms/LoopVectorize/vplan-printing.ll
+++ b/llvm/test/Transforms/LoopVectorize/vplan-printing.ll
@@ -154,6 +154,7 @@ define float @print_reduction(i64 %n, ptr noalias %y) {
; CHECK-EMPTY:
; CHECK-NEXT: middle.block:
; CHECK-NEXT: EMIT vp<[[RED_RES:%.+]]> = compute-reduction-result ir<%red>, ir<%red.next>
+; CHECK-NEXT: EMIT vp<[[RED_EX:%.+]]> = extract-from-end vp<[[RED_RES]]>, ir<1>
; CHECK-NEXT: EMIT vp<[[CMP:%.+]]> = icmp eq ir<%n>, vp<[[VEC_TC]]>
; CHECK-NEXT: EMIT branch-on-cond vp<[[CMP]]>
; CHECK-NEXT: Successor(s): ir-bb<for.end>, scalar.ph
@@ -164,7 +165,7 @@ define float @print_reduction(i64 %n, ptr noalias %y) {
; CHECK-NEXT: scalar.ph
; CHECK-NEXT: No successors
; CHECK-EMPTY:
-; CHECK-NEXT: Live-out float %red.next.lcssa = vp<[[RED_RES]]>
+; CHECK-NEXT: Live-out float %red.next.lcssa = vp<[[RED_EX]]>
; CHECK-NEXT: }
;
entry:
@@ -435,6 +436,7 @@ define float @print_fmuladd_strict(ptr %a, ptr %b, i64 %n) {
; CHECK-EMPTY:
; CHECK-NEXT: middle.block:
; CHECK-NEXT: EMIT vp<[[RED_RES:%.+]]> = compute-reduction-result ir<%sum.07>, ir<[[MULADD]]>
+; CHECK-NEXT: EMIT vp<[[RED_EX:%.+]]> = extract-from-end vp<[[RED_RES]]>, ir<1>
; CHECK-NEXT: EMIT vp<[[CMP:%.+]]> = icmp eq ir<%n>, vp<[[VEC_TC]]>
; CHECK-NEXT: EMIT branch-on-cond vp<[[CMP]]>
; CHECK-NEXT: Successor(s): ir-bb<for.end>, scalar.ph
@@ -445,7 +447,7 @@ define float @print_fmuladd_strict(ptr %a, ptr %b, i64 %n) {
; CHECK-NEXT: scalar.ph
; CHECK-NEXT: No successors
; CHECK-EMPTY:
-; CHECK-NEXT: Live-out float %muladd.lcssa = vp<[[RED_RES]]>
+; CHECK-NEXT: Live-out float %muladd.lcssa = vp<[[RED_EX]]>
; CHECK-NEXT:}
entry:
@@ -654,6 +656,7 @@ define i32 @print_exit_value(ptr %ptr, i32 %off) {
; CHECK-NEXT: Successor(s): middle.block
; CHECK-EMPTY:
; CHECK-NEXT: middle.block:
+; CHECK-NEXT: EMIT vp<[[EXIT:%.+]]> = extract-from-end ir<%add>, ir<1>
; CHECK-NEXT: EMIT vp<[[CMP:%.+]]> = icmp eq ir<1000>, vp<[[VEC_TC]]>
; CHECK-NEXT: EMIT branch-on-cond vp<[[CMP]]>
; CHECK-NEXT: Successor(s): ir-bb<exit>, scalar.ph
@@ -664,7 +667,7 @@ define i32 @print_exit_value(ptr %ptr, i32 %off) {
; CHECK-NEXT: scalar.ph
; CHECK-NEXT: No successors
; CHECK-EMPTY:
-; CHECK-NEXT: Live-out i32 %lcssa = ir<%add>
+; CHECK-NEXT: Live-out i32 %lcssa = vp<[[EXIT]]>
; CHECK-NEXT: }
;
entry:
@@ -1036,8 +1039,8 @@ define i16 @print_first_order_recurrence_and_result(ptr %ptr) {
; CHECK-NEXT: EMIT vp<[[RESUME_P:%.*]]> = resume-phi vp<[[RESUME_1]]>, ir<22>
; CHECK-NEXT: No successors
; CHECK-EMPTY:
-; CHECK-NEXT: Live-out i16 %for.1.lcssa = vp<[[FOR_RESULT]]>
; CHECK-NEXT: Live-out i16 %for.1 = vp<[[RESUME_P]]>
+; CHECK-NEXT: Live-out i16 %for.1.lcssa = vp<[[FOR_RESULT]]>
; CHECK-NEXT: }
;
entry:
>From 540b9c5de195002470389e47dcfbe01c5e22a540 Mon Sep 17 00:00:00 2001
From: Florian Hahn <flo at fhahn.com>
Date: Fri, 26 Jul 2024 12:25:29 +0100
Subject: [PATCH 2/3] !fixup fix formatting
---
llvm/lib/Transforms/Vectorize/LoopVectorize.cpp | 1 -
1 file changed, 1 deletion(-)
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index f3cc705962357..89ea7bc563673 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -8773,7 +8773,6 @@ LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(VFRange &Range) {
addUsersInExitBlock(OrigLoop, RecipeBuilder, *Plan,
Legal->getInductionVars());
-
// ---------------------------------------------------------------------------
// Transform initial VPlan: Apply previously taken decisions, in order, to
// bring the VPlan to its final state.
>From 0284675e468ef130fcf8d45b6715d2c6d3829706 Mon Sep 17 00:00:00 2001
From: Florian Hahn <flo at fhahn.com>
Date: Mon, 22 Jul 2024 10:47:54 +0100
Subject: [PATCH 3/3] [VPlan] Add VPIRInstruction, use for exit block
live-outs.
Add a new VPIRInstruction recipe to wrap existing IR instructions not to
be modified during execution, execept for PHIs. For PHIs, a single VPValue
operand is allowed, and it is used to add a new incoming value for the
single predecessor VPBB. Expect PHIs, VPIRInstructions cannot have any
operands.
Depends on https://github.com/llvm/llvm-project/pull/100658.
---
.../Transforms/Vectorize/LoopVectorize.cpp | 239 +++++++++---------
llvm/lib/Transforms/Vectorize/VPlan.cpp | 14 +-
llvm/lib/Transforms/Vectorize/VPlan.h | 40 +++
.../lib/Transforms/Vectorize/VPlanRecipes.cpp | 29 +++
llvm/lib/Transforms/Vectorize/VPlanValue.h | 1 +
.../RISCV/riscv-vector-reverse.ll | 4 +
.../RISCV/vplan-vp-intrinsics-reduction.ll | 9 +-
...-order-recurrence-sink-replicate-region.ll | 2 +-
llvm/test/Transforms/LoopVectorize/pr36983.ll | 2 +-
llvm/test/Transforms/LoopVectorize/pr45259.ll | 5 +-
.../vplan-printing-before-execute.ll | 2 +
.../LoopVectorize/vplan-printing.ll | 13 +-
12 files changed, 227 insertions(+), 133 deletions(-)
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 89ea7bc563673..fabddbbce139a 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -8434,122 +8434,133 @@ static void addUsersInExitBlock(
if (!ExitBB || !ExitBB->getSinglePredecessor() || !ExitingBB)
return;
- // Introduce VPUsers modeling the exit values.
- for (PHINode &ExitPhi : ExitBB->phis()) {
- Value *IncomingValue =
- ExitPhi.getIncomingValueForBlock(ExitingBB);
- VPValue *V = Builder.getVPValueOrAddLiveIn(IncomingValue, Plan);
- // Exit values for inductions are computed and updated outside of VPlan and
- // independent of induction recipes.
- // TODO: Compute induction exit values in VPlan, use VPLiveOuts to update
- // live-outs.
- if ((isa<VPWidenIntOrFpInductionRecipe>(V) &&
- !cast<VPWidenIntOrFpInductionRecipe>(V)->getTruncInst()) ||
- isa<VPWidenPointerInductionRecipe>(V) ||
- (isa<Instruction>(IncomingValue) &&
- any_of(IncomingValue->users(), [&Inductions](User *U) {
- auto *P = dyn_cast<PHINode>(U);
- return P && Inductions.contains(P);
- })))
+ auto *MiddleBlock = Plan.getVectorLoopRegion()->getSingleSuccessor();
+ for (VPIRBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPIRBasicBlock>(
+ vp_depth_first_shallow(MiddleBlock))) {
+ if (VPBB->getIRBasicBlock() != ExitBB)
continue;
- auto MiddleVPBB =
- cast<VPBasicBlock>(Plan.getVectorLoopRegion()->getSingleSuccessor());
- VPBuilder B(MiddleVPBB);
- if (auto *Terminator = MiddleVPBB->getTerminator()) {
- auto *Condition = dyn_cast<VPInstruction>(Terminator->getOperand(0));
- assert((!Condition || Condition->getParent() == MiddleVPBB) &&
- "Condition expected in MiddleVPBB");
- B.setInsertPoint(Condition ? Condition : Terminator);
- }
+ for (auto &R : *VPBB) {
+ auto *IR = dyn_cast<VPIRInstruction>(&R);
+ if (!IR)
+ continue;
+ auto *ExitPhi = dyn_cast<PHINode>(&IR->getInstruction());
+ if (!ExitPhi)
+ break;
+ Value *IncomingValue = ExitPhi->getIncomingValueForBlock(ExitingBB);
+ VPValue *V = Builder.getVPValueOrAddLiveIn(IncomingValue, Plan);
+ // Exit values for inductions are computed and updated outside of VPlan
+ // and independent of induction recipes.
+ // TODO: Compute induction exit values in VPlan, use VPLiveOuts to update
+ // live-outs.
+ if ((isa<VPWidenIntOrFpInductionRecipe>(V) &&
+ !cast<VPWidenIntOrFpInductionRecipe>(V)->getTruncInst()) ||
+ isa<VPWidenPointerInductionRecipe>(V) ||
+ (isa<Instruction>(IncomingValue) &&
+ any_of(IncomingValue->users(), [&Inductions](User *U) {
+ auto *P = dyn_cast<PHINode>(U);
+ return P && Inductions.contains(P);
+ })))
+ continue;
- VPValue *Ext;
- if (auto *FOR = dyn_cast_or_null<VPFirstOrderRecurrencePHIRecipe>(
- V->getDefiningRecipe())) {
- // This is the second phase of vectorizing first-order recurrences. An
- // overview of the transformation is described below. Suppose we have the
- // following loop with some use after the loop of the last a[i-1],
- //
- // for (int i = 0; i < n; ++i) {
- // t = a[i - 1];
- // b[i] = a[i] - t;
- // }
- // use t;
- //
- // There is a first-order recurrence on "a". For this loop, the shorthand
- // scalar IR looks like:
- //
- // scalar.ph:
- // s_init = a[-1]
- // br scalar.body
- //
- // scalar.body:
- // i = phi [0, scalar.ph], [i+1, scalar.body]
- // s1 = phi [s_init, scalar.ph], [s2, scalar.body]
- // s2 = a[i]
- // b[i] = s2 - s1
- // br cond, scalar.body, exit.block
- //
- // exit.block:
- // use = lcssa.phi [s1, scalar.body]
- //
- // In this example, s1 is a recurrence because it's value depends on the
- // previous iteration. In the first phase of vectorization, we created a
- // vector phi v1 for s1. We now complete the vectorization and produce the
- // shorthand vector IR shown below (for VF = 4, UF = 1).
- //
- // vector.ph:
- // v_init = vector(..., ..., ..., a[-1])
- // br vector.body
- //
- // vector.body
- // i = phi [0, vector.ph], [i+4, vector.body]
- // v1 = phi [v_init, vector.ph], [v2, vector.body]
- // v2 = a[i, i+1, i+2, i+3];
- // v3 = vector(v1(3), v2(0, 1, 2))
- // b[i, i+1, i+2, i+3] = v2 - v3
- // br cond, vector.body, middle.block
- //
- // middle.block:
- // s_penultimate = v2(2) = v3(3)
- // s_resume = v2(3)
- // br cond, scalar.ph, exit.block
- //
- // scalar.ph:
- // s_init' = phi [s_resume, middle.block], [s_init, otherwise]
- // br scalar.body
- //
- // scalar.body:
- // i = phi [0, scalar.ph], [i+1, scalar.body]
- // s1 = phi [s_init', scalar.ph], [s2, scalar.body]
- // s2 = a[i]
- // b[i] = s2 - s1
- // br cond, scalar.body, exit.block
- //
- // exit.block:
- // lo = lcssa.phi [s1, scalar.body], [s.penultimate, middle.block]
- //
- // After execution completes the vector loop, we extract the next value of
- // the recurrence (x) to use as the initial value in the scalar loop. This
- // is modeled by ExtractFromEnd.
- //
- // Extract the penultimate value of the recurrence and update VPLiveOut
- // users of the recurrence splice. Note that the extract of the final
- // value used to resume in the scalar loop is created earlier during VPlan
- // construction.
- Ext =
- B.createNaryOp(VPInstruction::ExtractFromEnd,
- {FOR->getBackedgeValue(),
- Plan.getOrAddLiveIn(ConstantInt::get(
- IntegerType::get(ExitBB->getContext(), 32), 2))},
- {}, "vector.recur.extract.for.phi");
- } else {
- Ext = B.createNaryOp(
- VPInstruction::ExtractFromEnd,
- {V, Plan.getOrAddLiveIn(ConstantInt::get(
- IntegerType::get(ExitBB->getContext(), 32), 1))});
+ auto MiddleVPBB =
+ cast<VPBasicBlock>(Plan.getVectorLoopRegion()->getSingleSuccessor());
+ VPBuilder B(MiddleVPBB);
+ if (auto *Terminator = MiddleVPBB->getTerminator()) {
+ auto *Condition = dyn_cast<VPInstruction>(Terminator->getOperand(0));
+ assert((!Condition || Condition->getParent() == MiddleVPBB) &&
+ "Condition expected in MiddleVPBB");
+ B.setInsertPoint(Condition ? Condition : Terminator);
+ }
+
+ VPValue *Ext;
+ if (auto *FOR = dyn_cast_or_null<VPFirstOrderRecurrencePHIRecipe>(
+ V->getDefiningRecipe())) {
+ // This is the second phase of vectorizing first-order recurrences. An
+ // overview of the transformation is described below. Suppose we have
+ // the following loop with some use after the loop of the last a[i-1],
+ //
+ // for (int i = 0; i < n; ++i) {
+ // t = a[i - 1];
+ // b[i] = a[i] - t;
+ // }
+ // use t;
+ //
+ // There is a first-order recurrence on "a". For this loop, the
+ // shorthand scalar IR looks like:
+ //
+ // scalar.ph:
+ // s_init = a[-1]
+ // br scalar.body
+ //
+ // scalar.body:
+ // i = phi [0, scalar.ph], [i+1, scalar.body]
+ // s1 = phi [s_init, scalar.ph], [s2, scalar.body]
+ // s2 = a[i]
+ // b[i] = s2 - s1
+ // br cond, scalar.body, exit.block
+ //
+ // exit.block:
+ // use = lcssa.phi [s1, scalar.body]
+ //
+ // In this example, s1 is a recurrence because it's value depends on the
+ // previous iteration. In the first phase of vectorization, we created a
+ // vector phi v1 for s1. We now complete the vectorization and produce
+ // the shorthand vector IR shown below (for VF = 4, UF = 1).
+ //
+ // vector.ph:
+ // v_init = vector(..., ..., ..., a[-1])
+ // br vector.body
+ //
+ // vector.body
+ // i = phi [0, vector.ph], [i+4, vector.body]
+ // v1 = phi [v_init, vector.ph], [v2, vector.body]
+ // v2 = a[i, i+1, i+2, i+3];
+ // v3 = vector(v1(3), v2(0, 1, 2))
+ // b[i, i+1, i+2, i+3] = v2 - v3
+ // br cond, vector.body, middle.block
+ //
+ // middle.block:
+ // s_penultimate = v2(2) = v3(3)
+ // s_resume = v2(3)
+ // br cond, scalar.ph, exit.block
+ //
+ // scalar.ph:
+ // s_init' = phi [s_resume, middle.block], [s_init, otherwise]
+ // br scalar.body
+ //
+ // scalar.body:
+ // i = phi [0, scalar.ph], [i+1, scalar.body]
+ // s1 = phi [s_init', scalar.ph], [s2, scalar.body]
+ // s2 = a[i]
+ // b[i] = s2 - s1
+ // br cond, scalar.body, exit.block
+ //
+ // exit.block:
+ // lo = lcssa.phi [s1, scalar.body], [s.penultimate, middle.block]
+ //
+ // After execution completes the vector loop, we extract the next value
+ // of the recurrence (x) to use as the initial value in the scalar loop.
+ // This is modeled by ExtractFromEnd.
+ //
+ // Extract the penultimate value of the recurrence and update VPLiveOut
+ // users of the recurrence splice. Note that the extract of the final
+ // value used to resume in the scalar loop is created earlier during
+ // VPlan construction.
+ Ext = B.createNaryOp(
+ VPInstruction::ExtractFromEnd,
+ {FOR->getBackedgeValue(),
+ Plan.getOrAddLiveIn(ConstantInt::get(
+ IntegerType::get(ExitBB->getContext(), 32), 2))},
+ {}, "vector.recur.extract.for.phi");
+ } else {
+ Ext = B.createNaryOp(
+ VPInstruction::ExtractFromEnd,
+ {V, Plan.getOrAddLiveIn(ConstantInt::get(
+ IntegerType::get(ExitBB->getContext(), 32), 1))});
+ }
+ IR->addOperand(Ext);
}
- Plan.addLiveOut(&ExitPhi, Ext);
}
}
@@ -10156,7 +10167,9 @@ bool LoopVectorizePass::processLoop(Loop *L) {
// directly in VPlan.
EpilogILV.setTripCount(MainILV.getTripCount());
for (auto &R : make_early_inc_range(*BestEpiPlan.getPreheader())) {
- auto *ExpandR = cast<VPExpandSCEVRecipe>(&R);
+ auto *ExpandR = dyn_cast<VPExpandSCEVRecipe>(&R);
+ if (!ExpandR)
+ continue;
auto *ExpandedVal = BestEpiPlan.getOrAddLiveIn(
ExpandedSCEVs.find(ExpandR->getSCEV())->second);
ExpandR->replaceAllUsesWith(ExpandedVal);
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.cpp b/llvm/lib/Transforms/Vectorize/VPlan.cpp
index 58de6256900f0..b16527386521b 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlan.cpp
@@ -855,10 +855,18 @@ VPlan::~VPlan() {
delete BackedgeTakenCount;
}
+static VPIRBasicBlock *createVPIRBasicBlockFor(BasicBlock *BB) {
+ auto *VPIRBB = new VPIRBasicBlock(BB);
+ for (Instruction &I :
+ make_range(BB->begin(), BB->getTerminator()->getIterator()))
+ VPIRBB->appendRecipe(new VPIRInstruction(I));
+ return VPIRBB;
+}
+
VPlanPtr VPlan::createInitialVPlan(const SCEV *TripCount, ScalarEvolution &SE,
bool RequiresScalarEpilogueCheck,
bool TailFolded, Loop *TheLoop) {
- VPIRBasicBlock *Entry = new VPIRBasicBlock(TheLoop->getLoopPreheader());
+ VPIRBasicBlock *Entry = createVPIRBasicBlockFor(TheLoop->getLoopPreheader());
VPBasicBlock *VecPreheader = new VPBasicBlock("vector.ph");
auto Plan = std::make_unique<VPlan>(Entry, VecPreheader);
Plan->TripCount =
@@ -890,7 +898,7 @@ VPlanPtr VPlan::createInitialVPlan(const SCEV *TripCount, ScalarEvolution &SE,
// we unconditionally branch to the scalar preheader. Do nothing.
// 3) Otherwise, construct a runtime check.
BasicBlock *IRExitBlock = TheLoop->getUniqueExitBlock();
- auto *VPExitBlock = new VPIRBasicBlock(IRExitBlock);
+ auto *VPExitBlock = createVPIRBasicBlockFor(IRExitBlock);
// The connection order corresponds to the operands of the conditional branch.
VPBlockUtils::insertBlockAfter(VPExitBlock, MiddleVPBB);
VPBlockUtils::connectBlocks(MiddleVPBB, ScalarPH);
@@ -957,7 +965,7 @@ void VPlan::prepareToExecute(Value *TripCountV, Value *VectorTripCountV,
/// predecessor, which is rewired to the new VPIRBasicBlock. All successors of
/// VPBB, if any, are rewired to the new VPIRBasicBlock.
static void replaceVPBBWithIRVPBB(VPBasicBlock *VPBB, BasicBlock *IRBB) {
- VPIRBasicBlock *IRMiddleVPBB = new VPIRBasicBlock(IRBB);
+ VPIRBasicBlock *IRMiddleVPBB = createVPIRBasicBlockFor(IRBB);
for (auto &R : make_early_inc_range(*VPBB))
R.moveBefore(*IRMiddleVPBB, IRMiddleVPBB->end());
VPBlockBase *PredVPBB = VPBB->getSinglePredecessor();
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h
index c9da5e5d38a6b..2100734eead1b 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.h
+++ b/llvm/lib/Transforms/Vectorize/VPlan.h
@@ -932,6 +932,7 @@ class VPSingleDefRecipe : public VPRecipeBase, public VPValue {
case VPRecipeBase::VPScalarCastSC:
return true;
case VPRecipeBase::VPInterleaveSC:
+ case VPRecipeBase::VPIRInstructionSC:
case VPRecipeBase::VPBranchOnMaskSC:
case VPRecipeBase::VPWidenLoadEVLSC:
case VPRecipeBase::VPWidenLoadSC:
@@ -1399,6 +1400,45 @@ class VPInstruction : public VPRecipeWithIRFlags {
bool isSingleScalar() const;
};
+/// A recipe to wrap on original IR instruction not to be modified during
+/// execution, execept for PHIs. For PHIs, a single VPValue operand is allowed,
+/// and it is used to add a new incoming value for the single predecessor VPBB.
+/// Expect PHIs, VPIRInstructions cannot have any operands.
+class VPIRInstruction : public VPRecipeBase {
+ Instruction &I;
+
+public:
+ VPIRInstruction(Instruction &I)
+ : VPRecipeBase(VPDef::VPIRInstructionSC, ArrayRef<VPValue *>()), I(I) {}
+
+ ~VPIRInstruction() override = default;
+
+ VP_CLASSOF_IMPL(VPDef::VPIRInstructionSC)
+
+ VPIRInstruction *clone() override {
+ auto *R = new VPIRInstruction(I);
+ for (auto *Op : operands())
+ R->addOperand(Op);
+ return R;
+ }
+
+ void execute(VPTransformState &State) override;
+
+ Instruction &getInstruction() { return I; }
+
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+ /// Print the recipe.
+ void print(raw_ostream &O, const Twine &Indent,
+ VPSlotTracker &SlotTracker) const override;
+#endif
+
+ bool usesScalars(const VPValue *Op) const override {
+ assert(is_contained(operands(), Op) &&
+ "Op must be an operand of the recipe");
+ return true;
+ }
+};
+
/// VPWidenRecipe is a recipe for producing a widened instruction using the
/// opcode and operands of the recipe. This recipe covers most of the
/// traditional vectorization cases where each recipe transforms into a
diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
index 798d178e5a963..780eeff5a2b7b 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
@@ -856,6 +856,35 @@ void VPInstruction::print(raw_ostream &O, const Twine &Indent,
}
#endif
+void VPIRInstruction::execute(VPTransformState &State) {
+ assert(isa<VPIRBasicBlock>(getParent()) &&
+ "VPIRInstructions can only be placed in VPIRBasicBlocks");
+
+ if (getNumOperands() == 1 && isa<PHINode>(&I)) {
+ VPValue *ExitValue = getOperand(0);
+ auto Lane = vputils::isUniformAfterVectorization(ExitValue)
+ ? VPLane::getFirstLane()
+ : VPLane::getLastLaneForVF(State.VF);
+ auto *PredVPBB = cast<VPBasicBlock>(getParent()->getSinglePredecessor());
+ BasicBlock *PredBB = State.CFG.VPBB2IRBB[PredVPBB];
+ // Set insertion point in PredBB in case an extract needs to be generated.
+ // TODO: Model extracts explicitly.
+ State.Builder.SetInsertPoint(PredBB, PredBB->getFirstNonPHIIt());
+ Value *V = State.get(ExitValue, VPIteration(State.UF - 1, Lane));
+ auto *Phi = cast<PHINode>(&I);
+ Phi->addIncoming(V, PredBB);
+ }
+
+ State.Builder.SetInsertPoint(I.getParent(), std::next(I.getIterator()));
+}
+
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+void VPIRInstruction::print(raw_ostream &O, const Twine &Indent,
+ VPSlotTracker &SlotTracker) const {
+ O << Indent << "IR " << I;
+}
+#endif
+
void VPWidenCallRecipe::execute(VPTransformState &State) {
assert(State.VF.isVector() && "not widening");
Function *CalledScalarFn = getCalledScalarFunction();
diff --git a/llvm/lib/Transforms/Vectorize/VPlanValue.h b/llvm/lib/Transforms/Vectorize/VPlanValue.h
index 452c977106a77..33c221aa946f1 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanValue.h
+++ b/llvm/lib/Transforms/Vectorize/VPlanValue.h
@@ -339,6 +339,7 @@ class VPDef {
VPBranchOnMaskSC,
VPDerivedIVSC,
VPExpandSCEVSC,
+ VPIRInstructionSC,
VPInstructionSC,
VPInterleaveSC,
VPReductionEVLSC,
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/riscv-vector-reverse.ll b/llvm/test/Transforms/LoopVectorize/RISCV/riscv-vector-reverse.ll
index 3a14842580425..97809e50d784a 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/riscv-vector-reverse.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/riscv-vector-reverse.ll
@@ -60,6 +60,7 @@ define void @vector_reverse_i64(ptr nocapture noundef writeonly %A, ptr nocaptur
; CHECK-NEXT: vp<%2> = original trip-count
; CHECK-EMPTY:
; CHECK-NEXT: ir-bb<for.body.preheader>:
+; CHECK-NEXT: IR %0 = zext i32 %n to i64
; CHECK-NEXT: EMIT vp<%2> = EXPAND SCEV (zext i32 %n to i64)
; CHECK-NEXT: No successors
; CHECK-EMPTY:
@@ -144,6 +145,7 @@ define void @vector_reverse_i64(ptr nocapture noundef writeonly %A, ptr nocaptur
; CHECK-NEXT: vp<%2> = original trip-count
; CHECK-EMPTY:
; CHECK-NEXT: ir-bb<for.body.preheader>:
+; CHECK-NEXT: IR %0 = zext i32 %n to i64
; CHECK-NEXT: EMIT vp<%2> = EXPAND SCEV (zext i32 %n to i64)
; CHECK-NEXT: No successors
; CHECK-EMPTY:
@@ -265,6 +267,7 @@ define void @vector_reverse_f32(ptr nocapture noundef writeonly %A, ptr nocaptur
; CHECK-NEXT: vp<%2> = original trip-count
; CHECK-EMPTY:
; CHECK-NEXT: ir-bb<for.body.preheader>:
+; CHECK-NEXT: IR %0 = zext i32 %n to i64
; CHECK-NEXT: EMIT vp<%2> = EXPAND SCEV (zext i32 %n to i64)
; CHECK-NEXT: No successors
; CHECK-EMPTY:
@@ -349,6 +352,7 @@ define void @vector_reverse_f32(ptr nocapture noundef writeonly %A, ptr nocaptur
; CHECK-NEXT: vp<%2> = original trip-count
; CHECK-EMPTY:
; CHECK-NEXT: ir-bb<for.body.preheader>:
+; CHECK-NEXT: IR %0 = zext i32 %n to i64
; CHECK-NEXT: EMIT vp<%2> = EXPAND SCEV (zext i32 %n to i64)
; CHECK-NEXT: No successors
; CHECK-EMPTY:
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/vplan-vp-intrinsics-reduction.ll b/llvm/test/Transforms/LoopVectorize/RISCV/vplan-vp-intrinsics-reduction.ll
index f14ffe854a3a6..803a5e6ecb733 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/vplan-vp-intrinsics-reduction.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/vplan-vp-intrinsics-reduction.ll
@@ -60,12 +60,11 @@ define i32 @reduction(ptr %a, i64 %n, i32 %start) {
; IF-EVL-INLOOP-NEXT: Successor(s): ir-bb<for.end>, scalar.ph
; IF-EVL-INLOOP-EMPTY:
; IF-EVL-INLOOP-NEXT: ir-bb<for.end>:
+; IF-EVL-INLOOP-NEXT: IR %add.lcssa = phi i32 [ %add, %for.body ]
; IF-EVL-INLOOP-NEXT: No successors
; IF-EVL-INLOOP-EMPTY:
; IF-EVL-INLOOP-NEXT: scalar.ph:
; IF-EVL-INLOOP-NEXT: No successors
-; IF-EVL-INLOOP-EMPTY:
-; IF-EVL-INLOOP-NEXT: Live-out i32 %add.lcssa = vp<[[RDX_EX]]>
; IF-EVL-INLOOP-NEXT: }
;
@@ -100,12 +99,11 @@ define i32 @reduction(ptr %a, i64 %n, i32 %start) {
; NO-VP-OUTLOOP-NEXT: Successor(s): ir-bb<for.end>, scalar.ph
; NO-VP-OUTLOOP-EMPTY:
; NO-VP-OUTLOOP-NEXT: ir-bb<for.end>:
+; NO-VP-OUTLOOP-NEXT: IR %add.lcssa = phi i32 [ %add, %for.body ]
; NO-VP-OUTLOOP-NEXT: No successors
; NO-VP-OUTLOOP-EMPTY:
; NO-VP-OUTLOOP-NEXT: scalar.ph:
; NO-VP-OUTLOOP-NEXT: No successors
-; NO-VP-OUTLOOP-EMPTY:
-; NO-VP-OUTLOOP-NEXT: Live-out i32 %add.lcssa = vp<[[RDX_EX]]>
; NO-VP-OUTLOOP-NEXT: }
;
@@ -140,12 +138,11 @@ define i32 @reduction(ptr %a, i64 %n, i32 %start) {
; NO-VP-INLOOP-NEXT: Successor(s): ir-bb<for.end>, scalar.ph
; NO-VP-INLOOP-EMPTY:
; NO-VP-INLOOP-NEXT: ir-bb<for.end>:
+; NO-VP-INLOOP-NEXT: IR %add.lcssa = phi i32 [ %add, %for.body ]
; NO-VP-INLOOP-NEXT: No successors
; NO-VP-INLOOP-EMPTY:
; NO-VP-INLOOP-NEXT: scalar.ph:
; NO-VP-INLOOP-NEXT: No successors
-; NO-VP-INLOOP-EMPTY:
-; NO-VP-INLOOP-NEXT: Live-out i32 %add.lcssa = vp<[[RDX_EX]]>
; NO-VP-INLOOP-NEXT: }
;
entry:
diff --git a/llvm/test/Transforms/LoopVectorize/first-order-recurrence-sink-replicate-region.ll b/llvm/test/Transforms/LoopVectorize/first-order-recurrence-sink-replicate-region.ll
index 9e49cf6b42c6b..993b4b2a948e0 100644
--- a/llvm/test/Transforms/LoopVectorize/first-order-recurrence-sink-replicate-region.ll
+++ b/llvm/test/Transforms/LoopVectorize/first-order-recurrence-sink-replicate-region.ll
@@ -225,6 +225,7 @@ define i32 @sink_replicate_region_3_reduction(i32 %x, i8 %y, ptr %ptr) optsize {
; CHECK-NEXT: Successor(s): ir-bb<exit>, scalar.ph
; CHECK-EMPTY:
; CHECK-NEXT: ir-bb<exit>
+; CHECK-NEXT: IR %res = phi i32 [ %and.red.next, %loop ]
; CHECK-NEXT: No successors
; CHECK-EMPTY:
; CHECK-NEXT: scalar.ph
@@ -232,7 +233,6 @@ define i32 @sink_replicate_region_3_reduction(i32 %x, i8 %y, ptr %ptr) optsize {
; CHECK-NEXT: No successors
; CHECK-EMPTY:
; CHECK-NEXT: Live-out i32 %recur = vp<[[RESUME_1_P]]>
-; CHECK-NEXT: Live-out i32 %res = vp<[[RED_EX]]>
; CHECK-NEXT: }
;
entry:
diff --git a/llvm/test/Transforms/LoopVectorize/pr36983.ll b/llvm/test/Transforms/LoopVectorize/pr36983.ll
index f4da4f355eb0d..7e38d60b6f581 100644
--- a/llvm/test/Transforms/LoopVectorize/pr36983.ll
+++ b/llvm/test/Transforms/LoopVectorize/pr36983.ll
@@ -4,7 +4,7 @@
; CHECK-LABEL: bb1.bb3_crit_edge:
; CHECK: %_tmp133.lcssa1 = phi i16 [ %_tmp133, %bb2 ], [ %vector.recur.extract.for.phi, %middle.block ]
-; CHECK: %_tmp133.lcssa = phi i16 [ %_tmp133, %bb2 ], [ %vector.recur.extract.for.phi, %middle.block ]
+; CHECK: %_tmp133.lcssa = phi i16 [ %_tmp133, %bb2 ], [ %vector.recur.extract.for.phi1, %middle.block ]
define void @f1() {
bb2.lr.ph:
diff --git a/llvm/test/Transforms/LoopVectorize/pr45259.ll b/llvm/test/Transforms/LoopVectorize/pr45259.ll
index dcc8f3f2f9d8f..008971697775e 100644
--- a/llvm/test/Transforms/LoopVectorize/pr45259.ll
+++ b/llvm/test/Transforms/LoopVectorize/pr45259.ll
@@ -14,11 +14,12 @@ define i8 @widget(ptr %arr, i8 %t9) {
; CHECK-NEXT: br i1 [[C]], label [[FOR_PREHEADER:%.*]], label [[BB6]]
; CHECK: for.preheader:
; CHECK-NEXT: [[T1_0_LCSSA:%.*]] = phi ptr [ [[T1_0]], [[BB6]] ]
-; CHECK-NEXT: [[T1_0_LCSSA2:%.*]] = ptrtoint ptr [[T1_0_LCSSA]] to i64
; CHECK-NEXT: [[TMP0:%.*]] = trunc i64 [[ARR1]] to i32
; CHECK-NEXT: [[TMP1:%.*]] = sub i32 0, [[TMP0]]
-; CHECK-NEXT: [[TMP2:%.*]] = trunc i64 [[T1_0_LCSSA2]] to i32
+; CHECK-NEXT: [[T1_0_LCSSA3:%.*]] = ptrtoint ptr [[T1_0_LCSSA]] to i64
+; CHECK-NEXT: [[TMP2:%.*]] = trunc i64 [[T1_0_LCSSA3]] to i32
; CHECK-NEXT: [[TMP3:%.*]] = add i32 [[TMP1]], [[TMP2]]
+; CHECK-NEXT: [[T1_0_LCSSA2:%.*]] = ptrtoint ptr [[T1_0_LCSSA]] to i64
; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 [[TMP3]], 4
; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_SCEVCHECK:%.*]]
; CHECK: vector.scevcheck:
diff --git a/llvm/test/Transforms/LoopVectorize/vplan-printing-before-execute.ll b/llvm/test/Transforms/LoopVectorize/vplan-printing-before-execute.ll
index e4984f52ee6ff..431d14be45857 100644
--- a/llvm/test/Transforms/LoopVectorize/vplan-printing-before-execute.ll
+++ b/llvm/test/Transforms/LoopVectorize/vplan-printing-before-execute.ll
@@ -14,6 +14,7 @@ define void @test_tc_less_than_16(ptr %A, i64 %N) {
; CHECK-NEXT: vp<[[TC:%.+]]> = original trip-count
; CHECK-EMPTY:
; CHECK-NEXT: ir-bb<entry>:
+; CHECK-NEXT: IR %and = and i64 %N, 15
; CHECK-NEXT: EMIT vp<[[TC]]> = EXPAND SCEV (zext i4 (trunc i64 %N to i4) to i64)
; CHECK-NEXT: No successors
; CHECK-EMPTY:
@@ -55,6 +56,7 @@ define void @test_tc_less_than_16(ptr %A, i64 %N) {
; CHECK-NEXT: vp<[[TC:%.+]]> = original trip-count
; CHECK-EMPTY:
; CHECK-NEXT: ir-bb<entry>:
+; CHECK-NEXT: IR %and = and i64 %N, 15
; CHECK-NEXT: EMIT vp<[[TC]]> = EXPAND SCEV (zext i4 (trunc i64 %N to i4) to i64)
; CHECK-NEXT: No successors
; CHECK-EMPTY:
diff --git a/llvm/test/Transforms/LoopVectorize/vplan-printing.ll b/llvm/test/Transforms/LoopVectorize/vplan-printing.ll
index f18ed825a6b88..9b7051630347e 100644
--- a/llvm/test/Transforms/LoopVectorize/vplan-printing.ll
+++ b/llvm/test/Transforms/LoopVectorize/vplan-printing.ll
@@ -160,12 +160,11 @@ define float @print_reduction(i64 %n, ptr noalias %y) {
; CHECK-NEXT: Successor(s): ir-bb<for.end>, scalar.ph
; CHECK-EMPTY:
; CHECK-NEXT: ir-bb<for.end>
+; CHECK-NEXT: IR %red.next.lcssa = phi float [ %red.next, %for.body ]
; CHECK-NEXT: No successors
; CHECK-EMPTY:
; CHECK-NEXT: scalar.ph
; CHECK-NEXT: No successors
-; CHECK-EMPTY:
-; CHECK-NEXT: Live-out float %red.next.lcssa = vp<[[RED_EX]]>
; CHECK-NEXT: }
;
entry:
@@ -442,12 +441,11 @@ define float @print_fmuladd_strict(ptr %a, ptr %b, i64 %n) {
; CHECK-NEXT: Successor(s): ir-bb<for.end>, scalar.ph
; CHECK-EMPTY:
; CHECK-NEXT: ir-bb<for.end>
+; CHECK-NEXT: IR %muladd.lcssa = phi float [ %muladd, %for.body ]
; CHECK-NEXT: No successors
; CHECK-EMPTY:
; CHECK-NEXT: scalar.ph
; CHECK-NEXT: No successors
-; CHECK-EMPTY:
-; CHECK-NEXT: Live-out float %muladd.lcssa = vp<[[RED_EX]]>
; CHECK-NEXT:}
entry:
@@ -574,6 +572,8 @@ define void @print_expand_scev(i64 %y, ptr %ptr) {
; CHECK-NEXT: vp<[[TC:%.+]]> = original trip-count
; CHECK-EMPTY:
; CHECK-NEXT: ir-bb<entry>:
+; CHECK-NEXT: IR %div = udiv i64 %y, 492802768830814060
+; CHECK-NEXT: IR %inc = add i64 %div, 1
; CHECK-NEXT: EMIT vp<[[TC]]> = EXPAND SCEV (1 + ((15 + (%y /u 492802768830814060))<nuw><nsw> /u (1 + (%y /u 492802768830814060))<nuw><nsw>))<nuw><nsw>
; CHECK-NEXT: EMIT vp<[[EXP_SCEV:%.+]]> = EXPAND SCEV (1 + (%y /u 492802768830814060))<nuw><nsw>
; CHECK-NEXT: No successors
@@ -662,12 +662,11 @@ define i32 @print_exit_value(ptr %ptr, i32 %off) {
; CHECK-NEXT: Successor(s): ir-bb<exit>, scalar.ph
; CHECK-EMPTY:
; CHECK-NEXT: ir-bb<exit>
+; CHECK-NEXT: IR %lcssa = phi i32 [ %add, %loop ]
; CHECK-NEXT: No successors
; CHECK-EMPTY:
; CHECK-NEXT: scalar.ph
; CHECK-NEXT: No successors
-; CHECK-EMPTY:
-; CHECK-NEXT: Live-out i32 %lcssa = vp<[[EXIT]]>
; CHECK-NEXT: }
;
entry:
@@ -1033,6 +1032,7 @@ define i16 @print_first_order_recurrence_and_result(ptr %ptr) {
; CHECK-NEXT: Successor(s): ir-bb<exit>, scalar.ph
; CHECK-EMPTY:
; CHECK-NEXT: ir-bb<exit>
+; CHECK-NEXT: IR %for.1.lcssa = phi i16 [ %for.1, %loop ]
; CHECK-NEXT: No successors
; CHECK-EMPTY:
; CHECK-NEXT: scalar.ph
@@ -1040,7 +1040,6 @@ define i16 @print_first_order_recurrence_and_result(ptr %ptr) {
; CHECK-NEXT: No successors
; CHECK-EMPTY:
; CHECK-NEXT: Live-out i16 %for.1 = vp<[[RESUME_P]]>
-; CHECK-NEXT: Live-out i16 %for.1.lcssa = vp<[[FOR_RESULT]]>
; CHECK-NEXT: }
;
entry:
More information about the llvm-commits
mailing list