[llvm] [VPlan] Update final exit value via VPlan. (PR #112147)
Florian Hahn via llvm-commits
llvm-commits at lists.llvm.org
Sun Jan 5 03:51:19 PST 2025
https://github.com/fhahn updated https://github.com/llvm/llvm-project/pull/112147
>From 0f66ee14c952c25dac8a5542994f62f2bb905ef5 Mon Sep 17 00:00:00 2001
From: Florian Hahn <flo at fhahn.com>
Date: Tue, 31 Dec 2024 12:08:43 +0000
Subject: [PATCH 1/4] [LV] Add test with FP induction and increment operands
swapped.
---
.../Transforms/Vectorize/LoopVectorize.cpp | 14 ++
.../LoopVectorize/X86/induction-step.ll | 6 +-
.../LoopVectorize/iv_outside_user.ll | 120 ++++++++++++++++++
3 files changed, 136 insertions(+), 4 deletions(-)
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 7ef5295bb12763..5b75f6b26b6c5f 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -9311,6 +9311,20 @@ LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(VFRange &Range) {
"VPBasicBlock");
RecipeBuilder.fixHeaderPhis();
+ // Update wide induction increments to use the same step as the corresponding
+ // wide induction. This enables detecting induction increments directly in
+ // VPlan and removes redundant splats.
+ for (const auto &[Phi, ID] : Legal->getInductionVars()) {
+ auto *IVInc = cast<Instruction>(
+ Phi->getIncomingValueForBlock(OrigLoop->getLoopLatch()));
+ if (IVInc->getOperand(0) != Phi || IVInc->getOpcode() != Instruction::Add)
+ continue;
+ VPWidenInductionRecipe *WideIV =
+ cast<VPWidenInductionRecipe>(RecipeBuilder.getRecipe(Phi));
+ VPRecipeBase *R = RecipeBuilder.getRecipe(IVInc);
+ R->setOperand(1, WideIV->getStepValue());
+ }
+
if (auto *UncountableExitingBlock =
Legal->getUncountableEarlyExitingBlock()) {
VPlanTransforms::handleUncountableEarlyExit(
diff --git a/llvm/test/Transforms/LoopVectorize/X86/induction-step.ll b/llvm/test/Transforms/LoopVectorize/X86/induction-step.ll
index 6aac11a579719b..f6a9767c7f87d5 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/induction-step.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/induction-step.ll
@@ -21,16 +21,14 @@ define i16 @wide_add_induction_step_live_in(ptr %dst, i64 %N, i16 %off) {
; CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector <4 x i16> [[DOTSPLATINSERT]], <4 x i16> poison, <4 x i32> zeroinitializer
; CHECK-NEXT: [[TMP2:%.*]] = mul <4 x i16> <i16 0, i16 1, i16 2, i16 3>, [[DOTSPLAT]]
; CHECK-NEXT: [[INDUCTION:%.*]] = add <4 x i16> zeroinitializer, [[TMP2]]
-; CHECK-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <4 x i16> poison, i16 [[O_1]], i64 0
-; CHECK-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector <4 x i16> [[BROADCAST_SPLATINSERT1]], <4 x i16> poison, <4 x i32> zeroinitializer
; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
; CHECK: vector.body:
; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
; CHECK-NEXT: [[VEC_IND:%.*]] = phi <4 x i16> [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
; CHECK-NEXT: [[STEP_ADD:%.*]] = add <4 x i16> [[VEC_IND]], [[TMP1]]
; CHECK-NEXT: [[TMP3:%.*]] = add i64 [[INDEX]], 0
-; CHECK-NEXT: [[TMP4:%.*]] = add <4 x i16> [[VEC_IND]], [[BROADCAST_SPLAT2]]
-; CHECK-NEXT: [[TMP9:%.*]] = add <4 x i16> [[STEP_ADD]], [[BROADCAST_SPLAT2]]
+; CHECK-NEXT: [[TMP4:%.*]] = add <4 x i16> [[VEC_IND]], [[BROADCAST_SPLAT]]
+; CHECK-NEXT: [[TMP9:%.*]] = add <4 x i16> [[STEP_ADD]], [[BROADCAST_SPLAT]]
; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds i16, ptr [[DST:%.*]], i64 [[TMP3]]
; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i16, ptr [[TMP5]], i32 0
; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds i16, ptr [[TMP5]], i32 4
diff --git a/llvm/test/Transforms/LoopVectorize/iv_outside_user.ll b/llvm/test/Transforms/LoopVectorize/iv_outside_user.ll
index e9f67036faf2b1..66fdbc6a98bd0e 100644
--- a/llvm/test/Transforms/LoopVectorize/iv_outside_user.ll
+++ b/llvm/test/Transforms/LoopVectorize/iv_outside_user.ll
@@ -860,6 +860,126 @@ exit:
ret float %add
}
+define float @fp_postinc_use_fadd_ops_swapped(float %init, ptr noalias nocapture %A, i64 %N, float %fpinc) {
+; VEC-LABEL: define float @fp_postinc_use_fadd_ops_swapped(
+; VEC-SAME: float [[INIT:%.*]], ptr noalias nocapture [[A:%.*]], i64 [[N:%.*]], float [[FPINC:%.*]]) {
+; VEC-NEXT: [[ENTRY:.*]]:
+; VEC-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 2
+; VEC-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; VEC: [[VECTOR_PH]]:
+; VEC-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], 2
+; VEC-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
+; VEC-NEXT: [[DOTCAST:%.*]] = sitofp i64 [[N_VEC]] to float
+; VEC-NEXT: [[TMP0:%.*]] = fmul fast float [[FPINC]], [[DOTCAST]]
+; VEC-NEXT: [[TMP1:%.*]] = fadd fast float [[INIT]], [[TMP0]]
+; VEC-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <2 x float> poison, float [[INIT]], i64 0
+; VEC-NEXT: [[DOTSPLAT:%.*]] = shufflevector <2 x float> [[DOTSPLATINSERT]], <2 x float> poison, <2 x i32> zeroinitializer
+; VEC-NEXT: [[DOTSPLATINSERT1:%.*]] = insertelement <2 x float> poison, float [[FPINC]], i64 0
+; VEC-NEXT: [[DOTSPLAT2:%.*]] = shufflevector <2 x float> [[DOTSPLATINSERT1]], <2 x float> poison, <2 x i32> zeroinitializer
+; VEC-NEXT: [[TMP2:%.*]] = fmul fast <2 x float> <float 0.000000e+00, float 1.000000e+00>, [[DOTSPLAT2]]
+; VEC-NEXT: [[INDUCTION:%.*]] = fadd fast <2 x float> [[DOTSPLAT]], [[TMP2]]
+; VEC-NEXT: [[TMP3:%.*]] = fmul fast float [[FPINC]], 2.000000e+00
+; VEC-NEXT: [[DOTSPLATINSERT3:%.*]] = insertelement <2 x float> poison, float [[TMP3]], i64 0
+; VEC-NEXT: [[DOTSPLAT4:%.*]] = shufflevector <2 x float> [[DOTSPLATINSERT3]], <2 x float> poison, <2 x i32> zeroinitializer
+; VEC-NEXT: br label %[[VECTOR_BODY:.*]]
+; VEC: [[VECTOR_BODY]]:
+; VEC-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; VEC-NEXT: [[VEC_IND:%.*]] = phi <2 x float> [ [[INDUCTION]], %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; VEC-NEXT: [[TMP4:%.*]] = add i64 [[INDEX]], 0
+; VEC-NEXT: [[TMP5:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP4]]
+; VEC-NEXT: [[TMP6:%.*]] = getelementptr inbounds float, ptr [[TMP5]], i32 0
+; VEC-NEXT: store <2 x float> [[VEC_IND]], ptr [[TMP6]], align 4
+; VEC-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
+; VEC-NEXT: [[VEC_IND_NEXT]] = fadd fast <2 x float> [[VEC_IND]], [[DOTSPLAT4]]
+; VEC-NEXT: [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; VEC-NEXT: br i1 [[TMP7]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], {{!llvm.loop ![0-9]+}}
+; VEC: [[MIDDLE_BLOCK]]:
+; VEC-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
+; VEC-NEXT: br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; VEC: [[SCALAR_PH]]:
+; VEC-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; VEC-NEXT: [[BC_RESUME_VAL5:%.*]] = phi float [ [[TMP1]], %[[MIDDLE_BLOCK]] ], [ [[INIT]], %[[ENTRY]] ]
+; VEC-NEXT: br label %[[LOOP:.*]]
+; VEC: [[LOOP]]:
+; VEC-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; VEC-NEXT: [[FP_IV:%.*]] = phi float [ [[BC_RESUME_VAL5]], %[[SCALAR_PH]] ], [ [[ADD:%.*]], %[[LOOP]] ]
+; VEC-NEXT: [[GEP_A:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[IV]]
+; VEC-NEXT: store float [[FP_IV]], ptr [[GEP_A]], align 4
+; VEC-NEXT: [[ADD]] = fadd fast float [[FPINC]], [[FP_IV]]
+; VEC-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
+; VEC-NEXT: [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
+; VEC-NEXT: br i1 [[EC]], label %[[EXIT]], label %[[LOOP]], {{!llvm.loop ![0-9]+}}
+; VEC: [[EXIT]]:
+; VEC-NEXT: [[ADD_LCSSA:%.*]] = phi float [ [[ADD]], %[[LOOP]] ], [ [[TMP1]], %[[MIDDLE_BLOCK]] ]
+; VEC-NEXT: ret float [[ADD_LCSSA]]
+;
+; INTERLEAVE-LABEL: define float @fp_postinc_use_fadd_ops_swapped(
+; INTERLEAVE-SAME: float [[INIT:%.*]], ptr noalias nocapture [[A:%.*]], i64 [[N:%.*]], float [[FPINC:%.*]]) {
+; INTERLEAVE-NEXT: [[ENTRY:.*]]:
+; INTERLEAVE-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 2
+; INTERLEAVE-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; INTERLEAVE: [[VECTOR_PH]]:
+; INTERLEAVE-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], 2
+; INTERLEAVE-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
+; INTERLEAVE-NEXT: [[DOTCAST:%.*]] = sitofp i64 [[N_VEC]] to float
+; INTERLEAVE-NEXT: [[TMP0:%.*]] = fmul fast float [[FPINC]], [[DOTCAST]]
+; INTERLEAVE-NEXT: [[TMP1:%.*]] = fadd fast float [[INIT]], [[TMP0]]
+; INTERLEAVE-NEXT: br label %[[VECTOR_BODY:.*]]
+; INTERLEAVE: [[VECTOR_BODY]]:
+; INTERLEAVE-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; INTERLEAVE-NEXT: [[TMP2:%.*]] = add i64 [[INDEX]], 0
+; INTERLEAVE-NEXT: [[TMP3:%.*]] = add i64 [[INDEX]], 1
+; INTERLEAVE-NEXT: [[DOTCAST1:%.*]] = sitofp i64 [[INDEX]] to float
+; INTERLEAVE-NEXT: [[TMP4:%.*]] = fmul fast float [[FPINC]], [[DOTCAST1]]
+; INTERLEAVE-NEXT: [[OFFSET_IDX:%.*]] = fadd fast float [[INIT]], [[TMP4]]
+; INTERLEAVE-NEXT: [[TMP5:%.*]] = fmul fast float 0.000000e+00, [[FPINC]]
+; INTERLEAVE-NEXT: [[TMP6:%.*]] = fadd fast float [[OFFSET_IDX]], [[TMP5]]
+; INTERLEAVE-NEXT: [[TMP7:%.*]] = fmul fast float 1.000000e+00, [[FPINC]]
+; INTERLEAVE-NEXT: [[TMP8:%.*]] = fadd fast float [[OFFSET_IDX]], [[TMP7]]
+; INTERLEAVE-NEXT: [[TMP9:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP2]]
+; INTERLEAVE-NEXT: [[TMP10:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP3]]
+; INTERLEAVE-NEXT: store float [[TMP6]], ptr [[TMP9]], align 4
+; INTERLEAVE-NEXT: store float [[TMP8]], ptr [[TMP10]], align 4
+; INTERLEAVE-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
+; INTERLEAVE-NEXT: [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; INTERLEAVE-NEXT: br i1 [[TMP11]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], {{!llvm.loop ![0-9]+}}
+; INTERLEAVE: [[MIDDLE_BLOCK]]:
+; INTERLEAVE-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
+; INTERLEAVE-NEXT: br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; INTERLEAVE: [[SCALAR_PH]]:
+; INTERLEAVE-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; INTERLEAVE-NEXT: [[BC_RESUME_VAL2:%.*]] = phi float [ [[TMP1]], %[[MIDDLE_BLOCK]] ], [ [[INIT]], %[[ENTRY]] ]
+; INTERLEAVE-NEXT: br label %[[LOOP:.*]]
+; INTERLEAVE: [[LOOP]]:
+; INTERLEAVE-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; INTERLEAVE-NEXT: [[FP_IV:%.*]] = phi float [ [[BC_RESUME_VAL2]], %[[SCALAR_PH]] ], [ [[ADD:%.*]], %[[LOOP]] ]
+; INTERLEAVE-NEXT: [[GEP_A:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[IV]]
+; INTERLEAVE-NEXT: store float [[FP_IV]], ptr [[GEP_A]], align 4
+; INTERLEAVE-NEXT: [[ADD]] = fadd fast float [[FPINC]], [[FP_IV]]
+; INTERLEAVE-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
+; INTERLEAVE-NEXT: [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
+; INTERLEAVE-NEXT: br i1 [[EC]], label %[[EXIT]], label %[[LOOP]], {{!llvm.loop ![0-9]+}}
+; INTERLEAVE: [[EXIT]]:
+; INTERLEAVE-NEXT: [[ADD_LCSSA:%.*]] = phi float [ [[ADD]], %[[LOOP]] ], [ [[TMP1]], %[[MIDDLE_BLOCK]] ]
+; INTERLEAVE-NEXT: ret float [[ADD_LCSSA]]
+;
+entry:
+ br label %loop
+
+loop:
+ %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
+ %fp.iv = phi float [ %init, %entry ], [ %add, %loop ]
+ %gep.A = getelementptr inbounds float, ptr %A, i64 %iv
+ store float %fp.iv, ptr %gep.A, align 4
+ %add = fadd fast float %fpinc, %fp.iv
+ %iv.next = add nuw nsw i64 %iv, 1
+ %ec = icmp eq i64 %iv.next, %N
+ br i1 %ec, label %exit, label %loop
+
+exit:
+ ret float %add
+}
+
define float @fp_postinc_use_fsub(float %init, ptr noalias nocapture %A, i64 %N, float %fpinc) {
; VEC-LABEL: define float @fp_postinc_use_fsub(
; VEC-SAME: float [[INIT:%.*]], ptr noalias nocapture [[A:%.*]], i64 [[N:%.*]], float [[FPINC:%.*]]) {
>From 2764023e9678d7eda6cbe049a5434bf4be359de7 Mon Sep 17 00:00:00 2001
From: Florian Hahn <flo at fhahn.com>
Date: Tue, 31 Dec 2024 14:01:48 +0000
Subject: [PATCH 2/4] Step
---
.../Transforms/Vectorize/LoopVectorize.cpp | 71 +++++++++++++++----
llvm/lib/Transforms/Vectorize/VPlan.h | 9 +++
.../Transforms/Vectorize/VPlanPatternMatch.h | 21 +++++-
.../LoopVectorize/X86/induction-step.ll | 3 +-
4 files changed, 87 insertions(+), 17 deletions(-)
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 5b75f6b26b6c5f..b55e82f217818c 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -8953,14 +8953,65 @@ static void addScalarResumePhis(VPRecipeBuilder &Builder, VPlan &Plan) {
}
}
+static bool isIVUse(VPValue *Incoming) {
+ VPRecipeBase *IncomingDef = Incoming->getDefiningRecipe();
+ if (!IncomingDef)
+ return false;
+ auto *WideIV = dyn_cast<VPWidenInductionRecipe>(IncomingDef);
+ if (WideIV) {
+ return isa<VPWidenPointerInductionRecipe>(WideIV) || !cast<VPWidenIntOrFpInductionRecipe>(WideIV)->getTruncInst();
+ }
+
+ if (IncomingDef->getNumOperands() != 2)
+ return false;
+ WideIV = dyn_cast<VPWidenInductionRecipe>(IncomingDef->getOperand(0));
+ if (!WideIV)
+ WideIV = dyn_cast<VPWidenInductionRecipe>(IncomingDef->getOperand(1));
+ if (!WideIV)
+ return false;
+
+ using namespace VPlanPatternMatch;
+ auto &ID = WideIV->getInductionDescriptor();
+ switch (ID.getInductionOpcode()) {
+ case Instruction::Add:
+ return match(Incoming,
+ m_c_Binary<Instruction::Add>(
+ m_VPValue(), m_Specific(WideIV->getStepValue())));
+ case Instruction::FAdd:
+ return match(Incoming,
+ m_c_Binary<Instruction::FAdd>(
+ m_VPValue(), m_Specific(WideIV->getStepValue())));
+ case Instruction::FSub:
+ return match(Incoming,
+ m_Binary<Instruction::FSub>(
+ m_VPValue(), m_Specific(WideIV->getStepValue())));
+ case Instruction::Sub: {
+ VPValue *Step;
+ return match(Incoming,
+ m_Binary<Instruction::Sub>(m_VPValue(), m_VPValue(Step))) &&
+ Step->isLiveIn() && WideIV->getStepValue()->isLiveIn() &&
+ (cast<ConstantInt>(Step->getLiveInIRValue())->getValue() +
+ cast<ConstantInt>(WideIV->getStepValue()->getLiveInIRValue())
+ ->getValue())
+ .isZero();
+ }
+ default:
+ return ID.getKind() == InductionDescriptor::IK_PtrInduction &&
+ match(
+ Incoming,
+ m_GetElementPtr(m_VPValue(), m_Specific(WideIV->getStepValue())));
+ }
+ llvm_unreachable("should have been covered by switch above");
+}
+
// Collect VPIRInstructions for phis in the exit blocks that are modeled
// in VPlan and add the exiting VPValue as operand. Some exiting values are not
// modeled explicitly yet and won't be included. Those are un-truncated
// VPWidenIntOrFpInductionRecipe, VPWidenPointerInductionRecipe and induction
// increments.
static SetVector<VPIRInstruction *> collectUsersInExitBlocks(
- Loop *OrigLoop, VPRecipeBuilder &Builder, VPlan &Plan,
- const MapVector<PHINode *, InductionDescriptor> &Inductions) {
+ Loop *OrigLoop, VPRecipeBuilder &Builder, VPlan &Plan
+ ) {
auto *MiddleVPBB = Plan.getMiddleBlock();
SetVector<VPIRInstruction *> ExitUsersToFix;
for (VPIRBasicBlock *ExitVPBB : Plan.getExitBlocks()) {
@@ -8985,18 +9036,8 @@ static SetVector<VPIRInstruction *> collectUsersInExitBlocks(
// Exit values for inductions are computed and updated outside of VPlan
// and independent of induction recipes.
// TODO: Compute induction exit values in VPlan.
- if ((isa<VPWidenIntOrFpInductionRecipe>(V) &&
- !cast<VPWidenIntOrFpInductionRecipe>(V)->getTruncInst()) ||
- isa<VPWidenPointerInductionRecipe>(V) ||
- (isa<Instruction>(IncomingValue) &&
- OrigLoop->contains(cast<Instruction>(IncomingValue)) &&
- any_of(IncomingValue->users(), [&Inductions](User *U) {
- auto *P = dyn_cast<PHINode>(U);
- return P && Inductions.contains(P);
- }))) {
- if (ExitVPBB->getSinglePredecessor() == MiddleVPBB)
- continue;
- }
+ if (isIVUse(V) && ExitVPBB->getSinglePredecessor() == MiddleVPBB)
+ continue;
ExitUsersToFix.insert(ExitIRI);
ExitIRI->addOperand(V);
}
@@ -9332,7 +9373,7 @@ LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(VFRange &Range) {
}
addScalarResumePhis(RecipeBuilder, *Plan);
SetVector<VPIRInstruction *> ExitUsersToFix = collectUsersInExitBlocks(
- OrigLoop, RecipeBuilder, *Plan, Legal->getInductionVars());
+ OrigLoop, RecipeBuilder, *Plan);
addExitUsersForFirstOrderRecurrences(*Plan, ExitUsersToFix);
if (!addUsersInExitBlocks(*Plan, ExitUsersToFix)) {
reportVectorizationFailure(
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h
index 88f3f672d3aa38..1be57d23f19cf7 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.h
+++ b/llvm/lib/Transforms/Vectorize/VPlan.h
@@ -2095,6 +2095,15 @@ class VPWidenInductionRecipe : public VPHeaderPHIRecipe {
R->getVPDefID() == VPDef::VPWidenPointerInductionSC;
}
+ static inline bool classof(const VPValue *V) {
+ auto *R = V->getDefiningRecipe();
+ return R && classof(R);
+ }
+
+ static inline bool classof(const VPHeaderPHIRecipe *R) {
+ return classof(static_cast<const VPRecipeBase *>(R));
+ }
+
virtual void execute(VPTransformState &State) override = 0;
/// Returns the step value of the induction.
diff --git a/llvm/lib/Transforms/Vectorize/VPlanPatternMatch.h b/llvm/lib/Transforms/Vectorize/VPlanPatternMatch.h
index ec3c203a61b383..4866426ad88486 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanPatternMatch.h
+++ b/llvm/lib/Transforms/Vectorize/VPlanPatternMatch.h
@@ -139,7 +139,8 @@ struct MatchRecipeAndOpcode<Opcode, RecipeTy> {
if constexpr (std::is_same<RecipeTy, VPScalarIVStepsRecipe>::value ||
std::is_same<RecipeTy, VPCanonicalIVPHIRecipe>::value ||
std::is_same<RecipeTy, VPWidenSelectRecipe>::value ||
- std::is_same<RecipeTy, VPDerivedIVRecipe>::value)
+ std::is_same<RecipeTy, VPDerivedIVRecipe>::value ||
+ std::is_same<RecipeTy, VPWidenGEPRecipe>::value)
return DefR;
else
return DefR && DefR->getOpcode() == Opcode;
@@ -309,6 +310,12 @@ m_Binary(const Op0_t &Op0, const Op1_t &Op1) {
return AllBinaryRecipe_match<Op0_t, Op1_t, Opcode, Commutative>(Op0, Op1);
}
+template <unsigned Opcode, typename Op0_t, typename Op1_t>
+inline AllBinaryRecipe_match<Op0_t, Op1_t, Opcode, true>
+m_c_Binary(const Op0_t &Op0, const Op1_t &Op1) {
+ return AllBinaryRecipe_match<Op0_t, Op1_t, Opcode, true>(Op0, Op1);
+}
+
template <typename Op0_t, typename Op1_t>
inline AllBinaryRecipe_match<Op0_t, Op1_t, Instruction::Mul>
m_Mul(const Op0_t &Op0, const Op1_t &Op1) {
@@ -339,6 +346,18 @@ m_c_BinaryOr(const Op0_t &Op0, const Op1_t &Op1) {
return m_BinaryOr<Op0_t, Op1_t, /*Commutative*/ true>(Op0, Op1);
}
+template <typename Op0_t, typename Op1_t>
+using GEPLikeRecipe_match =
+ BinaryRecipe_match<Op0_t, Op1_t, Instruction::GetElementPtr, false,
+ VPWidenRecipe, VPReplicateRecipe, VPWidenGEPRecipe,
+ VPInstruction>;
+
+template <typename Op0_t, typename Op1_t>
+inline GEPLikeRecipe_match<Op0_t, Op1_t> m_GetElementPtr(const Op0_t &Op0,
+ const Op1_t &Op1) {
+ return GEPLikeRecipe_match<Op0_t, Op1_t>(Op0, Op1);
+}
+
template <typename Op0_t, typename Op1_t, typename Op2_t, unsigned Opcode>
using AllTernaryRecipe_match =
Recipe_match<std::tuple<Op0_t, Op1_t, Op2_t>, Opcode, false,
diff --git a/llvm/test/Transforms/LoopVectorize/X86/induction-step.ll b/llvm/test/Transforms/LoopVectorize/X86/induction-step.ll
index f6a9767c7f87d5..1dd2692ba6822d 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/induction-step.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/induction-step.ll
@@ -115,6 +115,7 @@ define i16 @wide_sub_induction_step_live_in(ptr %dst, i64 %N, i16 %off) {
; CHECK-NEXT: [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
; CHECK-NEXT: br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
; CHECK: middle.block:
+; CHECK-NEXT: [[TMP11:%.*]] = extractelement <4 x i16> [[TMP10]], i32 3
; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
; CHECK-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]
; CHECK: scalar.ph:
@@ -131,7 +132,7 @@ define i16 @wide_sub_induction_step_live_in(ptr %dst, i64 %N, i16 %off) {
; CHECK-NEXT: [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
; CHECK-NEXT: br i1 [[EC]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP5:![0-9]+]]
; CHECK: exit:
-; CHECK-NEXT: [[SUB_LCSSA:%.*]] = phi i16 [ [[SUB]], [[LOOP]] ], [ [[TMP1]], [[MIDDLE_BLOCK]] ]
+; CHECK-NEXT: [[SUB_LCSSA:%.*]] = phi i16 [ [[SUB]], [[LOOP]] ], [ [[TMP11]], [[MIDDLE_BLOCK]] ]
; CHECK-NEXT: ret i16 [[SUB_LCSSA]]
;
entry:
>From 844aa2a7995766d7e685bd4b8d03c58575b898ec Mon Sep 17 00:00:00 2001
From: Florian Hahn <flo at fhahn.com>
Date: Fri, 11 Oct 2024 14:11:42 +0100
Subject: [PATCH 3/4] [VPlan] Update final exit value via VPlan.
Model updating IV users directly in VPlan, replace fixupIVUsers.
Depends on https://github.com/llvm/llvm-project/pull/110004,
https://github.com/llvm/llvm-project/pull/109975 and
https://github.com/llvm/llvm-project/pull/112145.
---
.../Transforms/Vectorize/LoopVectorize.cpp | 240 +++++++-----------
llvm/lib/Transforms/Vectorize/VPlan.cpp | 24 +-
llvm/lib/Transforms/Vectorize/VPlan.h | 5 +
.../lib/Transforms/Vectorize/VPlanRecipes.cpp | 2 +-
.../RISCV/riscv-vector-reverse.ll | 2 -
5 files changed, 118 insertions(+), 155 deletions(-)
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index b55e82f217818c..6c0b47a427f688 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -543,11 +543,6 @@ class InnerLoopVectorizer {
protected:
friend class LoopVectorizationPlanner;
- /// Set up the values of the IVs correctly when exiting the vector loop.
- virtual void fixupIVUsers(PHINode *OrigPhi, const InductionDescriptor &II,
- Value *VectorTripCount, BasicBlock *MiddleBlock,
- VPTransformState &State);
-
/// Iteratively sink the scalarized operands of a predicated instruction into
/// the block that was created for it.
void sinkScalarOperands(Instruction *PredInst);
@@ -785,10 +780,6 @@ class EpilogueVectorizerMainLoop : public InnerLoopAndEpilogueVectorizer {
BasicBlock *emitIterationCountCheck(BasicBlock *Bypass, bool ForEpilogue);
void printDebugTracesAtStart() override;
void printDebugTracesAtEnd() override;
-
- void fixupIVUsers(PHINode *OrigPhi, const InductionDescriptor &II,
- Value *VectorTripCount, BasicBlock *MiddleBlock,
- VPTransformState &State) override {};
};
// A specialized derived class of inner loop vectorizer that performs
@@ -2775,97 +2766,6 @@ BasicBlock *InnerLoopVectorizer::createVectorizedLoopSkeleton(
return LoopVectorPreHeader;
}
-// Fix up external users of the induction variable. At this point, we are
-// in LCSSA form, with all external PHIs that use the IV having one input value,
-// coming from the remainder loop. We need those PHIs to also have a correct
-// value for the IV when arriving directly from the middle block.
-void InnerLoopVectorizer::fixupIVUsers(PHINode *OrigPhi,
- const InductionDescriptor &II,
- Value *VectorTripCount,
- BasicBlock *MiddleBlock,
- VPTransformState &State) {
- // There are two kinds of external IV usages - those that use the value
- // computed in the last iteration (the PHI) and those that use the penultimate
- // value (the value that feeds into the phi from the loop latch).
- // We allow both, but they, obviously, have different values.
-
- DenseMap<Value *, Value *> MissingVals;
-
- Value *EndValue = cast<PHINode>(OrigPhi->getIncomingValueForBlock(
- OrigLoop->getLoopPreheader()))
- ->getIncomingValueForBlock(MiddleBlock);
-
- // An external user of the last iteration's value should see the value that
- // the remainder loop uses to initialize its own IV.
- Value *PostInc = OrigPhi->getIncomingValueForBlock(OrigLoop->getLoopLatch());
- for (User *U : PostInc->users()) {
- Instruction *UI = cast<Instruction>(U);
- if (!OrigLoop->contains(UI)) {
- assert(isa<PHINode>(UI) && "Expected LCSSA form");
- MissingVals[UI] = EndValue;
- }
- }
-
- // An external user of the penultimate value need to see EndValue - Step.
- // The simplest way to get this is to recompute it from the constituent SCEVs,
- // that is Start + (Step * (CRD - 1)).
- for (User *U : OrigPhi->users()) {
- auto *UI = cast<Instruction>(U);
- if (!OrigLoop->contains(UI)) {
- assert(isa<PHINode>(UI) && "Expected LCSSA form");
- IRBuilder<> B(MiddleBlock->getTerminator());
-
- // Fast-math-flags propagate from the original induction instruction.
- if (isa_and_nonnull<FPMathOperator>(II.getInductionBinOp()))
- B.setFastMathFlags(II.getInductionBinOp()->getFastMathFlags());
-
- VPValue *StepVPV = Plan.getSCEVExpansion(II.getStep());
- assert(StepVPV && "step must have been expanded during VPlan execution");
- Value *Step = StepVPV->isLiveIn() ? StepVPV->getLiveInIRValue()
- : State.get(StepVPV, VPLane(0));
- Value *Escape = nullptr;
- if (EndValue->getType()->isIntegerTy())
- Escape = B.CreateSub(EndValue, Step);
- else if (EndValue->getType()->isPointerTy())
- Escape = B.CreatePtrAdd(EndValue, B.CreateNeg(Step));
- else {
- assert(EndValue->getType()->isFloatingPointTy() &&
- "Unexpected induction type");
- Escape = B.CreateBinOp(II.getInductionBinOp()->getOpcode() ==
- Instruction::FAdd
- ? Instruction::FSub
- : Instruction::FAdd,
- EndValue, Step);
- }
- Escape->setName("ind.escape");
- MissingVals[UI] = Escape;
- }
- }
-
- assert((MissingVals.empty() ||
- all_of(MissingVals,
- [MiddleBlock, this](const std::pair<Value *, Value *> &P) {
- return all_of(
- predecessors(cast<Instruction>(P.first)->getParent()),
- [MiddleBlock, this](BasicBlock *Pred) {
- return Pred == MiddleBlock ||
- Pred == OrigLoop->getLoopLatch();
- });
- })) &&
- "Expected escaping values from latch/middle.block only");
-
- for (auto &I : MissingVals) {
- PHINode *PHI = cast<PHINode>(I.first);
- // One corner case we have to handle is two IVs "chasing" each-other,
- // that is %IV2 = phi [...], [ %IV1, %latch ]
- // In this case, if IV1 has an external use, we need to avoid adding both
- // "last value of IV1" and "penultimate value of IV2". So, verify that we
- // don't already have an incoming value for the middle block.
- if (PHI->getBasicBlockIndex(MiddleBlock) == -1)
- PHI->addIncoming(I.second, MiddleBlock);
- }
-}
-
namespace {
struct CSEDenseMapInfo {
@@ -2994,24 +2894,6 @@ void InnerLoopVectorizer::fixVectorizedLoop(VPTransformState &State) {
for (PHINode &PN : Exit->phis())
PSE.getSE()->forgetLcssaPhiWithNewPredecessor(OrigLoop, &PN);
- if (Cost->requiresScalarEpilogue(VF.isVector())) {
- // No edge from the middle block to the unique exit block has been inserted
- // and there is nothing to fix from vector loop; phis should have incoming
- // from scalar loop only.
- } else {
- // TODO: Check in VPlan to see if IV users need fixing instead of checking
- // the cost model.
-
- // If we inserted an edge from the middle block to the unique exit block,
- // update uses outside the loop (phis) to account for the newly inserted
- // edge.
-
- // Fix-up external users of the induction variables.
- for (const auto &Entry : Legal->getInductionVars())
- fixupIVUsers(Entry.first, Entry.second,
- getOrCreateVectorTripCount(nullptr), LoopMiddleBlock, State);
- }
-
for (Instruction *PI : PredicatedInstructions)
sinkScalarOperands(&*PI);
@@ -8866,11 +8748,10 @@ static void addCanonicalIVRecipes(VPlan &Plan, Type *IdxTy, bool HasNUW,
/// Create and return a ResumePhi for \p WideIV, unless it is truncated. If the
/// induction recipe is not canonical, creates a VPDerivedIVRecipe to compute
/// the end value of the induction.
-static VPValue *addResumePhiRecipeForInduction(VPWidenInductionRecipe *WideIV,
- VPBuilder &VectorPHBuilder,
- VPBuilder &ScalarPHBuilder,
- VPTypeAnalysis &TypeInfo,
- VPValue *VectorTC) {
+static VPValue *addResumePhiRecipeForInduction(
+ VPWidenInductionRecipe *WideIV, VPBuilder &VectorPHBuilder,
+ VPBuilder &ScalarPHBuilder, VPTypeAnalysis &TypeInfo, VPValue *VectorTC,
+ DenseMap<VPValue *, VPValue *> &EndValues) {
auto *WideIntOrFp = dyn_cast<VPWidenIntOrFpInductionRecipe>(WideIV);
// Truncated wide inductions resume from the last lane of their vector value
// in the last vector iteration which is handled elsewhere.
@@ -8895,6 +8776,7 @@ static VPValue *addResumePhiRecipeForInduction(VPWidenInductionRecipe *WideIV,
ScalarTypeOfWideIV);
}
+ EndValues[WideIV] = EndValue;
auto *ResumePhiRecipe =
ScalarPHBuilder.createNaryOp(VPInstruction::ResumePhi, {EndValue, Start},
WideIV->getDebugLoc(), "bc.resume.val");
@@ -8904,7 +8786,9 @@ static VPValue *addResumePhiRecipeForInduction(VPWidenInductionRecipe *WideIV,
/// Create resume phis in the scalar preheader for first-order recurrences,
/// reductions and inductions, and update the VPIRInstructions wrapping the
/// original phis in the scalar header.
-static void addScalarResumePhis(VPRecipeBuilder &Builder, VPlan &Plan) {
+static void addScalarResumePhis(VPRecipeBuilder &Builder, VPlan &Plan,
+ Loop *OrigLoop,
+ DenseMap<VPValue *, VPValue *> &EndValues) {
VPTypeAnalysis TypeInfo(Plan.getCanonicalIV()->getScalarType());
auto *ScalarPH = Plan.getScalarPreheader();
auto *MiddleVPBB = cast<VPBasicBlock>(ScalarPH->getSinglePredecessor());
@@ -8924,7 +8808,7 @@ static void addScalarResumePhis(VPRecipeBuilder &Builder, VPlan &Plan) {
if (auto *WideIVR = dyn_cast<VPWidenInductionRecipe>(VectorPhiR)) {
if (VPValue *ResumePhi = addResumePhiRecipeForInduction(
WideIVR, VectorPHBuilder, ScalarPHBuilder, TypeInfo,
- &Plan.getVectorTripCount())) {
+ &Plan.getVectorTripCount(), EndValues)) {
ScalarPhiIRI->addOperand(ResumePhi);
continue;
}
@@ -9009,9 +8893,9 @@ static bool isIVUse(VPValue *Incoming) {
// modeled explicitly yet and won't be included. Those are un-truncated
// VPWidenIntOrFpInductionRecipe, VPWidenPointerInductionRecipe and induction
// increments.
-static SetVector<VPIRInstruction *> collectUsersInExitBlocks(
- Loop *OrigLoop, VPRecipeBuilder &Builder, VPlan &Plan
- ) {
+static SetVector<VPIRInstruction *>
+collectUsersInExitBlocks(Loop *OrigLoop, VPRecipeBuilder &Builder,
+ VPlan &Plan) {
auto *MiddleVPBB = Plan.getMiddleBlock();
SetVector<VPIRInstruction *> ExitUsersToFix;
for (VPIRBasicBlock *ExitVPBB : Plan.getExitBlocks()) {
@@ -9033,11 +8917,6 @@ static SetVector<VPIRInstruction *> collectUsersInExitBlocks(
}
Value *IncomingValue = ExitPhi->getIncomingValueForBlock(ExitingBB);
VPValue *V = Builder.getVPValueOrAddLiveIn(IncomingValue);
- // Exit values for inductions are computed and updated outside of VPlan
- // and independent of induction recipes.
- // TODO: Compute induction exit values in VPlan.
- if (isIVUse(V) && ExitVPBB->getSinglePredecessor() == MiddleVPBB)
- continue;
ExitUsersToFix.insert(ExitIRI);
ExitIRI->addOperand(V);
}
@@ -9046,17 +8925,86 @@ static SetVector<VPIRInstruction *> collectUsersInExitBlocks(
return ExitUsersToFix;
}
+/// If \p Incoming is a user of a non-truncated induction, create recipes to
+/// compute the final value and update the user \p ExitIRI.
+static bool addInductionEndValue(
+ VPlan &Plan, VPIRInstruction *ExitIRI, VPValue *Incoming,
+ const MapVector<PHINode *, InductionDescriptor> &Inductions,
+ DenseMap<VPValue *, VPValue *> &EndValues, VPTypeAnalysis &TypeInfo) {
+ if ((isa<VPWidenIntOrFpInductionRecipe>(Incoming) &&
+ !cast<VPWidenIntOrFpInductionRecipe>(Incoming)->getTruncInst()) ||
+ isa<VPWidenPointerInductionRecipe>(Incoming) ||
+ (isa<Instruction>(Incoming->getUnderlyingValue()) &&
+ any_of(cast<Instruction>(Incoming->getUnderlyingValue())->users(),
+ [&Inductions](User *U) {
+ auto *P = dyn_cast<PHINode>(U);
+ return P && Inductions.contains(P);
+ }))) {
+ VPValue *IV;
+ if (auto *WideIV =
+ dyn_cast<VPWidenInductionRecipe>(Incoming->getDefiningRecipe()))
+ IV = WideIV;
+ else if (auto *WideIV =
+ dyn_cast<VPWidenInductionRecipe>(Incoming->getDefiningRecipe()
+ ->getOperand(0)
+ ->getDefiningRecipe()))
+ IV = WideIV;
+ else
+ IV = Incoming->getDefiningRecipe()->getOperand(1);
+ // Skip phi nodes already updated. This can be the case if 2 induction
+ // phis chase each other.
+ VPValue *EndValue = EndValues[IV];
+ if (any_of(cast<VPRecipeBase>(Incoming->getDefiningRecipe())->operands(),
+ IsaPred<VPWidenIntOrFpInductionRecipe,
+ VPWidenPointerInductionRecipe>)) {
+ ExitIRI->setOperand(0, EndValue);
+ return true;
+ }
+
+ VPBuilder B(Plan.getMiddleBlock()->getTerminator());
+ VPValue *Escape = nullptr;
+ auto *WideIV = cast<VPWidenInductionRecipe>(IV->getDefiningRecipe());
+ VPValue *Step = WideIV->getStepValue();
+ Type *ScalarTy = TypeInfo.inferScalarType(WideIV);
+ if (ScalarTy->isIntegerTy())
+ Escape =
+ B.createNaryOp(Instruction::Sub, {EndValue, Step}, {}, "ind.escape");
+ else if (ScalarTy->isPointerTy())
+ Escape = B.createPtrAdd(
+ EndValue,
+ B.createNaryOp(Instruction::Sub,
+ {Plan.getOrAddLiveIn(ConstantInt::get(
+ Step->getLiveInIRValue()->getType(), 0)),
+ Step}),
+ {}, "ind.escape");
+ else if (ScalarTy->isFloatingPointTy()) {
+ const auto &ID = WideIV->getInductionDescriptor();
+ Escape = B.createNaryOp(
+ ID.getInductionBinOp()->getOpcode() == Instruction::FAdd
+ ? Instruction::FSub
+ : Instruction::FAdd,
+ {EndValue, Step}, {ID.getInductionBinOp()->getFastMathFlags()});
+ } else {
+ llvm_unreachable("all possible induction types must be handled");
+ }
+ ExitIRI->setOperand(0, Escape);
+ return true;
+ }
+ return false;
+}
// Add exit values to \p Plan. Extracts are added for each entry in \p
// ExitUsersToFix if needed and their operands are updated. Returns true if all
// exit users can be handled, otherwise return false.
-static bool
-addUsersInExitBlocks(VPlan &Plan,
- const SetVector<VPIRInstruction *> &ExitUsersToFix) {
+static bool addUsersInExitBlocks(
+ VPlan &Plan, const SetVector<VPIRInstruction *> &ExitUsersToFix,
+ const MapVector<PHINode *, InductionDescriptor> &Inductions,
+ DenseMap<VPValue *, VPValue *> &EndValues) {
if (ExitUsersToFix.empty())
return true;
auto *MiddleVPBB = Plan.getMiddleBlock();
VPBuilder B(MiddleVPBB, MiddleVPBB->getFirstNonPhi());
+ VPTypeAnalysis TypeInfo(Plan.getCanonicalIV()->getScalarType());
// Introduce extract for exiting values and update the VPIRInstructions
// modeling the corresponding LCSSA phis.
@@ -9072,11 +9020,16 @@ addUsersInExitBlocks(VPlan &Plan,
if (ExitIRI->getParent()->getSinglePredecessor() != MiddleVPBB)
return false;
+ VPValue *Incoming = ExitIRI->getOperand(0);
+ if (addInductionEndValue(Plan, ExitIRI, Incoming, Inductions, EndValues,
+ TypeInfo))
+ continue;
+
LLVMContext &Ctx = ExitIRI->getInstruction().getContext();
VPValue *Ext = B.createNaryOp(VPInstruction::ExtractFromEnd,
{Op, Plan.getOrAddLiveIn(ConstantInt::get(
IntegerType::get(Ctx, 32), 1))});
- ExitIRI->setOperand(Idx, Ext);
+ ExitIRI->setOperand(0, Ext);
}
}
return true;
@@ -9371,11 +9324,13 @@ LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(VFRange &Range) {
VPlanTransforms::handleUncountableEarlyExit(
*Plan, *PSE.getSE(), OrigLoop, UncountableExitingBlock, RecipeBuilder);
}
- addScalarResumePhis(RecipeBuilder, *Plan);
- SetVector<VPIRInstruction *> ExitUsersToFix = collectUsersInExitBlocks(
- OrigLoop, RecipeBuilder, *Plan);
+ DenseMap<VPValue *, VPValue *> EndValues;
+ addScalarResumePhis(RecipeBuilder, *Plan, OrigLoop, EndValues);
+ SetVector<VPIRInstruction *> ExitUsersToFix =
+ collectUsersInExitBlocks(OrigLoop, RecipeBuilder, *Plan);
addExitUsersForFirstOrderRecurrences(*Plan, ExitUsersToFix);
- if (!addUsersInExitBlocks(*Plan, ExitUsersToFix)) {
+ if (!addUsersInExitBlocks(*Plan, ExitUsersToFix,
+ EndValues)) {
reportVectorizationFailure(
"Some exit values in loop with uncountable exit not supported yet",
"UncountableEarlyExitLoopsUnsupportedExitValue", ORE, OrigLoop);
@@ -9502,7 +9457,8 @@ VPlanPtr LoopVectorizationPlanner::buildVPlan(VFRange &Range) {
auto *HeaderR = cast<VPHeaderPHIRecipe>(&R);
RecipeBuilder.setRecipe(HeaderR->getUnderlyingInstr(), HeaderR);
}
- addScalarResumePhis(RecipeBuilder, *Plan);
+ DenseMap<VPValue *, VPValue *> EndValues;
+ addScalarResumePhis(RecipeBuilder, *Plan, OrigLoop, EndValues);
assert(verifyVPlanIsValid(*Plan) && "VPlan is invalid");
return Plan;
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.cpp b/llvm/lib/Transforms/Vectorize/VPlan.cpp
index 06c36396a17f38..bd8c2e28ab36bc 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlan.cpp
@@ -311,16 +311,20 @@ Value *VPTransformState::get(VPValue *Def, bool NeedsScalar) {
LastLane = 0;
}
- auto *LastInst = cast<Instruction>(get(Def, LastLane));
- // Set the insert point after the last scalarized instruction or after the
- // last PHI, if LastInst is a PHI. This ensures the insertelement sequence
- // will directly follow the scalar definitions.
+ auto *LastDef = get(Def, LastLane);
auto OldIP = Builder.saveIP();
- auto NewIP =
- isa<PHINode>(LastInst)
- ? BasicBlock::iterator(LastInst->getParent()->getFirstNonPHI())
- : std::next(BasicBlock::iterator(LastInst));
- Builder.SetInsertPoint(&*NewIP);
+ if (auto *LastInst = dyn_cast<Instruction>(LastDef)) {
+ // TODO: Remove once VPDerivedIVReicpe can be simplified, which requires
+ // vector trip count being modeled in VPlan.
+ // Set the insert point after the last scalarized instruction or after the
+ // last PHI, if LastInst is a PHI. This ensures the insertelement sequence
+ // will directly follow the scalar definitions.
+ auto NewIP =
+ isa<PHINode>(LastInst)
+ ? BasicBlock::iterator(LastInst->getParent()->getFirstNonPHI())
+ : std::next(BasicBlock::iterator(LastInst));
+ Builder.SetInsertPoint(&*NewIP);
+ }
// However, if we are vectorizing, we need to construct the vector values.
// If the value is known to be uniform after vectorization, we can just
@@ -335,7 +339,7 @@ Value *VPTransformState::get(VPValue *Def, bool NeedsScalar) {
} else {
// Initialize packing with insertelements to start from undef.
assert(!VF.isScalable() && "VF is assumed to be non scalable.");
- Value *Undef = PoisonValue::get(VectorType::get(LastInst->getType(), VF));
+ Value *Undef = PoisonValue::get(VectorType::get(LastDef->getType(), VF));
set(Def, Undef);
for (unsigned Lane = 0; Lane < VF.getKnownMinValue(); ++Lane)
packScalarIntoVectorValue(Def, Lane);
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h
index 1be57d23f19cf7..a109bc23507990 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.h
+++ b/llvm/lib/Transforms/Vectorize/VPlan.h
@@ -1416,6 +1416,11 @@ class VPIRInstruction : public VPRecipeBase {
"Op must be an operand of the recipe");
return true;
}
+ bool onlyFirstLaneUsed(const VPValue *Op) const override {
+ assert(is_contained(operands(), Op) &&
+ "Op must be an operand of the recipe");
+ return true;
+ }
};
/// VPWidenRecipe is a recipe for producing a widened instruction using the
diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
index 77c08839dbfa95..3514a5a9d4467d 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
@@ -843,7 +843,7 @@ void VPIRInstruction::execute(VPTransformState &State) {
BasicBlock *PredBB = State.CFG.VPBB2IRBB[PredVPBB];
// Set insertion point in PredBB in case an extract needs to be generated.
// TODO: Model extracts explicitly.
- State.Builder.SetInsertPoint(PredBB, PredBB->getFirstNonPHIIt());
+ State.Builder.SetInsertPoint(&*PredBB->getTerminator());
Value *V = State.get(ExitValue, VPLane(Lane));
auto *Phi = cast<PHINode>(&I);
// If there is no existing block for PredBB in the phi, add a new incoming
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/riscv-vector-reverse.ll b/llvm/test/Transforms/LoopVectorize/RISCV/riscv-vector-reverse.ll
index 951d833fa941e8..f630f4f21e065f 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/riscv-vector-reverse.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/riscv-vector-reverse.ll
@@ -230,7 +230,6 @@ define void @vector_reverse_i64(ptr nocapture noundef writeonly %A, ptr nocaptur
; CHECK: IR %indvars.iv.next = add nsw i64 %indvars.iv, -1
; CHECK-NEXT: No successors
; CHECK-NEXT: }
-; CHECK: LV: Loop does not require scalar epilogue
;
entry:
%cmp7 = icmp sgt i32 %n, 0
@@ -480,7 +479,6 @@ define void @vector_reverse_f32(ptr nocapture noundef writeonly %A, ptr nocaptur
; CHECK: IR %indvars.iv.next = add nsw i64 %indvars.iv, -1
; CHECK-NEXT: No successors
; CHECK-NEXT: }
-; CHECK: LV: Loop does not require scalar epilogue
;
entry:
%cmp7 = icmp sgt i32 %n, 0
>From 4b3ff6332b04b6170b01d7965365423876310485 Mon Sep 17 00:00:00 2001
From: Florian Hahn <flo at fhahn.com>
Date: Mon, 30 Dec 2024 13:47:04 +0000
Subject: [PATCH 4/4] !fixup Turn into VPlan transform
---
.../Transforms/Vectorize/LoopVectorize.cpp | 163 +++---------------
llvm/lib/Transforms/Vectorize/VPlan.cpp | 6 +-
.../lib/Transforms/Vectorize/VPlanRecipes.cpp | 2 +-
.../Transforms/Vectorize/VPlanTransforms.cpp | 121 +++++++++++++
.../Transforms/Vectorize/VPlanTransforms.h | 4 +
.../single_early_exit_live_outs.ll | 5 +-
6 files changed, 153 insertions(+), 148 deletions(-)
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 6c0b47a427f688..421d915bf55cab 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -8748,10 +8748,9 @@ static void addCanonicalIVRecipes(VPlan &Plan, Type *IdxTy, bool HasNUW,
/// Create and return a ResumePhi for \p WideIV, unless it is truncated. If the
/// induction recipe is not canonical, creates a VPDerivedIVRecipe to compute
/// the end value of the induction.
-static VPValue *addResumePhiRecipeForInduction(
+static VPInstruction *addResumePhiRecipeForInduction(
VPWidenInductionRecipe *WideIV, VPBuilder &VectorPHBuilder,
- VPBuilder &ScalarPHBuilder, VPTypeAnalysis &TypeInfo, VPValue *VectorTC,
- DenseMap<VPValue *, VPValue *> &EndValues) {
+ VPBuilder &ScalarPHBuilder, VPTypeAnalysis &TypeInfo, VPValue *VectorTC) {
auto *WideIntOrFp = dyn_cast<VPWidenIntOrFpInductionRecipe>(WideIV);
// Truncated wide inductions resume from the last lane of their vector value
// in the last vector iteration which is handled elsewhere.
@@ -8776,7 +8775,6 @@ static VPValue *addResumePhiRecipeForInduction(
ScalarTypeOfWideIV);
}
- EndValues[WideIV] = EndValue;
auto *ResumePhiRecipe =
ScalarPHBuilder.createNaryOp(VPInstruction::ResumePhi, {EndValue, Start},
WideIV->getDebugLoc(), "bc.resume.val");
@@ -8785,10 +8783,10 @@ static VPValue *addResumePhiRecipeForInduction(
/// Create resume phis in the scalar preheader for first-order recurrences,
/// reductions and inductions, and update the VPIRInstructions wrapping the
-/// original phis in the scalar header.
+/// original phis in the scalar header. End values for inductions are added to
+/// \p IVEndValues.
static void addScalarResumePhis(VPRecipeBuilder &Builder, VPlan &Plan,
- Loop *OrigLoop,
- DenseMap<VPValue *, VPValue *> &EndValues) {
+ DenseMap<VPValue *, VPValue *> &IVEndValues) {
VPTypeAnalysis TypeInfo(Plan.getCanonicalIV()->getScalarType());
auto *ScalarPH = Plan.getScalarPreheader();
auto *MiddleVPBB = cast<VPBasicBlock>(ScalarPH->getSinglePredecessor());
@@ -8806,9 +8804,10 @@ static void addScalarResumePhis(VPRecipeBuilder &Builder, VPlan &Plan,
auto *VectorPhiR = cast<VPHeaderPHIRecipe>(Builder.getRecipe(ScalarPhiI));
if (auto *WideIVR = dyn_cast<VPWidenInductionRecipe>(VectorPhiR)) {
- if (VPValue *ResumePhi = addResumePhiRecipeForInduction(
+ if (VPInstruction *ResumePhi = addResumePhiRecipeForInduction(
WideIVR, VectorPHBuilder, ScalarPHBuilder, TypeInfo,
- &Plan.getVectorTripCount(), EndValues)) {
+ &Plan.getVectorTripCount())) {
+ IVEndValues[WideIVR] = ResumePhi->getOperand(0);
ScalarPhiIRI->addOperand(ResumePhi);
continue;
}
@@ -8837,57 +8836,6 @@ static void addScalarResumePhis(VPRecipeBuilder &Builder, VPlan &Plan,
}
}
-static bool isIVUse(VPValue *Incoming) {
- VPRecipeBase *IncomingDef = Incoming->getDefiningRecipe();
- if (!IncomingDef)
- return false;
- auto *WideIV = dyn_cast<VPWidenInductionRecipe>(IncomingDef);
- if (WideIV) {
- return isa<VPWidenPointerInductionRecipe>(WideIV) || !cast<VPWidenIntOrFpInductionRecipe>(WideIV)->getTruncInst();
- }
-
- if (IncomingDef->getNumOperands() != 2)
- return false;
- WideIV = dyn_cast<VPWidenInductionRecipe>(IncomingDef->getOperand(0));
- if (!WideIV)
- WideIV = dyn_cast<VPWidenInductionRecipe>(IncomingDef->getOperand(1));
- if (!WideIV)
- return false;
-
- using namespace VPlanPatternMatch;
- auto &ID = WideIV->getInductionDescriptor();
- switch (ID.getInductionOpcode()) {
- case Instruction::Add:
- return match(Incoming,
- m_c_Binary<Instruction::Add>(
- m_VPValue(), m_Specific(WideIV->getStepValue())));
- case Instruction::FAdd:
- return match(Incoming,
- m_c_Binary<Instruction::FAdd>(
- m_VPValue(), m_Specific(WideIV->getStepValue())));
- case Instruction::FSub:
- return match(Incoming,
- m_Binary<Instruction::FSub>(
- m_VPValue(), m_Specific(WideIV->getStepValue())));
- case Instruction::Sub: {
- VPValue *Step;
- return match(Incoming,
- m_Binary<Instruction::Sub>(m_VPValue(), m_VPValue(Step))) &&
- Step->isLiveIn() && WideIV->getStepValue()->isLiveIn() &&
- (cast<ConstantInt>(Step->getLiveInIRValue())->getValue() +
- cast<ConstantInt>(WideIV->getStepValue()->getLiveInIRValue())
- ->getValue())
- .isZero();
- }
- default:
- return ID.getKind() == InductionDescriptor::IK_PtrInduction &&
- match(
- Incoming,
- m_GetElementPtr(m_VPValue(), m_Specific(WideIV->getStepValue())));
- }
- llvm_unreachable("should have been covered by switch above");
-}
-
// Collect VPIRInstructions for phis in the exit blocks that are modeled
// in VPlan and add the exiting VPValue as operand. Some exiting values are not
// modeled explicitly yet and won't be included. Those are un-truncated
@@ -8925,80 +8873,13 @@ collectUsersInExitBlocks(Loop *OrigLoop, VPRecipeBuilder &Builder,
return ExitUsersToFix;
}
-/// If \p Incoming is a user of a non-truncated induction, create recipes to
-/// compute the final value and update the user \p ExitIRI.
-static bool addInductionEndValue(
- VPlan &Plan, VPIRInstruction *ExitIRI, VPValue *Incoming,
- const MapVector<PHINode *, InductionDescriptor> &Inductions,
- DenseMap<VPValue *, VPValue *> &EndValues, VPTypeAnalysis &TypeInfo) {
- if ((isa<VPWidenIntOrFpInductionRecipe>(Incoming) &&
- !cast<VPWidenIntOrFpInductionRecipe>(Incoming)->getTruncInst()) ||
- isa<VPWidenPointerInductionRecipe>(Incoming) ||
- (isa<Instruction>(Incoming->getUnderlyingValue()) &&
- any_of(cast<Instruction>(Incoming->getUnderlyingValue())->users(),
- [&Inductions](User *U) {
- auto *P = dyn_cast<PHINode>(U);
- return P && Inductions.contains(P);
- }))) {
- VPValue *IV;
- if (auto *WideIV =
- dyn_cast<VPWidenInductionRecipe>(Incoming->getDefiningRecipe()))
- IV = WideIV;
- else if (auto *WideIV =
- dyn_cast<VPWidenInductionRecipe>(Incoming->getDefiningRecipe()
- ->getOperand(0)
- ->getDefiningRecipe()))
- IV = WideIV;
- else
- IV = Incoming->getDefiningRecipe()->getOperand(1);
- // Skip phi nodes already updated. This can be the case if 2 induction
- // phis chase each other.
- VPValue *EndValue = EndValues[IV];
- if (any_of(cast<VPRecipeBase>(Incoming->getDefiningRecipe())->operands(),
- IsaPred<VPWidenIntOrFpInductionRecipe,
- VPWidenPointerInductionRecipe>)) {
- ExitIRI->setOperand(0, EndValue);
- return true;
- }
-
- VPBuilder B(Plan.getMiddleBlock()->getTerminator());
- VPValue *Escape = nullptr;
- auto *WideIV = cast<VPWidenInductionRecipe>(IV->getDefiningRecipe());
- VPValue *Step = WideIV->getStepValue();
- Type *ScalarTy = TypeInfo.inferScalarType(WideIV);
- if (ScalarTy->isIntegerTy())
- Escape =
- B.createNaryOp(Instruction::Sub, {EndValue, Step}, {}, "ind.escape");
- else if (ScalarTy->isPointerTy())
- Escape = B.createPtrAdd(
- EndValue,
- B.createNaryOp(Instruction::Sub,
- {Plan.getOrAddLiveIn(ConstantInt::get(
- Step->getLiveInIRValue()->getType(), 0)),
- Step}),
- {}, "ind.escape");
- else if (ScalarTy->isFloatingPointTy()) {
- const auto &ID = WideIV->getInductionDescriptor();
- Escape = B.createNaryOp(
- ID.getInductionBinOp()->getOpcode() == Instruction::FAdd
- ? Instruction::FSub
- : Instruction::FAdd,
- {EndValue, Step}, {ID.getInductionBinOp()->getFastMathFlags()});
- } else {
- llvm_unreachable("all possible induction types must be handled");
- }
- ExitIRI->setOperand(0, Escape);
- return true;
- }
- return false;
-}
// Add exit values to \p Plan. Extracts are added for each entry in \p
// ExitUsersToFix if needed and their operands are updated. Returns true if all
// exit users can be handled, otherwise return false.
-static bool addUsersInExitBlocks(
- VPlan &Plan, const SetVector<VPIRInstruction *> &ExitUsersToFix,
- const MapVector<PHINode *, InductionDescriptor> &Inductions,
- DenseMap<VPValue *, VPValue *> &EndValues) {
+static bool
+addUsersInExitBlocks(VPlan &Plan,
+ const SetVector<VPIRInstruction *> &ExitUsersToFix,
+ DenseMap<VPValue *, VPValue *> &IVEndValues) {
if (ExitUsersToFix.empty())
return true;
@@ -9020,16 +8901,11 @@ static bool addUsersInExitBlocks(
if (ExitIRI->getParent()->getSinglePredecessor() != MiddleVPBB)
return false;
- VPValue *Incoming = ExitIRI->getOperand(0);
- if (addInductionEndValue(Plan, ExitIRI, Incoming, Inductions, EndValues,
- TypeInfo))
- continue;
-
LLVMContext &Ctx = ExitIRI->getInstruction().getContext();
VPValue *Ext = B.createNaryOp(VPInstruction::ExtractFromEnd,
{Op, Plan.getOrAddLiveIn(ConstantInt::get(
IntegerType::get(Ctx, 32), 1))});
- ExitIRI->setOperand(0, Ext);
+ ExitIRI->setOperand(Idx, Ext);
}
}
return true;
@@ -9324,13 +9200,12 @@ LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(VFRange &Range) {
VPlanTransforms::handleUncountableEarlyExit(
*Plan, *PSE.getSE(), OrigLoop, UncountableExitingBlock, RecipeBuilder);
}
- DenseMap<VPValue *, VPValue *> EndValues;
- addScalarResumePhis(RecipeBuilder, *Plan, OrigLoop, EndValues);
+ DenseMap<VPValue *, VPValue *> IVEndValues;
+ addScalarResumePhis(RecipeBuilder, *Plan, IVEndValues);
SetVector<VPIRInstruction *> ExitUsersToFix =
collectUsersInExitBlocks(OrigLoop, RecipeBuilder, *Plan);
addExitUsersForFirstOrderRecurrences(*Plan, ExitUsersToFix);
- if (!addUsersInExitBlocks(*Plan, ExitUsersToFix,
- EndValues)) {
+ if (!addUsersInExitBlocks(*Plan, ExitUsersToFix, IVEndValues)) {
reportVectorizationFailure(
"Some exit values in loop with uncountable exit not supported yet",
"UncountableEarlyExitLoopsUnsupportedExitValue", ORE, OrigLoop);
@@ -9409,6 +9284,8 @@ LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(VFRange &Range) {
VPlanTransforms::addActiveLaneMask(*Plan, ForControlFlow,
WithoutRuntimeCheck);
}
+
+ VPlanTransforms::optimizeInductionExitUsers(*Plan, IVEndValues);
return Plan;
}
@@ -9457,8 +9334,8 @@ VPlanPtr LoopVectorizationPlanner::buildVPlan(VFRange &Range) {
auto *HeaderR = cast<VPHeaderPHIRecipe>(&R);
RecipeBuilder.setRecipe(HeaderR->getUnderlyingInstr(), HeaderR);
}
- DenseMap<VPValue *, VPValue *> EndValues;
- addScalarResumePhis(RecipeBuilder, *Plan, OrigLoop, EndValues);
+ DenseMap<VPValue *, VPValue *> IVEndValues;
+ addScalarResumePhis(RecipeBuilder, *Plan, IVEndValues);
assert(verifyVPlanIsValid(*Plan) && "VPlan is invalid");
return Plan;
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.cpp b/llvm/lib/Transforms/Vectorize/VPlan.cpp
index bd8c2e28ab36bc..5514f494f18262 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlan.cpp
@@ -311,9 +311,9 @@ Value *VPTransformState::get(VPValue *Def, bool NeedsScalar) {
LastLane = 0;
}
- auto *LastDef = get(Def, LastLane);
+ auto *LastValue = get(Def, LastLane);
auto OldIP = Builder.saveIP();
- if (auto *LastInst = dyn_cast<Instruction>(LastDef)) {
+ if (auto *LastInst = dyn_cast<Instruction>(LastValue)) {
// TODO: Remove once VPDerivedIVReicpe can be simplified, which requires
// vector trip count being modeled in VPlan.
// Set the insert point after the last scalarized instruction or after the
@@ -339,7 +339,7 @@ Value *VPTransformState::get(VPValue *Def, bool NeedsScalar) {
} else {
// Initialize packing with insertelements to start from undef.
assert(!VF.isScalable() && "VF is assumed to be non scalable.");
- Value *Undef = PoisonValue::get(VectorType::get(LastDef->getType(), VF));
+ Value *Undef = PoisonValue::get(VectorType::get(LastValue->getType(), VF));
set(Def, Undef);
for (unsigned Lane = 0; Lane < VF.getKnownMinValue(); ++Lane)
packScalarIntoVectorValue(Def, Lane);
diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
index 3514a5a9d4467d..77c08839dbfa95 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
@@ -843,7 +843,7 @@ void VPIRInstruction::execute(VPTransformState &State) {
BasicBlock *PredBB = State.CFG.VPBB2IRBB[PredVPBB];
// Set insertion point in PredBB in case an extract needs to be generated.
// TODO: Model extracts explicitly.
- State.Builder.SetInsertPoint(&*PredBB->getTerminator());
+ State.Builder.SetInsertPoint(PredBB, PredBB->getFirstNonPHIIt());
Value *V = State.get(ExitValue, VPLane(Lane));
auto *Phi = cast<PHINode>(&I);
// If there is no existing block for PredBB in the phi, add a new incoming
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
index 8ac2bd5160c267..e713d297cb5281 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
@@ -624,6 +624,127 @@ static void legalizeAndOptimizeInductions(VPlan &Plan) {
}
}
+VPWidenInductionRecipe *isIVUse(VPValue *Incoming) {
+ auto *WideIV = dyn_cast<VPWidenInductionRecipe>(Incoming);
+ if (WideIV) {
+ auto *WideIntOrFpIV = dyn_cast<VPWidenIntOrFpInductionRecipe>(WideIV);
+ if (WideIntOrFpIV && WideIntOrFpIV->getTruncInst())
+ return nullptr;
+ return WideIV;
+ }
+
+ VPRecipeBase *IncomingDef = Incoming->getDefiningRecipe();
+ if (!IncomingDef || IncomingDef->getNumOperands() != 2)
+ return nullptr;
+
+ WideIV = dyn_cast<VPWidenInductionRecipe>(IncomingDef->getOperand(0));
+ if (!WideIV)
+ WideIV = dyn_cast<VPWidenInductionRecipe>(IncomingDef->getOperand(1));
+ if (!WideIV)
+ return nullptr;
+
+ auto IsWideIVInc = [&]() {
+ using namespace VPlanPatternMatch;
+ auto &ID = WideIV->getInductionDescriptor();
+ switch (ID.getInductionOpcode()) {
+ case Instruction::Add:
+ return match(Incoming,
+ m_c_Binary<Instruction::Add>(
+ m_VPValue(), m_Specific(WideIV->getStepValue())));
+ case Instruction::FAdd:
+ return match(Incoming,
+ m_c_Binary<Instruction::FAdd>(
+ m_VPValue(), m_Specific(WideIV->getStepValue())));
+ case Instruction::FSub:
+ return match(Incoming,
+ m_Binary<Instruction::FSub>(
+ m_VPValue(), m_Specific(WideIV->getStepValue())));
+ case Instruction::Sub: {
+ VPValue *Step;
+ return match(Incoming,
+ m_Binary<Instruction::Sub>(m_VPValue(), m_VPValue(Step))) &&
+ Step->isLiveIn() && WideIV->getStepValue()->isLiveIn() &&
+ (cast<ConstantInt>(Step->getLiveInIRValue())->getValue() +
+ cast<ConstantInt>(WideIV->getStepValue()->getLiveInIRValue())
+ ->getValue())
+ .isZero();
+ }
+ default:
+ return ID.getKind() == InductionDescriptor::IK_PtrInduction &&
+ match(Incoming,
+ m_GetElementPtr(m_VPValue(),
+ m_Specific(WideIV->getStepValue())));
+ }
+ llvm_unreachable("should have been covered by switch above");
+ };
+ return IsWideIVInc() ? WideIV : nullptr;
+}
+
+void VPlanTransforms::optimizeInductionExitUsers(
+ VPlan &Plan, DenseMap<VPValue *, VPValue *> &EndValues) {
+ using namespace VPlanPatternMatch;
+ SmallVector<VPIRBasicBlock *> ExitVPBBs(Plan.getExitBlocks());
+ if (ExitVPBBs.size() != 1)
+ return;
+
+ VPIRBasicBlock *ExitVPBB = ExitVPBBs[0];
+ VPBlockBase *PredVPBB = ExitVPBB->getSinglePredecessor();
+ if (!PredVPBB)
+ return;
+ assert(PredVPBB == Plan.getMiddleBlock() &&
+ "predecessor must be the middle block");
+
+ VPTypeAnalysis TypeInfo(Plan.getCanonicalIV()->getScalarType());
+ VPBuilder B(Plan.getMiddleBlock()->getTerminator());
+ for (VPRecipeBase &R : *ExitVPBB) {
+ auto *ExitIRI = cast<VPIRInstruction>(&R);
+ if (!isa<PHINode>(ExitIRI->getInstruction()))
+ break;
+
+ VPValue *Incoming;
+ if (!match(ExitIRI->getOperand(0),
+ m_VPInstruction<VPInstruction::ExtractFromEnd>(
+ m_VPValue(Incoming), m_SpecificInt(1))))
+ continue;
+
+ auto *WideIV = isIVUse(Incoming);
+ if (!WideIV)
+ continue;
+ VPValue *EndValue = EndValues.lookup(WideIV);
+ if (!EndValue)
+ continue;
+
+ if (Incoming != WideIV) {
+ ExitIRI->setOperand(0, EndValue);
+ continue;
+ }
+
+ VPValue *Escape = nullptr;
+ VPValue *Step = WideIV->getStepValue();
+ Type *ScalarTy = TypeInfo.inferScalarType(WideIV);
+ if (ScalarTy->isIntegerTy()) {
+ Escape =
+ B.createNaryOp(Instruction::Sub, {EndValue, Step}, {}, "ind.escape");
+ } else if (ScalarTy->isPointerTy()) {
+ auto *Zero = Plan.getOrAddLiveIn(
+ ConstantInt::get(Step->getLiveInIRValue()->getType(), 0));
+ Escape = B.createPtrAdd(EndValue,
+ B.createNaryOp(Instruction::Sub, {Zero, Step}),
+ {}, "ind.escape");
+ } else if (ScalarTy->isFloatingPointTy()) {
+ const auto &ID = WideIV->getInductionDescriptor();
+ Escape = B.createNaryOp(
+ ID.getInductionBinOp()->getOpcode() == Instruction::FAdd
+ ? Instruction::FSub
+ : Instruction::FAdd,
+ {EndValue, Step}, {ID.getInductionBinOp()->getFastMathFlags()});
+ } else {
+ llvm_unreachable("all possible induction types must be handled");
+ }
+ ExitIRI->setOperand(0, Escape);
+ }
+}
+
/// Remove redundant EpxandSCEVRecipes in \p Plan's entry block by replacing
/// them with already existing recipes expanding the same SCEV expression.
static void removeRedundantExpandSCEVRecipes(VPlan &Plan) {
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.h b/llvm/lib/Transforms/Vectorize/VPlanTransforms.h
index fddde868911665..9a1a90b3eef293 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.h
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.h
@@ -137,6 +137,10 @@ struct VPlanTransforms {
/// Lower abstract recipes to concrete ones, that can be codegen'd.
static void convertToConcreteRecipes(VPlan &Plan);
+
+ static void
+ optimizeInductionExitUsers(VPlan &Plan,
+ DenseMap<VPValue *, VPValue *> &EndValues);
};
} // namespace llvm
diff --git a/llvm/test/Transforms/LoopVectorize/single_early_exit_live_outs.ll b/llvm/test/Transforms/LoopVectorize/single_early_exit_live_outs.ll
index 085438aa80f246..6e542bd873b8c3 100644
--- a/llvm/test/Transforms/LoopVectorize/single_early_exit_live_outs.ll
+++ b/llvm/test/Transforms/LoopVectorize/single_early_exit_live_outs.ll
@@ -532,6 +532,7 @@ define i64 @diff_exit_block_pre_inc_use2() {
; CHECK-NEXT: br label [[LOOP:%.*]]
; CHECK: vector.body:
; CHECK-NEXT: [[INDEX1:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT3:%.*]], [[LOOP]] ]
+; CHECK-NEXT: [[VEC_IND:%.*]] = phi <4 x i64> [ <i64 3, i64 4, i64 5, i64 6>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[LOOP]] ]
; CHECK-NEXT: [[OFFSET_IDX:%.*]] = add i64 3, [[INDEX1]]
; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[OFFSET_IDX]], 0
; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 [[TMP0]]
@@ -545,11 +546,13 @@ define i64 @diff_exit_block_pre_inc_use2() {
; CHECK-NEXT: [[TMP6:%.*]] = xor <4 x i1> [[TMP5]], splat (i1 true)
; CHECK-NEXT: [[TMP7:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP6]])
; CHECK-NEXT: [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT3]], 64
+; CHECK-NEXT: [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], splat (i64 4)
; CHECK-NEXT: [[TMP9:%.*]] = or i1 [[TMP7]], [[TMP8]]
; CHECK-NEXT: br i1 [[TMP9]], label [[MIDDLE_SPLIT:%.*]], label [[LOOP]], !llvm.loop [[LOOP0:![0-9]+]]
; CHECK: middle.split:
; CHECK-NEXT: br i1 [[TMP7]], label [[LOOP_EARLY_EXIT:%.*]], label [[MIDDLE_BLOCK:%.*]]
; CHECK: middle.block:
+; CHECK-NEXT: [[TMP10:%.*]] = extractelement <4 x i64> [[VEC_IND]], i32 3
; CHECK-NEXT: br i1 true, label [[LOOP_END:%.*]], label [[SCALAR_PH]]
; CHECK: scalar.ph:
; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 67, [[MIDDLE_BLOCK]] ], [ 3, [[ENTRY:%.*]] ]
@@ -570,7 +573,7 @@ define i64 @diff_exit_block_pre_inc_use2() {
; CHECK-NEXT: [[RETVAL1:%.*]] = phi i64 [ 67, [[LOOP1]] ], [ 67, [[MIDDLE_SPLIT]] ]
; CHECK-NEXT: ret i64 [[RETVAL1]]
; CHECK: loop.end:
-; CHECK-NEXT: [[RETVAL2:%.*]] = phi i64 [ [[INDEX]], [[LOOP_INC]] ], [ 66, [[MIDDLE_BLOCK]] ]
+; CHECK-NEXT: [[RETVAL2:%.*]] = phi i64 [ [[INDEX]], [[LOOP_INC]] ], [ [[TMP10]], [[MIDDLE_BLOCK]] ]
; CHECK-NEXT: ret i64 [[RETVAL2]]
;
entry:
More information about the llvm-commits
mailing list