[llvm] This is a draft for enabling opt-in tail-folding on vectorized epilogue. (PR #181401)
Hassnaa Hamdi via llvm-commits
llvm-commits at lists.llvm.org
Fri Feb 13 10:59:49 PST 2026
https://github.com/hassnaaHamdi created https://github.com/llvm/llvm-project/pull/181401
Enable tail-folding on vectorized epilogue, so that we can have:
Vectorized main loop
Tail-folded vectorized epilogue loop.
>From cafcaf890b7dcd3c2073eab826ab713bde5de1b2 Mon Sep 17 00:00:00 2001
From: Hassnaa Hamdi <hassnaa.hamdi at arm.com>
Date: Tue, 27 Jan 2026 11:40:08 +0000
Subject: [PATCH] Epilogue tail folding draft
---
.../Vectorize/LoopVectorizationLegality.h | 20 +-
llvm/lib/Transforms/Utils/BasicBlockUtils.cpp | 2 +-
.../Vectorize/LoopVectorizationLegality.cpp | 2 +-
.../Vectorize/LoopVectorizationPlanner.h | 32 +-
.../Transforms/Vectorize/LoopVectorize.cpp | 457 ++++++++++++++----
llvm/lib/Transforms/Vectorize/VPlan.cpp | 18 +-
.../Vectorize/VPlanConstruction.cpp | 11 +-
.../Transforms/Vectorize/VPlanTransforms.cpp | 3 +
.../Transforms/Vectorize/VPlanTransforms.h | 3 +-
.../AArch64/sve-epilog-tail-folded-vect.ll | 393 +++++++++++++++
10 files changed, 826 insertions(+), 115 deletions(-)
create mode 100644 llvm/test/Transforms/LoopVectorize/AArch64/sve-epilog-tail-folded-vect.ll
diff --git a/llvm/include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h b/llvm/include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h
index f82fc588639dd..8b39f4d3a2bdd 100644
--- a/llvm/include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h
+++ b/llvm/include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h
@@ -430,9 +430,12 @@ class LoopVectorizationLegality {
return LAI->getDepChecker().getStoreLoadForwardSafeDistanceInBits();
}
- /// Returns true if vector representation of the instruction \p I
- /// requires mask.
- bool isMaskRequired(const Instruction *I) const {
+ /// Returns true if instruction \p I requires a mask for vectorization.
+ /// This accounts for both control flow masking (conditionally executed
+ /// blocks) and tail-folding masking (predicated loop vectorization).
+ bool isMaskRequired(const Instruction *I, bool TailFolded) const {
+ if (TailFolded)
+ return TailFoldedMaskedOp.contains(I);
return MaskedOp.contains(I);
}
@@ -709,9 +712,16 @@ class LoopVectorizationLegality {
AssumptionCache *AC;
/// While vectorizing these instructions we have to generate a
- /// call to the appropriate masked intrinsic or drop them in case of
- /// conditional assumes.
+ /// call to the appropriate masked intrinsic or drop them.
+ /// In order to differentiate between control flow introduced at the source
+ /// level and that introduced by the loop vectoriser during tail-folding, we
+ /// keep two lists:
+ /// 1) MaskedOp - instructions that need masking if we are in conditionally
+ /// executed block.
+ /// 2) TailFoldedMaskedOp - instructions that need masking because of tail-
+ /// folding.
SmallPtrSet<const Instruction *, 8> MaskedOp;
+ SmallPtrSet<const Instruction *, 8> TailFoldedMaskedOp;
/// Contains all identified histogram operations, which are sequences of
/// load -> update -> store instructions where multiple lanes in a vector
diff --git a/llvm/lib/Transforms/Utils/BasicBlockUtils.cpp b/llvm/lib/Transforms/Utils/BasicBlockUtils.cpp
index 6472e1771ec73..b0eb4dc961d28 100644
--- a/llvm/lib/Transforms/Utils/BasicBlockUtils.cpp
+++ b/llvm/lib/Transforms/Utils/BasicBlockUtils.cpp
@@ -152,7 +152,7 @@ void llvm::DeleteDeadBlocks(ArrayRef <BasicBlock *> BBs, DomTreeUpdater *DTU,
#ifndef NDEBUG
// Make sure that all predecessors of each dead block is also dead.
SmallPtrSet<BasicBlock *, 4> Dead(llvm::from_range, BBs);
- assert(Dead.size() == BBs.size() && "Duplicating blocks?");
+// assert(Dead.size() == BBs.size() && "Duplicating blocks?");
for (auto *BB : Dead)
for (BasicBlock *Pred : predecessors(BB))
assert(Dead.count(Pred) && "All predecessors must be dead!");
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp
index e57e0cf636501..835723362882e 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp
@@ -2156,7 +2156,7 @@ void LoopVectorizationLegality::prepareToFoldTailByMasking() {
// Mark all blocks for predication, including those that ordinarily do not
// need predication such as the header block.
for (BasicBlock *BB : TheLoop->blocks()) {
- [[maybe_unused]] bool R = blockCanBePredicated(BB, SafePointers, MaskedOp);
+ [[maybe_unused]] bool R = blockCanBePredicated(BB, SafePointers, TailFoldedMaskedOp);
assert(R && "Must be able to predicate block when tail-folding.");
}
}
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h b/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h
index 54bb073eb4f81..39f8ffbb2a053 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h
@@ -496,6 +496,7 @@ class LoopVectorizationPlanner {
/// The profitability analysis.
LoopVectorizationCostModel &CM;
+ LoopVectorizationCostModel *EpilogueTailFoldedCM;
/// The interleaved access analysis.
InterleavedAccessInfo &IAI;
@@ -507,6 +508,7 @@ class LoopVectorizationPlanner {
OptimizationRemarkEmitter *ORE;
SmallVector<VPlanPtr, 4> VPlans;
+ SmallVector<VPlanPtr, 4> EpilogueTailFoldedPlans;
/// Profitable vector factors.
SmallVector<VectorizationFactor, 8> ProfitableVFs;
@@ -538,7 +540,21 @@ class LoopVectorizationPlanner {
PredicatedScalarEvolution &PSE, const LoopVectorizeHints &Hints,
OptimizationRemarkEmitter *ORE)
: OrigLoop(L), LI(LI), DT(DT), TLI(TLI), TTI(TTI), Legal(Legal), CM(CM),
- IAI(IAI), PSE(PSE), Hints(Hints), ORE(ORE) {}
+ EpilogueTailFoldedCM(nullptr), IAI(IAI), PSE(PSE), Hints(Hints), ORE(ORE) {}
+
+ void setEpilogueTailFoldingCM(LoopVectorizationCostModel *Cost) {
+ EpilogueTailFoldedCM = Cost;
+ }
+
+ LoopVectorizationCostModel *getEpilogueTailFoldingCM() const {
+ return EpilogueTailFoldedCM;
+ }
+
+ bool isEpilogueTailFolded() const;
+
+ void disableEpilogueTailFolding() {
+ EpilogueTailFoldedCM = nullptr;
+ }
/// Build VPlans for the specified \p UserVF and \p UserIC if they are
/// non-zero or all applicable candidate VFs otherwise. If vectorization and
@@ -551,7 +567,7 @@ class LoopVectorizationPlanner {
/// Return the VPlan for \p VF. At the moment, there is always a single VPlan
/// for each VF.
- VPlan &getPlanFor(ElementCount VF) const;
+ VPlan &getPlanFor(ElementCount VF, bool ForEpilogue = false) const;
/// Compute and return the most profitable vectorization factor. Also collect
/// all profitable VFs in ProfitableVFs.
@@ -586,9 +602,9 @@ class LoopVectorizationPlanner {
/// Look through the existing plans and return true if we have one with
/// vectorization factor \p VF.
- bool hasPlanWithVF(ElementCount VF) const {
- return any_of(VPlans,
- [&](const VPlanPtr &Plan) { return Plan->hasVF(VF); });
+ bool hasPlanWithVF(ElementCount VF, bool ForEpilogue = false) const {
+ return any_of((ForEpilogue && isEpilogueTailFolded()) ? EpilogueTailFoldedPlans : VPlans,
+ [&](const VPlanPtr &Plan) { return Plan->hasVF(VF); });
}
/// Test a \p Predicate on a \p Range of VF's. Return the value of applying
@@ -648,7 +664,8 @@ class LoopVectorizationPlanner {
/// set the largest included VF to the maximum VF for which no plan could be
/// built. Each VPlan is built starting from a copy of \p InitialPlan, which
/// is a plain CFG VPlan wrapping the original scalar loop.
- VPlanPtr tryToBuildVPlanWithVPRecipes(VPlanPtr InitialPlan, VFRange &Range,
+ VPlanPtr tryToBuildVPlanWithVPRecipes(LoopVectorizationCostModel *Cost,
+ VPlanPtr InitialPlan, VFRange &Range,
LoopVersioning *LVer);
/// Build VPlans for power-of-2 VF's between \p MinVF and \p MaxVF inclusive,
@@ -660,7 +677,8 @@ class LoopVectorizationPlanner {
/// ComputeReductionResult depending on the reduction) in
/// the middle block. Selects are introduced for reductions between the phi
/// and users outside the vector region when folding the tail.
- void addReductionResultComputation(VPlanPtr &Plan,
+ void addReductionResultComputation(LoopVectorizationCostModel *Cost,
+ VPlanPtr &Plan,
VPRecipeBuilder &RecipeBuilder,
ElementCount MinVF);
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 499c5a31421ed..6755f8a818812 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -207,7 +207,8 @@ namespace PreferPredicateTy {
enum Option {
ScalarEpilogue = 0,
PredicateElseScalarEpilogue,
- PredicateOrDontVectorize
+ PredicateOrDontVectorize,
+ PredicatedEpilogue
};
} // namespace PreferPredicateTy
@@ -227,7 +228,11 @@ static cl::opt<PreferPredicateTy::Option> PreferPredicateOverEpilogue(
clEnumValN(PreferPredicateTy::PredicateOrDontVectorize,
"predicate-dont-vectorize",
"prefers tail-folding, don't attempt vectorization if "
- "tail-folding fails.")));
+ "tail-folding fails."),
+ clEnumValN(PreferPredicateTy::PredicatedEpilogue,
+ "predicated-epilogue",
+ "prefers predicated vector epilogues, falling back on "
+ "scalar epilogues if it fails.")));
static cl::opt<TailFoldingStyle> ForceTailFoldingStyle(
"force-tail-folding-style", cl::desc("Force the tail folding style"),
@@ -642,10 +647,12 @@ class InnerLoopAndEpilogueVectorizer : public InnerLoopVectorizer {
DominatorTree *DT, const TargetTransformInfo *TTI, AssumptionCache *AC,
EpilogueLoopVectorizationInfo &EPI, LoopVectorizationCostModel *CM,
GeneratedRTChecks &Checks, VPlan &Plan, ElementCount VecWidth,
- ElementCount MinProfitableTripCount, unsigned UnrollFactor)
+ ElementCount MinProfitableTripCount, unsigned UnrollFactor,
+ bool isEpilogueTailFolded)
: InnerLoopVectorizer(OrigLoop, PSE, LI, DT, TTI, AC, VecWidth,
UnrollFactor, CM, Checks, Plan),
- EPI(EPI), MinProfitableTripCount(MinProfitableTripCount) {}
+ EPI(EPI), MinProfitableTripCount(MinProfitableTripCount),
+ isEpilogueTailFolded(isEpilogueTailFolded) {}
/// Holds and updates state information required to vectorize the main loop
/// and its epilogue in two separate passes. This setup helps us avoid
@@ -657,6 +664,7 @@ class InnerLoopAndEpilogueVectorizer : public InnerLoopVectorizer {
protected:
ElementCount MinProfitableTripCount;
+ bool isEpilogueTailFolded;
};
/// A specialized derived class of inner loop vectorizer that performs
@@ -670,10 +678,12 @@ class EpilogueVectorizerMainLoop : public InnerLoopAndEpilogueVectorizer {
AssumptionCache *AC,
EpilogueLoopVectorizationInfo &EPI,
LoopVectorizationCostModel *CM,
- GeneratedRTChecks &Check, VPlan &Plan)
+ GeneratedRTChecks &Check, VPlan &Plan,
+ bool isEpilogueTailFolded)
: InnerLoopAndEpilogueVectorizer(OrigLoop, PSE, LI, DT, TTI, AC, EPI, CM,
Check, Plan, EPI.MainLoopVF,
- EPI.MainLoopVF, EPI.MainLoopUF) {}
+ EPI.MainLoopVF, EPI.MainLoopUF,
+ isEpilogueTailFolded) {}
/// Implements the interface for creating a vectorized skeleton using the
/// *main loop* strategy (i.e., the first pass of VPlan execution).
BasicBlock *createVectorizedLoopSkeleton() final;
@@ -708,10 +718,12 @@ class EpilogueVectorizerEpilogueLoop : public InnerLoopAndEpilogueVectorizer {
AssumptionCache *AC,
EpilogueLoopVectorizationInfo &EPI,
LoopVectorizationCostModel *CM,
- GeneratedRTChecks &Checks, VPlan &Plan)
+ GeneratedRTChecks &Checks, VPlan &Plan,
+ bool isEpilogueTailFolded)
: InnerLoopAndEpilogueVectorizer(OrigLoop, PSE, LI, DT, TTI, AC, EPI, CM,
Checks, Plan, EPI.EpilogueVF,
- EPI.EpilogueVF, EPI.EpilogueUF) {}
+ EPI.EpilogueVF, EPI.EpilogueUF,
+ isEpilogueTailFolded) {}
/// Implements the interface for creating a vectorized skeleton using the
/// *epilogue loop* strategy (i.e., the second pass of VPlan execution).
BasicBlock *createVectorizedLoopSkeleton() final;
@@ -861,8 +873,8 @@ enum ScalarEpilogueLowering {
CM_ScalarEpilogueNotAllowedLowTripLoop,
// Loop hint predicate indicating an epilogue is undesired.
- CM_ScalarEpilogueNotNeededUsePredicate,
-
+ CM_ScalarEpilogueNotNeededUsePredicatedBody,
+ CM_ScalarEpilogueNotNeededUsePredicatedEpilogue,
// Directive indicating we must either tail fold or not vectorize
CM_ScalarEpilogueNotAllowedUsePredicate
};
@@ -966,6 +978,13 @@ class LoopVectorizationCostModel {
return MinBWs;
}
+ void copyMinimalBitwidths(const MapVector<Instruction *, uint64_t> &BWs) {
+ MinBWs.clear();
+ for (auto &BW : BWs) {
+ MinBWs.insert(BW);
+ }
+ }
+
/// \returns True if it is more profitable to scalarize instruction \p I for
/// vectorization factor \p VF.
bool isProfitableToScalarize(Instruction *I, ElementCount VF) const {
@@ -1234,6 +1253,10 @@ class LoopVectorizationCostModel {
/// \p VF is the vectorization factor that will be used to vectorize \p I.
bool isScalarWithPredication(Instruction *I, ElementCount VF);
+ /// Wrapper function for LoopVectorizationLegality::isMaskRequired,
+ /// that passes the Instruction \p I and if we fold tail.
+ bool isMaskRequired(Instruction *I) const;
+
/// Returns true if \p I is an instruction that needs to be predicated
/// at runtime. The result is independent of the predication mechanism.
/// Superset of instructions that return true for isScalarWithPredication.
@@ -1312,12 +1335,16 @@ class LoopVectorizationCostModel {
/// Returns true if a scalar epilogue is not allowed due to optsize or a
/// loop hint annotation.
bool isScalarEpilogueAllowed() const {
- return ScalarEpilogueStatus == CM_ScalarEpilogueAllowed;
+ // We may have requested the creation of a predicated vector epilogue, but
+ // the cost model may still decide it's not worth it and should fall back
+ // on a scalar epilogue.
+ return ScalarEpilogueStatus == CM_ScalarEpilogueAllowed ||
+ ScalarEpilogueStatus == CM_ScalarEpilogueNotNeededUsePredicatedEpilogue;
}
/// Returns true if tail-folding is preferred over a scalar epilogue.
bool preferPredicatedLoop() const {
- return ScalarEpilogueStatus == CM_ScalarEpilogueNotNeededUsePredicate ||
+ return ScalarEpilogueStatus == CM_ScalarEpilogueNotNeededUsePredicatedBody ||
ScalarEpilogueStatus == CM_ScalarEpilogueNotAllowedUsePredicate;
}
@@ -1334,6 +1361,8 @@ class LoopVectorizationCostModel {
/// \param IsScalableVF true if scalable vector factors enabled.
/// \param UserIC User specific interleave count.
void setTailFoldingStyles(bool IsScalableVF, unsigned UserIC) {
+ // TODO: Should probably have separate style for the main and epilogue.
+
assert(!ChosenTailFoldingStyle && "Tail folding must not be selected yet.");
if (!Legal->canFoldTailByMasking()) {
ChosenTailFoldingStyle = {TailFoldingStyle::None, TailFoldingStyle::None};
@@ -1360,7 +1389,7 @@ class LoopVectorizationCostModel {
// If for some reason EVL mode is unsupported, fallback to a scalar epilogue
// if it's allowed, or DataWithoutLaneMask otherwise.
if (ScalarEpilogueStatus == CM_ScalarEpilogueAllowed ||
- ScalarEpilogueStatus == CM_ScalarEpilogueNotNeededUsePredicate)
+ ScalarEpilogueStatus == CM_ScalarEpilogueNotNeededUsePredicatedBody)
ChosenTailFoldingStyle = {TailFoldingStyle::None, TailFoldingStyle::None};
else
ChosenTailFoldingStyle = {TailFoldingStyle::DataWithoutLaneMask,
@@ -1375,6 +1404,7 @@ class LoopVectorizationCostModel {
}
/// Returns true if all loop blocks should be masked to fold tail loop.
+ /// TODO: Distinguish according to predicated body or epilogue.
bool foldTailByMasking() const {
// TODO: check if it is possible to check for None style independent of
// IVUpdateMayOverflow flag in getTailFoldingStyle.
@@ -1405,6 +1435,7 @@ class LoopVectorizationCostModel {
/// for any reason, e.g. because tail folding now requires a predicate
/// or because the block in the original loop was predicated.
bool blockNeedsPredicationForAnyReason(BasicBlock *BB) const {
+ /// TODO: Distinguish according to predicated body or epilogue.
return foldTailByMasking() || Legal->blockNeedsPredication(BB);
}
@@ -2381,6 +2412,7 @@ Value *EpilogueVectorizerMainLoop::createIterationCountCheck(
} else if (VF.isScalable() && !TTI->isVScaleKnownToBeAPowerOfTwo() &&
!isIndvarOverflowCheckKnownFalse(Cost, VF, UF) &&
Style != TailFoldingStyle::DataAndControlFlowWithoutRuntimeCheck) {
+ // TODO: What about check in epilogue?
// vscale is not necessarily a power-of-2, which means we cannot guarantee
// an overflow to zero when updating induction variables and so an
// additional overflow check is required before entering the vector loop.
@@ -2772,6 +2804,7 @@ void LoopVectorizationCostModel::collectLoopScalars(ElementCount VF) {
// If tail-folding is applied, the primary induction variable will be used
// to feed a vector compare.
+ /// TODO: Distinguish according to predicated body or epilogue.
if (Ind == Legal->getPrimaryInduction() && foldTailByMasking())
continue;
@@ -2864,12 +2897,17 @@ bool LoopVectorizationCostModel::isScalarWithPredication(Instruction *I,
}
}
+bool LoopVectorizationCostModel::isMaskRequired(Instruction *I) const {
+ return Legal->isMaskRequired(I, foldTailByMasking());
+}
+
// TODO: Fold into LoopVectorizationLegality::isMaskRequired.
bool LoopVectorizationCostModel::isPredicatedInst(Instruction *I) const {
// TODO: We can use the loop-preheader as context point here and get
// context sensitive reasoning for isSafeToSpeculativelyExecute.
+ bool PredicatedLoop = foldTailByMasking();
if (isSafeToSpeculativelyExecute(I) ||
- (isa<LoadInst, StoreInst, CallInst>(I) && !Legal->isMaskRequired(I)) ||
+ (isa<LoadInst, StoreInst, CallInst>(I) && !isMaskRequired(I)) ||
isa<BranchInst, SwitchInst, PHINode, AllocaInst>(I))
return false;
@@ -2879,7 +2917,8 @@ bool LoopVectorizationCostModel::isPredicatedInst(Instruction *I) const {
return true;
// If we're not folding the tail by masking, predication is unnecessary.
- if (!foldTailByMasking())
+ /// TODO: Distinguish according to predicated body or epilogue.
+ if (!PredicatedLoop)
return false;
// All that remain are instructions with side-effects originally executed in
@@ -2894,7 +2933,7 @@ bool LoopVectorizationCostModel::isPredicatedInst(Instruction *I) const {
case Instruction::Call:
// Side-effects of a Call are assumed to be non-invariant, needing a
// (fold-tail) mask.
- assert(Legal->isMaskRequired(I) &&
+ assert(isMaskRequired(I) &&
"should have returned earlier for calls not needing a mask");
return true;
case Instruction::Load:
@@ -3041,8 +3080,7 @@ bool LoopVectorizationCostModel::interleavedAccessCanBeWidened(
// (either a gap at the end of a load-access that may result in a speculative
// load, or any gaps in a store-access).
bool PredicatedAccessRequiresMasking =
- blockNeedsPredicationForAnyReason(I->getParent()) &&
- Legal->isMaskRequired(I);
+ blockNeedsPredicationForAnyReason(I->getParent()) && isMaskRequired(I);
bool LoadAccessWithGapsRequiresEpilogMasking =
isa<LoadInst>(I) && Group->requiresScalarEpilogue() &&
!isScalarEpilogueAllowed();
@@ -3454,6 +3492,8 @@ LoopVectorizationCostModel::getMaxLegalScalableVF(unsigned MaxSafeElements) {
return MaxScalableVF;
}
+// TODO: What does FoldTailByMasking mean here? Should it only be true
+// when predicating the main vector body or for the epilogue too?
FixedScalableVFPair LoopVectorizationCostModel::computeFeasibleMaxVF(
unsigned MaxTripCount, ElementCount UserVF, unsigned UserIC,
bool FoldTailByMasking) {
@@ -3613,12 +3653,14 @@ LoopVectorizationCostModel::computeMaxVF(ElementCount UserVF, unsigned UserIC) {
return computeFeasibleMaxVF(MaxTC, UserVF, UserIC, false);
case CM_ScalarEpilogueNotAllowedUsePredicate:
[[fallthrough]];
- case CM_ScalarEpilogueNotNeededUsePredicate:
+ case CM_ScalarEpilogueNotNeededUsePredicatedBody:
LLVM_DEBUG(
dbgs() << "LV: vector predicate hint/switch found.\n"
<< "LV: Not allowing scalar epilogue, creating predicated "
<< "vector loop.\n");
break;
+ case CM_ScalarEpilogueNotNeededUsePredicatedEpilogue:
+ break;
case CM_ScalarEpilogueNotAllowedLowTripLoop:
// fallthrough as a special case of OptForSize
case CM_ScalarEpilogueNotAllowedOptSize:
@@ -3648,9 +3690,10 @@ LoopVectorizationCostModel::computeMaxVF(ElementCount UserVF, unsigned UserIC) {
// none were taken so far.
InterleaveInfo.invalidateGroupsRequiringScalarEpilogue();
}
-
+ bool TailFoldedIntoMainBody =
+ ScalarEpilogueStatus != CM_ScalarEpilogueNotNeededUsePredicatedEpilogue;
FixedScalableVFPair MaxFactors =
- computeFeasibleMaxVF(MaxTC, UserVF, UserIC, true);
+ computeFeasibleMaxVF(MaxTC, UserVF, UserIC, TailFoldedIntoMainBody);
// Avoid tail folding if the trip count is known to be a multiple of any VF
// we choose.
@@ -3723,6 +3766,14 @@ LoopVectorizationCostModel::computeMaxVF(ElementCount UserVF, unsigned UserIC) {
return FixedScalableVFPair::getNone();
}
+ // We don't want to set the tail-folding style on this cost model when we're
+ // only using predicated vector epilogues.
+ if (!TailFoldedIntoMainBody) {
+ LLVM_DEBUG(dbgs() << "LV: Using unpredicated vector body with predicated "
+ "vector epilogue.\n");
+ return MaxFactors;
+ }
+
// If we don't know the precise trip count, or if the trip count that we
// found modulo the vectorization factor is not zero, try to fold the tail
// by masking.
@@ -3730,6 +3781,7 @@ LoopVectorizationCostModel::computeMaxVF(ElementCount UserVF, unsigned UserIC) {
bool ContainsScalableVF = MaxFactors.ScalableVF.isNonZero();
setTailFoldingStyles(ContainsScalableVF, UserIC);
if (foldTailByMasking()) {
+ // TODO: Should probably disallow predicated vector epilogues with EVL.
if (foldTailWithEVL()) {
LLVM_DEBUG(
dbgs()
@@ -3748,7 +3800,8 @@ LoopVectorizationCostModel::computeMaxVF(ElementCount UserVF, unsigned UserIC) {
// If there was a tail-folding hint/switch, but we can't fold the tail by
// masking, fallback to a vectorization with a scalar epilogue.
- if (ScalarEpilogueStatus == CM_ScalarEpilogueNotNeededUsePredicate) {
+ if (ScalarEpilogueStatus == CM_ScalarEpilogueNotNeededUsePredicatedBody ||
+ ScalarEpilogueStatus == CM_ScalarEpilogueNotNeededUsePredicatedEpilogue) {
LLVM_DEBUG(dbgs() << "LV: Cannot fold tail by masking: vectorize with a "
"scalar epilogue instead.\n");
ScalarEpilogueStatus = CM_ScalarEpilogueAllowed;
@@ -3845,6 +3898,8 @@ ElementCount LoopVectorizationCostModel::clampVFByMaxTripCount(
return VF;
}
+// Looks like FoldTailByMasking is mostly useful for a single predicated
+// body.
ElementCount LoopVectorizationCostModel::getMaximizedVFForTarget(
unsigned MaxTripCount, unsigned SmallestType, unsigned WidestType,
ElementCount MaxSafeVF, unsigned UserIC, bool FoldTailByMasking) {
@@ -4418,11 +4473,16 @@ bool LoopVectorizationCostModel::isEpilogueVectorizationProfitable(
// with vectorization factors larger than a certain value.
// Allow the target to opt out entirely.
+ // How does this interact with the option to create predicated vector
+ // epilogues?
if (!TTI.preferEpilogueVectorization())
return false;
// We also consider epilogue vectorization unprofitable for targets that don't
// consider interleaving beneficial (eg. MVE).
+
+ // TODO: This shouldn't be a restriction if the target prefers predicated
+ // vector epilogues.
if (TTI.getMaxInterleaveFactor(VF) <= 1)
return false;
@@ -4435,11 +4495,15 @@ bool LoopVectorizationCostModel::isEpilogueVectorizationProfitable(
VectorizationFactor LoopVectorizationPlanner::selectEpilogueVectorizationFactor(
const ElementCount MainLoopVF, unsigned IC) {
VectorizationFactor Result = VectorizationFactor::Disabled();
+ // How does this interact with forcing predicated vector epilogues?
if (!EnableEpilogueVectorization) {
LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization is disabled.\n");
return Result;
}
+ // We should probably avoid creating the predicated plans much earlier if a
+ // scalar epilogue isn't allowed.
+ // Deliberately using unpredicated cost model here - is this right?
if (!CM.isScalarEpilogueAllowed()) {
LLVM_DEBUG(dbgs() << "LEV: Unable to vectorize epilogue because no "
"epilogue is allowed.\n");
@@ -4457,7 +4521,7 @@ VectorizationFactor LoopVectorizationPlanner::selectEpilogueVectorizationFactor(
if (EpilogueVectorizationForceVF > 1) {
LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization factor is forced.\n");
ElementCount ForcedEC = ElementCount::getFixed(EpilogueVectorizationForceVF);
- if (hasPlanWithVF(ForcedEC))
+ if (hasPlanWithVF(ForcedEC, /*ForEpilogue*/ true))
return {ForcedEC, 0, 0};
LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization forced factor is not "
@@ -4471,17 +4535,20 @@ VectorizationFactor LoopVectorizationPlanner::selectEpilogueVectorizationFactor(
return Result;
}
- if (!CM.isEpilogueVectorizationProfitable(MainLoopVF, IC)) {
+ if (!EpilogueTailFoldedCM &&
+ !CM.isEpilogueVectorizationProfitable(MainLoopVF, IC)) {
LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization is not profitable for "
"this loop\n");
return Result;
}
-
// If MainLoopVF = vscale x 2, and vscale is expected to be 4, then we know
// the main loop handles 8 lanes per iteration. We could still benefit from
// vectorizing the epilogue loop with VF=4.
+
+ // FIXME: This doesn't consider interleaving, does it matter?
+ unsigned Multiplier = EpilogueTailFoldedCM ? IC : 1;
ElementCount EstimatedRuntimeVF = ElementCount::getFixed(
- estimateElementCount(MainLoopVF, CM.getVScaleForTuning()));
+ estimateElementCount(MainLoopVF * Multiplier, CM.getVScaleForTuning()));
Type *TCType = Legal->getWidestInductionType();
const SCEV *RemainingIterations = nullptr;
@@ -4513,6 +4580,7 @@ VectorizationFactor LoopVectorizationPlanner::selectEpilogueVectorizationFactor(
return Result;
if (MainLoopVF.isFixed()) {
+ // TODO: extend to support scalable VFs.
MaxTripCount = MainLoopVF.getFixedValue() * IC - 1;
if (SE.isKnownPredicate(CmpInst::ICMP_ULT, RemainingIterations,
SE.getConstant(TCType, MaxTripCount))) {
@@ -4527,15 +4595,17 @@ VectorizationFactor LoopVectorizationPlanner::selectEpilogueVectorizationFactor(
};
for (auto &NextVF : ProfitableVFs) {
// Skip candidate VFs without a corresponding VPlan.
- if (!hasPlanWithVF(NextVF.Width))
+ if (!hasPlanWithVF(NextVF.Width,
+ /*ForEpilogue*/ true))
continue;
// Skip candidate VFs with widths >= the (estimated) runtime VF (scalable
// vectors) or > the VF of the main loop (fixed vectors).
+ // FIXME: Why are we using isKnownGE for scalable vectors?
if ((!NextVF.Width.isScalable() && MainLoopVF.isScalable() &&
ElementCount::isKnownGE(NextVF.Width, EstimatedRuntimeVF)) ||
(NextVF.Width.isScalable() &&
- ElementCount::isKnownGE(NextVF.Width, MainLoopVF)) ||
+ ElementCount::isKnownGE(NextVF.Width, MainLoopVF)) ||
(!NextVF.Width.isScalable() && !MainLoopVF.isScalable() &&
ElementCount::isKnownGT(NextVF.Width, MainLoopVF)))
continue;
@@ -4545,6 +4615,12 @@ VectorizationFactor LoopVectorizationPlanner::selectEpilogueVectorizationFactor(
// TODO: We should also consider comparing against a scalable
// RemainingIterations when SCEV be able to evaluate non-canonical
// vscale-based expressions.
+
+ // TODO: For predicated vector epilogues it's ok for the epilogue VF
+ // width to be > remaining iterations. In fact, it can be a good
+ // thing if it allows us to remove the loop by eliminating the backedge
+ // branch. We just don't want to make sure we're using enough lanes in
+ // the vector to be worthwhile.
if (!ScalableRemIter) {
// Handle the case where NextVF and RemainingIterations are in different
// numerical spaces.
@@ -4556,9 +4632,14 @@ VectorizationFactor LoopVectorizationPlanner::selectEpilogueVectorizationFactor(
continue;
}
+ // TODO: We really need to add support for calculating a MaxTripCount
+ // as this will be useful for predicated vector epilogues.
+
+ // TODO: Need to update last flag passed to isMoreProfitable.
if (Result.Width.isScalar() ||
- isMoreProfitable(NextVF, Result, MaxTripCount, !CM.foldTailByMasking(),
- /*IsEpilogue*/ true))
+ isMoreProfitable(NextVF, Result, MaxTripCount,
+ EpilogueTailFoldedCM ? !EpilogueTailFoldedCM->foldTailByMasking() :
+ !CM.foldTailByMasking(), /*IsEpilogue*/ true))
Result = NextVF;
}
@@ -5323,7 +5404,7 @@ LoopVectorizationCostModel::getConsecutiveMemOpCost(Instruction *I,
"Stride should be 1 or -1 for consecutive memory access");
const Align Alignment = getLoadStoreAlignment(I);
InstructionCost Cost = 0;
- if (Legal->isMaskRequired(I)) {
+ if (isMaskRequired(I)) {
unsigned IID = I->getOpcode() == Instruction::Load
? Intrinsic::masked_load
: Intrinsic::masked_store;
@@ -5392,8 +5473,8 @@ LoopVectorizationCostModel::getGatherScatterCost(Instruction *I,
: Intrinsic::masked_scatter;
return TTI.getAddressComputationCost(PtrTy, nullptr, nullptr, CostKind) +
TTI.getMemIntrinsicInstrCost(
- MemIntrinsicCostAttributes(IID, VectorTy, Ptr,
- Legal->isMaskRequired(I), Alignment, I),
+ MemIntrinsicCostAttributes(IID, VectorTy, Ptr, isMaskRequired(I),
+ Alignment, I),
CostKind);
}
@@ -5423,12 +5504,11 @@ LoopVectorizationCostModel::getInterleaveGroupCost(Instruction *I,
(isa<StoreInst>(I) && !Group->isFull());
InstructionCost Cost = TTI.getInterleavedMemoryOpCost(
InsertPos->getOpcode(), WideVecTy, Group->getFactor(), Indices,
- Group->getAlign(), AS, CostKind, Legal->isMaskRequired(I),
- UseMaskForGaps);
+ Group->getAlign(), AS, CostKind, isMaskRequired(I), UseMaskForGaps);
if (Group->isReverse()) {
// TODO: Add support for reversed masked interleaved access.
- assert(!Legal->isMaskRequired(I) &&
+ assert(!isMaskRequired(I) &&
"Reverse masked interleaved access not supported.");
Cost += Group->getNumMembers() *
TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, VectorTy,
@@ -5724,6 +5804,7 @@ void LoopVectorizationCostModel::setCostBasedWideningDecision(ElementCount VF) {
// stores. Note that even with tail folding we know that at least
// one lane is active (i.e. generalized predication is not possible
// here), and the logic below depends on this fact.
+ // TODO: Needs to reason about predicated main body vs epilogue.
if (!foldTailByMasking())
return true;
@@ -5976,7 +6057,7 @@ void LoopVectorizationCostModel::setVectorizedCallDecision(ElementCount VF) {
continue;
}
- bool MaskRequired = Legal->isMaskRequired(CI);
+ bool MaskRequired = isMaskRequired(CI);
// Compute corresponding vector type for return value and arguments.
Type *RetTy = toVectorizedTy(ScalarRetTy, VF);
for (Type *ScalarTy : ScalarTys)
@@ -6796,11 +6877,17 @@ LoopVectorizationPlanner::planInVPlanNativePath(ElementCount UserVF) {
return VectorizationFactor::Disabled();
}
+bool LoopVectorizationPlanner::isEpilogueTailFolded() const {
+ return EpilogueTailFoldedCM && EpilogueTailFoldedCM->foldTailByMasking();
+}
+
void LoopVectorizationPlanner::plan(ElementCount UserVF, unsigned UserIC) {
assert(OrigLoop->isInnermost() && "Inner loop expected.");
CM.collectValuesToIgnore();
CM.collectElementTypesForWidening();
+ // computeMaxVF -> computeFeasibleMaxVF -> MinBWs = computeMinimumValueSizes
+ // These need copying to EpilogueTailFoldedCM.
FixedScalableVFPair MaxFactors = CM.computeMaxVF(UserVF, UserIC);
if (!MaxFactors) // Cases that should not to be vectorized nor interleaved.
return;
@@ -6819,12 +6906,48 @@ void LoopVectorizationPlanner::plan(ElementCount UserVF, unsigned UserIC) {
CM.invalidateCostModelingDecisions();
}
- if (CM.foldTailByMasking())
+ // TODO: Use information from computeMaxVF to know if we should still use a
+ // predicated epilogue.
+ if (EpilogueTailFoldedCM) {
+ // dbgs() << "Preparing additional predicated version of cost model\n";
+ EpilogueTailFoldedCM->collectValuesToIgnore();
+ EpilogueTailFoldedCM->collectElementTypesForWidening();
+ EpilogueTailFoldedCM->setTailFoldingStyles(MaxFactors.ScalableVF.isNonZero(), UserIC);
+ // If the max VF is likely to be 2, then there probably isn't much point
+ // generating a predicated vector epilogue.
+ unsigned EstimatedMaxVF = MaxFactors.FixedVF.getFixedValue();
+ if (MaxFactors.ScalableVF.isNonZero()) {
+ unsigned EstimatedMaxScalableVF = estimateElementCount(MaxFactors.ScalableVF, CM.getVScaleForTuning());
+ EstimatedMaxVF = std::max(EstimatedMaxVF, EstimatedMaxScalableVF);
+ }
+ if (EpilogueTailFoldedCM->foldTailByMasking() && !EpilogueTailFoldedCM->InterleaveInfo.hasGroups() && EstimatedMaxVF > 2) {
+ EpilogueTailFoldedCM->copyMinimalBitwidths(CM.getMinimalBitwidths());
+
+ // Invalidate interleave groups if all blocks of loop will be predicated.
+ if (EpilogueTailFoldedCM->blockNeedsPredicationForAnyReason(OrigLoop->getHeader()) &&
+ !useMaskedInterleavedAccesses(TTI)) {
+ LLVM_DEBUG(
+ dbgs()
+ << "LV: Invalidate all interleaved groups due to fold-tail by masking "
+ "which requires masked-interleaved support.\n");
+ if (EpilogueTailFoldedCM->InterleaveInfo.invalidateGroups())
+ // Invalidating interleave groups also requires invalidating all decisions
+ // based on them, which includes widening decisions and uniform and scalar
+ // values.
+ EpilogueTailFoldedCM->invalidateCostModelingDecisions();
+ }
+ } else
+ EpilogueTailFoldedCM = nullptr;
+ }
+
+ // TODO: Does this only apply for predicated main body?
+ if (CM.foldTailByMasking() || (EpilogueTailFoldedCM && EpilogueTailFoldedCM->foldTailByMasking()))
Legal->prepareToFoldTailByMasking();
ElementCount MaxUserVF =
UserVF.isScalable() ? MaxFactors.ScalableVF : MaxFactors.FixedVF;
if (UserVF) {
+ assert(!EpilogueTailFoldedCM);
if (!ElementCount::isKnownLE(UserVF, MaxUserVF)) {
reportVectorizationInfo(
"UserVF ignored because it may be larger than the maximal safe VF",
@@ -6861,6 +6984,14 @@ void LoopVectorizationPlanner::plan(ElementCount UserVF, unsigned UserIC) {
CM.collectNonVectorizedAndSetWideningDecisions(VF);
}
+ if (EpilogueTailFoldedCM) {
+ EpilogueTailFoldedCM->collectInLoopReductions();
+ for (const auto &VF : VFCandidates) {
+ // Collect Uniform and Scalar instructions after vectorization with VF.
+ EpilogueTailFoldedCM->collectNonVectorizedAndSetWideningDecisions(VF);
+ }
+ }
+
buildVPlansWithVPRecipes(ElementCount::getFixed(1), MaxFactors.FixedVF);
buildVPlansWithVPRecipes(ElementCount::getScalable(1), MaxFactors.ScalableVF);
@@ -7290,6 +7421,56 @@ VectorizationFactor LoopVectorizationPlanner::computeBestVF() {
}
}
+if (EpilogueTailFoldedCM) {
+ // Find profitable VFs for vector epilogue.
+ ProfitableVFs.clear();
+
+ for (auto &P : EpilogueTailFoldedPlans) {
+ ArrayRef<ElementCount> VFs(P->vectorFactors().begin(),
+ P->vectorFactors().end());
+
+ SmallVector<VPRegisterUsage, 8> RUs;
+ if (EpilogueTailFoldedCM->useMaxBandwidth(TargetTransformInfo::RGK_ScalableVector) ||
+ EpilogueTailFoldedCM->useMaxBandwidth(TargetTransformInfo::RGK_FixedWidthVector))
+ RUs = calculateRegisterUsageForPlan(*P, VFs, TTI, EpilogueTailFoldedCM->ValuesToIgnore);
+
+ for (unsigned I = 0; I < VFs.size(); I++) {
+ ElementCount VF = VFs[I];
+ if (VF.isScalar())
+ continue;
+ if (!ForceVectorization && !willGenerateVectors(*P, VF, TTI)) {
+ LLVM_DEBUG(
+ dbgs()
+ << "LV: Not considering vector loop of width " << VF
+ << " because it will not generate any vector instructions.\n");
+ continue;
+ }
+ if (EpilogueTailFoldedCM->OptForSize && !ForceVectorization && hasReplicatorRegion(*P)) {
+ LLVM_DEBUG(
+ dbgs()
+ << "LV: Not considering vector loop of width " << VF
+ << " because it would cause replicated blocks to be generated,"
+ << " which isn't allowed when optimizing for size.\n");
+ continue;
+ }
+
+ InstructionCost Cost = cost(*P, VF);
+ VectorizationFactor CurrentFactor(VF, Cost, ScalarCost);
+
+ if (EpilogueTailFoldedCM->shouldConsiderRegPressureForVF(VF) &&
+ RUs[I].exceedsMaxNumRegs(TTI, ForceTargetNumVectorRegs)) {
+ LLVM_DEBUG(dbgs() << "LV(REG): Not considering vector loop of width "
+ << VF << " because it uses too many registers\n");
+ continue;
+ }
+
+ // If profitable add it to ProfitableVF list.
+ if (isMoreProfitable(CurrentFactor, ScalarFactor, P->hasScalarTail()))
+ ProfitableVFs.push_back(CurrentFactor);
+ }
+ }
+}
+
#ifndef NDEBUG
// Select the optimal vectorization factor according to the legacy cost-model.
// This is now only used to verify the decisions by the new VPlan-based
@@ -7638,10 +7819,17 @@ BasicBlock *EpilogueVectorizerMainLoop::emitIterationCountCheck(
VectorPHVPBB = replaceVPBBWithIRVPBB(VectorPHVPBB, VectorPH);
}
- BranchInst &BI = *BranchInst::Create(Bypass, VectorPH, CheckMinIters);
- if (hasBranchWeightMD(*OrigLoop->getLoopLatch()->getTerminator()))
- setBranchWeights(BI, MinItersBypassWeights, /*IsExpected=*/false);
- ReplaceInstWithInst(TCCheckBlock->getTerminator(), &BI);
+ if (ForEpilogue && isEpilogueTailFolded) {
+ BranchInst &BI =
+ *BranchInst::Create(Bypass, VectorPH, Builder.getFalse());
+ ReplaceInstWithInst(TCCheckBlock->getTerminator(), &BI);
+ } else {
+ BranchInst &BI =
+ *BranchInst::Create(Bypass, VectorPH, CheckMinIters);
+ if (hasBranchWeightMD(*OrigLoop->getLoopLatch()->getTerminator()))
+ setBranchWeights(BI, MinItersBypassWeights, /*IsExpected=*/false);
+ ReplaceInstWithInst(TCCheckBlock->getTerminator(), &BI);
+ }
// When vectorizing the main loop, its trip-count check is placed in a new
// block, whereas the overall trip-count check is placed in the VPlan entry
@@ -7660,8 +7848,13 @@ BasicBlock *EpilogueVectorizerMainLoop::emitIterationCountCheck(
/// entry block to the epilogue VPlan. The minimum iteration check is being
/// represented in VPlan.
BasicBlock *EpilogueVectorizerEpilogueLoop::createVectorizedLoopSkeleton() {
- BasicBlock *NewScalarPH = createScalarPreheader("vec.epilog.");
- BasicBlock *OriginalScalarPH = NewScalarPH->getSinglePredecessor();
+ BasicBlock *OriginalScalarPH = nullptr;
+ if (isEpilogueTailFolded)
+ OriginalScalarPH = OrigLoop->getLoopPreheader();
+ else {
+ BasicBlock *NewScalarPH = createScalarPreheader("vec.epilog.");
+ OriginalScalarPH = NewScalarPH->getSinglePredecessor();
+ }
OriginalScalarPH->setName("vec.epilog.iter.check");
VPIRBasicBlock *NewEntry = Plan.createVPIRBasicBlock(OriginalScalarPH);
VPBasicBlock *OldEntry = Plan.getEntry();
@@ -7719,7 +7912,7 @@ VPRecipeBase *VPRecipeBuilder::tryToWidenMemory(VPInstruction *VPI,
// If a mask is not required, drop it - use unmasked version for safe loads.
// TODO: Determine if mask is needed in VPlan.
- VPValue *Mask = Legal->isMaskRequired(I) ? VPI->getMask() : nullptr;
+ VPValue *Mask = CM.isMaskRequired(I) ? VPI->getMask() : nullptr;
// Determine if the pointer operand of the access is either consecutive or
// reverse consecutive.
@@ -7986,7 +8179,7 @@ VPHistogramRecipe *VPRecipeBuilder::tryToWidenHistogram(const HistogramInfo *HI,
// In case of predicated execution (due to tail-folding, or conditional
// execution, or both), pass the relevant mask.
- if (Legal->isMaskRequired(HI->Store))
+ if (CM.isMaskRequired(HI->Store))
HGramOps.push_back(VPI->getMask());
return new VPHistogramRecipe(Opcode, HGramOps, VPI->getDebugLoc());
@@ -8139,7 +8332,7 @@ void LoopVectorizationPlanner::buildVPlansWithVPRecipes(ElementCount MinVF,
auto MaxVFTimes2 = MaxVF * 2;
for (ElementCount VF = MinVF; ElementCount::isKnownLT(VF, MaxVFTimes2);) {
VFRange SubRange = {VF, MaxVFTimes2};
- if (auto Plan = tryToBuildVPlanWithVPRecipes(
+ if (auto Plan = tryToBuildVPlanWithVPRecipes(&CM,
std::unique_ptr<VPlan>(VPlan0->duplicate()), SubRange, &LVer)) {
// Now optimize the initial VPlan.
VPlanTransforms::hoistPredicatedLoads(*Plan, PSE, OrigLoop);
@@ -8158,10 +8351,30 @@ void LoopVectorizationPlanner::buildVPlansWithVPRecipes(ElementCount MinVF,
}
VF = SubRange.End;
}
+
+ if (!isEpilogueTailFolded())
+ return;
+ for (ElementCount VF = MinVF; ElementCount::isKnownLT(VF, MaxVFTimes2);) {
+ VFRange SubRange = {VF, MaxVFTimes2};
+ if (auto Plan = tryToBuildVPlanWithVPRecipes(
+ EpilogueTailFoldedCM, std::unique_ptr<VPlan>(VPlan0->duplicate()), SubRange, &LVer)) {
+ bool HasScalarVF = Plan->hasScalarVFOnly();
+ // Now optimize the initial VPlan.
+ if (!HasScalarVF)
+ RUN_VPLAN_PASS(VPlanTransforms::truncateToMinimalBitwidths,
+ *Plan, EpilogueTailFoldedCM->getMinimalBitwidths());
+ RUN_VPLAN_PASS(VPlanTransforms::optimize, *Plan);
+ assert(!EpilogueTailFoldedCM->foldTailWithEVL());
+ assert(verifyVPlanIsValid(*Plan) && "VPlan is invalid");
+ EpilogueTailFoldedPlans.push_back(std::move(Plan));
+ }
+ VF = SubRange.End;
+ }
}
VPlanPtr LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(
- VPlanPtr Plan, VFRange &Range, LoopVersioning *LVer) {
+ LoopVectorizationCostModel *Cost, VPlanPtr Plan, VFRange &Range,
+ LoopVersioning *LVer) {
using namespace llvm::VPlanPatternMatch;
SmallPtrSet<const InterleaveGroup<Instruction> *, 1> InterleaveGroups;
@@ -8173,14 +8386,16 @@ VPlanPtr LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(
bool RequiresScalarEpilogueCheck =
LoopVectorizationPlanner::getDecisionAndClampRange(
- [this](ElementCount VF) {
- return !CM.requiresScalarEpilogue(VF.isVector());
+ [Cost](ElementCount VF) {
+ return !Cost->requiresScalarEpilogue(VF.isVector());
},
Range);
+
+ // TODO: foldTailByMasking needs to return different answers depending upon
+ // whether it's for the main body or the vector epilogue.
VPlanTransforms::handleEarlyExits(*Plan, Legal->hasUncountableEarlyExit());
VPlanTransforms::addMiddleCheck(*Plan, RequiresScalarEpilogueCheck,
- CM.foldTailByMasking());
-
+ Cost->foldTailByMasking());
VPlanTransforms::createLoopRegions(*Plan);
// Don't use getDecisionAndClampRange here, because we don't know the UF
@@ -8189,9 +8404,10 @@ VPlanPtr LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(
// TODO: Consider using getDecisionAndClampRange here to split up VPlans.
bool IVUpdateMayOverflow = false;
for (ElementCount VF : Range)
- IVUpdateMayOverflow |= !isIndvarOverflowCheckKnownFalse(&CM, VF);
+ IVUpdateMayOverflow |= !isIndvarOverflowCheckKnownFalse(Cost, VF);
- TailFoldingStyle Style = CM.getTailFoldingStyle(IVUpdateMayOverflow);
+ // TODO: Is the answer the same for both vector main body and epilogue?
+ TailFoldingStyle Style = Cost->getTailFoldingStyle(IVUpdateMayOverflow);
// Use NUW for the induction increment if we proved that it won't overflow in
// the vector loop or when not folding the tail. In the later case, we know
// that the canonical induction increment will not overflow as the vector trip
@@ -8218,9 +8434,9 @@ VPlanPtr LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(
// placeholders for its members' Recipes which we'll be replacing with a
// single VPInterleaveRecipe.
for (InterleaveGroup<Instruction> *IG : IAI.getInterleaveGroups()) {
- auto ApplyIG = [IG, this](ElementCount VF) -> bool {
+ auto ApplyIG = [IG, Cost](ElementCount VF) -> bool {
bool Result = (VF.isVector() && // Query is illegal for VF == 1
- CM.getWideningDecision(IG->getInsertPos(), VF) ==
+ Cost->getWideningDecision(IG->getInsertPos(), VF) ==
LoopVectorizationCostModel::CM_Interleave);
// For scalable vectors, the interleave factors must be <= 8 since we
// require the (de)interleaveN intrinsics instead of shufflevectors.
@@ -8236,13 +8452,13 @@ VPlanPtr LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(
// ---------------------------------------------------------------------------
// Predicate and linearize the top-level loop region.
// ---------------------------------------------------------------------------
- VPlanTransforms::introduceMasksAndLinearize(*Plan, CM.foldTailByMasking());
+ VPlanTransforms::introduceMasksAndLinearize(*Plan, Cost->foldTailByMasking());
// ---------------------------------------------------------------------------
// Construct wide recipes and apply predication for original scalar
// VPInstructions in the loop.
// ---------------------------------------------------------------------------
- VPRecipeBuilder RecipeBuilder(*Plan, TLI, Legal, CM, Builder);
+ VPRecipeBuilder RecipeBuilder(*Plan, TLI, Legal, *Cost, Builder);
// Scan the body of the loop in a topological order to visit each basic block
// after having visited its predecessor basic blocks.
@@ -8337,7 +8553,7 @@ VPlanPtr LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(
// bring the VPlan to its final state.
// ---------------------------------------------------------------------------
- addReductionResultComputation(Plan, RecipeBuilder, Range.Start);
+ addReductionResultComputation(Cost, Plan, RecipeBuilder, Range.Start);
// Optimize FindIV reductions to use sentinel-based approach when possible.
RUN_VPLAN_PASS(VPlanTransforms::optimizeFindIVReductions, *Plan, PSE,
@@ -8361,9 +8577,9 @@ VPlanPtr LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(
// range for better cost estimation.
// TODO: Enable following transform when the EVL-version of extended-reduction
// and mulacc-reduction are implemented.
- if (!CM.foldTailWithEVL()) {
- VPCostContext CostCtx(CM.TTI, *CM.TLI, *Plan, CM, CM.CostKind, CM.PSE,
- OrigLoop);
+ if (!Cost->foldTailWithEVL()) {
+ VPCostContext CostCtx(Cost->TTI, *Cost->TLI, *Plan, *Cost, Cost->CostKind,
+ Cost->PSE, OrigLoop);
RUN_VPLAN_PASS(VPlanTransforms::createPartialReductions, *Plan, CostCtx,
Range);
RUN_VPLAN_PASS(VPlanTransforms::convertToAbstractRecipes, *Plan, CostCtx,
@@ -8378,7 +8594,7 @@ VPlanPtr LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(
// for this VPlan, replace the Recipes widening its memory instructions with a
// single VPInterleaveRecipe at its insertion point.
RUN_VPLAN_PASS(VPlanTransforms::createInterleaveGroups, *Plan,
- InterleaveGroups, RecipeBuilder, CM.isScalarEpilogueAllowed());
+ InterleaveGroups, RecipeBuilder, Cost->isScalarEpilogueAllowed());
// Replace VPValues for known constant strides.
RUN_VPLAN_PASS(VPlanTransforms::replaceSymbolicStrides, *Plan, PSE,
@@ -8453,7 +8669,8 @@ VPlanPtr LoopVectorizationPlanner::tryToBuildVPlan(VFRange &Range) {
}
void LoopVectorizationPlanner::addReductionResultComputation(
- VPlanPtr &Plan, VPRecipeBuilder &RecipeBuilder, ElementCount MinVF) {
+ LoopVectorizationCostModel *Cost, VPlanPtr &Plan,
+ VPRecipeBuilder &RecipeBuilder, ElementCount MinVF) {
using namespace VPlanPatternMatch;
VPTypeAnalysis TypeInfo(*Plan);
VPRegionBlock *VectorLoopRegion = Plan->getVectorLoopRegion();
@@ -8716,7 +8933,8 @@ void LoopVectorizationPlanner::addMinimumIterationCheck(
static ScalarEpilogueLowering getScalarEpilogueLowering(
Function *F, Loop *L, LoopVectorizeHints &Hints, bool OptForSize,
TargetTransformInfo *TTI, TargetLibraryInfo *TLI,
- LoopVectorizationLegality &LVL, InterleavedAccessInfo *IAI) {
+ LoopVectorizationLegality &LVL, InterleavedAccessInfo *IAI,
+ bool ForEpilogue = false) {
// 1) OptSize takes precedence over all other options, i.e. if this is set,
// don't look at hints or options, and don't request a scalar epilogue.
if (F->hasOptSize() ||
@@ -8725,11 +8943,13 @@ static ScalarEpilogueLowering getScalarEpilogueLowering(
// 2) If set, obey the directives
if (PreferPredicateOverEpilogue.getNumOccurrences()) {
+ if (ForEpilogue && PreferPredicateOverEpilogue == PreferPredicateTy::PredicatedEpilogue)
+ return CM_ScalarEpilogueNotNeededUsePredicatedEpilogue;
switch (PreferPredicateOverEpilogue) {
case PreferPredicateTy::ScalarEpilogue:
return CM_ScalarEpilogueAllowed;
case PreferPredicateTy::PredicateElseScalarEpilogue:
- return CM_ScalarEpilogueNotNeededUsePredicate;
+ return CM_ScalarEpilogueNotNeededUsePredicatedBody;
case PreferPredicateTy::PredicateOrDontVectorize:
return CM_ScalarEpilogueNotAllowedUsePredicate;
};
@@ -8738,7 +8958,7 @@ static ScalarEpilogueLowering getScalarEpilogueLowering(
// 3) If set, obey the hints
switch (Hints.getPredicate()) {
case LoopVectorizeHints::FK_Enabled:
- return CM_ScalarEpilogueNotNeededUsePredicate;
+ return CM_ScalarEpilogueNotNeededUsePredicatedBody;
case LoopVectorizeHints::FK_Disabled:
return CM_ScalarEpilogueAllowed;
};
@@ -8746,7 +8966,7 @@ static ScalarEpilogueLowering getScalarEpilogueLowering(
// 4) if the TTI hook indicates this is profitable, request predication.
TailFoldingInfo TFI(TLI, &LVL, IAI);
if (TTI->preferPredicateOverEpilogue(&TFI))
- return CM_ScalarEpilogueNotNeededUsePredicate;
+ return CM_ScalarEpilogueNotNeededUsePredicatedBody;
return CM_ScalarEpilogueAllowed;
}
@@ -9017,7 +9237,7 @@ static void preparePlanForMainVectorLoop(VPlan &MainPlan, VPlan &EpiPlan) {
SmallPtrSet<PHINode *, 2> EpiWidenedPhis;
for (VPRecipeBase &R :
EpiPlan.getVectorLoopRegion()->getEntryBasicBlock()->phis()) {
- if (isa<VPCanonicalIVPHIRecipe>(&R))
+ if (isa<VPCanonicalIVPHIRecipe>(&R) || isa<VPActiveLaneMaskPHIRecipe>(&R))
continue;
EpiWidenedPhis.insert(
cast<PHINode>(R.getVPSingleValue()->getUnderlyingValue()));
@@ -9120,8 +9340,9 @@ static SmallVector<Instruction *> preparePlanForEpilogueVectorLoop(
// FIXME: Improve modeling for canonical IV start values in the epilogue
// loop.
using namespace llvm::PatternMatch;
- PHINode *EPResumeVal = &*L->getLoopPreheader()->phis().begin();
- for (Value *Inc : EPResumeVal->incoming_values()) {
+ PHINode *IVResumeVal = &*L->getLoopPreheader()->phis().begin();
+
+ for (Value *Inc : IVResumeVal->incoming_values()) {
if (match(Inc, m_SpecificInt(0)))
continue;
assert(!EPI.VectorTripCount &&
@@ -9134,20 +9355,22 @@ static SmallVector<Instruction *> preparePlanForEpilogueVectorLoop(
// TODO: We should not choose VF * UF so the main vector loop is known to
// be dead.
if (!EPI.VectorTripCount) {
- assert(EPResumeVal->getNumIncomingValues() > 0 &&
- all_of(EPResumeVal->incoming_values(),
+ assert(IVResumeVal->getNumIncomingValues() > 0 &&
+ all_of(IVResumeVal->incoming_values(),
[](Value *Inc) { return match(Inc, m_SpecificInt(0)); }) &&
"all incoming values must be 0");
- EPI.VectorTripCount = EPResumeVal->getOperand(0);
+ EPI.VectorTripCount = IVResumeVal->getOperand(0);
}
- VPValue *VPV = Plan.getOrAddLiveIn(EPResumeVal);
+ VPValue *VPV = Plan.getOrAddLiveIn(IVResumeVal);
assert(all_of(IV->users(),
[](const VPUser *U) {
return isa<VPScalarIVStepsRecipe>(U) ||
isa<VPDerivedIVRecipe>(U) ||
cast<VPRecipeBase>(U)->isScalarCast() ||
cast<VPInstruction>(U)->getOpcode() ==
- Instruction::Add;
+ Instruction::Add ||
+ cast<VPInstruction>(U)->getOpcode() ==
+ VPInstruction::CanonicalIVIncrementForPart;
}) &&
"the canonical IV should only be used by its increment or "
"ScalarIVSteps when resetting the start value");
@@ -9163,6 +9386,15 @@ static SmallVector<Instruction *> preparePlanForEpilogueVectorLoop(
// handled above.
for (VPRecipeBase &R : drop_begin(Header->phis())) {
Value *ResumeV = nullptr;
+ if (isa<VPActiveLaneMaskPHIRecipe>(&R)) {
+ // Needs extracting from the start value ActiveLaneMask instruction.
+ auto *ALM = cast<VPInstruction>(cast<VPActiveLaneMaskPHIRecipe>(&R)->getOperand(0));
+ assert(ALM->getOpcode() == VPInstruction::ActiveLaneMask);
+ assert(IVResumeVal && "must have a resume value for the canonical IV");
+ VPValue *VPV = Plan.getOrAddLiveIn(IVResumeVal);
+ ALM->setOperand(0, VPV);
+ continue;
+ }
// TODO: Move setting of resume values to prepareToExecute.
if (auto *ReductionPhi = dyn_cast<VPReductionPHIRecipe>(&R)) {
// Find the reduction result by searching users of the phi or its backedge
@@ -9281,7 +9513,8 @@ static SmallVector<Instruction *> preparePlanForEpilogueVectorLoop(
VPlanTransforms::addMinimumVectorEpilogueIterationCheck(
Plan, EPI.TripCount, EPI.VectorTripCount,
CM.requiresScalarEpilogue(EPI.EpilogueVF.isVector()), EPI.EpilogueVF,
- EPI.EpilogueUF, MainLoopStep, EpilogueLoopStep, SE);
+ EPI.EpilogueUF, MainLoopStep, EpilogueLoopStep, SE,
+ CM.foldTailByMasking());
return InstsToMove;
}
@@ -9313,7 +9546,8 @@ static Value *createInductionAdditionalBypassValues(
return EndValueFromAdditionalBypass;
}
-static void fixScalarResumeValuesFromBypass(BasicBlock *BypassBlock, Loop *L,
+static void fixScalarResumeValuesFromBypass(LoopVectorizationPlanner &LVP,
+ BasicBlock *BypassBlock, Loop *L,
VPlan &BestEpiPlan,
LoopVectorizationLegality &LVL,
const SCEV2ValueTy &ExpandedSCEVs,
@@ -9340,13 +9574,15 @@ static void fixScalarResumeValuesFromBypass(BasicBlock *BypassBlock, Loop *L,
// Fix induction resume values from the additional bypass block.
IRBuilder<> BypassBuilder(BypassBlock, BypassBlock->getFirstInsertionPt());
- for (const auto &[IVPhi, II] : LVL.getInductionVars()) {
- auto *Inc = cast<PHINode>(IVPhi->getIncomingValueForBlock(PH));
- Value *V = createInductionAdditionalBypassValues(
- IVPhi, II, BypassBuilder, ExpandedSCEVs, MainVectorTripCount,
- LVL.getPrimaryInduction());
- // TODO: Directly add as extra operand to the VPResumePHI recipe.
- Inc->setIncomingValueForBlock(BypassBlock, V);
+ if (!LVP.isEpilogueTailFolded()) {
+ for (const auto &[IVPhi, II] : LVL.getInductionVars()) {
+ auto *Inc = cast<PHINode>(IVPhi->getIncomingValueForBlock(PH));
+ Value *V = createInductionAdditionalBypassValues(
+ IVPhi, II, BypassBuilder, ExpandedSCEVs, MainVectorTripCount,
+ LVL.getPrimaryInduction());
+ // TODO: Directly add as extra operand to the VPResumePHI recipe.
+ Inc->setIncomingValueForBlock(BypassBlock, V);
+ }
}
}
@@ -9356,6 +9592,7 @@ static void fixScalarResumeValuesFromBypass(BasicBlock *BypassBlock, Loop *L,
// InstsToMove contains instructions that need to be moved to the preheader of
// the epilogue vector loop.
static void connectEpilogueVectorLoop(
+ LoopVectorizationPlanner &LVP,
VPlan &EpiPlan, Loop *L, EpilogueLoopVectorizationInfo &EPI,
DominatorTree *DT, LoopVectorizationLegality &LVL,
DenseMap<const SCEV *, Value *> &ExpandedSCEVs, GeneratedRTChecks &Checks,
@@ -9437,11 +9674,11 @@ static void connectEpilogueVectorLoop(
auto IP = VecEpiloguePreHeader->getFirstNonPHIIt();
for (auto *I : InstsToMove)
I->moveBefore(IP);
-
+
// VecEpilogueIterationCountCheck conditionally skips over the epilogue loop
// after executing the main loop. We need to update the resume values of
// inductions and reductions during epilogue vectorization.
- fixScalarResumeValuesFromBypass(VecEpilogueIterationCountCheck, L, EpiPlan,
+ fixScalarResumeValuesFromBypass(LVP, VecEpilogueIterationCountCheck, L, EpiPlan,
LVL, ExpandedSCEVs, EPI.VectorTripCount);
}
@@ -9576,7 +9813,8 @@ bool LoopVectorizePass::processLoop(Loop *L) {
// with runtime checks. It's more effective to let
// `isOutsideLoopWorkProfitable` determine if vectorization is
// beneficial for the loop.
- if (SEL != CM_ScalarEpilogueNotNeededUsePredicate)
+ if (SEL != CM_ScalarEpilogueNotNeededUsePredicatedBody &&
+ SEL != CM_ScalarEpilogueNotNeededUsePredicatedEpilogue)
SEL = CM_ScalarEpilogueNotAllowedLowTripLoop;
}
}
@@ -9630,6 +9868,12 @@ bool LoopVectorizePass::processLoop(Loop *L) {
// Use the cost model.
LoopVectorizationCostModel CM(SEL, L, PSE, LI, &LVL, *TTI, TLI, DB, AC, ORE,
GetBFI, F, &Hints, IAI, OptForSize);
+ ScalarEpilogueLowering EpilogueTailFoldedSEL =
+ getScalarEpilogueLowering(F, L, Hints, OptForSize, TTI, TLI, LVL, &IAI,
+ /*ForEpilogue*/ true);
+ LoopVectorizationCostModel EpilogueTailFoldedCM(EpilogueTailFoldedSEL, L, PSE, LI, &LVL, *TTI, TLI, DB, AC, ORE,
+ GetBFI, F, &Hints, IAI, OptForSize);
+
// Use the planner for vectorization.
LoopVectorizationPlanner LVP(L, LI, DT, TLI, *TTI, &LVL, CM, IAI, PSE, Hints,
ORE);
@@ -9640,6 +9884,24 @@ bool LoopVectorizePass::processLoop(Loop *L) {
if (UserIC > 1 && !LVL.isSafeForAnyVectorWidth())
UserIC = 1;
+ // What about predicated hint?
+ if (EpilogueTailFoldedSEL == CM_ScalarEpilogueNotNeededUsePredicatedEpilogue
+ && !CM.requiresScalarEpilogue(true)) {
+ bool HasReductions = !LVL.getReductionVars().empty();
+ bool HasSelectCmpReductions =
+ HasReductions &&
+ any_of(LVL.getReductionVars(), [&](auto &Reduction) -> bool {
+ const RecurrenceDescriptor &RdxDesc = Reduction.second;
+ RecurKind RK = RdxDesc.getRecurrenceKind();
+ return RecurrenceDescriptor::isAnyOfRecurrenceKind(RK) ||
+ RecurrenceDescriptor::isFindIVRecurrenceKind(RK) ||
+ RecurrenceDescriptor::isMinMaxRecurrenceKind(RK);
+ });
+ if (!HasSelectCmpReductions) {
+ LVP.setEpilogueTailFoldingCM(&EpilogueTailFoldedCM);
+ }
+ }
+
// Plan how to best vectorize.
LVP.plan(UserVF, UserIC);
VectorizationFactor VF = LVP.computeBestVF();
@@ -9712,7 +9974,8 @@ bool LoopVectorizePass::processLoop(Loop *L) {
"Ignoring user-specified interleave count due to possibly "
"unsafe dependencies in the loop."};
InterleaveLoop = false;
- } else if (!LVP.hasPlanWithVF(VF.Width) && UserIC > 1) {
+ } else if (!LVP.hasPlanWithVF(VF.Width) &&
+ UserIC > 1) {
// Tell the user interleaving was avoided up-front, despite being explicitly
// requested.
LLVM_DEBUG(dbgs() << "LV: Ignoring UserIC, because vectorization and "
@@ -9835,27 +10098,30 @@ bool LoopVectorizePass::processLoop(Loop *L) {
// The first pass vectorizes the main loop and creates a scalar epilogue
// to be vectorized by executing the plan (potentially with a different
// factor) again shortly afterwards.
- VPlan &BestEpiPlan = LVP.getPlanFor(EpilogueVF.Width);
+ VPlan &BestEpiPlan = LVP.getPlanFor(EpilogueVF.Width, /*ForEpilogue*/ true);
BestEpiPlan.getMiddleBlock()->setName("vec.epilog.middle.block");
BestEpiPlan.getVectorPreheader()->setName("vec.epilog.ph");
preparePlanForMainVectorLoop(*BestMainPlan, BestEpiPlan);
EpilogueLoopVectorizationInfo EPI(VF.Width, IC, EpilogueVF.Width, 1,
BestEpiPlan);
EpilogueVectorizerMainLoop MainILV(L, PSE, LI, DT, TTI, AC, EPI, &CM,
- Checks, *BestMainPlan);
+ Checks, *BestMainPlan,
+ LVP.isEpilogueTailFolded());
auto ExpandedSCEVs = LVP.executePlan(EPI.MainLoopVF, EPI.MainLoopUF,
*BestMainPlan, MainILV, DT, false);
++LoopsVectorized;
// Second pass vectorizes the epilogue and adjusts the control flow
// edges from the first pass.
- EpilogueVectorizerEpilogueLoop EpilogILV(L, PSE, LI, DT, TTI, AC, EPI, &CM,
- Checks, BestEpiPlan);
+ EpilogueVectorizerEpilogueLoop EpilogILV(L, PSE, LI, DT, TTI, AC, EPI,
+ LVP.isEpilogueTailFolded() ? LVP.getEpilogueTailFoldingCM() : &CM,
+ Checks, BestEpiPlan, LVP.isEpilogueTailFolded());
SmallVector<Instruction *> InstsToMove = preparePlanForEpilogueVectorLoop(
- BestEpiPlan, L, ExpandedSCEVs, EPI, CM, *PSE.getSE());
+ BestEpiPlan, L, ExpandedSCEVs, EPI, LVP.isEpilogueTailFolded() ? *LVP.getEpilogueTailFoldingCM() : CM, *PSE.getSE());
LVP.executePlan(EPI.EpilogueVF, EPI.EpilogueUF, BestEpiPlan, EpilogILV, DT,
true);
- connectEpilogueVectorLoop(BestEpiPlan, L, EPI, DT, LVL, ExpandedSCEVs,
+ if (!LVP.isEpilogueTailFolded())
+ connectEpilogueVectorLoop(LVP, BestEpiPlan, L, EPI, DT, LVL, ExpandedSCEVs,
Checks, InstsToMove);
++LoopsEpilogueVectorized;
} else {
@@ -9872,6 +10138,7 @@ bool LoopVectorizePass::processLoop(Loop *L) {
++LoopsVectorized;
}
+ LLVM_DEBUG(dbgs() << "hassnaa: [processLoop]: final function: "; F->dump());
assert(DT->verify(DominatorTree::VerificationLevel::Fast) &&
"DT not preserved correctly");
assert(!verifyFunction(*F, &dbgs()));
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.cpp b/llvm/lib/Transforms/Vectorize/VPlan.cpp
index 9b22c399d7acf..a0dc620206e6e 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlan.cpp
@@ -1590,13 +1590,16 @@ void LoopVectorizationPlanner::buildVPlans(ElementCount MinVF,
}
}
-VPlan &LoopVectorizationPlanner::getPlanFor(ElementCount VF) const {
- assert(count_if(VPlans,
+VPlan &LoopVectorizationPlanner::getPlanFor(ElementCount VF,
+ bool ForEpilogue) const {
+ bool UseEpilogueTailFoldedPlans = ForEpilogue && isEpilogueTailFolded();
+ assert(count_if(UseEpilogueTailFoldedPlans ? EpilogueTailFoldedPlans : VPlans,
[VF](const VPlanPtr &Plan) { return Plan->hasVF(VF); }) ==
1 &&
"Multiple VPlans for VF.");
- for (const VPlanPtr &Plan : VPlans) {
+ for (const VPlanPtr &Plan :
+ UseEpilogueTailFoldedPlans ? EpilogueTailFoldedPlans : VPlans) {
if (Plan->hasVF(VF))
return *Plan.get();
}
@@ -1745,6 +1748,15 @@ void LoopVectorizationPlanner::printPlans(raw_ostream &O) {
Plan->printDOT(O);
else
Plan->print(O);
+
+ if (EpilogueTailFoldedPlans.empty())
+ return;
+ O << "LV: Printing out predicated plans\n";
+ for (const auto &Plan : EpilogueTailFoldedPlans)
+ if (PrintVPlansInDotFormat)
+ Plan->printDOT(O);
+ else
+ Plan->print(O);
}
#endif
diff --git a/llvm/lib/Transforms/Vectorize/VPlanConstruction.cpp b/llvm/lib/Transforms/Vectorize/VPlanConstruction.cpp
index da1035847020a..315120bf0ed1a 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanConstruction.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanConstruction.cpp
@@ -1120,10 +1120,17 @@ void VPlanTransforms::addMinimumIterationCheck(
void VPlanTransforms::addMinimumVectorEpilogueIterationCheck(
VPlan &Plan, Value *TripCount, Value *VectorTripCount,
bool RequiresScalarEpilogue, ElementCount EpilogueVF, unsigned EpilogueUF,
- unsigned MainLoopStep, unsigned EpilogueLoopStep, ScalarEvolution &SE) {
+ unsigned MainLoopStep, unsigned EpilogueLoopStep, ScalarEvolution &SE,
+ bool EpilogueTailFolded) {
// Add the minimum iteration check for the epilogue vector loop.
- VPValue *TC = Plan.getOrAddLiveIn(TripCount);
VPBuilder Builder(cast<VPBasicBlock>(Plan.getEntry()));
+ if (EpilogueTailFolded) {
+ // Always branch to epilogue vector loop (always false condition)
+ Builder.createNaryOp(VPInstruction::BranchOnCond, Plan.getFalse());
+ return;
+ }
+
+ VPValue *TC = Plan.getOrAddLiveIn(TripCount);
VPValue *VFxUF = Builder.createExpandSCEV(SE.getElementCount(
TripCount->getType(), (EpilogueVF * EpilogueUF), SCEV::FlagNUW));
VPValue *Count = Builder.createSub(TC, Plan.getOrAddLiveIn(VectorTripCount),
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
index da3afe7ce6d03..75c39fe67b142 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
@@ -2673,6 +2673,9 @@ void VPlanTransforms::truncateToMinimalBitwidths(
// cannot use RAUW after creating a new truncate, as this would could make
// other uses have different types for their operands, making them invalidly
// typed.
+ if (MinBWs.empty()) {
+ return;
+ }
DenseMap<VPValue *, VPWidenCastRecipe *> ProcessedTruncs;
VPTypeAnalysis TypeInfo(Plan);
VPBasicBlock *PH = Plan.getVectorPreheader();
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.h b/llvm/lib/Transforms/Vectorize/VPlanTransforms.h
index 5fc68c2df145f..7e657965b235b 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.h
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.h
@@ -165,7 +165,8 @@ struct VPlanTransforms {
static void addMinimumVectorEpilogueIterationCheck(
VPlan &Plan, Value *TripCount, Value *VectorTripCount,
bool RequiresScalarEpilogue, ElementCount EpilogueVF, unsigned EpilogueUF,
- unsigned MainLoopStep, unsigned EpilogueLoopStep, ScalarEvolution &SE);
+ unsigned MainLoopStep, unsigned EpilogueLoopStep, ScalarEvolution &SE,
+ bool EpilogueTailFolded);
/// Replace loops in \p Plan's flat CFG with VPRegionBlocks, turning \p Plan's
/// flat CFG into a hierarchical CFG.
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-epilog-tail-folded-vect.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-epilog-tail-folded-vect.ll
new file mode 100644
index 0000000000000..a3e1bb2f3c522
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-epilog-tail-folded-vect.ll
@@ -0,0 +1,393 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --filter-out-after "^for.body:" --filter-out-after "^loop:"
+; REQUIRES: asserts
+; RUN: opt < %s -passes=loop-vectorize -force-vector-interleave=2 -epilogue-vectorization-force-VF=8 \
+; RUN: -prefer-predicate-over-epilogue=predicated-epilogue -debug-only=loop-vectorize -S 2>%t | FileCheck %s --check-prefix=CHECK-VF8
+; RUN: cat %t | FileCheck %s --check-prefix=DEBUG-FORCED
+
+target triple = "aarch64-linux-gnu"
+
+define void @main_vf_vscale_x_16(ptr %A) #0 {
+;
+; CHECK-VF8-LABEL: @main_vf_vscale_x_16(
+; CHECK-VF8-NEXT: iter.check:
+; CHECK-VF8-NEXT: br i1 false, label [[VEC_EPILOG_ITER_CHECK:%.*]], label [[VECTOR_MAIN_LOOP_ITER_CHECK:%.*]]
+; CHECK-VF8: vector.main.loop.iter.check:
+; CHECK-VF8-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-VF8-NEXT: [[TMP1:%.*]] = shl nuw i64 [[TMP0]], 5
+; CHECK-VF8-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1024, [[TMP1]]
+; CHECK-VF8-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[VEC_EPILOG_ITER_CHECK]], label [[VECTOR_PH:%.*]]
+; CHECK-VF8: vector.ph:
+; CHECK-VF8-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-VF8-NEXT: [[TMP3:%.*]] = shl nuw i64 [[TMP2]], 4
+; CHECK-VF8-NEXT: [[TMP4:%.*]] = shl nuw i64 [[TMP3]], 1
+; CHECK-VF8-NEXT: [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP4]]
+; CHECK-VF8-NEXT: [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]]
+; CHECK-VF8-NEXT: br label [[VECTOR_BODY:%.*]]
+; CHECK-VF8: vector.body:
+; CHECK-VF8-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-VF8-NEXT: [[TMP5:%.*]] = getelementptr inbounds i8, ptr [[A:%.*]], i64 [[INDEX]]
+; CHECK-VF8-NEXT: [[TMP6:%.*]] = getelementptr inbounds i8, ptr [[TMP5]], i64 [[TMP3]]
+; CHECK-VF8-NEXT: store <vscale x 16 x i8> splat (i8 1), ptr [[TMP5]], align 1
+; CHECK-VF8-NEXT: store <vscale x 16 x i8> splat (i8 1), ptr [[TMP6]], align 1
+; CHECK-VF8-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP4]]
+; CHECK-VF8-NEXT: [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-VF8-NEXT: br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK-VF8: middle.block:
+; CHECK-VF8-NEXT: [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]]
+; CHECK-VF8-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[VEC_EPILOG_ITER_CHECK]]
+; CHECK-VF8: vec.epilog.iter.check:
+; CHECK-VF8-NEXT: [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ITER_CHECK:%.*]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
+; CHECK-VF8-NEXT: br label [[VEC_EPILOG_PH:%.*]]
+; CHECK-VF8: vec.epilog.ph:
+; CHECK-VF8-NEXT: [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call <8 x i1> @llvm.get.active.lane.mask.v8i1.i64(i64 [[VEC_EPILOG_RESUME_VAL]], i64 1024)
+; CHECK-VF8-NEXT: br label [[VEC_EPILOG_VECTOR_BODY:%.*]]
+; CHECK-VF8: vec.epilog.vector.body:
+; CHECK-VF8-NEXT: [[INDEX1:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT2:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ]
+; CHECK-VF8-NEXT: [[ACTIVE_LANE_MASK:%.*]] = phi <8 x i1> [ [[ACTIVE_LANE_MASK_ENTRY]], [[VEC_EPILOG_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ]
+; CHECK-VF8-NEXT: [[TMP8:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[INDEX1]]
+; CHECK-VF8-NEXT: call void @llvm.masked.store.v8i8.p0(<8 x i8> splat (i8 1), ptr align 1 [[TMP8]], <8 x i1> [[ACTIVE_LANE_MASK]])
+; CHECK-VF8-NEXT: [[INDEX_NEXT2]] = add i64 [[INDEX1]], 8
+; CHECK-VF8-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call <8 x i1> @llvm.get.active.lane.mask.v8i1.i64(i64 [[INDEX_NEXT2]], i64 1024)
+; CHECK-VF8-NEXT: [[TMP9:%.*]] = extractelement <8 x i1> [[ACTIVE_LANE_MASK_NEXT]], i32 0
+; CHECK-VF8-NEXT: [[TMP10:%.*]] = xor i1 [[TMP9]], true
+; CHECK-VF8-NEXT: br i1 [[TMP10]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
+; CHECK-VF8: vec.epilog.middle.block:
+; CHECK-VF8-NEXT: br label [[EXIT]]
+; CHECK-VF8: exit:
+; CHECK-VF8-NEXT: ret void
+;
+entry:
+ br label %for.body
+
+for.body:
+ %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+ %arrayidx = getelementptr inbounds i8, ptr %A, i64 %iv
+ store i8 1, ptr %arrayidx, align 1
+ %iv.next = add nuw nsw i64 %iv, 1
+ %exitcond = icmp ne i64 %iv.next, 1024
+ br i1 %exitcond, label %for.body, label %exit
+
+exit:
+ ret void
+}
+
+define void @main_vf_vscale_x_2_no_epi_iteration(ptr %A) #0 vscale_range(8, 8) {
+;
+; CHECK-VF8-LABEL: @main_vf_vscale_x_2_no_epi_iteration(
+; CHECK-VF8-NEXT: iter.check:
+; CHECK-VF8-NEXT: br i1 false, label [[VEC_EPILOG_ITER_CHECK:%.*]], label [[VECTOR_MAIN_LOOP_ITER_CHECK:%.*]]
+; CHECK-VF8: vector.main.loop.iter.check:
+; CHECK-VF8-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-VF8-NEXT: [[TMP1:%.*]] = shl nuw i64 [[TMP0]], 2
+; CHECK-VF8-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1024, [[TMP1]]
+; CHECK-VF8-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[VEC_EPILOG_ITER_CHECK]], label [[VECTOR_PH:%.*]]
+; CHECK-VF8: vector.ph:
+; CHECK-VF8-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-VF8-NEXT: [[TMP3:%.*]] = shl nuw i64 [[TMP2]], 1
+; CHECK-VF8-NEXT: [[TMP4:%.*]] = shl nuw i64 [[TMP3]], 1
+; CHECK-VF8-NEXT: [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP4]]
+; CHECK-VF8-NEXT: [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]]
+; CHECK-VF8-NEXT: br label [[VECTOR_BODY:%.*]]
+; CHECK-VF8: vector.body:
+; CHECK-VF8-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-VF8-NEXT: [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[INDEX]]
+; CHECK-VF8-NEXT: [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[TMP5]], i64 [[TMP3]]
+; CHECK-VF8-NEXT: store <vscale x 2 x i64> splat (i64 1), ptr [[TMP5]], align 1
+; CHECK-VF8-NEXT: store <vscale x 2 x i64> splat (i64 1), ptr [[TMP6]], align 1
+; CHECK-VF8-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP4]]
+; CHECK-VF8-NEXT: [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-VF8-NEXT: br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
+; CHECK-VF8: middle.block:
+; CHECK-VF8-NEXT: [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]]
+; CHECK-VF8-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[VEC_EPILOG_ITER_CHECK]]
+; CHECK-VF8: vec.epilog.iter.check:
+; CHECK-VF8-NEXT: [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ITER_CHECK:%.*]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
+; CHECK-VF8-NEXT: br label [[VEC_EPILOG_PH:%.*]]
+; CHECK-VF8: vec.epilog.ph:
+; CHECK-VF8-NEXT: [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call <8 x i1> @llvm.get.active.lane.mask.v8i1.i64(i64 [[VEC_EPILOG_RESUME_VAL]], i64 1024)
+; CHECK-VF8-NEXT: br label [[VEC_EPILOG_VECTOR_BODY:%.*]]
+; CHECK-VF8: vec.epilog.vector.body:
+; CHECK-VF8-NEXT: [[INDEX1:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT2:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ]
+; CHECK-VF8-NEXT: [[ACTIVE_LANE_MASK:%.*]] = phi <8 x i1> [ [[ACTIVE_LANE_MASK_ENTRY]], [[VEC_EPILOG_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ]
+; CHECK-VF8-NEXT: [[TMP8:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[INDEX1]]
+; CHECK-VF8-NEXT: call void @llvm.masked.store.v8i64.p0(<8 x i64> splat (i64 1), ptr align 1 [[TMP8]], <8 x i1> [[ACTIVE_LANE_MASK]])
+; CHECK-VF8-NEXT: [[INDEX_NEXT2]] = add i64 [[INDEX1]], 8
+; CHECK-VF8-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call <8 x i1> @llvm.get.active.lane.mask.v8i1.i64(i64 [[INDEX_NEXT2]], i64 1024)
+; CHECK-VF8-NEXT: [[TMP9:%.*]] = extractelement <8 x i1> [[ACTIVE_LANE_MASK_NEXT]], i32 0
+; CHECK-VF8-NEXT: [[TMP10:%.*]] = xor i1 [[TMP9]], true
+; CHECK-VF8-NEXT: br i1 [[TMP10]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
+; CHECK-VF8: vec.epilog.middle.block:
+; CHECK-VF8-NEXT: br label [[EXIT]]
+; CHECK-VF8: exit:
+; CHECK-VF8-NEXT: ret void
+;
+entry:
+ br label %for.body
+
+for.body:
+ %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+ %arrayidx = getelementptr inbounds i64, ptr %A, i64 %iv
+ store i64 1, ptr %arrayidx, align 1
+ %iv.next = add nuw nsw i64 %iv, 1
+ %exitcond = icmp ne i64 %iv.next, 1024
+ br i1 %exitcond, label %for.body, label %exit
+
+exit:
+ ret void
+}
+
+
+define void @main_vf_vscale_x_2(ptr %A, i64 %n) #0 vscale_range(8, 8) {
+;
+; CHECK-VF8-LABEL: @main_vf_vscale_x_2(
+; CHECK-VF8-NEXT: iter.check:
+; CHECK-VF8-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N:%.*]], 8
+; CHECK-VF8-NEXT: br i1 false, label [[VEC_EPILOG_ITER_CHECK:%.*]], label [[VECTOR_MAIN_LOOP_ITER_CHECK:%.*]]
+; CHECK-VF8: vector.main.loop.iter.check:
+; CHECK-VF8-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-VF8-NEXT: [[TMP1:%.*]] = shl nuw i64 [[TMP0]], 2
+; CHECK-VF8-NEXT: [[MIN_ITERS_CHECK1:%.*]] = icmp ult i64 [[N]], [[TMP1]]
+; CHECK-VF8-NEXT: br i1 [[MIN_ITERS_CHECK1]], label [[VEC_EPILOG_ITER_CHECK]], label [[VECTOR_PH:%.*]]
+; CHECK-VF8: vector.ph:
+; CHECK-VF8-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-VF8-NEXT: [[TMP3:%.*]] = shl nuw i64 [[TMP2]], 1
+; CHECK-VF8-NEXT: [[TMP4:%.*]] = shl nuw i64 [[TMP3]], 1
+; CHECK-VF8-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP4]]
+; CHECK-VF8-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
+; CHECK-VF8-NEXT: br label [[VECTOR_BODY:%.*]]
+; CHECK-VF8: vector.body:
+; CHECK-VF8-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-VF8-NEXT: [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[INDEX]]
+; CHECK-VF8-NEXT: [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[TMP5]], i64 [[TMP3]]
+; CHECK-VF8-NEXT: store <vscale x 2 x i64> splat (i64 1), ptr [[TMP5]], align 1
+; CHECK-VF8-NEXT: store <vscale x 2 x i64> splat (i64 1), ptr [[TMP6]], align 1
+; CHECK-VF8-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP4]]
+; CHECK-VF8-NEXT: [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-VF8-NEXT: br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
+; CHECK-VF8: middle.block:
+; CHECK-VF8-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
+; CHECK-VF8-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[VEC_EPILOG_ITER_CHECK]]
+; CHECK-VF8: vec.epilog.iter.check:
+; CHECK-VF8-NEXT: [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ITER_CHECK:%.*]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
+; CHECK-VF8-NEXT: br label [[VEC_EPILOG_PH:%.*]]
+; CHECK-VF8: vec.epilog.ph:
+; CHECK-VF8-NEXT: [[TMP8:%.*]] = sub i64 [[N]], 8
+; CHECK-VF8-NEXT: [[TMP9:%.*]] = icmp ugt i64 [[N]], 8
+; CHECK-VF8-NEXT: [[TMP10:%.*]] = select i1 [[TMP9]], i64 [[TMP8]], i64 0
+; CHECK-VF8-NEXT: [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call <8 x i1> @llvm.get.active.lane.mask.v8i1.i64(i64 [[VEC_EPILOG_RESUME_VAL]], i64 [[N]])
+; CHECK-VF8-NEXT: br label [[VEC_EPILOG_VECTOR_BODY:%.*]]
+; CHECK-VF8: vec.epilog.vector.body:
+; CHECK-VF8-NEXT: [[INDEX2:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT3:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ]
+; CHECK-VF8-NEXT: [[ACTIVE_LANE_MASK:%.*]] = phi <8 x i1> [ [[ACTIVE_LANE_MASK_ENTRY]], [[VEC_EPILOG_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ]
+; CHECK-VF8-NEXT: [[TMP11:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[INDEX2]]
+; CHECK-VF8-NEXT: call void @llvm.masked.store.v8i64.p0(<8 x i64> splat (i64 1), ptr align 1 [[TMP11]], <8 x i1> [[ACTIVE_LANE_MASK]])
+; CHECK-VF8-NEXT: [[INDEX_NEXT3]] = add i64 [[INDEX2]], 8
+; CHECK-VF8-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call <8 x i1> @llvm.get.active.lane.mask.v8i1.i64(i64 [[INDEX2]], i64 [[TMP10]])
+; CHECK-VF8-NEXT: [[TMP12:%.*]] = extractelement <8 x i1> [[ACTIVE_LANE_MASK_NEXT]], i32 0
+; CHECK-VF8-NEXT: [[TMP13:%.*]] = xor i1 [[TMP12]], true
+; CHECK-VF8-NEXT: br i1 [[TMP13]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]]
+; CHECK-VF8: vec.epilog.middle.block:
+; CHECK-VF8-NEXT: br label [[EXIT]]
+; CHECK-VF8: exit:
+; CHECK-VF8-NEXT: ret void
+;
+entry:
+ br label %for.body
+
+for.body:
+ %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+ %arrayidx = getelementptr inbounds i64, ptr %A, i64 %iv
+ store i64 1, ptr %arrayidx, align 1
+ %iv.next = add nuw nsw i64 %iv, 1
+ %exitcond = icmp ne i64 %iv.next, %n
+ br i1 %exitcond, label %for.body, label %exit
+
+exit:
+ ret void
+}
+
+define void @test_pr57912_pointer_induction(ptr %start) #0 {
+;
+; CHECK-VF8-LABEL: @test_pr57912_pointer_induction(
+; CHECK-VF8-NEXT: iter.check:
+; CHECK-VF8-NEXT: br i1 false, label [[VEC_EPILOG_ITER_CHECK:%.*]], label [[VECTOR_MAIN_LOOP_ITER_CHECK:%.*]]
+; CHECK-VF8: vector.main.loop.iter.check:
+; CHECK-VF8-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-VF8-NEXT: [[TMP1:%.*]] = shl nuw i64 [[TMP0]], 5
+; CHECK-VF8-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 10000, [[TMP1]]
+; CHECK-VF8-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[VEC_EPILOG_ITER_CHECK]], label [[VECTOR_PH:%.*]]
+; CHECK-VF8: vector.ph:
+; CHECK-VF8-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-VF8-NEXT: [[TMP3:%.*]] = shl nuw i64 [[TMP2]], 4
+; CHECK-VF8-NEXT: [[TMP4:%.*]] = shl nuw i64 [[TMP3]], 1
+; CHECK-VF8-NEXT: [[N_MOD_VF:%.*]] = urem i64 10000, [[TMP4]]
+; CHECK-VF8-NEXT: [[N_VEC:%.*]] = sub i64 10000, [[N_MOD_VF]]
+; CHECK-VF8-NEXT: br label [[VECTOR_BODY:%.*]]
+; CHECK-VF8: vector.body:
+; CHECK-VF8-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-VF8-NEXT: [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[START:%.*]], i64 [[INDEX]]
+; CHECK-VF8-NEXT: [[TMP5:%.*]] = getelementptr i8, ptr [[NEXT_GEP]], i64 [[TMP3]]
+; CHECK-VF8-NEXT: store <vscale x 16 x i8> zeroinitializer, ptr [[NEXT_GEP]], align 1
+; CHECK-VF8-NEXT: store <vscale x 16 x i8> zeroinitializer, ptr [[TMP5]], align 1
+; CHECK-VF8-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP4]]
+; CHECK-VF8-NEXT: [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-VF8-NEXT: br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
+; CHECK-VF8: middle.block:
+; CHECK-VF8-NEXT: [[CMP_N:%.*]] = icmp eq i64 10000, [[N_VEC]]
+; CHECK-VF8-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[VEC_EPILOG_ITER_CHECK]]
+; CHECK-VF8: vec.epilog.iter.check:
+; CHECK-VF8-NEXT: [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ITER_CHECK:%.*]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
+; CHECK-VF8-NEXT: br label [[VEC_EPILOG_PH:%.*]]
+; CHECK-VF8: vec.epilog.ph:
+; CHECK-VF8-NEXT: [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call <8 x i1> @llvm.get.active.lane.mask.v8i1.i64(i64 [[VEC_EPILOG_RESUME_VAL]], i64 10000)
+; CHECK-VF8-NEXT: br label [[VEC_EPILOG_VECTOR_BODY:%.*]]
+; CHECK-VF8: vec.epilog.vector.body:
+; CHECK-VF8-NEXT: [[INDEX1:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT3:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ]
+; CHECK-VF8-NEXT: [[ACTIVE_LANE_MASK:%.*]] = phi <8 x i1> [ [[ACTIVE_LANE_MASK_ENTRY]], [[VEC_EPILOG_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ]
+; CHECK-VF8-NEXT: [[NEXT_GEP2:%.*]] = getelementptr i8, ptr [[START]], i64 [[INDEX1]]
+; CHECK-VF8-NEXT: call void @llvm.masked.store.v8i8.p0(<8 x i8> zeroinitializer, ptr align 1 [[NEXT_GEP2]], <8 x i1> [[ACTIVE_LANE_MASK]])
+; CHECK-VF8-NEXT: [[INDEX_NEXT3]] = add i64 [[INDEX1]], 8
+; CHECK-VF8-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call <8 x i1> @llvm.get.active.lane.mask.v8i1.i64(i64 [[INDEX_NEXT3]], i64 10000)
+; CHECK-VF8-NEXT: [[TMP7:%.*]] = extractelement <8 x i1> [[ACTIVE_LANE_MASK_NEXT]], i32 0
+; CHECK-VF8-NEXT: [[TMP8:%.*]] = xor i1 [[TMP7]], true
+; CHECK-VF8-NEXT: br i1 [[TMP8]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]]
+; CHECK-VF8: vec.epilog.middle.block:
+; CHECK-VF8-NEXT: br label [[EXIT]]
+; CHECK-VF8: exit:
+; CHECK-VF8-NEXT: ret void
+;
+entry:
+ br label %loop
+
+loop:
+ %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
+ %ptr.iv = phi ptr [ %start, %entry ], [ %ptr.iv.next, %loop ]
+ store i8 0, ptr %ptr.iv, align 1
+ %ptr.iv.next = getelementptr inbounds i8, ptr %ptr.iv, i64 1
+ %iv.next = add nuw nsw i64 %iv, 1
+ %cmp = icmp eq i64 %iv.next, 10000
+ br i1 %cmp, label %exit, label %loop
+
+exit:
+ ret void
+}
+
+define void @trip_count_vscale(ptr noalias %a, ptr noalias %b) vscale_range(1, 16) #0 {
+;
+; CHECK-VF8-LABEL: @trip_count_vscale(
+; CHECK-VF8-NEXT: entry:
+; CHECK-VF8-NEXT: [[V:%.*]] = tail call i64 @llvm.vscale.i64()
+; CHECK-VF8-NEXT: [[N:%.*]] = mul nuw nsw i64 [[V]], 1033
+; CHECK-VF8-NEXT: br label [[VECTOR_PH:%.*]]
+; CHECK-VF8: vector.ph:
+; CHECK-VF8-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-VF8-NEXT: [[TMP1:%.*]] = shl nuw i64 [[TMP0]], 2
+; CHECK-VF8-NEXT: [[TMP2:%.*]] = shl nuw i64 [[TMP1]], 1
+; CHECK-VF8-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP2]]
+; CHECK-VF8-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
+; CHECK-VF8-NEXT: br label [[VECTOR_BODY:%.*]]
+; CHECK-VF8: vector.body:
+; CHECK-VF8-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-VF8-NEXT: [[TMP3:%.*]] = getelementptr inbounds nuw float, ptr [[A:%.*]], i64 [[INDEX]]
+; CHECK-VF8-NEXT: [[TMP4:%.*]] = getelementptr inbounds nuw float, ptr [[TMP3]], i64 [[TMP1]]
+; CHECK-VF8-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 4 x float>, ptr [[TMP3]], align 4
+; CHECK-VF8-NEXT: [[WIDE_LOAD1:%.*]] = load <vscale x 4 x float>, ptr [[TMP4]], align 4
+; CHECK-VF8-NEXT: [[TMP5:%.*]] = getelementptr inbounds nuw float, ptr [[B:%.*]], i64 [[INDEX]]
+; CHECK-VF8-NEXT: [[TMP6:%.*]] = getelementptr inbounds nuw float, ptr [[TMP5]], i64 [[TMP1]]
+; CHECK-VF8-NEXT: [[WIDE_LOAD2:%.*]] = load <vscale x 4 x float>, ptr [[TMP5]], align 4
+; CHECK-VF8-NEXT: [[WIDE_LOAD3:%.*]] = load <vscale x 4 x float>, ptr [[TMP6]], align 4
+; CHECK-VF8-NEXT: [[TMP7:%.*]] = fmul <vscale x 4 x float> [[WIDE_LOAD]], [[WIDE_LOAD2]]
+; CHECK-VF8-NEXT: [[TMP8:%.*]] = fmul <vscale x 4 x float> [[WIDE_LOAD1]], [[WIDE_LOAD3]]
+; CHECK-VF8-NEXT: store <vscale x 4 x float> [[TMP7]], ptr [[TMP5]], align 4
+; CHECK-VF8-NEXT: store <vscale x 4 x float> [[TMP8]], ptr [[TMP6]], align 4
+; CHECK-VF8-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP2]]
+; CHECK-VF8-NEXT: [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-VF8-NEXT: br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
+; CHECK-VF8: middle.block:
+; CHECK-VF8-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
+; CHECK-VF8-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH:%.*]]
+; CHECK-VF8: scalar.ph:
+; CHECK-VF8-NEXT: br label [[FOR_BODY:%.*]]
+; CHECK-VF8: for.body:
+;
+entry:
+ %v = tail call i64 @llvm.vscale.i64()
+ %n = mul nuw nsw i64 %v, 1033
+ br label %for.body
+
+for.body:
+ %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+ %arrayidx = getelementptr inbounds nuw float, ptr %a, i64 %iv
+ %l.a = load float, ptr %arrayidx, align 4
+ %arrayidx3 = getelementptr inbounds nuw float, ptr %b, i64 %iv
+ %l.b = load float, ptr %arrayidx3, align 4
+ %mul4 = fmul float %l.a, %l.b
+ store float %mul4, ptr %arrayidx3, align 4
+ %iv.next = add nuw nsw i64 %iv, 1
+ %ec = icmp eq i64 %iv.next, %n
+ br i1 %ec, label %exit, label %for.body
+
+exit:
+ ret void
+}
+
+define void @trip_count_vscale_no_epilogue_iterations(ptr noalias %a, ptr noalias %b) vscale_range(1, 16) #0 {
+;
+; CHECK-VF8-LABEL: @trip_count_vscale_no_epilogue_iterations(
+; CHECK-VF8-NEXT: entry:
+; CHECK-VF8-NEXT: [[V:%.*]] = tail call i64 @llvm.vscale.i64()
+; CHECK-VF8-NEXT: [[N:%.*]] = mul nuw nsw i64 [[V]], 1024
+; CHECK-VF8-NEXT: br label [[VECTOR_PH:%.*]]
+; CHECK-VF8: vector.ph:
+; CHECK-VF8-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-VF8-NEXT: [[TMP1:%.*]] = shl nuw i64 [[TMP0]], 2
+; CHECK-VF8-NEXT: [[TMP2:%.*]] = shl nuw i64 [[TMP1]], 1
+; CHECK-VF8-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP2]]
+; CHECK-VF8-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
+; CHECK-VF8-NEXT: br label [[VECTOR_BODY:%.*]]
+; CHECK-VF8: vector.body:
+; CHECK-VF8-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-VF8-NEXT: [[TMP3:%.*]] = getelementptr inbounds nuw float, ptr [[A:%.*]], i64 [[INDEX]]
+; CHECK-VF8-NEXT: [[TMP4:%.*]] = getelementptr inbounds nuw float, ptr [[TMP3]], i64 [[TMP1]]
+; CHECK-VF8-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 4 x float>, ptr [[TMP3]], align 4
+; CHECK-VF8-NEXT: [[WIDE_LOAD1:%.*]] = load <vscale x 4 x float>, ptr [[TMP4]], align 4
+; CHECK-VF8-NEXT: [[TMP5:%.*]] = getelementptr inbounds nuw float, ptr [[B:%.*]], i64 [[INDEX]]
+; CHECK-VF8-NEXT: [[TMP6:%.*]] = getelementptr inbounds nuw float, ptr [[TMP5]], i64 [[TMP1]]
+; CHECK-VF8-NEXT: [[WIDE_LOAD2:%.*]] = load <vscale x 4 x float>, ptr [[TMP5]], align 4
+; CHECK-VF8-NEXT: [[WIDE_LOAD3:%.*]] = load <vscale x 4 x float>, ptr [[TMP6]], align 4
+; CHECK-VF8-NEXT: [[TMP7:%.*]] = fmul <vscale x 4 x float> [[WIDE_LOAD]], [[WIDE_LOAD2]]
+; CHECK-VF8-NEXT: [[TMP8:%.*]] = fmul <vscale x 4 x float> [[WIDE_LOAD1]], [[WIDE_LOAD3]]
+; CHECK-VF8-NEXT: store <vscale x 4 x float> [[TMP7]], ptr [[TMP5]], align 4
+; CHECK-VF8-NEXT: store <vscale x 4 x float> [[TMP8]], ptr [[TMP6]], align 4
+; CHECK-VF8-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP2]]
+; CHECK-VF8-NEXT: [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-VF8-NEXT: br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]]
+; CHECK-VF8: middle.block:
+; CHECK-VF8-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
+; CHECK-VF8-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH:%.*]]
+; CHECK-VF8: scalar.ph:
+; CHECK-VF8-NEXT: br label [[FOR_BODY:%.*]]
+; CHECK-VF8: for.body:
+;
+entry:
+ %v = tail call i64 @llvm.vscale.i64()
+ %n = mul nuw nsw i64 %v, 1024
+ br label %for.body
+
+for.body:
+ %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+ %arrayidx = getelementptr inbounds nuw float, ptr %a, i64 %iv
+ %l.a = load float, ptr %arrayidx, align 4
+ %arrayidx3 = getelementptr inbounds nuw float, ptr %b, i64 %iv
+ %l.b = load float, ptr %arrayidx3, align 4
+ %mul4 = fmul float %l.a, %l.b
+ store float %mul4, ptr %arrayidx3, align 4
+ %iv.next = add nuw nsw i64 %iv, 1
+ %ec = icmp eq i64 %iv.next, %n
+ br i1 %ec, label %exit, label %for.body
+
+exit:
+ ret void
+}
+
+attributes #0 = { "target-features"="+sve" }
More information about the llvm-commits
mailing list