[llvm] 81fdc73 - [LV] Return both fixed and scalable Max VF from computeMaxVF.
Sander de Smalen via llvm-commits
llvm-commits at lists.llvm.org
Tue May 18 00:04:42 PDT 2021
Author: Sander de Smalen
Date: 2021-05-18T08:03:48+01:00
New Revision: 81fdc73e5dc74bfe7cda1e16f01f23836406d709
URL: https://github.com/llvm/llvm-project/commit/81fdc73e5dc74bfe7cda1e16f01f23836406d709
DIFF: https://github.com/llvm/llvm-project/commit/81fdc73e5dc74bfe7cda1e16f01f23836406d709.diff
LOG: [LV] Return both fixed and scalable Max VF from computeMaxVF.
This patch introduces a new class, MaxVFCandidates, that holds the
maximum vectorization factors that have been computed for both scalable
and fixed-width vectors.
This patch is intended to be NFC for fixed-width vectors, although
considering a scalable max VF (which is disabled by default) pessimises
tail-loop elimination, since it can no longer determine if any chosen VF
(less than fixed/scalable MaxVFs) is guaranteed to handle all vector
iterations if the trip-count is known. This issue will be addressed in
a future patch.
Reviewed By: fhahn, david-arm
Differential Revision: https://reviews.llvm.org/D98721
Added:
llvm/test/Transforms/LoopVectorize/AArch64/eliminate-tail-predication.ll
Modified:
llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h
llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
Removed:
################################################################################
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h b/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h
index d76b557dda13a..6e8953f7790e4 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h
@@ -200,6 +200,37 @@ struct VectorizationFactor {
}
};
+/// A class that represents two vectorization factors (initialized with 0 by
+/// default). One for fixed-width vectorization and one for scalable
+/// vectorization. This can be used by the vectorizer to choose from a range of
+/// fixed and/or scalable VFs in order to find the most cost-effective VF to
+/// vectorize with.
+struct FixedScalableVFPair {
+ ElementCount FixedVF;
+ ElementCount ScalableVF;
+
+ FixedScalableVFPair()
+ : FixedVF(ElementCount::getFixed(0)),
+ ScalableVF(ElementCount::getScalable(0)) {}
+ FixedScalableVFPair(const ElementCount &Max) : FixedScalableVFPair() {
+ *(Max.isScalable() ? &ScalableVF : &FixedVF) = Max;
+ }
+ FixedScalableVFPair(const ElementCount &FixedVF,
+ const ElementCount &ScalableVF)
+ : FixedVF(FixedVF), ScalableVF(ScalableVF) {
+ assert(!FixedVF.isScalable() && ScalableVF.isScalable() &&
+ "Invalid scalable properties");
+ }
+
+ static FixedScalableVFPair getNone() { return FixedScalableVFPair(); }
+
+ /// \return true if either fixed- or scalable VF is non-zero.
+ explicit operator bool() const { return FixedVF || ScalableVF; }
+
+ /// \return true if either fixed- or scalable VF is a valid vector VF.
+ bool hasVector() const { return FixedVF.isVector() || ScalableVF.isVector(); }
+};
+
/// Planner drives the vectorization process after having passed
/// Legality checks.
class LoopVectorizationPlanner {
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 90343f3a57fa0..cfa70aed15c91 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -1229,9 +1229,10 @@ class LoopVectorizationCostModel {
TTI(TTI), TLI(TLI), DB(DB), AC(AC), ORE(ORE), TheFunction(F),
Hints(Hints), InterleaveInfo(IAI) {}
- /// \return An upper bound for the vectorization factor, or None if
- /// vectorization and interleaving should be avoided up front.
- Optional<ElementCount> computeMaxVF(ElementCount UserVF, unsigned UserIC);
+ /// \return An upper bound for the vectorization factors (both fixed and
+ /// scalable). If the factors are 0, vectorization and interleaving should be
+ /// avoided up front.
+ FixedScalableVFPair computeMaxVF(ElementCount UserVF, unsigned UserIC);
/// \return True if runtime checks are required for vectorization, and false
/// otherwise.
@@ -1625,11 +1626,13 @@ class LoopVectorizationCostModel {
private:
unsigned NumPredStores = 0;
- /// \return An upper bound for the vectorization factor, a power-of-2 larger
- /// than zero. One is returned if vectorization should best be avoided due
- /// to cost.
- ElementCount computeFeasibleMaxVF(unsigned ConstTripCount,
- ElementCount UserVF);
+ /// \return An upper bound for the vectorization factors for both
+ /// fixed and scalable vectorization, where the minimum-known number of
+ /// elements is a power-of-2 larger than zero. If scalable vectorization is
+ /// disabled or unsupported, then the scalable part will be equal to
+ /// ElementCount::getScalable(0).
+ FixedScalableVFPair computeFeasibleMaxVF(unsigned ConstTripCount,
+ ElementCount UserVF);
/// \return the maximized element count based on the targets vector
/// registers and the loop trip-count, but limited to a maximum safe VF.
@@ -5676,7 +5679,7 @@ LoopVectorizationCostModel::getMaxLegalScalableVF(unsigned MaxSafeElements) {
return MaxScalableVF;
}
-ElementCount
+FixedScalableVFPair
LoopVectorizationCostModel::computeFeasibleMaxVF(unsigned ConstTripCount,
ElementCount UserVF) {
MinBWs = computeMinimumValueSizes(TheLoop->getBlocks(), *DB, &TTI);
@@ -5742,22 +5745,24 @@ LoopVectorizationCostModel::computeFeasibleMaxVF(unsigned ConstTripCount,
LLVM_DEBUG(dbgs() << "LV: The Smallest and Widest types: " << SmallestType
<< " / " << WidestType << " bits.\n");
- ElementCount MaxFixedVF = ElementCount::getFixed(1);
+ FixedScalableVFPair Result(ElementCount::getFixed(1),
+ ElementCount::getScalable(0));
if (auto MaxVF = getMaximizedVFForTarget(ConstTripCount, SmallestType,
WidestType, MaxSafeFixedVF))
- MaxFixedVF = MaxVF;
+ Result.FixedVF = MaxVF;
if (auto MaxVF = getMaximizedVFForTarget(ConstTripCount, SmallestType,
WidestType, MaxSafeScalableVF))
- // FIXME: Return scalable VF as well (to be added in future patch).
- if (MaxVF.isScalable())
+ if (MaxVF.isScalable()) {
+ Result.ScalableVF = MaxVF;
LLVM_DEBUG(dbgs() << "LV: Found feasible scalable VF = " << MaxVF
<< "\n");
+ }
- return MaxFixedVF;
+ return Result;
}
-Optional<ElementCount>
+FixedScalableVFPair
LoopVectorizationCostModel::computeMaxVF(ElementCount UserVF, unsigned UserIC) {
if (Legal->getRuntimePointerChecking()->Need && TTI.hasBranchDivergence()) {
// TODO: It may by useful to do since it's still likely to be dynamically
@@ -5766,7 +5771,7 @@ LoopVectorizationCostModel::computeMaxVF(ElementCount UserVF, unsigned UserIC) {
"Not inserting runtime ptr check for divergent target",
"runtime pointer checks needed. Not enabled for divergent target",
"CantVersionLoopWithDivergentTarget", ORE, TheLoop);
- return None;
+ return FixedScalableVFPair::getNone();
}
unsigned TC = PSE.getSE()->getSmallConstantTripCount(TheLoop);
@@ -5775,7 +5780,7 @@ LoopVectorizationCostModel::computeMaxVF(ElementCount UserVF, unsigned UserIC) {
reportVectorizationFailure("Single iteration (non) loop",
"loop trip count is one, irrelevant for vectorization",
"SingleIterationLoop", ORE, TheLoop);
- return None;
+ return FixedScalableVFPair::getNone();
}
switch (ScalarEpilogueStatus) {
@@ -5802,7 +5807,7 @@ LoopVectorizationCostModel::computeMaxVF(ElementCount UserVF, unsigned UserIC) {
// Bail if runtime checks are required, which are not good when optimising
// for size.
if (runtimeChecksRequired())
- return None;
+ return FixedScalableVFPair::getNone();
break;
}
@@ -5820,7 +5825,7 @@ LoopVectorizationCostModel::computeMaxVF(ElementCount UserVF, unsigned UserIC) {
ScalarEpilogueStatus = CM_ScalarEpilogueAllowed;
return computeFeasibleMaxVF(TC, UserVF);
}
- return None;
+ return FixedScalableVFPair::getNone();
}
// Now try the tail folding
@@ -5835,26 +5840,29 @@ LoopVectorizationCostModel::computeMaxVF(ElementCount UserVF, unsigned UserIC) {
InterleaveInfo.invalidateGroupsRequiringScalarEpilogue();
}
- ElementCount MaxVF = computeFeasibleMaxVF(TC, UserVF);
- assert(!MaxVF.isScalable() &&
- "Scalable vectors do not yet support tail folding");
- assert((UserVF.isNonZero() || isPowerOf2_32(MaxVF.getFixedValue())) &&
- "MaxVF must be a power of 2");
- unsigned MaxVFtimesIC =
- UserIC ? MaxVF.getFixedValue() * UserIC : MaxVF.getFixedValue();
- // Avoid tail folding if the trip count is known to be a multiple of any VF we
- // chose.
- ScalarEvolution *SE = PSE.getSE();
- const SCEV *BackedgeTakenCount = PSE.getBackedgeTakenCount();
- const SCEV *ExitCount = SE->getAddExpr(
- BackedgeTakenCount, SE->getOne(BackedgeTakenCount->getType()));
- const SCEV *Rem = SE->getURemExpr(
- SE->applyLoopGuards(ExitCount, TheLoop),
- SE->getConstant(BackedgeTakenCount->getType(), MaxVFtimesIC));
- if (Rem->isZero()) {
- // Accept MaxVF if we do not have a tail.
- LLVM_DEBUG(dbgs() << "LV: No tail will remain for any chosen VF.\n");
- return MaxVF;
+ FixedScalableVFPair MaxFactors = computeFeasibleMaxVF(TC, UserVF);
+ // Avoid tail folding if the trip count is known to be a multiple of any VF
+ // we chose.
+ // FIXME: The condition below pessimises the case for fixed-width vectors,
+ // when scalable VFs are also candidates for vectorization.
+ if (MaxFactors.FixedVF.isVector() && !MaxFactors.ScalableVF) {
+ ElementCount MaxFixedVF = MaxFactors.FixedVF;
+ assert((UserVF.isNonZero() || isPowerOf2_32(MaxFixedVF.getFixedValue())) &&
+ "MaxFixedVF must be a power of 2");
+ unsigned MaxVFtimesIC = UserIC ? MaxFixedVF.getFixedValue() * UserIC
+ : MaxFixedVF.getFixedValue();
+ ScalarEvolution *SE = PSE.getSE();
+ const SCEV *BackedgeTakenCount = PSE.getBackedgeTakenCount();
+ const SCEV *ExitCount = SE->getAddExpr(
+ BackedgeTakenCount, SE->getOne(BackedgeTakenCount->getType()));
+ const SCEV *Rem = SE->getURemExpr(
+ SE->applyLoopGuards(ExitCount, TheLoop),
+ SE->getConstant(BackedgeTakenCount->getType(), MaxVFtimesIC));
+ if (Rem->isZero()) {
+ // Accept MaxFixedVF if we do not have a tail.
+ LLVM_DEBUG(dbgs() << "LV: No tail will remain for any chosen VF.\n");
+ return MaxFactors;
+ }
}
// If we don't know the precise trip count, or if the trip count that we
@@ -5863,7 +5871,7 @@ LoopVectorizationCostModel::computeMaxVF(ElementCount UserVF, unsigned UserIC) {
// FIXME: look for a smaller MaxVF that does divide TC rather than masking.
if (Legal->prepareToFoldTailByMasking()) {
FoldTailByMasking = true;
- return MaxVF;
+ return MaxFactors;
}
// If there was a tail-folding hint/switch, but we can't fold the tail by
@@ -5872,12 +5880,12 @@ LoopVectorizationCostModel::computeMaxVF(ElementCount UserVF, unsigned UserIC) {
LLVM_DEBUG(dbgs() << "LV: Cannot fold tail by masking: vectorize with a "
"scalar epilogue instead.\n");
ScalarEpilogueStatus = CM_ScalarEpilogueAllowed;
- return MaxVF;
+ return MaxFactors;
}
if (ScalarEpilogueStatus == CM_ScalarEpilogueNotAllowedUsePredicate) {
LLVM_DEBUG(dbgs() << "LV: Can't fold tail by masking: don't vectorize\n");
- return None;
+ return FixedScalableVFPair::getNone();
}
if (TC == 0) {
@@ -5885,7 +5893,7 @@ LoopVectorizationCostModel::computeMaxVF(ElementCount UserVF, unsigned UserIC) {
"Unable to calculate the loop count due to complex control flow",
"unable to calculate the loop count due to complex control flow",
"UnknownLoopCountComplexCFG", ORE, TheLoop);
- return None;
+ return FixedScalableVFPair::getNone();
}
reportVectorizationFailure(
@@ -5894,7 +5902,7 @@ LoopVectorizationCostModel::computeMaxVF(ElementCount UserVF, unsigned UserIC) {
"Enable vectorization of this loop with '#pragma clang loop "
"vectorize(enable)' when compiling with -Os/-Oz",
"NoTailLoopWithOptForSize", ORE, TheLoop);
- return None;
+ return FixedScalableVFPair::getNone();
}
ElementCount LoopVectorizationCostModel::getMaximizedVFForTarget(
@@ -7928,8 +7936,8 @@ LoopVectorizationPlanner::planInVPlanNativePath(ElementCount UserVF) {
Optional<VectorizationFactor>
LoopVectorizationPlanner::plan(ElementCount UserVF, unsigned UserIC) {
assert(OrigLoop->isInnermost() && "Inner loop expected.");
- Optional<ElementCount> MaybeMaxVF = CM.computeMaxVF(UserVF, UserIC);
- if (!MaybeMaxVF) // Cases that should not to be vectorized nor interleaved.
+ FixedScalableVFPair MaxFactors = CM.computeMaxVF(UserVF, UserIC);
+ if (!MaxFactors) // Cases that should not to be vectorized nor interleaved.
return None;
// Invalidate interleave groups if all blocks of loop will be predicated.
@@ -7946,29 +7954,24 @@ LoopVectorizationPlanner::plan(ElementCount UserVF, unsigned UserIC) {
CM.invalidateCostModelingDecisions();
}
- ElementCount MaxVF = MaybeMaxVF.getValue();
- assert(MaxVF.isNonZero() && "MaxVF is zero.");
-
- bool UserVFIsLegal = ElementCount::isKnownLE(UserVF, MaxVF);
- if (!UserVF.isZero() &&
- (UserVFIsLegal || (UserVF.isScalable() && MaxVF.isScalable()))) {
- // FIXME: MaxVF is temporarily used inplace of UserVF for illegal scalable
- // VFs here, this should be reverted to only use legal UserVFs once the
- // loop below supports scalable VFs.
- ElementCount VF = UserVFIsLegal ? UserVF : MaxVF;
+ ElementCount MaxUserVF =
+ UserVF.isScalable() ? MaxFactors.ScalableVF : MaxFactors.FixedVF;
+ bool UserVFIsLegal = ElementCount::isKnownLE(UserVF, MaxUserVF);
+ if (!UserVF.isZero() && UserVFIsLegal) {
LLVM_DEBUG(dbgs() << "LV: Using " << (UserVFIsLegal ? "user" : "max")
- << " VF " << VF << ".\n");
- assert(isPowerOf2_32(VF.getKnownMinValue()) &&
+ << " VF " << UserVF << ".\n");
+ assert(isPowerOf2_32(UserVF.getKnownMinValue()) &&
"VF needs to be a power of two");
// Collect the instructions (and their associated costs) that will be more
// profitable to scalarize.
- CM.selectUserVectorizationFactor(VF);
+ CM.selectUserVectorizationFactor(UserVF);
CM.collectInLoopReductions();
- buildVPlansWithVPRecipes(VF, VF);
+ buildVPlansWithVPRecipes({UserVF}, {UserVF});
LLVM_DEBUG(printPlans(dbgs()));
- return {{VF, 0}};
+ return {{UserVF, 0}};
}
+ ElementCount MaxVF = MaxFactors.FixedVF;
assert(!MaxVF.isScalable() &&
"Scalable vectors not yet supported beyond this point");
@@ -7987,7 +7990,7 @@ LoopVectorizationPlanner::plan(ElementCount UserVF, unsigned UserIC) {
buildVPlansWithVPRecipes(ElementCount::getFixed(1), MaxVF);
LLVM_DEBUG(printPlans(dbgs()));
- if (MaxVF.isScalar())
+ if (!MaxFactors.hasVector())
return VectorizationFactor::Disabled();
// Select the optimal vectorization factor.
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/eliminate-tail-predication.ll b/llvm/test/Transforms/LoopVectorize/AArch64/eliminate-tail-predication.ll
new file mode 100644
index 0000000000000..3eb0eef96f9f9
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/eliminate-tail-predication.ll
@@ -0,0 +1,33 @@
+; RUN: opt -loop-vectorize -force-target-instruction-cost=1 -prefer-predicate-over-epilogue=predicate-dont-vectorize -S < %s 2>&1 | FileCheck %s
+
+; This test currently fails when the LV calculates a maximums safe
+; distance for scalable vectors, because the code to eliminate the tail is
+; pessimistic when scalable vectors are considered. This will be addressed
+; in a future patch, at which point we should be able to un-XFAIL the
+; test. The expected output is to vectorize this loop without predication
+; (and thus have unpredicated vector store).
+; XFAIL: *
+
+; CHECK: store <4 x i32>
+
+target triple = "aarch64"
+target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128"
+
+
+define void @f1(i32* %A) #0 {
+entry:
+ br label %for.body
+
+for.body:
+ %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+ %arrayidx = getelementptr inbounds i32, i32* %A, i64 %iv
+ store i32 1, i32* %arrayidx, align 4
+ %iv.next = add nuw nsw i64 %iv, 1
+ %exitcond = icmp ne i64 %iv.next, 1024
+ br i1 %exitcond, label %for.body, label %exit
+
+exit:
+ ret void
+}
+
+attributes #0 = { "target-features"="+sve" }
More information about the llvm-commits
mailing list