[llvm] d41cb6b - [LV] Build and cost VPlans for scalable VFs.
Sander de Smalen via llvm-commits
llvm-commits at lists.llvm.org
Wed Jun 2 06:51:58 PDT 2021
Author: Sander de Smalen
Date: 2021-06-02T14:47:47+01:00
New Revision: d41cb6bb2607fa5c7a9df2b3dab361353657d225
URL: https://github.com/llvm/llvm-project/commit/d41cb6bb2607fa5c7a9df2b3dab361353657d225
DIFF: https://github.com/llvm/llvm-project/commit/d41cb6bb2607fa5c7a9df2b3dab361353657d225.diff
LOG: [LV] Build and cost VPlans for scalable VFs.
This patch uses the calculated maximum scalable VFs to build VPlans,
cost them and select a suitable scalable VF.
Reviewed By: paulwalker-arm
Differential Revision: https://reviews.llvm.org/D98722
Added:
Modified:
llvm/include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h
llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
llvm/test/Transforms/LoopVectorize/AArch64/masked-op-cost.ll
llvm/test/Transforms/LoopVectorize/AArch64/scalable-vectorization.ll
Removed:
################################################################################
diff --git a/llvm/include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h b/llvm/include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h
index deb694d13ac1e..9da5ce5ca157d 100644
--- a/llvm/include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h
+++ b/llvm/include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h
@@ -144,6 +144,13 @@ class LoopVectorizeHints {
return (ForceKind)Force.Value;
}
+ /// \return true if the cost-model for scalable vectorization should
+ /// favor vectorization with scalable vectors over fixed-width vectors when
+ /// the cost-model is inconclusive.
+ bool isScalableVectorizationPreferred() const {
+ return Scalable.Value == SK_PreferScalable;
+ }
+
/// \return true if scalable vectorization has been explicitly enabled.
bool isScalableVectorizationExplicitlyEnabled() const {
return Scalable.Value == SK_PreferFixedWidth ||
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 21d3385e3862a..0378c2737bcb0 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -70,6 +70,7 @@
#include "llvm/ADT/Optional.h"
#include "llvm/ADT/STLExtras.h"
#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/SmallSet.h"
#include "llvm/ADT/SmallVector.h"
#include "llvm/ADT/Statistic.h"
#include "llvm/ADT/StringRef.h"
@@ -1204,6 +1205,16 @@ enum ScalarEpilogueLowering {
CM_ScalarEpilogueNotAllowedUsePredicate
};
+/// ElementCountComparator creates a total ordering for ElementCount
+/// for the purposes of using it in a set structure.
+struct ElementCountComparator {
+ bool operator()(const ElementCount &LHS, const ElementCount &RHS) const {
+ return std::make_tuple(LHS.isScalable(), LHS.getKnownMinValue()) <
+ std::make_tuple(RHS.isScalable(), RHS.getKnownMinValue());
+ }
+};
+using ElementCountSet = SmallSet<ElementCount, 16, ElementCountComparator>;
+
/// LoopVectorizationCostModel - estimates the expected speedups due to
/// vectorization.
/// In many cases vectorization is not profitable. This can happen because of
@@ -1236,10 +1247,12 @@ class LoopVectorizationCostModel {
bool runtimeChecksRequired();
/// \return The most profitable vectorization factor and the cost of that VF.
- /// This method checks every power of two up to MaxVF. If UserVF is not ZERO
+ /// This method checks every VF in \p CandidateVFs. If UserVF is not ZERO
/// then this vectorization factor will be selected if vectorization is
/// possible.
- VectorizationFactor selectVectorizationFactor(ElementCount MaxVF);
+ VectorizationFactor
+ selectVectorizationFactor(const ElementCountSet &CandidateVFs);
+
VectorizationFactor
selectEpilogueVectorizationFactor(const ElementCount MaxVF,
const LoopVectorizationPlanner &LVP);
@@ -6013,6 +6026,14 @@ bool LoopVectorizationCostModel::isMoreProfitable(
return RTCostA < RTCostB;
}
+ // When set to preferred, for now assume vscale may be larger than 1, so
+ // that scalable vectorization is slightly favorable over fixed-width
+ // vectorization.
+ if (Hints->isScalableVectorizationPreferred())
+ if (A.Width.isScalable() && !B.Width.isScalable())
+ return (CostA * B.Width.getKnownMinValue()) <=
+ (CostB * A.Width.getKnownMinValue());
+
// To avoid the need for FP division:
// (CostA / A.Width) < (CostB / B.Width)
// <=> (CostA * B.Width) < (CostB * A.Width)
@@ -6020,30 +6041,30 @@ bool LoopVectorizationCostModel::isMoreProfitable(
(CostB * A.Width.getKnownMinValue());
}
-VectorizationFactor
-LoopVectorizationCostModel::selectVectorizationFactor(ElementCount MaxVF) {
- // FIXME: This can be fixed for scalable vectors later, because at this stage
- // the LoopVectorizer will only consider vectorizing a loop with scalable
- // vectors when the loop has a hint to enable vectorization for a given VF.
- assert(!MaxVF.isScalable() && "scalable vectors not yet supported");
-
+VectorizationFactor LoopVectorizationCostModel::selectVectorizationFactor(
+ const ElementCountSet &VFCandidates) {
InstructionCost ExpectedCost = expectedCost(ElementCount::getFixed(1)).first;
LLVM_DEBUG(dbgs() << "LV: Scalar loop costs: " << ExpectedCost << ".\n");
assert(ExpectedCost.isValid() && "Unexpected invalid cost for scalar loop");
+ assert(VFCandidates.count(ElementCount::getFixed(1)) &&
+ "Expected Scalar VF to be a candidate");
const VectorizationFactor ScalarCost(ElementCount::getFixed(1), ExpectedCost);
VectorizationFactor ChosenFactor = ScalarCost;
bool ForceVectorization = Hints->getForce() == LoopVectorizeHints::FK_Enabled;
- if (ForceVectorization && MaxVF.isVector()) {
+ if (ForceVectorization && VFCandidates.size() > 1) {
// Ignore scalar width, because the user explicitly wants vectorization.
// Initialize cost to max so that VF = 2 is, at least, chosen during cost
// evaluation.
ChosenFactor.Cost = std::numeric_limits<InstructionCost::CostType>::max();
}
- for (auto i = ElementCount::getFixed(2); ElementCount::isKnownLE(i, MaxVF);
- i *= 2) {
+ for (const auto &i : VFCandidates) {
+ // The cost for scalar VF=1 is already calculated, so ignore it.
+ if (i.isScalar())
+ continue;
+
// Notice that the vector loop needs to be executed less times, so
// we need to divide the cost of the vector loops by the width of
// the vector elements.
@@ -6053,7 +6074,9 @@ LoopVectorizationCostModel::selectVectorizationFactor(ElementCount MaxVF) {
VectorizationFactor Candidate(i, C.first);
LLVM_DEBUG(
dbgs() << "LV: Vector loop of width " << i << " costs: "
- << (*Candidate.Cost.getValue() / Candidate.Width.getFixedValue())
+ << (*Candidate.Cost.getValue() /
+ Candidate.Width.getKnownMinValue())
+ << (i.isScalable() ? " (assuming a minimum vscale of 1)" : "")
<< ".\n");
if (!C.second && !ForceVectorization) {
@@ -7967,17 +7990,21 @@ LoopVectorizationPlanner::plan(ElementCount UserVF, unsigned UserIC) {
// profitable to scalarize.
CM.selectUserVectorizationFactor(UserVF);
CM.collectInLoopReductions();
- buildVPlansWithVPRecipes({UserVF}, {UserVF});
+ buildVPlansWithVPRecipes(UserVF, UserVF);
LLVM_DEBUG(printPlans(dbgs()));
return {{UserVF, 0}};
}
- ElementCount MaxVF = MaxFactors.FixedVF;
- assert(!MaxVF.isScalable() &&
- "Scalable vectors not yet supported beyond this point");
+ // Populate the set of Vectorization Factor Candidates.
+ ElementCountSet VFCandidates;
+ for (auto VF = ElementCount::getFixed(1);
+ ElementCount::isKnownLE(VF, MaxFactors.FixedVF); VF *= 2)
+ VFCandidates.insert(VF);
+ for (auto VF = ElementCount::getScalable(1);
+ ElementCount::isKnownLE(VF, MaxFactors.ScalableVF); VF *= 2)
+ VFCandidates.insert(VF);
- for (ElementCount VF = ElementCount::getFixed(1);
- ElementCount::isKnownLE(VF, MaxVF); VF *= 2) {
+ for (const auto VF : VFCandidates) {
// Collect Uniform and Scalar instructions after vectorization with VF.
CM.collectUniformsAndScalars(VF);
@@ -7988,14 +8015,15 @@ LoopVectorizationPlanner::plan(ElementCount UserVF, unsigned UserIC) {
}
CM.collectInLoopReductions();
+ buildVPlansWithVPRecipes(ElementCount::getFixed(1), MaxFactors.FixedVF);
+ buildVPlansWithVPRecipes(ElementCount::getScalable(1), MaxFactors.ScalableVF);
- buildVPlansWithVPRecipes(ElementCount::getFixed(1), MaxVF);
LLVM_DEBUG(printPlans(dbgs()));
if (!MaxFactors.hasVector())
return VectorizationFactor::Disabled();
// Select the optimal vectorization factor.
- auto SelectedVF = CM.selectVectorizationFactor(MaxVF);
+ auto SelectedVF = CM.selectVectorizationFactor(VFCandidates);
// Check if it is profitable to vectorize with runtime checks.
unsigned NumRuntimePointerChecks = Requirements.getNumRuntimePointerChecks();
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/masked-op-cost.ll b/llvm/test/Transforms/LoopVectorize/AArch64/masked-op-cost.ll
index 7d6469fe2e23b..7e53d025e3adb 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/masked-op-cost.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/masked-op-cost.ll
@@ -41,7 +41,7 @@ if.then: ; preds = %for.body
for.inc: ; preds = %for.body, %if.then
%inc = add nuw nsw i64 %i.07, 1
%exitcond.not = icmp eq i64 %inc, %n
- br i1 %exitcond.not, label %for.cond.cleanup.loopexit, label %for.body
+ br i1 %exitcond.not, label %for.cond.cleanup.loopexit, label %for.body, !llvm.loop !5
}
@@ -90,3 +90,5 @@ attributes #0 = { "target-features"="+neon,+sve" }
!2 = !{!"llvm.loop.vectorize.width", i32 4}
!3 = !{!"llvm.loop.vectorize.scalable.enable", i1 true}
!4 = !{!"llvm.loop.vectorize.enable", i1 true}
+!5 = distinct !{!5, !6}
+!6 = !{!"llvm.loop.vectorize.scalable.enable", i1 false}
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/scalable-vectorization.ll b/llvm/test/Transforms/LoopVectorize/AArch64/scalable-vectorization.ll
index ed711a4f5f581..1d9d9d8545408 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/scalable-vectorization.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/scalable-vectorization.ll
@@ -10,9 +10,13 @@
define void @test0(i32* %a, i8* %b, i32* %c) {
; CHECK: LV: Checking a loop in "test0"
; CHECK_SCALABLE_ON: LV: Found feasible scalable VF = vscale x 4
+; CHECK_SCALABLE_ON: LV: Selecting VF: 4
; CHECK_SCALABLE_PREFERRED: LV: Found feasible scalable VF = vscale x 4
+; CHECK_SCALABLE_PREFERRED: LV: Selecting VF: vscale x 4
; CHECK_SCALABLE_DISABLED-NOT: LV: Found feasible scalable VF
+; CHECK_SCALABLE_DISABLED: LV: Selecting VF: 4
; CHECK_SCALABLE_PREFERRED_MAXBW: LV: Found feasible scalable VF = vscale x 16
+; CHECK_SCALABLE_PREFERRED_MAXBW: LV: Selecting VF: vscale x 16
entry:
br label %loop
@@ -39,9 +43,13 @@ exit:
define void @test1(i32* %a, i8* %b) {
; CHECK: LV: Checking a loop in "test1"
; CHECK_SCALABLE_ON: LV: Found feasible scalable VF = vscale x 4
+; CHECK_SCALABLE_ON: LV: Selecting VF: 4
; CHECK_SCALABLE_PREFERRED: LV: Found feasible scalable VF = vscale x 4
+; CHECK_SCALABLE_PREFERRED: LV: Selecting VF: vscale x 4
; CHECK_SCALABLE_DISABLED-NOT: LV: Found feasible scalable VF
+; CHECK_SCALABLE_DISABLED: LV: Selecting VF: 4
; CHECK_SCALABLE_PREFERRED_MAXBW: LV: Found feasible scalable VF = vscale x 4
+; CHECK_SCALABLE_PREFERRED_MAXBW: LV: Selecting VF: 16
entry:
br label %loop
@@ -69,9 +77,13 @@ exit:
define void @test2(i32* %a, i8* %b) {
; CHECK: LV: Checking a loop in "test2"
; CHECK_SCALABLE_ON: LV: Found feasible scalable VF = vscale x 2
+; CHECK_SCALABLE_ON: LV: Selecting VF: 4
; CHECK_SCALABLE_PREFERRED: LV: Found feasible scalable VF = vscale x 2
+; CHECK_SCALABLE_PREFERRED: LV: Selecting VF: 4
; CHECK_SCALABLE_DISABLED-NOT: LV: Found feasible scalable VF
+; CHECK_SCALABLE_DISABLED: LV: Selecting VF: 4
; CHECK_SCALABLE_PREFERRED_MAXBW: LV: Found feasible scalable VF = vscale x 2
+; CHECK_SCALABLE_PREFERRED_MAXBW: LV: Selecting VF: 16
entry:
br label %loop
@@ -99,9 +111,13 @@ exit:
define void @test3(i32* %a, i8* %b) {
; CHECK: LV: Checking a loop in "test3"
; CHECK_SCALABLE_ON: LV: Found feasible scalable VF = vscale x 1
+; CHECK_SCALABLE_ON: LV: Selecting VF: 4
; CHECK_SCALABLE_PREFERRED: LV: Found feasible scalable VF = vscale x 1
+; CHECK_SCALABLE_PREFERRED: LV: Selecting VF: 4
; CHECK_SCALABLE_DISABLED-NOT: LV: Found feasible scalable VF
+; CHECK_SCALABLE_DISABLED: LV: Selecting VF: 4
; CHECK_SCALABLE_PREFERRED_MAXBW: LV: Found feasible scalable VF = vscale x 1
+; CHECK_SCALABLE_PREFERRED_MAXBW: LV: Selecting VF: 16
entry:
br label %loop
@@ -129,9 +145,13 @@ exit:
define void @test4(i32* %a, i32* %b) {
; CHECK: LV: Checking a loop in "test4"
; CHECK_SCALABLE_ON-NOT: LV: Found feasible scalable VF
+; CHECK_SCALABLE_ON: LV: Selecting VF: 4
; CHECK_SCALABLE_PREFERRED-NOT: LV: Found feasible scalable VF
+; CHECK_SCALABLE_PREFERRED: LV: Selecting VF: 4
; CHECK_SCALABLE_DISABLED-NOT: LV: Found feasible scalable VF
+; CHECK_SCALABLE_DISABLED: LV: Selecting VF: 4
; CHECK_SCALABLE_PREFERRED_MAXBW-NOT: LV: Found feasible scalable VF
+; CHECK_SCALABLE_PREFERRED_MAXBW: LV: Selecting VF: 4
entry:
br label %loop
More information about the llvm-commits
mailing list