[llvm] [NFC][LoopVectorize] Introduce new getEstimatedRuntimeVF function (PR #116247)
David Sherwood via llvm-commits
llvm-commits at lists.llvm.org
Thu Nov 14 07:53:16 PST 2024
https://github.com/david-arm created https://github.com/llvm/llvm-project/pull/116247
There are lots of places where we try to estimate the runtime
vectorisation factor based on the getVScaleForTuning TTI hook.
I've added a new getEstimatedRuntimeVF function and taught
several places in the vectoriser to use this new function.
>From 15fdaaae88284d57ce5b7695b5b7263bef967770 Mon Sep 17 00:00:00 2001
From: David Sherwood <david.sherwood at arm.com>
Date: Thu, 14 Nov 2024 15:51:23 +0000
Subject: [PATCH] [NFC][LoopVectorize] Introduce new getEstimatedRuntimeVF
function
There are lots of places where we try to estimate the runtime
vectorisation factor based on the getVScaleForTuning TTI hook.
I've added a new getEstimatedRuntimeVF function and taught
several places in the vectoriser to use this new function.
---
.../Transforms/Vectorize/LoopVectorize.cpp | 56 +++++++------------
1 file changed, 21 insertions(+), 35 deletions(-)
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 1ebc62f9843905..32adf9032e9c25 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -4300,6 +4300,16 @@ getVScaleForTuning(const Loop *L, const TargetTransformInfo &TTI) {
return TTI.getVScaleForTuning();
}
+static unsigned getEstimatedRuntimeVF(const Loop *L,
+ const TargetTransformInfo &TTI,
+ ElementCount VF) {
+ unsigned EstimatedVF = VF.getKnownMinValue();
+ if (VF.isScalable())
+ if (std::optional<unsigned> VScale = getVScaleForTuning(L, TTI))
+ EstimatedVF *= *VScale;
+ return EstimatedVF;
+}
+
bool LoopVectorizationPlanner::isMoreProfitable(
const VectorizationFactor &A, const VectorizationFactor &B) const {
InstructionCost CostA = A.Cost;
@@ -4596,17 +4606,13 @@ VectorizationFactor LoopVectorizationPlanner::selectVectorizationFactor() {
InstructionCost C = CM.expectedCost(VF);
VectorizationFactor Candidate(VF, C, ScalarCost.ScalarCost);
- unsigned AssumedMinimumVscale =
- getVScaleForTuning(OrigLoop, TTI).value_or(1);
- unsigned Width =
- Candidate.Width.isScalable()
- ? Candidate.Width.getKnownMinValue() * AssumedMinimumVscale
- : Candidate.Width.getFixedValue();
+ unsigned Width = getEstimatedRuntimeVF(OrigLoop, TTI, Candidate.Width);
LLVM_DEBUG(dbgs() << "LV: Vector loop of width " << VF
<< " costs: " << (Candidate.Cost / Width));
if (VF.isScalable())
LLVM_DEBUG(dbgs() << " (assuming a minimum vscale of "
- << AssumedMinimumVscale << ")");
+ << getVScaleForTuning(OrigLoop, TTI).value_or(1)
+ << ")");
LLVM_DEBUG(dbgs() << ".\n");
if (!ForceVectorization && !willGenerateVectors(*P, VF, TTI)) {
@@ -4687,12 +4693,7 @@ bool LoopVectorizationCostModel::isEpilogueVectorizationProfitable(
if (TTI.getMaxInterleaveFactor(VF) <= 1)
return false;
- unsigned Multiplier = 1;
- if (VF.isScalable())
- Multiplier = getVScaleForTuning(TheLoop, TTI).value_or(1);
- if ((Multiplier * VF.getKnownMinValue()) >= EpilogueVectorizationMinVF)
- return true;
- return false;
+ return getEstimatedRuntimeVF(TheLoop, TTI, VF) >= EpilogueVectorizationMinVF;
}
VectorizationFactor LoopVectorizationPlanner::selectEpilogueVectorizationFactor(
@@ -4744,12 +4745,8 @@ VectorizationFactor LoopVectorizationPlanner::selectEpilogueVectorizationFactor(
// If MainLoopVF = vscale x 2, and vscale is expected to be 4, then we know
// the main loop handles 8 lanes per iteration. We could still benefit from
// vectorizing the epilogue loop with VF=4.
- ElementCount EstimatedRuntimeVF = MainLoopVF;
- if (MainLoopVF.isScalable()) {
- EstimatedRuntimeVF = ElementCount::getFixed(MainLoopVF.getKnownMinValue());
- if (std::optional<unsigned> VScale = getVScaleForTuning(OrigLoop, TTI))
- EstimatedRuntimeVF *= *VScale;
- }
+ ElementCount EstimatedRuntimeVF =
+ ElementCount::getFixed(getEstimatedRuntimeVF(OrigLoop, TTI, MainLoopVF));
ScalarEvolution &SE = *PSE.getSE();
Type *TCType = Legal->getWidestInductionType();
@@ -4976,11 +4973,7 @@ LoopVectorizationCostModel::selectInterleaveCount(ElementCount VF,
MaxInterleaveCount = ForceTargetMaxVectorInterleaveFactor;
}
- unsigned EstimatedVF = VF.getKnownMinValue();
- if (VF.isScalable()) {
- if (std::optional<unsigned> VScale = getVScaleForTuning(TheLoop, TTI))
- EstimatedVF *= *VScale;
- }
+ unsigned EstimatedVF = getEstimatedRuntimeVF(TheLoop, TTI, VF);
assert(EstimatedVF >= 1 && "Estimated VF shouldn't be less than 1");
unsigned KnownTC = PSE.getSE()->getSmallConstantTripCount(TheLoop);
@@ -9776,8 +9769,8 @@ static void checkMixedPrecision(Loop *L, OptimizationRemarkEmitter *ORE) {
}
static bool areRuntimeChecksProfitable(GeneratedRTChecks &Checks,
- VectorizationFactor &VF,
- std::optional<unsigned> VScale, Loop *L,
+ VectorizationFactor &VF, Loop *L,
+ const TargetTransformInfo &TTI,
PredicatedScalarEvolution &PSE,
ScalarEpilogueLowering SEL) {
InstructionCost CheckCost = Checks.getCost();
@@ -9829,13 +9822,7 @@ static bool areRuntimeChecksProfitable(GeneratedRTChecks &Checks,
// For now we assume the epilogue cost EpiC = 0 for simplicity. Note that
// the computations are performed on doubles, not integers and the result
// is rounded up, hence we get an upper estimate of the TC.
- unsigned IntVF = VF.Width.getKnownMinValue();
- if (VF.Width.isScalable()) {
- unsigned AssumedMinimumVscale = 1;
- if (VScale)
- AssumedMinimumVscale = *VScale;
- IntVF *= AssumedMinimumVscale;
- }
+ unsigned IntVF = getEstimatedRuntimeVF(L, TTI, VF.Width);
uint64_t RtC = *CheckCost.getValue();
uint64_t Div = ScalarC * IntVF - *VF.Cost.getValue();
uint64_t MinTC1 = Div == 0 ? 0 : divideCeil(RtC * IntVF, Div);
@@ -10084,8 +10071,7 @@ bool LoopVectorizePass::processLoop(Loop *L) {
bool ForceVectorization =
Hints.getForce() == LoopVectorizeHints::FK_Enabled;
if (!ForceVectorization &&
- !areRuntimeChecksProfitable(Checks, VF, getVScaleForTuning(L, *TTI), L,
- PSE, SEL)) {
+ !areRuntimeChecksProfitable(Checks, VF, L, *TTI, PSE, SEL)) {
ORE->emit([&]() {
return OptimizationRemarkAnalysisAliasing(
DEBUG_TYPE, "CantReorderMemOps", L->getStartLoc(),
More information about the llvm-commits
mailing list