[llvm-branch-commits] [llvm] 9414ed7 - Revert "[VPlan] Unify inner and outer loop paths (NFCI). (#192868)"
via llvm-branch-commits
llvm-branch-commits at lists.llvm.org
Fri May 8 12:52:57 PDT 2026
Author: Florian Hahn
Date: 2026-05-08T21:52:53+02:00
New Revision: 9414ed711700de33ac728566c247186e4ff1790f
URL: https://github.com/llvm/llvm-project/commit/9414ed711700de33ac728566c247186e4ff1790f
DIFF: https://github.com/llvm/llvm-project/commit/9414ed711700de33ac728566c247186e4ff1790f.diff
LOG: Revert "[VPlan] Unify inner and outer loop paths (NFCI). (#192868)"
This reverts commit b84f58ee844ca929db2fff2e41e2195e255548b8.
Added:
Modified:
llvm/include/llvm/Transforms/Vectorize/LoopVectorize.h
llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.cpp
llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h
llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
llvm/lib/Transforms/Vectorize/VPlan.cpp
llvm/lib/Transforms/Vectorize/VPlan.h
llvm/lib/Transforms/Vectorize/VPlanPredicator.cpp
llvm/test/Transforms/LoopVectorize/VPlan/vplan-stress-test-no-explict-vf.ll
llvm/test/Transforms/LoopVectorize/VPlan/vplan_hcfg_stress_test.ll
llvm/test/Transforms/LoopVectorize/explicit_outer_detection.ll
Removed:
################################################################################
diff --git a/llvm/include/llvm/Transforms/Vectorize/LoopVectorize.h b/llvm/include/llvm/Transforms/Vectorize/LoopVectorize.h
index 0d45c159d315c..18906aa7eeae3 100644
--- a/llvm/include/llvm/Transforms/Vectorize/LoopVectorize.h
+++ b/llvm/include/llvm/Transforms/Vectorize/LoopVectorize.h
@@ -169,12 +169,9 @@ struct LoopVectorizePass : public OptionalPassInfoMixin<LoopVectorizePass> {
/// purposes along with the corresponding optimization remark \p RemarkName.
/// If \p I is passed, it is an instruction that prevents vectorization.
/// Otherwise, the loop \p TheLoop is used for the location of the remark.
-LLVM_ABI void reportVectorizationFailure(const StringRef DebugMsg,
- const StringRef OREMsg,
- const StringRef ORETag,
- OptimizationRemarkEmitter *ORE,
- const Loop *TheLoop,
- Instruction *I = nullptr);
+LLVM_ABI void reportVectorizationFailure(
+ const StringRef DebugMsg, const StringRef OREMsg, const StringRef ORETag,
+ OptimizationRemarkEmitter *ORE, Loop *TheLoop, Instruction *I = nullptr);
/// Same as above, but the debug message and optimization remark are identical
inline void reportVectorizationFailure(const StringRef DebugMsg,
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.cpp
index f29834d2f804e..91476cf232fe0 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.cpp
@@ -27,8 +27,6 @@ using namespace llvm;
#define DEBUG_TYPE "loop-vectorize"
-extern cl::opt<bool> VPlanBuildOuterloopStressTest;
-
static cl::opt<bool> MaximizeBandwidth(
"vectorizer-maximize-bandwidth", cl::init(false), cl::Hidden,
cl::desc("Maximize bandwidth when selecting vectorization factor which "
@@ -620,49 +618,3 @@ void VFSelectionContext::collectInLoopReductions() {
<< " reduction for phi: " << *Phi << "\n");
}
}
-
-// TODO: we could return a pair of values that specify the max VF and
-// min VF, to be used in `buildVPlans(MinVF, MaxVF)` instead of
-// `buildVPlans(VF, VF)`. We cannot do it because VPLAN at the moment
-// doesn't have a cost model that can choose which plan to execute if
-// more than one is generated.
-FixedScalableVFPair
-VFSelectionContext::computeVPlanOuterloopVF(ElementCount UserVF) {
- if (UserVF.isScalable() && !supportsScalableVectors()) {
- reportVectorizationFailure(
- "Scalable vectorization requested but not supported by the target",
- "the scalable user-specified vectorization width for outer-loop "
- "vectorization cannot be used because the target does not support "
- "scalable vectors.",
- "ScalableVFUnfeasible", ORE, TheLoop);
- return FixedScalableVFPair::getNone();
- }
-
- ElementCount VF = UserVF;
- if (VF.isZero()) {
- auto [_, WidestType] = getSmallestAndWidestTypes();
-
- auto RegKind = TTI.enableScalableVectorization()
- ? TargetTransformInfo::RGK_ScalableVector
- : TargetTransformInfo::RGK_FixedWidthVector;
-
- TypeSize RegSize = TTI.getRegisterBitWidth(RegKind);
- unsigned N = RegSize.getKnownMinValue() / WidestType;
- VF = ElementCount::get(N, RegSize.isScalable());
- LLVM_DEBUG(dbgs() << "LV: VPlan computed VF " << VF << ".\n");
-
- // Make sure we have a VF > 1 for stress testing.
- if (VPlanBuildOuterloopStressTest && VF.isScalar()) {
- LLVM_DEBUG(dbgs() << "LV: VPlan stress testing: "
- << "overriding computed VF.\n");
- VF = ElementCount::getFixed(4);
- }
- }
- assert(isPowerOf2_32(VF.getKnownMinValue()) &&
- "VF needs to be a power of two");
- if (VF.isScalar())
- return FixedScalableVFPair::getNone();
- LLVM_DEBUG(dbgs() << "LV: Using " << (!UserVF.isZero() ? "user " : "")
- << "VF " << VF << " to build VPlans.\n");
- return FixedScalableVFPair(VF);
-}
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h b/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h
index 00b689326d770..a6789974e0bd6 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h
@@ -703,10 +703,6 @@ class VFSelectionContext {
/// for size, returning true here aborts vectorization.
bool runtimeChecksRequired();
- /// Returns a scalable VF to use for outer-loop vectorization if the target
- /// supports it and a fixed VF otherwise.
- FixedScalableVFPair computeVPlanOuterloopVF(ElementCount UserVF);
-
/// Compute smallest bitwidth each instruction can be represented with.
/// The vector equivalents of these instructions should be truncated to this
/// type.
@@ -793,6 +789,10 @@ class LoopVectorizationPlanner {
/// interleaving should be avoided up-front, no plans are generated.
void plan(ElementCount UserVF, unsigned UserIC);
+ /// Use the VPlan-native path to plan how to best vectorize, return the best
+ /// VF and its cost.
+ VectorizationFactor planInVPlanNativePath(ElementCount UserVF);
+
/// Return the VPlan for \p VF. At the moment, there is always a single VPlan
/// for each VF.
VPlan &getPlanFor(ElementCount VF) const;
@@ -881,22 +881,34 @@ class LoopVectorizationPlanner {
unsigned OrigLoopInvocationWeight, unsigned EstimatedVFxUF,
bool DisableRuntimeUnroll);
-private:
- /// Build a VPlan using VPRecipes according to the information gathered by
- /// Legal and VPlan-based analysis. For outer loops, performs basic recipe
- /// conversion only. For inner loops, \p Range's largest included VF is
- /// restricted to the maximum VF the returned VPlan is valid for. If no VPlan
- /// can be built for the input range, set the largest included VF to the
- /// maximum VF for which no plan could be built. Each VPlan is built starting
- /// from a copy of \p InitialPlan, which is a plain CFG VPlan wrapping the
- /// original scalar loop.
- VPlanPtr tryToBuildVPlan(VPlanPtr InitialPlan, VFRange &Range);
-
+protected:
/// Build VPlans for power-of-2 VF's between \p MinVF and \p MaxVF inclusive,
/// according to the information gathered by Legal when it checked if it is
/// legal to vectorize the loop.
void buildVPlans(ElementCount MinVF, ElementCount MaxVF);
+private:
+ /// Build a VPlan according to the information gathered by Legal. \return a
+ /// VPlan for vectorization factors \p Range.Start and up to \p Range.End
+ /// exclusive, possibly decreasing \p Range.End. If no VPlan can be built for
+ /// the input range, set the largest included VF to the maximum VF for which
+ /// no plan could be built.
+ VPlanPtr tryToBuildVPlan(VFRange &Range);
+
+ /// Build a VPlan using VPRecipes according to the information gather by
+ /// Legal. This method is only used for the legacy inner loop vectorizer.
+ /// \p Range's largest included VF is restricted to the maximum VF the
+ /// returned VPlan is valid for. If no VPlan can be built for the input range,
+ /// set the largest included VF to the maximum VF for which no plan could be
+ /// built. Each VPlan is built starting from a copy of \p InitialPlan, which
+ /// is a plain CFG VPlan wrapping the original scalar loop.
+ VPlanPtr tryToBuildVPlanWithVPRecipes(VPlanPtr InitialPlan, VFRange &Range);
+
+ /// Build VPlans for power-of-2 VF's between \p MinVF and \p MaxVF inclusive,
+ /// according to the information gathered by Legal when it checked if it is
+ /// legal to vectorize the loop. This method creates VPlans using VPRecipes.
+ void buildVPlansWithVPRecipes(ElementCount MinVF, ElementCount MaxVF);
+
/// Add ComputeReductionResult recipes to the middle block to compute the
/// final reduction results. Add Select recipes to the latch block when
/// folding tail, to feed ComputeReductionResult with the last or penultimate
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 1ace2275e2b6d..ae1d6d83cccd4 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -350,8 +350,8 @@ cl::opt<bool> llvm::VPlanPrintVectorRegionScope(
// VPlan-native vectorization path. It must be used in conjuction with
// -enable-vplan-native-path. -vplan-verify-hcfg can also be used to enable the
// verification of the H-CFGs built.
-cl::opt<bool> VPlanBuildOuterloopStressTest(
- "vplan-build-outerloop-stress-test", cl::init(false), cl::Hidden,
+static cl::opt<bool> VPlanBuildStressTest(
+ "vplan-build-stress-test", cl::init(false), cl::Hidden,
cl::desc(
"Build VPlan for every supported loop nest in the function and bail "
"out right after the build (stress test the VPlan H-CFG construction "
@@ -745,8 +745,8 @@ Value *getRuntimeVF(IRBuilderBase &B, Type *Ty, ElementCount VF) {
void reportVectorizationFailure(const StringRef DebugMsg,
const StringRef OREMsg, const StringRef ORETag,
- OptimizationRemarkEmitter *ORE,
- const Loop *TheLoop, Instruction *I) {
+ OptimizationRemarkEmitter *ORE, Loop *TheLoop,
+ Instruction *I) {
LLVM_DEBUG(debugVectorizationMessage("Not vectorizing: ", DebugMsg, I));
LoopVectorizeHints Hints(TheLoop, false /* doesn't matter */, *ORE);
ORE->emit(
@@ -1877,7 +1877,7 @@ static void collectSupportedLoops(Loop &L, LoopInfo *LI,
// now, only collect outer loops that have explicit vectorization hints. If we
// are stress testing the VPlan H-CFG construction, we collect the outermost
// loop of every loop nest.
- if (L.isInnermost() || VPlanBuildOuterloopStressTest ||
+ if (L.isInnermost() || VPlanBuildStressTest ||
(EnableVPlanNativePath && isExplicitVecOuterLoop(&L, ORE))) {
LoopBlocksRPO RPOT(&L);
RPOT.perform(LI);
@@ -2868,12 +2868,6 @@ void LoopVectorizationCostModel::collectLoopUniforms(ElementCount VF) {
FixedScalableVFPair
LoopVectorizationCostModel::computeMaxVF(ElementCount UserVF, unsigned UserIC) {
- // For outer loops, use simple type-based heuristic VF. No cost model or
- // memory dependence analysis is available.
- if (!TheLoop->isInnermost()) {
- return Config.computeVPlanOuterloopVF(UserVF);
- }
-
if (Legal->getRuntimePointerChecking()->Need && TTI.hasBranchDivergence()) {
// TODO: It may be useful to do since it's still likely to be dynamically
// uniform if the target can skip.
@@ -5672,7 +5666,83 @@ void LoopVectorizationCostModel::collectValuesToIgnore() {
}
}
+// This function will select a scalable VF if the target supports scalable
+// vectors and a fixed one otherwise.
+// TODO: we could return a pair of values that specify the max VF and
+// min VF, to be used in `buildVPlans(MinVF, MaxVF)` instead of
+// `buildVPlans(VF, VF)`. We cannot do it because VPLAN at the moment
+// doesn't have a cost model that can choose which plan to execute if
+// more than one is generated.
+static ElementCount determineVPlanVF(const TargetTransformInfo &TTI,
+ VFSelectionContext &Config) {
+ unsigned WidestType = Config.getSmallestAndWidestTypes().second;
+
+ TargetTransformInfo::RegisterKind RegKind =
+ TTI.enableScalableVectorization()
+ ? TargetTransformInfo::RGK_ScalableVector
+ : TargetTransformInfo::RGK_FixedWidthVector;
+
+ TypeSize RegSize = TTI.getRegisterBitWidth(RegKind);
+ unsigned N = RegSize.getKnownMinValue() / WidestType;
+ return ElementCount::get(N, RegSize.isScalable());
+}
+
+VectorizationFactor
+LoopVectorizationPlanner::planInVPlanNativePath(ElementCount UserVF) {
+ ElementCount VF = UserVF;
+ // Outer loop handling: They may require CFG and instruction level
+ // transformations before even evaluating whether vectorization is profitable.
+ // Since we cannot modify the incoming IR, we need to build VPlan upfront in
+ // the vectorization pipeline.
+ if (!OrigLoop->isInnermost()) {
+ // If the user doesn't provide a vectorization factor, determine a
+ // reasonable one.
+ if (UserVF.isZero()) {
+ VF = determineVPlanVF(TTI, Config);
+ LLVM_DEBUG(dbgs() << "LV: VPlan computed VF " << VF << ".\n");
+
+ // Make sure we have a VF > 1 for stress testing.
+ if (VPlanBuildStressTest && (VF.isScalar() || VF.isZero())) {
+ LLVM_DEBUG(dbgs() << "LV: VPlan stress testing: "
+ << "overriding computed VF.\n");
+ VF = ElementCount::getFixed(4);
+ }
+ } else if (UserVF.isScalable() && !Config.supportsScalableVectors()) {
+ LLVM_DEBUG(dbgs() << "LV: Not vectorizing. Scalable VF requested, but "
+ << "not supported by the target.\n");
+ reportVectorizationFailure(
+ "Scalable vectorization requested but not supported by the target",
+ "the scalable user-specified vectorization width for outer-loop "
+ "vectorization cannot be used because the target does not support "
+ "scalable vectors.",
+ "ScalableVFUnfeasible", ORE, OrigLoop);
+ return VectorizationFactor::Disabled();
+ }
+ assert(EnableVPlanNativePath && "VPlan-native path is not enabled.");
+ assert(isPowerOf2_32(VF.getKnownMinValue()) &&
+ "VF needs to be a power of two");
+ LLVM_DEBUG(dbgs() << "LV: Using " << (!UserVF.isZero() ? "user " : "")
+ << "VF " << VF << " to build VPlans.\n");
+ buildVPlans(VF, VF);
+
+ if (VPlans.empty())
+ return VectorizationFactor::Disabled();
+
+ // For VPlan build stress testing, we bail out after VPlan construction.
+ if (VPlanBuildStressTest)
+ return VectorizationFactor::Disabled();
+
+ return {VF, 0 /*Cost*/, 0 /* ScalarCost */};
+ }
+
+ LLVM_DEBUG(
+ dbgs() << "LV: Not vectorizing. Inner loops aren't supported in the "
+ "VPlan-native path.\n");
+ return VectorizationFactor::Disabled();
+}
+
void LoopVectorizationPlanner::plan(ElementCount UserVF, unsigned UserIC) {
+ assert(OrigLoop->isInnermost() && "Inner loop expected.");
CM.collectValuesToIgnore();
Config.collectElementTypesForWidening(&CM.ValuesToIgnore);
@@ -5680,16 +5750,6 @@ void LoopVectorizationPlanner::plan(ElementCount UserVF, unsigned UserIC) {
if (!MaxFactors) // Cases that should not to be vectorized nor interleaved.
return;
- if (!OrigLoop->isInnermost()) {
- // For outer loops, computeMaxVF returns a single non-scalar VF; build a
- // plan for only that VF.
- ElementCount VF =
- MaxFactors.FixedVF ? MaxFactors.FixedVF : MaxFactors.ScalableVF;
- buildVPlans(VF, VF);
- LLVM_DEBUG(printPlans(dbgs()));
- return;
- }
-
// Compute the minimal bitwidths required for integer operations in the loop
// for later use by the cost model.
Config.computeMinimalBitwidths();
@@ -5730,9 +5790,9 @@ void LoopVectorizationPlanner::plan(ElementCount UserVF, unsigned UserIC) {
if (EpilogueUserVF.isVector() &&
ElementCount::isKnownLT(EpilogueUserVF, UserVF)) {
CM.collectNonVectorizedAndSetWideningDecisions(EpilogueUserVF);
- buildVPlans(EpilogueUserVF, EpilogueUserVF);
+ buildVPlansWithVPRecipes(EpilogueUserVF, EpilogueUserVF);
}
- buildVPlans(UserVF, UserVF);
+ buildVPlansWithVPRecipes(UserVF, UserVF);
if (!VPlans.empty() && VPlans.back()->getSingleVF() == UserVF) {
// For scalar VF, skip VPlan cost check as VPlan cost is designed for
// vector VFs only.
@@ -5764,8 +5824,8 @@ void LoopVectorizationPlanner::plan(ElementCount UserVF, unsigned UserIC) {
CM.collectNonVectorizedAndSetWideningDecisions(VF);
}
- buildVPlans(ElementCount::getFixed(1), MaxFactors.FixedVF);
- buildVPlans(ElementCount::getScalable(1), MaxFactors.ScalableVF);
+ buildVPlansWithVPRecipes(ElementCount::getFixed(1), MaxFactors.FixedVF);
+ buildVPlansWithVPRecipes(ElementCount::getScalable(1), MaxFactors.ScalableVF);
LLVM_DEBUG(printPlans(dbgs()));
}
@@ -5983,25 +6043,22 @@ LoopVectorizationPlanner::computeBestVF() {
return {VectorizationFactor::Disabled(), nullptr};
// If there is a single VPlan with a single VF, return it directly.
VPlan &FirstPlan = *VPlans[0];
-
ElementCount UserVF = Hints.getWidth();
- if (VPlans.size() == 1) {
- // For outer loops, the plan has a single vector VF determined by the
- // heuristic.
- assert((FirstPlan.hasScalarVFOnly() || hasPlanWithVF(UserVF) ||
- FirstPlan.isOuterLoop()) &&
- "must have a single scalar VF, UserVF or an outer loop");
- return {VectorizationFactor(FirstPlan.getSingleVF(), 0, 0), &FirstPlan};
- }
-
- if (hasPlanWithVF(UserVF) && EpilogueVectorizationForceVF > 1) {
- assert(VPlans.size() == 2 && "Must have exactly 2 VPlans built");
- assert(VPlans[0]->getSingleVF() ==
- ElementCount::getFixed(EpilogueVectorizationForceVF) &&
- "expected first plan to be for the forced epilogue VF");
- assert(VPlans[1]->getSingleVF() == UserVF &&
- "expected second plan to be for the forced UserVF");
- return {VectorizationFactor(UserVF, 0, 0), VPlans[1].get()};
+ if (hasPlanWithVF(UserVF)) {
+ if (VPlans.size() == 1) {
+ assert(FirstPlan.getSingleVF() == UserVF &&
+ "UserVF must match single VF");
+ return {VectorizationFactor(FirstPlan.getSingleVF(), 0, 0), &FirstPlan};
+ }
+ if (EpilogueVectorizationForceVF > 1) {
+ assert(VPlans.size() == 2 && "Must have exactly 2 VPlans built");
+ assert(VPlans[0]->getSingleVF() ==
+ ElementCount::getFixed(EpilogueVectorizationForceVF) &&
+ "expected first plan to be for the forced epilogue VF");
+ assert(VPlans[1]->getSingleVF() == UserVF &&
+ "expected second plan to be for the forced UserVF");
+ return {VectorizationFactor(UserVF, 0, 0), VPlans[1].get()};
+ }
}
LLVM_DEBUG(dbgs() << "LV: Computing best VF using cost kind: "
@@ -6747,38 +6804,30 @@ VPRecipeBuilder::tryToCreateWidenNonPhiRecipe(VPSingleDefRecipe *R,
// optimizations.
static void printOptimizedVPlan(VPlan &) {}
-void LoopVectorizationPlanner::buildVPlans(ElementCount MinVF,
- ElementCount MaxVF) {
+void LoopVectorizationPlanner::buildVPlansWithVPRecipes(ElementCount MinVF,
+ ElementCount MaxVF) {
if (ElementCount::isKnownGT(MinVF, MaxVF))
return;
- bool IsInnerLoop = OrigLoop->isInnermost();
-
- // Set up loop versioning for inner loops with memory runtime checks.
- // Outer loops don't have LoopAccessInfo since canVectorizeMemory() is not
- // called for them.
- std::optional<LoopVersioning> LVer;
- if (IsInnerLoop) {
- const LoopAccessInfo *LAI = Legal->getLAI();
- LVer.emplace(*LAI, LAI->getRuntimePointerChecking()->getChecks(), OrigLoop,
- LI, DT, PSE.getSE());
- if (!LAI->getRuntimePointerChecking()->getChecks().empty() &&
- !LAI->getRuntimePointerChecking()->getDiffChecks()) {
- // Only use noalias metadata when using memory checks guaranteeing no
- // overlap across all iterations.
- LVer->prepareNoAliasMetadata();
- }
+ assert(OrigLoop->isInnermost() && "Inner loop expected.");
+
+ const LoopAccessInfo *LAI = Legal->getLAI();
+ LoopVersioning LVer(*LAI, LAI->getRuntimePointerChecking()->getChecks(),
+ OrigLoop, LI, DT, PSE.getSE());
+ if (!LAI->getRuntimePointerChecking()->getChecks().empty() &&
+ !LAI->getRuntimePointerChecking()->getDiffChecks()) {
+ // Only use noalias metadata when using memory checks guaranteeing no
+ // overlap across all iterations.
+ LVer.prepareNoAliasMetadata();
}
// Create initial base VPlan0, to serve as common starting point for all
// candidates built later for specific VF ranges.
auto VPlan0 = VPlanTransforms::buildVPlan0(
OrigLoop, *LI, Legal->getWidestInductionType(),
- getDebugLocFromInstOrOperands(Legal->getPrimaryInduction()), PSE,
- LVer ? &*LVer : nullptr);
+ getDebugLocFromInstOrOperands(Legal->getPrimaryInduction()), PSE, &LVer);
- // Create recipes for header phis. For outer loops, reductions, recurrences
- // and in-loop reductions are empty since legality doesn't detect them.
+ // Create recipes for header phis.
if (!RUN_VPLAN_PASS(VPlanTransforms::createHeaderPhiRecipes, *VPlan0, PSE,
*OrigLoop, Legal->getInductionVars(),
Legal->getReductionVars(),
@@ -6813,8 +6862,8 @@ void LoopVectorizationPlanner::buildVPlans(ElementCount MinVF,
auto MaxVFTimes2 = MaxVF * 2;
for (ElementCount VF = MinVF; ElementCount::isKnownLT(VF, MaxVFTimes2);) {
VFRange SubRange = {VF, MaxVFTimes2};
- auto Plan =
- tryToBuildVPlan(std::unique_ptr<VPlan>(VPlan0->duplicate()), SubRange);
+ auto Plan = tryToBuildVPlanWithVPRecipes(
+ std::unique_ptr<VPlan>(VPlan0->duplicate()), SubRange);
VF = SubRange.End;
if (!Plan)
@@ -6842,21 +6891,9 @@ void LoopVectorizationPlanner::buildVPlans(ElementCount MinVF,
}
}
-VPlanPtr LoopVectorizationPlanner::tryToBuildVPlan(VPlanPtr Plan,
- VFRange &Range) {
-
- // For outer loops, the plan only needs basic recipe conversion and induction
- // live-out optimization; the full inner-loop recipe building below does not
- // apply (no widening decisions, interleave groups, reductions, etc.).
- if (Plan->isOuterLoop()) {
- for (ElementCount VF : Range)
- Plan->addVF(VF);
- if (!VPlanTransforms::tryToConvertVPInstructionsToVPRecipes(*Plan, *TLI))
- return nullptr;
- VPlanTransforms::optimizeInductionLiveOutUsers(*Plan, PSE,
- /*FoldTail=*/false);
- return Plan;
- }
+VPlanPtr
+LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(VPlanPtr Plan,
+ VFRange &Range) {
using namespace llvm::VPlanPatternMatch;
SmallPtrSet<const InterleaveGroup<Instruction> *, 1> InterleaveGroups;
@@ -7079,6 +7116,47 @@ VPlanPtr LoopVectorizationPlanner::tryToBuildVPlan(VPlanPtr Plan,
return Plan;
}
+VPlanPtr LoopVectorizationPlanner::tryToBuildVPlan(VFRange &Range) {
+ // Outer loop handling: They may require CFG and instruction level
+ // transformations before even evaluating whether vectorization is profitable.
+ // Since we cannot modify the incoming IR, we need to build VPlan upfront in
+ // the vectorization pipeline.
+ assert(!OrigLoop->isInnermost());
+ assert(EnableVPlanNativePath && "VPlan-native path is not enabled.");
+
+ auto Plan = VPlanTransforms::buildVPlan0(
+ OrigLoop, *LI, Legal->getWidestInductionType(),
+ getDebugLocFromInstOrOperands(Legal->getPrimaryInduction()), PSE);
+
+ if (!VPlanTransforms::createHeaderPhiRecipes(
+ *Plan, PSE, *OrigLoop, Legal->getInductionVars(),
+ MapVector<PHINode *, RecurrenceDescriptor>(),
+ SmallPtrSet<const PHINode *, 1>(), SmallPtrSet<PHINode *, 1>(),
+ /*AllowReordering=*/false))
+ return nullptr;
+ [[maybe_unused]] bool CanHandleExits = VPlanTransforms::handleEarlyExits(
+ *Plan, UncountableExitStyle::NoUncountableExit, OrigLoop, PSE, *DT,
+ Legal->getAssumptionCache());
+ assert(CanHandleExits &&
+ "early-exits are not supported in VPlan-native path");
+ VPlanTransforms::addMiddleCheck(*Plan, /*TailFolded*/ false);
+
+ VPlanTransforms::createLoopRegions(*Plan);
+
+ for (ElementCount VF : Range)
+ Plan->addVF(VF);
+
+ if (!VPlanTransforms::tryToConvertVPInstructionsToVPRecipes(*Plan, *TLI))
+ return nullptr;
+
+ // Optimize induction live-out users to use precomputed end values.
+ VPlanTransforms::optimizeInductionLiveOutUsers(*Plan, PSE,
+ /*FoldTail=*/false);
+
+ assert(verifyVPlanIsValid(*Plan) && "VPlan is invalid");
+ return Plan;
+}
+
void LoopVectorizationPlanner::addReductionResultComputation(
VPlanPtr &Plan, VPRecipeBuilder &RecipeBuilder, ElementCount MinVF) {
using namespace VPlanPatternMatch;
@@ -7279,7 +7357,7 @@ void LoopVectorizationPlanner::attachRuntimeChecks(
if (MemCheckBlock && MemCheckBlock->hasNPredecessors(0)) {
// VPlan-native path does not do any analysis for runtime checks
// currently.
- assert((!EnableVPlanNativePath || !Plan.isOuterLoop()) &&
+ assert((!EnableVPlanNativePath || OrigLoop->isInnermost()) &&
"Runtime checks are not supported for outer loops yet");
if (Config.OptForSize) {
@@ -7360,6 +7438,75 @@ getEpilogueLowering(Function *F, Loop *L, LoopVectorizeHints &Hints,
return CM_EpilogueAllowed;
}
+// Process the loop in the VPlan-native vectorization path. This path builds
+// VPlan upfront in the vectorization pipeline, which allows to apply
+// VPlan-to-VPlan transformations from the very beginning without modifying the
+// input LLVM IR.
+static bool processLoopInVPlanNativePath(
+ Loop *L, PredicatedScalarEvolution &PSE, LoopInfo *LI, DominatorTree *DT,
+ LoopVectorizationLegality *LVL, TargetTransformInfo *TTI,
+ TargetLibraryInfo *TLI, DemandedBits *DB, AssumptionCache *AC,
+ OptimizationRemarkEmitter *ORE,
+ std::function<BlockFrequencyInfo &()> GetBFI, bool OptForSize,
+ LoopVectorizeHints &Hints, LoopVectorizationRequirements &Requirements) {
+
+ if (isa<SCEVCouldNotCompute>(PSE.getBackedgeTakenCount())) {
+ LLVM_DEBUG(dbgs() << "LV: cannot compute the outer-loop trip count\n");
+ return false;
+ }
+ assert(EnableVPlanNativePath && "VPlan-native path is disabled.");
+ Function *F = L->getHeader()->getParent();
+ InterleavedAccessInfo IAI(PSE, L, DT, LI, LVL->getLAI());
+
+ EpilogueLowering SEL =
+ getEpilogueLowering(F, L, Hints, OptForSize, TTI, TLI, *LVL, &IAI);
+
+ VFSelectionContext Config(*TTI, LVL, L, *F, PSE, DB, ORE, &Hints, OptForSize);
+ LoopVectorizationCostModel CM(SEL, L, PSE, LI, LVL, *TTI, TLI, AC, ORE,
+ GetBFI, F, &Hints, IAI, Config);
+ // Use the planner for outer loop vectorization.
+ // TODO: CM is not used at this point inside the planner. Turn CM into an
+ // optional argument if we don't need it in the future.
+ LoopVectorizationPlanner LVP(L, LI, DT, TLI, *TTI, LVL, CM, Config, IAI, PSE,
+ Hints, ORE);
+
+ // Get user vectorization factor.
+ ElementCount UserVF = Hints.getWidth();
+
+ Config.collectElementTypesForWidening();
+
+ // Plan how to best vectorize, return the best VF and its cost.
+ const VectorizationFactor VF = LVP.planInVPlanNativePath(UserVF);
+
+ // If we are stress testing VPlan builds, do not attempt to generate vector
+ // code. Masked vector code generation support will follow soon.
+ // Also, do not attempt to vectorize if no vector code will be produced.
+ if (VPlanBuildStressTest || VectorizationFactor::Disabled() == VF)
+ return false;
+
+ VPlan &BestPlan = LVP.getPlanFor(VF.Width);
+
+ {
+ GeneratedRTChecks Checks(PSE, DT, LI, TTI, Config.CostKind);
+ InnerLoopVectorizer LB(L, PSE, LI, DT, TTI, AC, VF.Width, /*UF=*/1, &CM,
+ Checks, BestPlan);
+ LLVM_DEBUG(dbgs() << "Vectorizing outer loop in \"" << F->getName()
+ << "\"\n");
+ LVP.addMinimumIterationCheck(BestPlan, VF.Width, /*UF=*/1,
+ VF.MinProfitableTripCount);
+ bool HasBranchWeights =
+ hasBranchWeightMD(*L->getLoopLatch()->getTerminator());
+ LVP.attachRuntimeChecks(BestPlan, Checks, HasBranchWeights);
+
+ reportVectorization(ORE, L, VF, 1);
+
+ LVP.executePlan(VF.Width, /*UF=*/1, BestPlan, LB, DT);
+ }
+
+ assert(!verifyFunction(*F, &dbgs()));
+ return true;
+}
+
// Emit a remark if there are stores to floats that required a floating point
// extension. If the vectorized loop was generated with floating point there
// will be a performance penalty from the conversion overhead and the change in
@@ -8029,14 +8176,6 @@ bool LoopVectorizePass::processLoop(Loop *L) {
return false;
}
- bool IsInnerLoop = L->isInnermost();
-
- // Outer loops require a computable trip count.
- if (!IsInnerLoop && isa<SCEVCouldNotCompute>(PSE.getBackedgeTakenCount())) {
- LLVM_DEBUG(dbgs() << "LV: cannot compute the outer-loop trip count\n");
- return false;
- }
-
if (LVL.hasUncountableEarlyExit()) {
if (!EnableEarlyExitVectorization) {
reportVectorizationFailure("Auto-vectorization of loops with uncountable "
@@ -8046,13 +8185,24 @@ bool LoopVectorizePass::processLoop(Loop *L) {
}
}
+ // Entrance to the VPlan-native vectorization path. Outer loops are processed
+ // here. They may require CFG and instruction level transformations before
+ // even evaluating whether vectorization is profitable. Since we cannot modify
+ // the incoming IR, we need to build VPlan upfront in the vectorization
+ // pipeline.
+ if (!L->isInnermost())
+ return processLoopInVPlanNativePath(L, PSE, LI, DT, &LVL, TTI, TLI, DB, AC,
+ ORE, GetBFI, OptForSize, Hints,
+ Requirements);
+
+ assert(L->isInnermost() && "Inner loop expected.");
+
InterleavedAccessInfo IAI(PSE, L, DT, LI, LVL.getLAI());
- bool UseInterleaved =
- IsInnerLoop && TTI->enableInterleavedAccessVectorization();
+ bool UseInterleaved = TTI->enableInterleavedAccessVectorization();
// If an override option has been passed in for interleaved accesses, use it.
if (EnableInterleavedMemAccesses.getNumOccurrences() > 0)
- UseInterleaved = IsInnerLoop && EnableInterleavedMemAccesses;
+ UseInterleaved = EnableInterleavedMemAccesses;
// Analyze interleaved memory accesses.
if (UseInterleaved)
@@ -8155,11 +8305,7 @@ bool LoopVectorizePass::processLoop(Loop *L) {
// Get user vectorization factor and interleave count.
ElementCount UserVF = Hints.getWidth();
unsigned UserIC = Hints.getInterleave();
- // Outer loops don't have LoopAccessInfo, so skip the safety check and reset
- // UserIC (interleaving is not supported for outer loops).
- if (!IsInnerLoop)
- UserIC = 0;
- else if (UserIC > 1 && !LVL.isSafeForAnyVectorWidth())
+ if (UserIC > 1 && !LVL.isSafeForAnyVectorWidth())
UserIC = 1;
// Plan how to best vectorize.
@@ -8167,16 +8313,11 @@ bool LoopVectorizePass::processLoop(Loop *L) {
auto [VF, BestPlanPtr] = LVP.computeBestVF();
unsigned IC = 1;
- // For VPlan build stress testing of outer loops, bail after plan
- // construction.
- if (!IsInnerLoop && VPlanBuildOuterloopStressTest)
- return false;
-
- if (IsInnerLoop && ORE->allowExtraAnalysis(LV_NAME))
+ if (ORE->allowExtraAnalysis(LV_NAME))
LVP.emitInvalidCostRemarks(ORE);
GeneratedRTChecks Checks(PSE, DT, LI, TTI, Config.CostKind);
- if (IsInnerLoop && LVP.hasPlanWithVF(VF.Width)) {
+ if (LVP.hasPlanWithVF(VF.Width)) {
// Select the interleave count.
IC = LVP.selectInterleaveCount(*BestPlanPtr, VF.Width, VF.Cost);
@@ -8419,9 +8560,6 @@ bool LoopVectorizePass::processLoop(Loop *L) {
VF.MinProfitableTripCount);
LVP.attachRuntimeChecks(BestPlan, Checks, HasBranchWeights);
- if (!IsInnerLoop)
- LLVM_DEBUG(dbgs() << "Vectorizing outer loop in \"" << F->getName()
- << "\"\n");
LVP.executePlan(VF.Width, IC, BestPlan, LB, DT);
++LoopsVectorized;
}
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.cpp b/llvm/lib/Transforms/Vectorize/VPlan.cpp
index 3c8f3362ae93a..77cc6484e9c6c 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlan.cpp
@@ -1078,14 +1078,6 @@ const VPRegionBlock *VPlan::getVectorLoopRegion() const {
return nullptr;
}
-bool VPlan::isOuterLoop() const {
- const VPRegionBlock *LoopRegion = getVectorLoopRegion();
- assert(LoopRegion && "expected a vector loop region");
- return any_of(VPBlockUtils::blocksOnly<const VPRegionBlock>(
- vp_depth_first_shallow(LoopRegion->getEntry())),
- [](const VPRegionBlock *R) { return !R->isReplicator(); });
-}
-
#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
void VPlan::printLiveIns(raw_ostream &O) const {
VPSlotTracker SlotTracker(this);
@@ -1675,6 +1667,27 @@ bool LoopVectorizationPlanner::getDecisionAndClampRange(
return PredicateAtRangeStart;
}
+/// Build VPlans for the full range of feasible VF's = {\p MinVF, 2 * \p MinVF,
+/// 4 * \p MinVF, ..., \p MaxVF} by repeatedly building a VPlan for a sub-range
+/// of VF's starting at a given VF and extending it as much as possible. Each
+/// vectorization decision can potentially shorten this sub-range during
+/// buildVPlan().
+void LoopVectorizationPlanner::buildVPlans(ElementCount MinVF,
+ ElementCount MaxVF) {
+ auto MaxVFTimes2 = MaxVF * 2;
+ for (ElementCount VF = MinVF; ElementCount::isKnownLT(VF, MaxVFTimes2);) {
+ VFRange SubRange = {VF, MaxVFTimes2};
+ if (auto Plan = tryToBuildVPlan(SubRange)) {
+ VPlanTransforms::optimize(*Plan);
+ // Update the name of the latch of the top-level vector loop region region
+ // after optimizations which includes block folding.
+ Plan->getVectorLoopRegion()->getExiting()->setName("vector.latch");
+ VPlans.push_back(std::move(Plan));
+ }
+ VF = SubRange.End;
+ }
+}
+
VPlan &LoopVectorizationPlanner::getPlanFor(ElementCount VF) const {
assert(count_if(VPlans,
[VF](const VPlanPtr &Plan) { return Plan->hasVF(VF); }) ==
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h
index 51193964bdd83..6a1ea6b3439bf 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.h
+++ b/llvm/lib/Transforms/Vectorize/VPlan.h
@@ -4616,10 +4616,6 @@ class VPlan {
LLVM_ABI_FOR_TEST VPRegionBlock *getVectorLoopRegion();
LLVM_ABI_FOR_TEST const VPRegionBlock *getVectorLoopRegion() const;
- /// Returns true if this VPlan is for an outer loop, i.e., its vector
- /// loop region contains a nested loop region.
- LLVM_ABI_FOR_TEST bool isOuterLoop() const;
-
/// Returns the 'middle' block of the plan, that is the block that selects
/// whether to execute the scalar tail loop or the exit block from the loop
/// latch. If there is an early exit from the vector loop, the middle block
diff --git a/llvm/lib/Transforms/Vectorize/VPlanPredicator.cpp b/llvm/lib/Transforms/Vectorize/VPlanPredicator.cpp
index 2717b80e2eeaa..9710767f905fe 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanPredicator.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanPredicator.cpp
@@ -260,9 +260,6 @@ void VPPredicator::convertPhisToBlends(VPBasicBlock *VPBB) {
}
void VPlanTransforms::introduceMasksAndLinearize(VPlan &Plan) {
- // Nested loop regions (outer-loop vectorization) are not supported yet.
- if (Plan.isOuterLoop())
- return;
VPRegionBlock *LoopRegion = Plan.getVectorLoopRegion();
// Scan the body of the loop in a topological order to visit each basic block
// after having visited its predecessor basic blocks.
diff --git a/llvm/test/Transforms/LoopVectorize/VPlan/vplan-stress-test-no-explict-vf.ll b/llvm/test/Transforms/LoopVectorize/VPlan/vplan-stress-test-no-explict-vf.ll
index 7f442f7d72e78..e03110fc3807a 100644
--- a/llvm/test/Transforms/LoopVectorize/VPlan/vplan-stress-test-no-explict-vf.ll
+++ b/llvm/test/Transforms/LoopVectorize/VPlan/vplan-stress-test-no-explict-vf.ll
@@ -1,4 +1,4 @@
-; RUN: opt < %s -S -passes=loop-vectorize -enable-vplan-native-path -vplan-build-outerloop-stress-test -debug-only=loop-vectorize -disable-output 2>&1 | FileCheck %s
+; RUN: opt < %s -S -passes=loop-vectorize -enable-vplan-native-path -vplan-build-stress-test -debug-only=loop-vectorize -disable-output 2>&1 | FileCheck %s
; This test checks that, when stress testing VPlan, if the computed VF
; is 1, we override it to VF = 4.
diff --git a/llvm/test/Transforms/LoopVectorize/VPlan/vplan_hcfg_stress_test.ll b/llvm/test/Transforms/LoopVectorize/VPlan/vplan_hcfg_stress_test.ll
index f60a620deecf9..f6b215f43d68e 100644
--- a/llvm/test/Transforms/LoopVectorize/VPlan/vplan_hcfg_stress_test.ll
+++ b/llvm/test/Transforms/LoopVectorize/VPlan/vplan_hcfg_stress_test.ll
@@ -1,4 +1,4 @@
-; RUN: opt < %s -passes=loop-vectorize -enable-vplan-native-path -vplan-build-outerloop-stress-test -debug-only=vplan -disable-output 2>&1 | FileCheck %s
+; RUN: opt < %s -passes=loop-vectorize -enable-vplan-native-path -vplan-build-stress-test -debug-only=vplan -disable-output 2>&1 | FileCheck %s
; Verify that the stress testing flag for the VPlan H-CFG builder works as
; expected with and without enabling the VPlan H-CFG Verifier.
diff --git a/llvm/test/Transforms/LoopVectorize/explicit_outer_detection.ll b/llvm/test/Transforms/LoopVectorize/explicit_outer_detection.ll
index a610f0669f483..71bcd90304e43 100644
--- a/llvm/test/Transforms/LoopVectorize/explicit_outer_detection.ll
+++ b/llvm/test/Transforms/LoopVectorize/explicit_outer_detection.ll
@@ -73,7 +73,7 @@ for.end15:
; CHECK-LABEL: case2
; CHECK: LV: Loop hints: force=enabled width=0 interleave=0
; CHECK: LV: We can vectorize this outer loop!
-; CHECK: LV: VPlan computed VF 1.
+; CHECK: LV: Using VF 1 to build VPlans.
define void @case2(ptr nocapture %a, ptr nocapture readonly %b, i32 %N, i32 %M) {
entry:
More information about the llvm-branch-commits
mailing list