[llvm] 25fbe80 - [LV] Move runtime pointer size check to LVP::plan().
Florian Hahn via llvm-commits
llvm-commits at lists.llvm.org
Mon Mar 29 06:20:13 PDT 2021
Author: Florian Hahn
Date: 2021-03-29T14:12:29+01:00
New Revision: 25fbe803d4dbcf8ff3a3a9ca161f5b9a68353ed0
URL: https://github.com/llvm/llvm-project/commit/25fbe803d4dbcf8ff3a3a9ca161f5b9a68353ed0
DIFF: https://github.com/llvm/llvm-project/commit/25fbe803d4dbcf8ff3a3a9ca161f5b9a68353ed0.diff
LOG: [LV] Move runtime pointer size check to LVP::plan().
This removes the need for the remaining doesNotMeet check and instead
directly checks if there are too many runtime checks for vectorization
in the planner.
A subsequent patch will adjust the logic used to decide whether to
vectorize with runtime to consider their cost more accurately.
Reviewed By: lebedev.ri
Differential Revision: https://reviews.llvm.org/D98634
Added:
llvm/test/Transforms/LoopVectorize/X86/runtime-limit.ll
Modified:
llvm/include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h
llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp
llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h
llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
llvm/test/LTO/X86/diagnostic-handler-remarks.ll
Removed:
llvm/test/Transforms/LoopVectorize/runtime-limit.ll
################################################################################
diff --git a/llvm/include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h b/llvm/include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h
index f9a8be317bb6..bfaad81771ec 100644
--- a/llvm/include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h
+++ b/llvm/include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h
@@ -177,8 +177,6 @@ class LoopVectorizeHints {
/// followed by a non-expert user.
class LoopVectorizationRequirements {
public:
- LoopVectorizationRequirements(OptimizationRemarkEmitter &ORE) : ORE(ORE) {}
-
/// Track the 1st floating-point instruction that can not be reassociated.
void addExactFPMathInst(Instruction *I) {
if (I && !ExactFPMathInst)
@@ -187,19 +185,19 @@ class LoopVectorizationRequirements {
void addRuntimePointerChecks(unsigned Num) { NumRuntimePointerChecks = Num; }
- bool doesNotMeet(Function *F, Loop *L, const LoopVectorizeHints &Hints);
Instruction *getExactFPInst() { return ExactFPMathInst; }
bool canVectorizeFPMath(const LoopVectorizeHints &Hints) const {
return !ExactFPMathInst || Hints.allowReordering();
}
+ unsigned getNumRuntimePointerChecks() const {
+ return NumRuntimePointerChecks;
+ }
+
private:
unsigned NumRuntimePointerChecks = 0;
Instruction *ExactFPMathInst = nullptr;
-
- /// Interface to emit optimization remarks.
- OptimizationRemarkEmitter &ORE;
};
/// LoopVectorizationLegality checks if it is legal to vectorize a loop, and
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp
index 939fbe3e3601..016f61af791d 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp
@@ -37,11 +37,8 @@ static cl::opt<bool>
EnableIfConversion("enable-if-conversion", cl::init(true), cl::Hidden,
cl::desc("Enable if-conversion during vectorization."));
-static cl::opt<unsigned> PragmaVectorizeMemoryCheckThreshold(
- "pragma-vectorize-memory-check-threshold", cl::init(128), cl::Hidden,
- cl::desc("The maximum allowed number of runtime memory checks with a "
- "vectorize(enable) pragma."));
-
+// TODO: Move size-based thresholds out of legality checking, make cost based
+// decisions instead of hard thresholds.
static cl::opt<unsigned> VectorizeSCEVCheckThreshold(
"vectorize-scev-check-threshold", cl::init(16), cl::Hidden,
cl::desc("The maximum number of SCEV checks allowed."));
@@ -246,32 +243,6 @@ void LoopVectorizeHints::setHint(StringRef Name, Metadata *Arg) {
}
}
-bool LoopVectorizationRequirements::doesNotMeet(
- Function *F, Loop *L, const LoopVectorizeHints &Hints) {
- const char *PassName = Hints.vectorizeAnalysisPassName();
- bool Failed = false;
-
- // Test if runtime memcheck thresholds are exceeded.
- bool PragmaThresholdReached =
- NumRuntimePointerChecks > PragmaVectorizeMemoryCheckThreshold;
- bool ThresholdReached =
- NumRuntimePointerChecks > VectorizerParams::RuntimeMemoryCheckThreshold;
- if ((ThresholdReached && !Hints.allowReordering()) ||
- PragmaThresholdReached) {
- ORE.emit([&]() {
- return OptimizationRemarkAnalysisAliasing(PassName, "CantReorderMemOps",
- L->getStartLoc(),
- L->getHeader())
- << "loop not vectorized: cannot prove it is safe to reorder "
- "memory operations";
- });
- LLVM_DEBUG(dbgs() << "LV: Too many memory checks needed.\n");
- Failed = true;
- }
-
- return Failed;
-}
-
// Return true if the inner loop \p Lp is uniform with regard to the outer loop
// \p OuterLp (i.e., if the outer loop is vectorized, all the vector lanes
// executing the inner loop will execute the same iterations). This check is
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h b/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h
index 70e1226e0ebf..d5306e556e14 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h
@@ -34,6 +34,9 @@ namespace llvm {
class LoopVectorizationLegality;
class LoopVectorizationCostModel;
class PredicatedScalarEvolution;
+class LoopVectorizationRequirements;
+class LoopVectorizeHints;
+class OptimizationRemarkEmitter;
class VPRecipeBuilder;
/// VPlan-based builder utility analogous to IRBuilder.
@@ -220,6 +223,12 @@ class LoopVectorizationPlanner {
PredicatedScalarEvolution &PSE;
+ const LoopVectorizeHints &Hints;
+
+ LoopVectorizationRequirements &Requirements;
+
+ OptimizationRemarkEmitter *ORE;
+
SmallVector<VPlanPtr, 4> VPlans;
/// A builder used to construct the current plan.
@@ -237,9 +246,12 @@ class LoopVectorizationPlanner {
LoopVectorizationLegality *Legal,
LoopVectorizationCostModel &CM,
InterleavedAccessInfo &IAI,
- PredicatedScalarEvolution &PSE)
+ PredicatedScalarEvolution &PSE,
+ const LoopVectorizeHints &Hints,
+ LoopVectorizationRequirements &Requirements,
+ OptimizationRemarkEmitter *ORE)
: OrigLoop(L), LI(LI), TLI(TLI), TTI(TTI), Legal(Legal), CM(CM), IAI(IAI),
- PSE(PSE) {}
+ PSE(PSE), Hints(Hints), Requirements(Requirements), ORE(ORE) {}
/// Plan how to best vectorize, return the best VF and its cost, or None if
/// vectorization and interleaving should be avoided up front.
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 077b7867ebc9..039aa89b98fc 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -197,6 +197,11 @@ static cl::opt<unsigned> TinyTripCountVectorThreshold(
"value are vectorized only if no scalar iteration overheads "
"are incurred."));
+static cl::opt<unsigned> PragmaVectorizeMemoryCheckThreshold(
+ "pragma-vectorize-memory-check-threshold", cl::init(128), cl::Hidden,
+ cl::desc("The maximum allowed number of runtime memory checks with a "
+ "vectorize(enable) pragma."));
+
// Option prefer-predicate-over-epilogue indicates that an epilogue is undesired,
// that predication is preferred, and this lists all options. I.e., the
// vectorizer will try to fold the tail-loop (epilogue) into the vector body
@@ -7774,7 +7779,30 @@ LoopVectorizationPlanner::plan(ElementCount UserVF, unsigned UserIC) {
return VectorizationFactor::Disabled();
// Select the optimal vectorization factor.
- return CM.selectVectorizationFactor(MaxVF);
+ auto SelectedVF = CM.selectVectorizationFactor(MaxVF);
+
+ // Check if it is profitable to vectorize with runtime checks.
+ unsigned NumRuntimePointerChecks = Requirements.getNumRuntimePointerChecks();
+ if (SelectedVF.Width.getKnownMinValue() > 1 && NumRuntimePointerChecks) {
+ bool PragmaThresholdReached =
+ NumRuntimePointerChecks > PragmaVectorizeMemoryCheckThreshold;
+ bool ThresholdReached =
+ NumRuntimePointerChecks > VectorizerParams::RuntimeMemoryCheckThreshold;
+ if ((ThresholdReached && !Hints.allowReordering()) ||
+ PragmaThresholdReached) {
+ ORE->emit([&]() {
+ return OptimizationRemarkMissed(DEBUG_TYPE, "CantReorderMemOps",
+ OrigLoop->getStartLoc(),
+ OrigLoop->getHeader())
+ << "loop not vectorized: cannot prove it is safe to reorder "
+ "memory operations";
+ });
+ LLVM_DEBUG(dbgs() << "LV: Too many memory checks needed.\n");
+ Hints.emitRemarkWithHints();
+ return VectorizationFactor::Disabled();
+ }
+ }
+ return SelectedVF;
}
void LoopVectorizationPlanner::setBestPlan(ElementCount VF, unsigned UF) {
@@ -9391,7 +9419,8 @@ static bool processLoopInVPlanNativePath(
LoopVectorizationLegality *LVL, TargetTransformInfo *TTI,
TargetLibraryInfo *TLI, DemandedBits *DB, AssumptionCache *AC,
OptimizationRemarkEmitter *ORE, BlockFrequencyInfo *BFI,
- ProfileSummaryInfo *PSI, LoopVectorizeHints &Hints) {
+ ProfileSummaryInfo *PSI, LoopVectorizeHints &Hints,
+ LoopVectorizationRequirements &Requirements) {
if (isa<SCEVCouldNotCompute>(PSE.getBackedgeTakenCount())) {
LLVM_DEBUG(dbgs() << "LV: cannot compute the outer-loop trip count\n");
@@ -9409,7 +9438,8 @@ static bool processLoopInVPlanNativePath(
// Use the planner for outer loop vectorization.
// TODO: CM is not used at this point inside the planner. Turn CM into an
// optional argument if we don't need it in the future.
- LoopVectorizationPlanner LVP(L, LI, TLI, TTI, LVL, CM, IAI, PSE);
+ LoopVectorizationPlanner LVP(L, LI, TLI, TTI, LVL, CM, IAI, PSE, Hints,
+ Requirements, ORE);
// Get user vectorization factor.
ElementCount UserVF = Hints.getWidth();
@@ -9537,7 +9567,7 @@ bool LoopVectorizePass::processLoop(Loop *L) {
PredicatedScalarEvolution PSE(*SE, *L);
// Check if it is legal to vectorize the loop.
- LoopVectorizationRequirements Requirements(*ORE);
+ LoopVectorizationRequirements Requirements;
LoopVectorizationLegality LVL(L, PSE, DT, TTI, TLI, AA, F, GetLAA, LI, ORE,
&Requirements, &Hints, DB, AC, BFI, PSI);
if (!LVL.canVectorize(EnableVPlanNativePath)) {
@@ -9558,7 +9588,7 @@ bool LoopVectorizePass::processLoop(Loop *L) {
// pipeline.
if (!L->isInnermost())
return processLoopInVPlanNativePath(L, PSE, LI, DT, &LVL, TTI, TLI, DB, AC,
- ORE, BFI, PSI, Hints);
+ ORE, BFI, PSI, Hints, Requirements);
assert(L->isInnermost() && "Inner loop expected.");
@@ -9637,7 +9667,8 @@ bool LoopVectorizePass::processLoop(Loop *L) {
CM.collectValuesToIgnore();
// Use the planner for vectorization.
- LoopVectorizationPlanner LVP(L, LI, TLI, TTI, &LVL, CM, IAI, PSE);
+ LoopVectorizationPlanner LVP(L, LI, TLI, TTI, &LVL, CM, IAI, PSE, Hints,
+ Requirements, ORE);
// Get user vectorization factor and interleave count.
ElementCount UserVF = Hints.getWidth();
@@ -9658,13 +9689,6 @@ bool LoopVectorizePass::processLoop(Loop *L) {
// Identify the diagnostic messages that should be produced.
std::pair<StringRef, std::string> VecDiagMsg, IntDiagMsg;
bool VectorizeLoop = true, InterleaveLoop = true;
- if (Requirements.doesNotMeet(F, L, Hints)) {
- LLVM_DEBUG(dbgs() << "LV: Not vectorizing: loop did not meet vectorization "
- "requirements.\n");
- Hints.emitRemarkWithHints();
- return false;
- }
-
if (VF.Width.isScalar()) {
LLVM_DEBUG(dbgs() << "LV: Vectorization is possible but not beneficial.\n");
VecDiagMsg = std::make_pair(
diff --git a/llvm/test/LTO/X86/diagnostic-handler-remarks.ll b/llvm/test/LTO/X86/diagnostic-handler-remarks.ll
index f38293db93e1..85e4a624792b 100644
--- a/llvm/test/LTO/X86/diagnostic-handler-remarks.ll
+++ b/llvm/test/LTO/X86/diagnostic-handler-remarks.ll
@@ -6,14 +6,14 @@
; Confirm that there are -pass-remarks.
; RUN: llvm-lto -use-new-pm=false \
; RUN: -pass-remarks=inline \
-; RUN: -exported-symbol _func2 -pass-remarks-analysis=loop-vectorize \
+; RUN: -exported-symbol _func2 -pass-remarks-missed=loop-vectorize \
; RUN: -exported-symbol _main -o %t.o %t.bc 2>&1 | \
; RUN: FileCheck %s -allow-empty -check-prefix=REMARKS
; RUN: llvm-nm %t.o | FileCheck %s -check-prefix NM
; RUN: llvm-lto -use-new-pm=false \
; RUN: -pass-remarks=inline -use-diagnostic-handler \
-; RUN: -exported-symbol _func2 -pass-remarks-analysis=loop-vectorize \
+; RUN: -exported-symbol _func2 -pass-remarks-missed=loop-vectorize \
; RUN: -exported-symbol _main -o %t.o %t.bc 2>&1 | \
; RUN: FileCheck %s -allow-empty -check-prefix=REMARKS_DH
; RUN: llvm-nm %t.o | FileCheck %s -check-prefix NM
diff --git a/llvm/test/Transforms/LoopVectorize/runtime-limit.ll b/llvm/test/Transforms/LoopVectorize/X86/runtime-limit.ll
similarity index 86%
rename from llvm/test/Transforms/LoopVectorize/runtime-limit.ll
rename to llvm/test/Transforms/LoopVectorize/X86/runtime-limit.ll
index a7f692cef170..7e93ef4792c9 100644
--- a/llvm/test/Transforms/LoopVectorize/runtime-limit.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/runtime-limit.ll
@@ -1,17 +1,19 @@
-; RUN: opt < %s -loop-vectorize -force-vector-width=4 -force-vector-interleave=1 -dce -instcombine -pass-remarks=loop-vectorize -pass-remarks-missed=loop-vectorize -S 2>&1 | FileCheck %s -check-prefix=OVERRIDE
-; RUN: opt < %s -loop-vectorize -force-vector-width=4 -force-vector-interleave=1 -pragma-vectorize-memory-check-threshold=6 -dce -instcombine -pass-remarks=loop-vectorize -pass-remarks-missed=loop-vectorize -S 2>&1 | FileCheck %s
+; RUN: opt < %s -loop-vectorize -dce -instcombine -pass-remarks=loop-vectorize -pass-remarks-missed=loop-vectorize -S 2>&1 | FileCheck %s -check-prefix=OVERRIDE
+; RUN: opt < %s -loop-vectorize -pragma-vectorize-memory-check-threshold=6 -dce -instcombine -pass-remarks=loop-vectorize -pass-remarks-missed=loop-vectorize -S 2>&1 | FileCheck %s
target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux"
+
; First loop produced diagnostic pass remark.
-;CHECK: remark: {{.*}}:0:0: vectorized loop (vectorization width: 4, interleaved count: 1)
+;CHECK: remark: {{.*}}:0:0: vectorized loop (vectorization width: 4, interleaved count: 2)
; Second loop produces diagnostic analysis remark.
;CHECK: remark: {{.*}}:0:0: loop not vectorized: cannot prove it is safe to reorder memory operations
; First loop produced diagnostic pass remark.
-;OVERRIDE: remark: {{.*}}:0:0: vectorized loop (vectorization width: 4, interleaved count: 1)
+;OVERRIDE: remark: {{.*}}:0:0: vectorized loop (vectorization width: 4, interleaved count: 2)
; Second loop produces diagnostic pass remark.
-;OVERRIDE: remark: {{.*}}:0:0: vectorized loop (vectorization width: 4, interleaved count: 1)
+;OVERRIDE: remark: {{.*}}:0:0: loop not vectorized: cannot prove it is safe to reorder memory operations
; We are vectorizing with 6 runtime checks.
;CHECK-LABEL: func1x6(
@@ -56,7 +58,7 @@ for.end: ; preds = %for.body
;CHECK: ret
; We vectorize with 12 checks if a vectorization hint is provided.
;OVERRIDE-LABEL: func2x6(
-;OVERRIDE: <4 x i32>
+;OVERRIDE-NOT: <4 x i32>
;OVERRIDE: ret
define i32 @func2x6(i32* nocapture %out, i32* nocapture %out2, i32* nocapture %A, i32* nocapture %B, i32* nocapture %C, i32* nocapture %D, i32* nocapture %E, i32* nocapture %F) {
entry:
More information about the llvm-commits
mailing list