[llvm] 30c382a - [PGO][PGSO] Add profile guided size optimization to loop vectorization legality.
Hiroshi Yamauchi via llvm-commits
llvm-commits at lists.llvm.org
Wed Jul 15 11:50:23 PDT 2020
Author: Hiroshi Yamauchi
Date: 2020-07-15T11:49:36-07:00
New Revision: 30c382a7c6607a7d898730f8d288768110cdf1d2
URL: https://github.com/llvm/llvm-project/commit/30c382a7c6607a7d898730f8d288768110cdf1d2
DIFF: https://github.com/llvm/llvm-project/commit/30c382a7c6607a7d898730f8d288768110cdf1d2.diff
LOG: [PGO][PGSO] Add profile guided size optimization to loop vectorization legality.
Differential Revision: https://reviews.llvm.org/D83329
Added:
Modified:
llvm/include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h
llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp
llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
llvm/test/Transforms/LoopVectorize/optsize.ll
Removed:
################################################################################
diff --git a/llvm/include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h b/llvm/include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h
index c6c3450f7760..7235aa586112 100644
--- a/llvm/include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h
+++ b/llvm/include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h
@@ -202,9 +202,10 @@ class LoopVectorizationLegality {
Function *F, std::function<const LoopAccessInfo &(Loop &)> *GetLAA,
LoopInfo *LI, OptimizationRemarkEmitter *ORE,
LoopVectorizationRequirements *R, LoopVectorizeHints *H, DemandedBits *DB,
- AssumptionCache *AC)
+ AssumptionCache *AC, BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI)
: TheLoop(L), LI(LI), PSE(PSE), TTI(TTI), TLI(TLI), DT(DT),
- GetLAA(GetLAA), ORE(ORE), Requirements(R), Hints(H), DB(DB), AC(AC) {}
+ GetLAA(GetLAA), ORE(ORE), Requirements(R), Hints(H), DB(DB), AC(AC),
+ BFI(BFI), PSI(PSI) {}
/// ReductionList contains the reduction descriptors for all
/// of the reductions that were found in the loop.
@@ -478,6 +479,10 @@ class LoopVectorizationLegality {
/// Assume instructions in predicated blocks must be dropped if the CFG gets
/// flattened.
SmallPtrSet<Instruction *, 8> ConditionalAssumes;
+
+ /// BFI and PSI are used to check for profile guided size optimizations.
+ BlockFrequencyInfo *BFI;
+ ProfileSummaryInfo *PSI;
};
} // namespace llvm
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp
index 23613775d896..120b544808be 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp
@@ -20,6 +20,7 @@
#include "llvm/Analysis/VectorUtils.h"
#include "llvm/IR/IntrinsicInst.h"
#include "llvm/IR/PatternMatch.h"
+#include "llvm/Transforms/Utils/SizeOpts.h"
#include "llvm/Transforms/Vectorize/LoopVectorize.h"
using namespace llvm;
@@ -412,7 +413,11 @@ int LoopVectorizationLegality::isConsecutivePtr(Value *Ptr) {
const ValueToValueMap &Strides =
getSymbolicStrides() ? *getSymbolicStrides() : ValueToValueMap();
- bool CanAddPredicate = !TheLoop->getHeader()->getParent()->hasOptSize();
+ Function *F = TheLoop->getHeader()->getParent();
+ bool OptForSize = F->hasOptSize() ||
+ llvm::shouldOptimizeForSize(TheLoop->getHeader(), PSI, BFI,
+ PGSOQueryType::IRPass);
+ bool CanAddPredicate = !OptForSize;
int Stride = getPtrStride(PSE, Ptr, TheLoop, Strides, CanAddPredicate, false);
if (Stride == 1 || Stride == -1)
return Stride;
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 35af8e425778..5e3c5a69cd90 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -395,11 +395,13 @@ class InnerLoopVectorizer {
const TargetTransformInfo *TTI, AssumptionCache *AC,
OptimizationRemarkEmitter *ORE, unsigned VecWidth,
unsigned UnrollFactor, LoopVectorizationLegality *LVL,
- LoopVectorizationCostModel *CM)
+ LoopVectorizationCostModel *CM, BlockFrequencyInfo *BFI,
+ ProfileSummaryInfo *PSI)
: OrigLoop(OrigLoop), PSE(PSE), LI(LI), DT(DT), TLI(TLI), TTI(TTI),
AC(AC), ORE(ORE), VF(VecWidth), UF(UnrollFactor),
Builder(PSE.getSE()->getContext()),
- VectorLoopValueMap(UnrollFactor, VecWidth), Legal(LVL), Cost(CM) {}
+ VectorLoopValueMap(UnrollFactor, VecWidth), Legal(LVL), Cost(CM),
+ BFI(BFI), PSI(PSI) {}
virtual ~InnerLoopVectorizer() = default;
/// Create a new empty loop. Unlink the old loop and connect the new one.
@@ -779,6 +781,10 @@ class InnerLoopVectorizer {
// Vector of original scalar PHIs whose corresponding widened PHIs need to be
// fixed up at the end of vector code generation.
SmallVector<PHINode *, 8> OrigPHIsToFix;
+
+ /// BFI and PSI are used to check for profile guided size optimizations.
+ BlockFrequencyInfo *BFI;
+ ProfileSummaryInfo *PSI;
};
class InnerLoopUnroller : public InnerLoopVectorizer {
@@ -789,9 +795,10 @@ class InnerLoopUnroller : public InnerLoopVectorizer {
const TargetTransformInfo *TTI, AssumptionCache *AC,
OptimizationRemarkEmitter *ORE, unsigned UnrollFactor,
LoopVectorizationLegality *LVL,
- LoopVectorizationCostModel *CM)
+ LoopVectorizationCostModel *CM, BlockFrequencyInfo *BFI,
+ ProfileSummaryInfo *PSI)
: InnerLoopVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE, 1,
- UnrollFactor, LVL, CM) {}
+ UnrollFactor, LVL, CM, BFI, PSI) {}
private:
Value *getBroadcastInstrs(Value *V) override;
@@ -2754,7 +2761,9 @@ void InnerLoopVectorizer::emitSCEVChecks(Loop *L, BasicBlock *Bypass) {
if (C->isZero())
return;
- assert(!SCEVCheckBlock->getParent()->hasOptSize() &&
+ assert(!(SCEVCheckBlock->getParent()->hasOptSize() ||
+ llvm::shouldOptimizeForSize(L->getHeader(), PSI, BFI,
+ PGSOQueryType::IRPass)) &&
"Cannot SCEV check stride or overflow when optimizing for size");
SCEVCheckBlock->setName("vector.scevcheck");
@@ -2800,7 +2809,9 @@ void InnerLoopVectorizer::emitMemRuntimeChecks(Loop *L, BasicBlock *Bypass) {
assert(MemRuntimeCheck && "no RT checks generated although RtPtrChecking "
"claimed checks are required");
- if (MemCheckBlock->getParent()->hasOptSize()) {
+ if (MemCheckBlock->getParent()->hasOptSize() ||
+ llvm::shouldOptimizeForSize(L->getHeader(), PSI, BFI,
+ PGSOQueryType::IRPass)) {
assert(Cost->Hints->getForce() == LoopVectorizeHints::FK_Enabled &&
"Cannot emit memory checks when optimizing for size, unless forced "
"to vectorize.");
@@ -7691,7 +7702,7 @@ static bool processLoopInVPlanNativePath(
LVP.setBestPlan(VF.Width, 1);
InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width, 1, LVL,
- &CM);
+ &CM, BFI, PSI);
LLVM_DEBUG(dbgs() << "Vectorizing outer loop in \""
<< L->getHeader()->getParent()->getName() << "\"\n");
LVP.executePlan(LB, DT);
@@ -7755,7 +7766,7 @@ bool LoopVectorizePass::processLoop(Loop *L) {
// Check if it is legal to vectorize the loop.
LoopVectorizationRequirements Requirements(*ORE);
LoopVectorizationLegality LVL(L, PSE, DT, TTI, TLI, AA, F, GetLAA, LI, ORE,
- &Requirements, &Hints, DB, AC);
+ &Requirements, &Hints, DB, AC, BFI, PSI);
if (!LVL.canVectorize(EnableVPlanNativePath)) {
LLVM_DEBUG(dbgs() << "LV: Not vectorizing: Cannot prove legality.\n");
Hints.emitRemarkWithHints();
@@ -7955,8 +7966,8 @@ bool LoopVectorizePass::processLoop(Loop *L) {
assert(IC > 1 && "interleave count should not be 1 or 0");
// If we decided that it is not legal to vectorize the loop, then
// interleave it.
- InnerLoopUnroller Unroller(L, PSE, LI, DT, TLI, TTI, AC, ORE, IC, &LVL,
- &CM);
+ InnerLoopUnroller Unroller(L, PSE, LI, DT, TLI, TTI, AC, ORE, IC, &LVL, &CM,
+ BFI, PSI);
LVP.executePlan(Unroller, DT);
ORE->emit([&]() {
@@ -7968,7 +7979,7 @@ bool LoopVectorizePass::processLoop(Loop *L) {
} else {
// If we decided that it is *legal* to vectorize the loop, then do it.
InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width, IC,
- &LVL, &CM);
+ &LVL, &CM, BFI, PSI);
LVP.executePlan(LB, DT);
++LoopsVectorized;
diff --git a/llvm/test/Transforms/LoopVectorize/optsize.ll b/llvm/test/Transforms/LoopVectorize/optsize.ll
index 0e88f362746f..b4233e6751cb 100644
--- a/llvm/test/Transforms/LoopVectorize/optsize.ll
+++ b/llvm/test/Transforms/LoopVectorize/optsize.ll
@@ -121,6 +121,38 @@ for.body29:
br i1 %cmp26, label %for.body29, label %for.cond.cleanup28
}
+define void @pr43371_pgso() !prof !14 {
+;
+; CHECK-LABEL: @pr43371_pgso
+; CHECK-NOT: vector.scevcheck
+;
+; We do not want to generate SCEV predicates when optimising for size, because
+; that will lead to extra code generation such as the SCEV overflow runtime
+; checks. Not generating SCEV predicates can still result in vectorisation as
+; the non-consecutive loads/stores can be scalarized:
+;
+; CHECK: vector.body:
+; CHECK: store i16 0, i16* %{{.*}}, align 1
+; CHECK: store i16 0, i16* %{{.*}}, align 1
+; CHECK: br i1 {{.*}}, label %vector.body
+;
+entry:
+ br label %for.body29
+
+for.cond.cleanup28:
+ unreachable
+
+for.body29:
+ %i24.0170 = phi i16 [ 0, %entry], [ %inc37, %for.body29]
+ %add33 = add i16 undef, %i24.0170
+ %idxprom34 = zext i16 %add33 to i32
+ %arrayidx35 = getelementptr [2592 x i16], [2592 x i16] * @cm_array, i32 0, i32 %idxprom34
+ store i16 0, i16 * %arrayidx35, align 1
+ %inc37 = add i16 %i24.0170, 1
+ %cmp26 = icmp ult i16 %inc37, 756
+ br i1 %cmp26, label %for.body29, label %for.cond.cleanup28
+}
+
; PR45526: don't vectorize with fold-tail if first-order-recurrence is live-out.
;
define i32 @pr45526() optsize {
@@ -154,6 +186,37 @@ exit:
ret i32 %for
}
+define i32 @pr45526_pgso() !prof !14 {
+;
+; CHECK-LABEL: @pr45526_pgso
+; CHECK-NEXT: entry:
+; CHECK-NEXT: br label %loop
+; CHECK-EMPTY:
+; CHECK-NEXT: loop:
+; CHECK-NEXT: %piv = phi i32 [ 0, %entry ], [ %pivPlus1, %loop ]
+; CHECK-NEXT: %for = phi i32 [ 5, %entry ], [ %pivPlus1, %loop ]
+; CHECK-NEXT: %pivPlus1 = add nuw nsw i32 %piv, 1
+; CHECK-NEXT: %cond = icmp ult i32 %piv, 510
+; CHECK-NEXT: br i1 %cond, label %loop, label %exit
+; CHECK-EMPTY:
+; CHECK-NEXT: exit:
+; CHECK-NEXT: %for.lcssa = phi i32 [ %for, %loop ]
+; CHECK-NEXT: ret i32 %for.lcssa
+;
+entry:
+ br label %loop
+
+loop:
+ %piv = phi i32 [ 0, %entry ], [ %pivPlus1, %loop ]
+ %for = phi i32 [ 5, %entry ], [ %pivPlus1, %loop ]
+ %pivPlus1 = add nuw nsw i32 %piv, 1
+ %cond = icmp ult i32 %piv, 510
+ br i1 %cond, label %loop, label %exit
+
+exit:
+ ret i32 %for
+}
+
; PR46228: Vectorize w/o versioning for unit stride under optsize and enabled
; vectorization.
@@ -190,7 +253,7 @@ define void @stride1(i16* noalias %B, i32 %BStride) optsize {
; CHECK-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], 2
; CHECK-NEXT: [[VEC_IND_NEXT]] = add <2 x i32> [[VEC_IND]], <i32 2, i32 2>
; CHECK-NEXT: [[TMP8:%.*]] = icmp eq i32 [[INDEX_NEXT]], 1026
-; CHECK-NEXT: br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !19
+; CHECK-NEXT: br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !21
; CHECK: middle.block:
; CHECK-NEXT: br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]]
; CHECK: scalar.ph:
More information about the llvm-commits
mailing list