[llvm] 2c391a5 - [LICM] Make Loop ICM profile aware again
Wenlei He via llvm-commits
llvm-commits at lists.llvm.org
Tue Sep 15 17:25:50 PDT 2020
Author: Wenlei He
Date: 2020-09-15T17:21:58-07:00
New Revision: 2c391a5a14aeb34e970aba85c5aa540656fe47ca
URL: https://github.com/llvm/llvm-project/commit/2c391a5a14aeb34e970aba85c5aa540656fe47ca
DIFF: https://github.com/llvm/llvm-project/commit/2c391a5a14aeb34e970aba85c5aa540656fe47ca.diff
LOG: [LICM] Make Loop ICM profile aware again
D65060 was reverted because it introduced non-determinism by using BFI counts from already freed blocks. The parent of this revision fixes that by using a VH callback on blocks to prevent this from happening and makes sure BFI data is passed correctly in LoopStandardAnalysisResults.
This re-introduces the previous optimization of using BFI data to prevent LICM from hoisting/sinking if the instruction will end up moving to a colder block.
Internally at Facebook this change results in a ~7% win in a CPU related metric in one of our big services by preventing hoisting cold code into a hot pre-header like the added test case demonstrates.
Testing:
ninja check
Reviewed By: asbirlea
Differential Revision: https://reviews.llvm.org/D87551
Added:
llvm/test/Transforms/LICM/Inputs/no-hoist-prof.prof
llvm/test/Transforms/LICM/no-hoist-prof.ll
Modified:
llvm/include/llvm/Transforms/Utils/LoopUtils.h
llvm/lib/Passes/PassBuilder.cpp
llvm/lib/Transforms/Scalar/LICM.cpp
llvm/test/Transforms/LICM/sink.ll
Removed:
################################################################################
diff --git a/llvm/include/llvm/Transforms/Utils/LoopUtils.h b/llvm/include/llvm/Transforms/Utils/LoopUtils.h
index 70c8c84c857b..cf0982d270b8 100644
--- a/llvm/include/llvm/Transforms/Utils/LoopUtils.h
+++ b/llvm/include/llvm/Transforms/Utils/LoopUtils.h
@@ -26,6 +26,7 @@ class AAResults;
class AliasSet;
class AliasSetTracker;
class BasicBlock;
+class BlockFrequencyInfo;
class IRBuilderBase;
class Loop;
class LoopInfo;
@@ -123,12 +124,13 @@ struct SinkAndHoistLICMFlags {
/// reverse depth first order w.r.t the DominatorTree. This allows us to visit
/// uses before definitions, allowing us to sink a loop body in one pass without
/// iteration. Takes DomTreeNode, AAResults, LoopInfo, DominatorTree,
-/// TargetLibraryInfo, Loop, AliasSet information for all
+/// BlockFrequencyInfo, TargetLibraryInfo, Loop, AliasSet information for all
/// instructions of the loop and loop safety information as
/// arguments. Diagnostics is emitted via \p ORE. It returns changed status.
bool sinkRegion(DomTreeNode *, AAResults *, LoopInfo *, DominatorTree *,
- TargetLibraryInfo *, TargetTransformInfo *, Loop *,
- AliasSetTracker *, MemorySSAUpdater *, ICFLoopSafetyInfo *,
+ BlockFrequencyInfo *, TargetLibraryInfo *,
+ TargetTransformInfo *, Loop *, AliasSetTracker *,
+ MemorySSAUpdater *, ICFLoopSafetyInfo *,
SinkAndHoistLICMFlags &, OptimizationRemarkEmitter *);
/// Walk the specified region of the CFG (defined by all blocks
@@ -136,13 +138,14 @@ bool sinkRegion(DomTreeNode *, AAResults *, LoopInfo *, DominatorTree *,
/// first order w.r.t the DominatorTree. This allows us to visit definitions
/// before uses, allowing us to hoist a loop body in one pass without iteration.
/// Takes DomTreeNode, AAResults, LoopInfo, DominatorTree,
-/// TargetLibraryInfo, Loop, AliasSet information for all instructions of the
-/// loop and loop safety information as arguments. Diagnostics is emitted via \p
-/// ORE. It returns changed status.
+/// BlockFrequencyInfo, TargetLibraryInfo, Loop, AliasSet information for all
+/// instructions of the loop and loop safety information as arguments.
+/// Diagnostics is emitted via \p ORE. It returns changed status.
bool hoistRegion(DomTreeNode *, AAResults *, LoopInfo *, DominatorTree *,
- TargetLibraryInfo *, Loop *, AliasSetTracker *,
- MemorySSAUpdater *, ScalarEvolution *, ICFLoopSafetyInfo *,
- SinkAndHoistLICMFlags &, OptimizationRemarkEmitter *);
+ BlockFrequencyInfo *, TargetLibraryInfo *, Loop *,
+ AliasSetTracker *, MemorySSAUpdater *, ScalarEvolution *,
+ ICFLoopSafetyInfo *, SinkAndHoistLICMFlags &,
+ OptimizationRemarkEmitter *);
/// This function deletes dead loops. The caller of this function needs to
/// guarantee that the loop is infact dead.
diff --git a/llvm/lib/Passes/PassBuilder.cpp b/llvm/lib/Passes/PassBuilder.cpp
index ddbc7a2fb4d5..1f43b5e6538e 100644
--- a/llvm/lib/Passes/PassBuilder.cpp
+++ b/llvm/lib/Passes/PassBuilder.cpp
@@ -2429,9 +2429,11 @@ Error PassBuilder::parseFunctionPass(FunctionPassManager &FPM,
return Err;
// Add the nested pass manager with the appropriate adaptor.
bool UseMemorySSA = (Name == "loop-mssa");
- FPM.addPass(createFunctionToLoopPassAdaptor(
- std::move(LPM), UseMemorySSA, /*UseBlockFrequencyInfo=*/false,
- DebugLogging));
+ bool UseBFI =
+ std::any_of(InnerPipeline.begin(), InnerPipeline.end(),
+ [](auto Pipeline) { return Pipeline.Name == "licm"; });
+ FPM.addPass(createFunctionToLoopPassAdaptor(std::move(LPM), UseMemorySSA,
+ UseBFI, DebugLogging));
return Error::success();
}
if (auto Count = parseRepeatPassName(Name)) {
diff --git a/llvm/lib/Transforms/Scalar/LICM.cpp b/llvm/lib/Transforms/Scalar/LICM.cpp
index 841badba0834..a8fe8280a9ce 100644
--- a/llvm/lib/Transforms/Scalar/LICM.cpp
+++ b/llvm/lib/Transforms/Scalar/LICM.cpp
@@ -35,6 +35,7 @@
#include "llvm/Analysis/AliasAnalysis.h"
#include "llvm/Analysis/AliasSetTracker.h"
#include "llvm/Analysis/BasicAliasAnalysis.h"
+#include "llvm/Analysis/BlockFrequencyInfo.h"
#include "llvm/Analysis/CaptureTracking.h"
#include "llvm/Analysis/ConstantFolding.h"
#include "llvm/Analysis/GlobalsModRef.h"
@@ -99,6 +100,11 @@ static cl::opt<bool> ControlFlowHoisting(
"licm-control-flow-hoisting", cl::Hidden, cl::init(false),
cl::desc("Enable control flow (and PHI) hoisting in LICM"));
+static cl::opt<unsigned> HoistSinkColdnessThreshold(
+ "licm-coldness-threshold", cl::Hidden, cl::init(4),
+ cl::desc("Relative coldness Threshold of hoisting/sinking destination "
+ "block for LICM to be considered beneficial"));
+
static cl::opt<uint32_t> MaxNumUsesTraversed(
"licm-max-num-uses-traversed", cl::Hidden, cl::init(8),
cl::desc("Max num uses visited for identifying load "
@@ -144,8 +150,9 @@ static void hoist(Instruction &I, const DominatorTree *DT, const Loop *CurLoop,
MemorySSAUpdater *MSSAU, ScalarEvolution *SE,
OptimizationRemarkEmitter *ORE);
static bool sink(Instruction &I, LoopInfo *LI, DominatorTree *DT,
- const Loop *CurLoop, ICFLoopSafetyInfo *SafetyInfo,
- MemorySSAUpdater *MSSAU, OptimizationRemarkEmitter *ORE);
+ BlockFrequencyInfo *BFI, const Loop *CurLoop,
+ ICFLoopSafetyInfo *SafetyInfo, MemorySSAUpdater *MSSAU,
+ OptimizationRemarkEmitter *ORE);
static bool isSafeToExecuteUnconditionally(Instruction &Inst,
const DominatorTree *DT,
const Loop *CurLoop,
@@ -356,12 +363,13 @@ bool LoopInvariantCodeMotion::runOnLoop(
LicmMssaOptCap, LicmMssaNoAccForPromotionCap,
/*IsSink=*/true};
if (L->hasDedicatedExits())
- Changed |= sinkRegion(DT->getNode(L->getHeader()), AA, LI, DT, TLI, TTI, L,
- CurAST.get(), MSSAU.get(), &SafetyInfo, Flags, ORE);
+ Changed |=
+ sinkRegion(DT->getNode(L->getHeader()), AA, LI, DT, BFI, TLI, TTI, L,
+ CurAST.get(), MSSAU.get(), &SafetyInfo, Flags, ORE);
Flags.IsSink = false;
if (Preheader)
Changed |=
- hoistRegion(DT->getNode(L->getHeader()), AA, LI, DT, TLI, L,
+ hoistRegion(DT->getNode(L->getHeader()), AA, LI, DT, BFI, TLI, L,
CurAST.get(), MSSAU.get(), SE, &SafetyInfo, Flags, ORE);
// Now that all loop invariants have been removed from the loop, promote any
@@ -458,10 +466,10 @@ bool LoopInvariantCodeMotion::runOnLoop(
/// definitions, allowing us to sink a loop body in one pass without iteration.
///
bool llvm::sinkRegion(DomTreeNode *N, AAResults *AA, LoopInfo *LI,
- DominatorTree *DT, TargetLibraryInfo *TLI,
- TargetTransformInfo *TTI, Loop *CurLoop,
- AliasSetTracker *CurAST, MemorySSAUpdater *MSSAU,
- ICFLoopSafetyInfo *SafetyInfo,
+ DominatorTree *DT, BlockFrequencyInfo *BFI,
+ TargetLibraryInfo *TLI, TargetTransformInfo *TTI,
+ Loop *CurLoop, AliasSetTracker *CurAST,
+ MemorySSAUpdater *MSSAU, ICFLoopSafetyInfo *SafetyInfo,
SinkAndHoistLICMFlags &Flags,
OptimizationRemarkEmitter *ORE) {
@@ -510,7 +518,7 @@ bool llvm::sinkRegion(DomTreeNode *N, AAResults *AA, LoopInfo *LI,
isNotUsedOrFreeInLoop(I, CurLoop, SafetyInfo, TTI, FreeInLoop) &&
canSinkOrHoistInst(I, AA, DT, CurLoop, CurAST, MSSAU, true, &Flags,
ORE)) {
- if (sink(I, LI, DT, CurLoop, SafetyInfo, MSSAU, ORE)) {
+ if (sink(I, LI, DT, BFI, CurLoop, SafetyInfo, MSSAU, ORE)) {
if (!FreeInLoop) {
++II;
salvageDebugInfo(I);
@@ -755,13 +763,43 @@ class ControlFlowHoister {
};
} // namespace
+// Hoisting/sinking instruction out of a loop isn't always beneficial. It's only
+// only worthwhile if the destination block is actually colder than current
+// block.
+static bool worthSinkOrHoistInst(Instruction &I, BasicBlock *DstBlock,
+ OptimizationRemarkEmitter *ORE,
+ BlockFrequencyInfo *BFI) {
+ // Check block frequency only when runtime profile is available
+ // to avoid pathological cases. With static profile, lean towards
+ // hosting because it helps canonicalize the loop for vectorizer.
+ if (!DstBlock->getParent()->hasProfileData())
+ return true;
+
+ if (!HoistSinkColdnessThreshold || !BFI)
+ return true;
+
+ BasicBlock *SrcBlock = I.getParent();
+ if (BFI->getBlockFreq(DstBlock).getFrequency() / HoistSinkColdnessThreshold >
+ BFI->getBlockFreq(SrcBlock).getFrequency()) {
+ ORE->emit([&]() {
+ return OptimizationRemarkMissed(DEBUG_TYPE, "SinkHoistInst", &I)
+ << "failed to sink or hoist instruction because containing block "
+ "has lower frequency than destination block";
+ });
+ return false;
+ }
+
+ return true;
+}
+
/// Walk the specified region of the CFG (defined by all blocks dominated by
/// the specified block, and that are in the current loop) in depth first
/// order w.r.t the DominatorTree. This allows us to visit definitions before
/// uses, allowing us to hoist a loop body in one pass without iteration.
///
bool llvm::hoistRegion(DomTreeNode *N, AAResults *AA, LoopInfo *LI,
- DominatorTree *DT, TargetLibraryInfo *TLI, Loop *CurLoop,
+ DominatorTree *DT, BlockFrequencyInfo *BFI,
+ TargetLibraryInfo *TLI, Loop *CurLoop,
AliasSetTracker *CurAST, MemorySSAUpdater *MSSAU,
ScalarEvolution *SE, ICFLoopSafetyInfo *SafetyInfo,
SinkAndHoistLICMFlags &Flags,
@@ -812,13 +850,15 @@ bool llvm::hoistRegion(DomTreeNode *N, AAResults *AA, LoopInfo *LI,
// Try hoisting the instruction out to the preheader. We can only do
// this if all of the operands of the instruction are loop invariant and
- // if it is safe to hoist the instruction.
+ // if it is safe to hoist the instruction. We also check block frequency
+ // to make sure instruction only gets hoisted into colder blocks.
// TODO: It may be safe to hoist if we are hoisting to a conditional block
// and we have accurately duplicated the control flow from the loop header
// to that block.
if (CurLoop->hasLoopInvariantOperands(&I) &&
canSinkOrHoistInst(I, AA, DT, CurLoop, CurAST, MSSAU, true, &Flags,
ORE) &&
+ worthSinkOrHoistInst(I, CurLoop->getLoopPreheader(), ORE, BFI) &&
isSafeToExecuteUnconditionally(
I, DT, CurLoop, SafetyInfo, ORE,
CurLoop->getLoopPreheader()->getTerminator())) {
@@ -1554,8 +1594,9 @@ static void splitPredecessorsOfLoopExit(PHINode *PN, DominatorTree *DT,
/// position, and may either delete it or move it to outside of the loop.
///
static bool sink(Instruction &I, LoopInfo *LI, DominatorTree *DT,
- const Loop *CurLoop, ICFLoopSafetyInfo *SafetyInfo,
- MemorySSAUpdater *MSSAU, OptimizationRemarkEmitter *ORE) {
+ BlockFrequencyInfo *BFI, const Loop *CurLoop,
+ ICFLoopSafetyInfo *SafetyInfo, MemorySSAUpdater *MSSAU,
+ OptimizationRemarkEmitter *ORE) {
LLVM_DEBUG(dbgs() << "LICM sinking instruction: " << I << "\n");
ORE->emit([&]() {
return OptimizationRemark(DEBUG_TYPE, "InstSunk", &I)
@@ -1631,7 +1672,10 @@ static bool sink(Instruction &I, LoopInfo *LI, DominatorTree *DT,
// If this instruction is only used outside of the loop, then all users are
// PHI nodes in exit blocks due to LCSSA form. Just RAUW them with clones of
// the instruction.
+ // First check if I is worth sinking for all uses. Sink only when it is worth
+ // across all uses.
SmallSetVector<User*, 8> Users(I.user_begin(), I.user_end());
+ SmallVector<PHINode *, 8> ExitPNs;
for (auto *UI : Users) {
auto *User = cast<Instruction>(UI);
@@ -1641,6 +1685,15 @@ static bool sink(Instruction &I, LoopInfo *LI, DominatorTree *DT,
PHINode *PN = cast<PHINode>(User);
assert(ExitBlockSet.count(PN->getParent()) &&
"The LCSSA PHI is not in an exit block!");
+ if (!worthSinkOrHoistInst(I, PN->getParent(), ORE, BFI)) {
+ return Changed;
+ }
+
+ ExitPNs.push_back(PN);
+ }
+
+ for (auto *PN : ExitPNs) {
+
// The PHI must be trivially replaceable.
Instruction *New = sinkThroughTriviallyReplaceablePHI(
PN, &I, LI, SunkCopies, SafetyInfo, CurLoop, MSSAU);
diff --git a/llvm/test/Transforms/LICM/Inputs/no-hoist-prof.prof b/llvm/test/Transforms/LICM/Inputs/no-hoist-prof.prof
new file mode 100644
index 000000000000..c1b2ee0873c0
--- /dev/null
+++ b/llvm/test/Transforms/LICM/Inputs/no-hoist-prof.prof
@@ -0,0 +1,7 @@
+_Z3fooii:200:1
+ 0: 1
+ 1: 1 _Z3bari:1
+ 2: 200
+ 3: 200
+ 4: 0
+ 5: 1
diff --git a/llvm/test/Transforms/LICM/no-hoist-prof.ll b/llvm/test/Transforms/LICM/no-hoist-prof.ll
new file mode 100644
index 000000000000..1b18aa3c288e
--- /dev/null
+++ b/llvm/test/Transforms/LICM/no-hoist-prof.ll
@@ -0,0 +1,88 @@
+; RUN: opt -enable-new-pm=1 -sample-profile -licm -S -sample-profile-file='%S/Inputs/no-hoist-prof.prof' < %s | FileCheck %s --check-prefix=CHECK-BFI-LICM
+; RUN: opt -passes=licm -S < %s | FileCheck %s --check-prefix=CHECK-LICM
+
+; Original source code:
+;
+; int bar(int);
+; int foo(int iter, int explode) {
+; int base = bar(explode);
+; for (int i = 0; i != iter; ++i)
+; if (i == explode)
+; iter = (base * base) + bar(iter);
+; return iter;
+; }
+
+; We need debug information in this .ll in order to leverage the pgo file, so:
+; .ll generated by running `clang++ -O3 -g -S -emit-llvm`, then:
+; - move hoisted mul back into cold section
+; - give labels names
+; - reindex variables
+; - remove metadata calls, attributes, module header
+; - remove unnecessary metadata
+
+; CHECK-LICM: .l.check.preheader:{{.*}}
+; CHECK-LICM-NEXT: {{.*}} = mul {{.*}}
+; CHECK-LICM-NEXT: br{{.*}}
+
+; CHECK-BFI-LICM: .l.cold:{{.*}}
+; CHECK-BFI-LICM-NEXT: {{.*}} = mul {{.*}}
+
+define dso_local i32 @_Z3fooii(i32, i32) local_unnamed_addr #0 !dbg !7 {
+ %3 = tail call i32 @_Z3bari(i32 %1), !dbg !19
+ %4 = icmp eq i32 %0, 0, !dbg !22
+ br i1 %4, label %.l.ret, label %.l.check.preheader, !dbg !24
+
+.l.check.preheader:
+ br label %.l.check, !dbg !24
+
+.l.ret:
+ %5 = phi i32 [ 0, %2 ], [ %12, %.l.iterate ]
+ ret i32 %5, !dbg !25
+
+.l.check:
+ %6 = phi i32 [ 0, %.l.check.preheader ], [ %13, %.l.iterate ]
+ %7 = phi i32 [ %0, %.l.check.preheader ], [ %12, %.l.iterate ]
+ %8 = icmp eq i32 %6, %1, !dbg !26
+ br i1 %8, label %.l.cold, label %.l.iterate, !dbg !28
+
+.l.cold:
+ %9 = mul nsw i32 %3, %3
+ %10 = tail call i32 @_Z3bari(i32 %7), !dbg !29
+ %11 = add nsw i32 %10, %9, !dbg !30
+ br label %.l.iterate, !dbg !31
+
+.l.iterate:
+ %12 = phi i32 [ %11, %.l.cold ], [ %7, %.l.check ]
+ %13 = add nuw nsw i32 %6, 1, !dbg !32
+ %14 = icmp eq i32 %13, %12, !dbg !22
+ br i1 %14, label %.l.ret, label %.l.check, !dbg !24, !llvm.loop !33
+}
+
+attributes #0 = { "use-sample-profile" }
+
+declare dso_local i32 @_Z3bari(i32) local_unnamed_addr #1
+
+!llvm.module.flags = !{!4}
+
+!0 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus, file: !1, producer: "clang version 8.0.20181009 ", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug, nameTableKind: None)
+!1 = !DIFile(filename: "foo.cpp", directory: "/tmp/gather_pgo")
+!4 = !{i32 2, !"Debug Info Version", i32 3}
+!7 = distinct !DISubprogram(name: "foo", linkageName: "_Z3fooii", scope: !1, file: !1, line: 2, type: !8, isLocal: false, isDefinition: true, scopeLine: 2, flags: DIFlagPrototyped, isOptimized: true, unit: !0)
+!8 = !DISubroutineType(types: !9)
+!9 = !{!10, !10, !10}
+!10 = !DIBasicType(name: "int", size: 32, encoding: DW_ATE_signed)
+!16 = distinct !DILexicalBlock(scope: !7, file: !1, line: 4, column: 3)
+!19 = !DILocation(line: 3, column: 14, scope: !7)
+!22 = !DILocation(line: 4, column: 21, scope: !23)
+!23 = distinct !DILexicalBlock(scope: !16, file: !1, line: 4, column: 3)
+!24 = !DILocation(line: 4, column: 3, scope: !16)
+!25 = !DILocation(line: 7, column: 3, scope: !7)
+!26 = !DILocation(line: 5, column: 11, scope: !27)
+!27 = distinct !DILexicalBlock(scope: !23, file: !1, line: 5, column: 9)
+!28 = !DILocation(line: 5, column: 9, scope: !23)
+!29 = !DILocation(line: 6, column: 30, scope: !27)
+!30 = !DILocation(line: 6, column: 28, scope: !27)
+!31 = !DILocation(line: 6, column: 7, scope: !27)
+!32 = !DILocation(line: 4, column: 30, scope: !23)
+!33 = distinct !{!33, !24, !34}
+!34 = !DILocation(line: 6, column: 38, scope: !16)
diff --git a/llvm/test/Transforms/LICM/sink.ll b/llvm/test/Transforms/LICM/sink.ll
index 17170f5af196..8a5da47847c8 100644
--- a/llvm/test/Transforms/LICM/sink.ll
+++ b/llvm/test/Transforms/LICM/sink.ll
@@ -1,8 +1,10 @@
-; RUN: opt -S -licm < %s | FileCheck %s --check-prefix=CHECK-LICM
+; RUN: opt -S -licm -licm-coldness-threshold=0 < %s | FileCheck %s --check-prefix=CHECK-LICM
+; RUN: opt -S -licm < %s | FileCheck %s --check-prefix=CHECK-BFI-LICM
; RUN: opt -S -licm < %s | opt -S -loop-sink | FileCheck %s --check-prefix=CHECK-SINK
; RUN: opt -S < %s -passes='require<opt-remark-emit>,loop(licm),loop-sink' \
; RUN: | FileCheck %s --check-prefix=CHECK-SINK
-; RUN: opt -S -licm -enable-mssa-loop-dependency=true -verify-memoryssa < %s | FileCheck %s --check-prefix=CHECK-LICM
+; RUN: opt -S -licm -licm-coldness-threshold=0 -enable-mssa-loop-dependency=true -verify-memoryssa < %s | FileCheck %s --check-prefix=CHECK-LICM
+; RUN: opt -S -licm -enable-mssa-loop-dependency=true -verify-memoryssa < %s | FileCheck %s --check-prefix=CHECK-BFI-LICM
; Original source code:
; int g;
@@ -29,6 +31,10 @@ define i32 @foo(i32, i32) #0 !prof !2 {
; CHECK-LICM: load i32, i32* @g
; CHECK-LICM: br label %.lr.ph
+; CHECK-BFI-LICM: .lr.ph.preheader:
+; CHECK-BFI-LICM-NOT: load i32, i32* @g
+; CHECK-BFI-LICM: br label %.lr.ph
+
.lr.ph:
%.03 = phi i32 [ %8, %.combine ], [ 0, %.lr.ph.preheader ]
%.012 = phi i32 [ %.1, %.combine ], [ %1, %.lr.ph.preheader ]
More information about the llvm-commits
mailing list