[llvm] 7b61ae6 - [AutoFDO] Inline replay for cold/small callees from sample profile loader

Wenlei He via llvm-commits llvm-commits at lists.llvm.org
Fri Dec 6 11:45:06 PST 2019


Author: Wenlei He
Date: 2019-12-06T11:44:45-08:00
New Revision: 7b61ae68ecd7a127e69c9e0d2563bddb7eccad7a

URL: https://github.com/llvm/llvm-project/commit/7b61ae68ecd7a127e69c9e0d2563bddb7eccad7a
DIFF: https://github.com/llvm/llvm-project/commit/7b61ae68ecd7a127e69c9e0d2563bddb7eccad7a.diff

LOG: [AutoFDO] Inline replay for cold/small callees from sample profile loader

Summary:
Sample profile loader of AutoFDO tries to replay previous inlining using context sensitive profile. The replay only repeats inlining if the call site block is hot. As a result it punts inlining of small functions, some of which can be beneficial for size, and will still be inlined by CSGCC inliner later. The oscillation between sample profile loader's inlining and regular CGSSC inlining cause unnecessary loss of context-sensitive profile. It doesn't have much impact for inline decision itself, but it negatively affects post-inline profile quality as CGSCC inliner have to scale counts which is not as accurate as the original context sensitive profile, and bad post-inline profile can misguide code layout.

This change added regular Inline Cost calculation for sample profile loader, so we can inline small functions upfront under switch -sample-profile-inline-size. In addition -sample-profile-cold-inline-threshold is added so we can tune the separate size threshold - currently the default is chosen to be the same as regular inliner's cold call-site threshold.

Reviewers: wmi, davidxl

Subscribers: hiraditya, llvm-commits

Tags: #llvm

Differential Revision: https://reviews.llvm.org/D70750

Added: 
    llvm/test/Transforms/SampleProfile/Inputs/inline-cold.prof
    llvm/test/Transforms/SampleProfile/inline-cold.ll

Modified: 
    llvm/lib/Transforms/IPO/SampleProfile.cpp

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Transforms/IPO/SampleProfile.cpp b/llvm/lib/Transforms/IPO/SampleProfile.cpp
index c11eeda2aa75..40bcf43cab24 100644
--- a/llvm/lib/Transforms/IPO/SampleProfile.cpp
+++ b/llvm/lib/Transforms/IPO/SampleProfile.cpp
@@ -150,6 +150,15 @@ static cl::opt<bool> ProfileTopDownLoad(
     cl::desc("Do profile annotation and inlining for functions in top-down "
              "order of call graph during sample profile loading."));
 
+static cl::opt<bool> ProfileSizeInline(
+    "sample-profile-inline-size", cl::Hidden, cl::init(false),
+    cl::desc("Inline cold call sites in profile loader if it's beneficial "
+             "for code size."));
+
+static cl::opt<int> SampleColdCallSiteThreshold(
+    "sample-profile-cold-inline-threshold", cl::Hidden, cl::init(45),
+    cl::desc("Threshold for inlining cold callsites"));
+
 namespace {
 
 using BlockWeightMap = DenseMap<const BasicBlock *, uint64_t>;
@@ -319,6 +328,8 @@ class SampleProfileLoader {
   bool inlineCallInstruction(Instruction *I);
   bool inlineHotFunctions(Function &F,
                           DenseSet<GlobalValue::GUID> &InlinedGUIDs);
+  // Inline cold/small functions in addition to hot ones
+  bool shouldInlineColdCallee(Instruction &CallInst);
   void printEdgeWeight(raw_ostream &OS, Edge E);
   void printBlockWeight(raw_ostream &OS, const BasicBlock *BB) const;
   void printBlockEquivalence(raw_ostream &OS, const BasicBlock *BB);
@@ -899,6 +910,21 @@ bool SampleProfileLoader::inlineCallInstruction(Instruction *I) {
   return false;
 }
 
+bool SampleProfileLoader::shouldInlineColdCallee(Instruction &CallInst) {
+  if (!ProfileSizeInline)
+    return false;
+
+  Function *Callee = CallSite(&CallInst).getCalledFunction();
+  if (Callee == nullptr)
+    return false;
+
+  InlineCost Cost =
+      getInlineCost(cast<CallBase>(CallInst), getInlineParams(),
+                    GetTTI(*Callee), GetAC, None, nullptr, nullptr);
+
+  return Cost.getCost() <= SampleColdCallSiteThreshold;
+}
+
 /// Iteratively inline hot callsites of a function.
 ///
 /// Iteratively traverse all callsites of the function \p F, and find if
@@ -931,20 +957,26 @@ bool SampleProfileLoader::inlineHotFunctions(
     SmallVector<Instruction *, 10> CIS;
     for (auto &BB : F) {
       bool Hot = false;
-      SmallVector<Instruction *, 10> Candidates;
+      SmallVector<Instruction *, 10> AllCandidates;
+      SmallVector<Instruction *, 10> ColdCandidates;
       for (auto &I : BB.getInstList()) {
         const FunctionSamples *FS = nullptr;
         if ((isa<CallInst>(I) || isa<InvokeInst>(I)) &&
             !isa<IntrinsicInst>(I) && (FS = findCalleeFunctionSamples(I))) {
-          Candidates.push_back(&I);
+          AllCandidates.push_back(&I);
           if (FS->getEntrySamples() > 0)
             localNotInlinedCallSites.try_emplace(&I, FS);
           if (callsiteIsHot(FS, PSI))
             Hot = true;
+          else if (shouldInlineColdCallee(I))
+            ColdCandidates.push_back(&I);
         }
       }
       if (Hot) {
-        CIS.insert(CIS.begin(), Candidates.begin(), Candidates.end());
+        CIS.insert(CIS.begin(), AllCandidates.begin(), AllCandidates.end());
+      }
+      else {
+        CIS.insert(CIS.begin(), ColdCandidates.begin(), ColdCandidates.end());
       }
     }
     for (auto I : CIS) {

diff  --git a/llvm/test/Transforms/SampleProfile/Inputs/inline-cold.prof b/llvm/test/Transforms/SampleProfile/Inputs/inline-cold.prof
new file mode 100644
index 000000000000..6d097b08bc68
--- /dev/null
+++ b/llvm/test/Transforms/SampleProfile/Inputs/inline-cold.prof
@@ -0,0 +1,7 @@
+main:225715:0
+ 2.1: 5553
+ 3: 5391
+ 3.1: _Z3sumii:0
+  0: 0
+  1: 0
+  2: 0
\ No newline at end of file

diff  --git a/llvm/test/Transforms/SampleProfile/inline-cold.ll b/llvm/test/Transforms/SampleProfile/inline-cold.ll
new file mode 100644
index 000000000000..abec880d22ab
--- /dev/null
+++ b/llvm/test/Transforms/SampleProfile/inline-cold.ll
@@ -0,0 +1,102 @@
+; Let sample profile loader replay inlining of small/cold functions
+
+; Make sure we don't inline the cold call sites by default
+; RUN: opt < %s -sample-profile -sample-profile-file=%S/Inputs/inline-cold.prof -S | FileCheck -check-prefix=NOTINLINE %s
+; RUN: opt < %s -passes=sample-profile -sample-profile-file=%S/Inputs/inline-cold.prof -S | FileCheck -check-prefix=NOTINLINE %s
+
+; Make sure we inline code call sites for size if requested
+; RUN: opt < %s -sample-profile -sample-profile-file=%S/Inputs/inline-cold.prof -sample-profile-inline-size -S | FileCheck -check-prefix=INLINE %s
+
+; Make sure we re-inline everything if requested 
+; RUN: opt < %s -passes=sample-profile -sample-profile-file=%S/Inputs/inline-cold.prof -sample-profile-inline-size -sample-profile-cold-inline-threshold=9999999 -S | FileCheck -check-prefix=INLINE %s
+
+; Make sure the separate size threshold for sample profile loader inlining works
+; RUN: opt < %s -passes=sample-profile -sample-profile-file=%S/Inputs/inline-cold.prof -sample-profile-inline-size -sample-profile-cold-inline-threshold=-500 -S | FileCheck -check-prefix=NOTINLINE %s
+
+ at .str = private unnamed_addr constant [11 x i8] c"sum is %d\0A\00", align 1
+
+define i32 @_Z3sumii(i32 %x, i32 %y) !dbg !6 {
+entry:
+  %x.addr = alloca i32, align 4
+  %y.addr = alloca i32, align 4
+  store i32 %x, i32* %x.addr, align 4
+  store i32 %y, i32* %y.addr, align 4
+  %tmp = load i32, i32* %x.addr, align 4, !dbg !8
+  %tmp1 = load i32, i32* %y.addr, align 4, !dbg !8
+  %add = add nsw i32 %tmp, %tmp1, !dbg !8
+  ret i32 %add, !dbg !8
+}
+
+define i32 @main() !dbg !9 {
+entry:
+  %retval = alloca i32, align 4
+  %s = alloca i32, align 4
+  %i = alloca i32, align 4
+  store i32 0, i32* %retval
+  store i32 0, i32* %i, align 4, !dbg !10
+  br label %while.cond, !dbg !11
+
+while.cond:                                       ; preds = %if.end, %entry
+  %tmp = load i32, i32* %i, align 4, !dbg !12
+  %inc = add nsw i32 %tmp, 1, !dbg !12
+  store i32 %inc, i32* %i, align 4, !dbg !12
+  %cmp = icmp slt i32 %tmp, 400000000, !dbg !12
+  br i1 %cmp, label %while.body, label %while.end, !dbg !12
+
+while.body:                                       ; preds = %while.cond
+  %tmp1 = load i32, i32* %i, align 4, !dbg !14
+  %cmp1 = icmp ne i32 %tmp1, 100, !dbg !14
+  br i1 %cmp1, label %if.then, label %if.else, !dbg !14
+
+if.then:                                          ; preds = %while.body
+  %tmp2 = load i32, i32* %i, align 4, !dbg !16
+  %tmp3 = load i32, i32* %s, align 4, !dbg !16
+  %call = call i32 @_Z3sumii(i32 %tmp2, i32 %tmp3), !dbg !16
+; INLINE-NOT: call i32 @_Z3sumii
+; NOTINLINE: call i32 @_Z3sumii
+  store i32 %call, i32* %s, align 4, !dbg !16
+  br label %if.end, !dbg !16
+
+if.else:                                          ; preds = %while.body
+  store i32 30, i32* %s, align 4, !dbg !18
+  br label %if.end
+
+if.end:                                           ; preds = %if.else, %if.then
+  br label %while.cond, !dbg !20
+
+while.end:                                        ; preds = %while.cond
+  %tmp4 = load i32, i32* %s, align 4, !dbg !22
+  %call2 = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([11 x i8], [11 x i8]* @.str, i32 0, i32 0), i32 %tmp4), !dbg !22
+  ret i32 0, !dbg !23
+}
+
+declare i32 @printf(i8*, ...)
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!3, !4}
+!llvm.ident = !{!5}
+
+!0 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus, file: !1, producer: "clang version 3.5 ", isOptimized: false, runtimeVersion: 0, emissionKind: NoDebug, enums: !2, retainedTypes: !2, globals: !2, imports: !2)
+!1 = !DIFile(filename: "calls.cc", directory: ".")
+!2 = !{}
+!3 = !{i32 2, !"Dwarf Version", i32 4}
+!4 = !{i32 1, !"Debug Info Version", i32 3}
+!5 = !{!"clang version 3.5 "}
+!6 = distinct !DISubprogram(name: "sum", scope: !1, file: !1, line: 3, type: !7, scopeLine: 3, virtualIndex: 6, flags: DIFlagPrototyped, spFlags: DISPFlagDefinition, unit: !0, retainedNodes: !2)
+!7 = !DISubroutineType(types: !2)
+!8 = !DILocation(line: 4, scope: !6)
+!9 = distinct !DISubprogram(name: "main", scope: !1, file: !1, line: 7, type: !7, scopeLine: 7, virtualIndex: 6, flags: DIFlagPrototyped, spFlags: DISPFlagDefinition, unit: !0, retainedNodes: !2)
+!10 = !DILocation(line: 8, scope: !9)
+!11 = !DILocation(line: 9, scope: !9)
+!12 = !DILocation(line: 9, scope: !13)
+!13 = !DILexicalBlockFile(scope: !9, file: !1, discriminator: 2)
+!14 = !DILocation(line: 10, scope: !15)
+!15 = distinct !DILexicalBlock(scope: !9, file: !1, line: 10)
+!16 = !DILocation(line: 10, scope: !17)
+!17 = !DILexicalBlockFile(scope: !15, file: !1, discriminator: 2)
+!18 = !DILocation(line: 10, scope: !19)
+!19 = !DILexicalBlockFile(scope: !15, file: !1, discriminator: 4)
+!20 = !DILocation(line: 10, scope: !21)
+!21 = !DILexicalBlockFile(scope: !15, file: !1, discriminator: 6)
+!22 = !DILocation(line: 11, scope: !9)
+!23 = !DILocation(line: 12, scope: !9)


        


More information about the llvm-commits mailing list