[llvm] d275a06 - [AutoFDO] Statistic for context sensitive profile guided inlining

Wenlei He via llvm-commits llvm-commits at lists.llvm.org
Wed Dec 11 21:44:36 PST 2019


Author: Wenlei He
Date: 2019-12-11T21:37:21-08:00
New Revision: d275a064871763ab3a7712c74712d2fd1d0bef5d

URL: https://github.com/llvm/llvm-project/commit/d275a064871763ab3a7712c74712d2fd1d0bef5d
DIFF: https://github.com/llvm/llvm-project/commit/d275a064871763ab3a7712c74712d2fd1d0bef5d.diff

LOG: [AutoFDO] Statistic for context sensitive profile guided inlining

Summary: AutoFDO compilation has two places that do inlining - the sample profile loader that does inlining with context sensitive profile, and the regular inliner as CGSCC pass. Ideally we want most inlining to come from sample profile loader as that is driven by context sensitive profile and also retains context sensitivity after inlining. However the reality is most of the inlining actually happens during regular inliner. To track the number of inline instances from sample profile loader and help move more inlining to sample profile loader, I'm adding statistics and optimization remarks for sample profile loader's inlining.

Reviewers: wmi, davidxl

Subscribers: hiraditya, llvm-commits

Tags: #llvm

Differential Revision: https://reviews.llvm.org/D70584

Added: 
    llvm/test/Transforms/SampleProfile/inline-stats.ll

Modified: 
    llvm/lib/Transforms/IPO/SampleProfile.cpp
    llvm/test/Transforms/SampleProfile/inline-coverage.ll
    llvm/test/Transforms/SampleProfile/remarks.ll

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Transforms/IPO/SampleProfile.cpp b/llvm/lib/Transforms/IPO/SampleProfile.cpp
index 40bcf43cab24..2b169638d409 100644
--- a/llvm/lib/Transforms/IPO/SampleProfile.cpp
+++ b/llvm/lib/Transforms/IPO/SampleProfile.cpp
@@ -30,6 +30,7 @@
 #include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/ADT/SmallSet.h"
 #include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/Statistic.h"
 #include "llvm/ADT/StringMap.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/ADT/Twine.h"
@@ -94,6 +95,12 @@ using namespace llvm;
 using namespace sampleprof;
 using ProfileCount = Function::ProfileCount;
 #define DEBUG_TYPE "sample-profile"
+#define CSINLINE_DEBUG DEBUG_TYPE "-inline"
+
+STATISTIC(NumCSInlined,
+          "Number of functions inlined with context sensitive profile");
+STATISTIC(NumCSNotInlined,
+          "Number of functions not inlined with context sensitive profile");
 
 // Command line option to specify the file to read samples from. This is
 // mainly used for debugging.
@@ -330,6 +337,8 @@ class SampleProfileLoader {
                           DenseSet<GlobalValue::GUID> &InlinedGUIDs);
   // Inline cold/small functions in addition to hot ones
   bool shouldInlineColdCallee(Instruction &CallInst);
+  void emitOptimizationRemarksForInlineCandidates(
+    const SmallVector<Instruction *, 10> &Candidates, const Function &F, bool Hot);
   void printEdgeWeight(raw_ostream &OS, Edge E);
   void printBlockWeight(raw_ostream &OS, const BasicBlock *BB) const;
   void printBlockEquivalence(raw_ostream &OS, const BasicBlock *BB);
@@ -895,15 +904,15 @@ bool SampleProfileLoader::inlineCallInstruction(Instruction *I) {
       getInlineCost(cast<CallBase>(*I), Params, GetTTI(*CalledFunction), GetAC,
                     None, nullptr, nullptr);
   if (Cost.isNever()) {
-    ORE->emit(OptimizationRemark(DEBUG_TYPE, "Not inline", DLoc, BB)
+    ORE->emit(OptimizationRemarkAnalysis(CSINLINE_DEBUG, "InlineFail", DLoc, BB)
               << "incompatible inlining");
     return false;
   }
   InlineFunctionInfo IFI(nullptr, &GetAC);
   if (InlineFunction(CS, IFI)) {
     // The call to InlineFunction erases I, so we can't pass it here.
-    ORE->emit(OptimizationRemark(DEBUG_TYPE, "HotInline", DLoc, BB)
-              << "inlined hot callee '" << ore::NV("Callee", CalledFunction)
+    ORE->emit(OptimizationRemark(CSINLINE_DEBUG, "InlineSuccess", DLoc, BB)
+              << "inlined callee '" << ore::NV("Callee", CalledFunction)
               << "' into '" << ore::NV("Caller", BB->getParent()) << "'");
     return true;
   }
@@ -925,6 +934,22 @@ bool SampleProfileLoader::shouldInlineColdCallee(Instruction &CallInst) {
   return Cost.getCost() <= SampleColdCallSiteThreshold;
 }
 
+void SampleProfileLoader::emitOptimizationRemarksForInlineCandidates(
+    const SmallVector<Instruction *, 10> &Candidates, const Function &F,
+    bool Hot) {
+  for (auto I : Candidates) {
+    Function *CalledFunction = CallSite(I).getCalledFunction();
+    if (CalledFunction) {
+      ORE->emit(OptimizationRemarkAnalysis(CSINLINE_DEBUG, "InlineAttempt", 
+                                           I->getDebugLoc(), I->getParent())
+                << "previous inlining reattempted for "
+                << (Hot ? "hotness: '" : "size: '")
+                << ore::NV("Callee", CalledFunction) << "' into '"
+                << ore::NV("Caller", &F) << "'");
+    }
+  }
+}
+
 /// Iteratively inline hot callsites of a function.
 ///
 /// Iteratively traverse all callsites of the function \p F, and find if
@@ -974,9 +999,11 @@ bool SampleProfileLoader::inlineHotFunctions(
       }
       if (Hot) {
         CIS.insert(CIS.begin(), AllCandidates.begin(), AllCandidates.end());
+        emitOptimizationRemarksForInlineCandidates(AllCandidates, F, true);
       }
       else {
         CIS.insert(CIS.begin(), ColdCandidates.begin(), ColdCandidates.end());
+        emitOptimizationRemarksForInlineCandidates(ColdCandidates, F, false);
       }
     }
     for (auto I : CIS) {
@@ -1022,6 +1049,7 @@ bool SampleProfileLoader::inlineHotFunctions(
                 inlineCallInstruction(DI)) {
               localNotInlinedCallSites.erase(I);
               LocalChanged = true;
+              ++NumCSInlined;
             }
           } else {
             LLVM_DEBUG(dbgs()
@@ -1034,6 +1062,7 @@ bool SampleProfileLoader::inlineHotFunctions(
         if (inlineCallInstruction(I)) {
           localNotInlinedCallSites.erase(I);
           LocalChanged = true;
+          ++NumCSInlined;
         }
       } else if (IsThinLTOPreLink) {
         findCalleeFunctionSamples(*I)->findInlinedFunctions(
@@ -1053,6 +1082,14 @@ bool SampleProfileLoader::inlineHotFunctions(
     Function *Callee = CallSite(I).getCalledFunction();
     if (!Callee || Callee->isDeclaration())
       continue;
+
+    ORE->emit(OptimizationRemarkAnalysis(CSINLINE_DEBUG, "NotInline",
+                                         I->getDebugLoc(), I->getParent())
+              << "previous inlining not repeated: '"
+              << ore::NV("Callee", Callee) << "' into '"
+              << ore::NV("Caller", &F) << "'");
+
+    ++NumCSNotInlined;
     const FunctionSamples *FS = Pair.getSecond();
     if (FS->getTotalSamples() == 0 && FS->getEntrySamples() == 0) {
       continue;

diff  --git a/llvm/test/Transforms/SampleProfile/inline-coverage.ll b/llvm/test/Transforms/SampleProfile/inline-coverage.ll
index 7e189545190e..134bbe8b80d0 100644
--- a/llvm/test/Transforms/SampleProfile/inline-coverage.ll
+++ b/llvm/test/Transforms/SampleProfile/inline-coverage.ll
@@ -16,7 +16,7 @@
 ;    11      return sum > 0 ? 0 : 1;
 ;    12    }
 ;
-; CHECK: remark: coverage.cc:10:12: inlined hot callee '_Z3fool' into 'main'
+; CHECK: remark: coverage.cc:10:12: previous inlining reattempted for hotness: '_Z3fool' into 'main'
 ; CHECK: remark: coverage.cc:9:21: Applied 23478 samples from profile (offset: 2.1)
 ; CHECK: remark: coverage.cc:10:16: Applied 23478 samples from profile (offset: 3)
 ; CHECK: remark: coverage.cc:4:10: Applied 31878 samples from profile (offset: 1)

diff  --git a/llvm/test/Transforms/SampleProfile/inline-stats.ll b/llvm/test/Transforms/SampleProfile/inline-stats.ll
new file mode 100644
index 000000000000..f928aed6a97d
--- /dev/null
+++ b/llvm/test/Transforms/SampleProfile/inline-stats.ll
@@ -0,0 +1,104 @@
+; REQUIRES: asserts
+; RUN: opt < %s -sample-profile -sample-profile-file=%S/Inputs/inline.prof -stats -S 2>&1 | FileCheck %s
+; RUN: opt < %s -passes=sample-profile -sample-profile-file=%S/Inputs/inline.prof -stats -S 2>&1 | FileCheck %s
+
+; Original C++ test case
+;
+; #include <stdio.h>
+;
+; int sum(int x, int y) {
+;   return x + y;
+; }
+;
+; int main() {
+;   int s, i = 0;
+;   while (i++ < 20000 * 20000)
+;     if (i != 100) s = sum(i, s); else s = 30;
+;   printf("sum is %d\n", s);
+;   return 0;
+; }
+;
+ at .str = private unnamed_addr constant [11 x i8] c"sum is %d\0A\00", align 1
+define i32 @_Z3sumii(i32 %x, i32 %y) !dbg !6 {
+entry:
+  %x.addr = alloca i32, align 4
+  %y.addr = alloca i32, align 4
+  store i32 %x, i32* %x.addr, align 4
+  store i32 %y, i32* %y.addr, align 4
+  %tmp = load i32, i32* %x.addr, align 4, !dbg !8
+  %tmp1 = load i32, i32* %y.addr, align 4, !dbg !8
+  %add = add nsw i32 %tmp, %tmp1, !dbg !8
+  ret i32 %add, !dbg !8
+}
+define i32 @main() !dbg !9 {
+entry:
+  %retval = alloca i32, align 4
+  %s = alloca i32, align 4
+  %i = alloca i32, align 4
+  store i32 0, i32* %retval
+  store i32 0, i32* %i, align 4, !dbg !10
+  br label %while.cond, !dbg !11
+
+while.cond:                                       ; preds = %if.end, %entry
+  %tmp = load i32, i32* %i, align 4, !dbg !12
+  %inc = add nsw i32 %tmp, 1, !dbg !12
+  store i32 %inc, i32* %i, align 4, !dbg !12
+  %cmp = icmp slt i32 %tmp, 400000000, !dbg !12
+  br i1 %cmp, label %while.body, label %while.end, !dbg !12
+
+while.body:                                       ; preds = %while.cond
+  %tmp1 = load i32, i32* %i, align 4, !dbg !14
+  %cmp1 = icmp ne i32 %tmp1, 100, !dbg !14
+  br i1 %cmp1, label %if.then, label %if.else, !dbg !14
+
+if.then:                                          ; preds = %while.body
+  %tmp2 = load i32, i32* %i, align 4, !dbg !16
+  %tmp3 = load i32, i32* %s, align 4, !dbg !16
+  %call = call i32 @_Z3sumii(i32 %tmp2, i32 %tmp3), !dbg !16
+  store i32 %call, i32* %s, align 4, !dbg !16
+  br label %if.end, !dbg !16
+
+if.else:                                          ; preds = %while.body
+  store i32 30, i32* %s, align 4, !dbg !18
+  br label %if.end
+
+if.end:                                           ; preds = %if.else, %if.then
+  br label %while.cond, !dbg !20
+
+while.end:                                        ; preds = %while.cond
+  %tmp4 = load i32, i32* %s, align 4, !dbg !22
+  %call2 = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([11 x i8], [11 x i8]* @.str, i32 0, i32 0), i32 %tmp4), !dbg !22
+  ret i32 0, !dbg !23
+}
+declare i32 @printf(i8*, ...)
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!3, !4}
+!llvm.ident = !{!5}
+
+!0 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus, file: !1, producer: "clang version 3.5 ", isOptimized: false, runtimeVersion: 0, emissionKind: NoDebug, enums: !2, retainedTypes: !2, globals: !2, imports: !2)
+!1 = !DIFile(filename: "calls.cc", directory: ".")
+!2 = !{}
+!3 = !{i32 2, !"Dwarf Version", i32 4}
+!4 = !{i32 1, !"Debug Info Version", i32 3}
+!5 = !{!"clang version 3.5 "}
+!6 = distinct !DISubprogram(name: "sum", scope: !1, file: !1, line: 3, type: !7, scopeLine: 3, virtualIndex: 6, flags: DIFlagPrototyped, spFlags: DISPFlagDefinition, unit: !0, retainedNodes: !2)
+!7 = !DISubroutineType(types: !2)
+!8 = !DILocation(line: 4, scope: !6)
+!9 = distinct !DISubprogram(name: "main", scope: !1, file: !1, line: 7, type: !7, scopeLine: 7, virtualIndex: 6, flags: DIFlagPrototyped, spFlags: DISPFlagDefinition, unit: !0, retainedNodes: !2)
+!10 = !DILocation(line: 8, scope: !9)
+!11 = !DILocation(line: 9, scope: !9)
+!12 = !DILocation(line: 9, scope: !13)
+!13 = !DILexicalBlockFile(scope: !9, file: !1, discriminator: 2)
+!14 = !DILocation(line: 10, scope: !15)
+!15 = distinct !DILexicalBlock(scope: !9, file: !1, line: 10)
+!16 = !DILocation(line: 10, scope: !17)
+!17 = !DILexicalBlockFile(scope: !15, file: !1, discriminator: 2)
+!18 = !DILocation(line: 10, scope: !19)
+!19 = !DILexicalBlockFile(scope: !15, file: !1, discriminator: 4)
+!20 = !DILocation(line: 10, scope: !21)
+!21 = !DILexicalBlockFile(scope: !15, file: !1, discriminator: 6)
+!22 = !DILocation(line: 11, scope: !9)
+!23 = !DILocation(line: 12, scope: !9)
+
+; CHECK: 1 sample-profile - Number of functions inlined with context sensitive profile
\ No newline at end of file

diff  --git a/llvm/test/Transforms/SampleProfile/remarks.ll b/llvm/test/Transforms/SampleProfile/remarks.ll
index 3ecaa534296e..89a4bbe46d00 100644
--- a/llvm/test/Transforms/SampleProfile/remarks.ll
+++ b/llvm/test/Transforms/SampleProfile/remarks.ll
@@ -21,7 +21,7 @@
 
 ; We are expecting foo() to be inlined in main() (almost all the cycles are
 ; spent inside foo).
-; CHECK: remark: remarks.cc:13:21: inlined hot callee '_Z3foov' into 'main'
+; CHECK: remark: remarks.cc:13:21: inlined callee '_Z3foov' into 'main'
 
 ; The back edge for the loop is the hottest edge in the loop subgraph.
 ; CHECK: remark: remarks.cc:6:9: most popular destination for conditional branches at remarks.cc:5:3
@@ -31,12 +31,12 @@
 
 ; Checking to see if YAML file is generated and contains remarks
 ;YAML:       --- !Passed
-;YAML-NEXT:  Pass:            sample-profile
-;YAML-NEXT:  Name:            HotInline
+;YAML-NEXT:  Pass:            sample-profile-inline
+;YAML-NEXT:  Name:            InlineSuccess
 ;YAML-NEXT:  DebugLoc:        { File: remarks.cc, Line: 13, Column: 21 }
 ;YAML-NEXT:  Function:        main
 ;YAML-NEXT:  Args:
-;YAML-NEXT:    - String:          'inlined hot callee '''
+;YAML-NEXT:    - String:          'inlined callee '''
 ;YAML-NEXT:    - Callee:          _Z3foov
 ;YAML-NEXT:      DebugLoc:        { File: remarks.cc, Line: 3, Column: 0 }
 ;YAML-NEXT:    - String:          ''' into '''


        


More information about the llvm-commits mailing list