[llvm] e503fd8 - [AutoFDO] Properly merge context-sensitive profile of inlinee back to outlined function

Wenlei He via llvm-commits llvm-commits at lists.llvm.org
Thu Dec 5 15:59:37 PST 2019


Author: Wenlei He
Date: 2019-12-05T15:57:55-08:00
New Revision: e503fd85d3ac9d3e1493a7a63bc43c6939e132cc

URL: https://github.com/llvm/llvm-project/commit/e503fd85d3ac9d3e1493a7a63bc43c6939e132cc
DIFF: https://github.com/llvm/llvm-project/commit/e503fd85d3ac9d3e1493a7a63bc43c6939e132cc.diff

LOG: [AutoFDO] Properly merge context-sensitive profile of inlinee back to outlined function

Summary:
When sample profile loader decides not to inline a previously inlined call-site, we adjust the profile of outlined function simply by scaling up its profile counts by call-site count. This means the context-sensitive profile of that inlined instance will be thrown away. This commit try to keep context-sensitive profile for such cases:

 - Instead of scaling outlined function's profile, we now properly merge the FunctionSamples of inlined instance into outlined function, including all recursively inlined profile.
 - Instead of adjusting the profile for negative inline decision at the end of the sample profile loader pass, we do the profile merge right after processing each function. This change paired with top-down ordering of annotation/inline-replay (a separate diff) will make sure we recursively merge profile back before the profile is used for annotation and inline replay.

A new switch -sample-profile-merge-inlinee is added to enable the new profile merge for tuning. It should be the default behavior eventually.

Reviewers: wmi, davidxl

Subscribers: hiraditya, llvm-commits

Tags: #llvm

Differential Revision: https://reviews.llvm.org/D70653

Added: 
    llvm/test/Transforms/SampleProfile/Inputs/inline-mergeprof.prof
    llvm/test/Transforms/SampleProfile/inline-mergeprof.ll

Modified: 
    llvm/include/llvm/ProfileData/SampleProf.h
    llvm/include/llvm/ProfileData/SampleProfReader.h
    llvm/lib/Transforms/IPO/SampleProfile.cpp
    llvm/test/CodeGen/X86/insert-prefetch-inline.afdo
    llvm/test/Transforms/SampleProfile/Inputs/einline.prof
    llvm/test/Transforms/SampleProfile/Inputs/inline-callee-update.prof

Removed: 
    


################################################################################
diff  --git a/llvm/include/llvm/ProfileData/SampleProf.h b/llvm/include/llvm/ProfileData/SampleProf.h
index 55418d9d0f9c..f8be89c569b7 100644
--- a/llvm/include/llvm/ProfileData/SampleProf.h
+++ b/llvm/include/llvm/ProfileData/SampleProf.h
@@ -387,7 +387,10 @@ class FunctionSamples {
     if (FS != iter->second.end())
       return &FS->second;
     // If we cannot find exact match of the callee name, return the FS with
-    // the max total count.
+    // the max total count. Only do this when CalleeName is not provided, 
+    // i.e., only for indirect calls.
+    if (!CalleeName.empty()) 
+      return nullptr;
     uint64_t MaxTotalSamples = 0;
     const FunctionSamples *R = nullptr;
     for (const auto &NameFS : iter->second)

diff  --git a/llvm/include/llvm/ProfileData/SampleProfReader.h b/llvm/include/llvm/ProfileData/SampleProfReader.h
index 5a5d4cfde224..72b178edc260 100644
--- a/llvm/include/llvm/ProfileData/SampleProfReader.h
+++ b/llvm/include/llvm/ProfileData/SampleProfReader.h
@@ -358,6 +358,15 @@ class SampleProfileReader {
     return getSamplesFor(CanonName);
   }
 
+  /// Return the samples collected for function \p F, create empty
+  /// FunctionSamples if it doesn't exist.
+  FunctionSamples *getOrCreateSamplesFor(const Function &F) {
+    std::string FGUID;
+    StringRef CanonName = FunctionSamples::getCanonicalFnName(F);
+    CanonName = getRepInFormat(CanonName, getFormat(), FGUID);
+    return &Profiles[CanonName];
+  }
+
   /// Return the samples collected for function \p F.
   virtual FunctionSamples *getSamplesFor(StringRef Fname) {
     if (Remapper) {

diff  --git a/llvm/lib/Transforms/IPO/SampleProfile.cpp b/llvm/lib/Transforms/IPO/SampleProfile.cpp
index 21461a609c94..0a3e6ada58b3 100644
--- a/llvm/lib/Transforms/IPO/SampleProfile.cpp
+++ b/llvm/lib/Transforms/IPO/SampleProfile.cpp
@@ -137,6 +137,11 @@ static cl::opt<bool> ProfileAccurateForSymsInList(
     cl::desc("For symbols in profile symbol list, regard their profiles to "
              "be accurate. It may be overriden by profile-sample-accurate. "));
 
+static cl::opt<bool> ProfileMergeInlinee(
+    "sample-profile-merge-inlinee", cl::Hidden, cl::init(false),
+    cl::desc("Merge past inlinee's profile to outline version if sample "
+             "profile loader decided not to inline a call site."));
+
 namespace {
 
 using BlockWeightMap = DenseMap<const BasicBlock *, uint64_t>;
@@ -1008,9 +1013,26 @@ bool SampleProfileLoader::inlineHotFunctions(
     if (!Callee || Callee->isDeclaration())
       continue;
     const FunctionSamples *FS = Pair.getSecond();
-    auto pair =
-        notInlinedCallInfo.try_emplace(Callee, NotInlinedProfileInfo{0});
-    pair.first->second.entryCount += FS->getEntrySamples();
+    if (FS->getTotalSamples() == 0 && FS->getEntrySamples() == 0) {
+      continue;
+    }
+
+    if (ProfileMergeInlinee) {
+      // Use entry samples as head samples during the merge, as inlinees
+      // don't have head samples.
+      assert(FS->getHeadSamples() == 0 && "Expect 0 head sample for inlinee");
+      const_cast<FunctionSamples *>(FS)->addHeadSamples(FS->getEntrySamples());
+
+      // Note that we have to do the merge right after processing function.
+      // This allows OutlineFS's profile to be used for annotation during
+      // top-down processing of functions' annotation.
+      FunctionSamples *OutlineFS = Reader->getOrCreateSamplesFor(*Callee);
+      OutlineFS->merge(*FS);
+    } else {
+      auto pair =
+          notInlinedCallInfo.try_emplace(Callee, NotInlinedProfileInfo{0});
+      pair.first->second.entryCount += FS->getEntrySamples();
+    }
   }
   return Changed;
 }

diff  --git a/llvm/test/CodeGen/X86/insert-prefetch-inline.afdo b/llvm/test/CodeGen/X86/insert-prefetch-inline.afdo
index 83b30f6e210e..935b707ff107 100644
--- a/llvm/test/CodeGen/X86/insert-prefetch-inline.afdo
+++ b/llvm/test/CodeGen/X86/insert-prefetch-inline.afdo
@@ -1,4 +1,4 @@
 caller:0:0
- 2:sum:0
+ 2: sum:0
   3: 0 __prefetch_nta_0:23456
   3.1: 0 __prefetch_nta_0:8764 __prefetch_nta_1:64
\ No newline at end of file

diff  --git a/llvm/test/Transforms/SampleProfile/Inputs/einline.prof b/llvm/test/Transforms/SampleProfile/Inputs/einline.prof
index 624990b47ef3..1b0f3922658c 100644
--- a/llvm/test/Transforms/SampleProfile/Inputs/einline.prof
+++ b/llvm/test/Transforms/SampleProfile/Inputs/einline.prof
@@ -1,7 +1,7 @@
 _Z3foov:200:100
- 1: _Z3barv:0
+ 1: _ZL3barv:0
  2: no_inline:100
- 3: _Z3barv:100
+ 3: _ZL3barv:100
 recursive:200:100
  1: recursive:100
  2: recursive:100

diff  --git a/llvm/test/Transforms/SampleProfile/Inputs/inline-callee-update.prof b/llvm/test/Transforms/SampleProfile/Inputs/inline-callee-update.prof
index b33f2bf998be..ecd931274729 100644
--- a/llvm/test/Transforms/SampleProfile/Inputs/inline-callee-update.prof
+++ b/llvm/test/Transforms/SampleProfile/Inputs/inline-callee-update.prof
@@ -4,7 +4,7 @@ sample_loader_inlinee:3000:0
  1: direct_leaf_func:35000
   11: 3000
 test_cgscc_inline:63067:0
- 1: sample_loader_inlinee:1
+ 1: cgscc_inlinee:1
 cgscc_inlinee:3000:0
  1: direct_leaf_func:35000
   11: 3000
\ No newline at end of file

diff  --git a/llvm/test/Transforms/SampleProfile/Inputs/inline-mergeprof.prof b/llvm/test/Transforms/SampleProfile/Inputs/inline-mergeprof.prof
new file mode 100644
index 000000000000..96ac2decb7ce
--- /dev/null
+++ b/llvm/test/Transforms/SampleProfile/Inputs/inline-mergeprof.prof
@@ -0,0 +1,13 @@
+main:225715:0
+ 2.1: 5553
+ 3: 5391
+ 3.1: _Z3sumii:46
+  1: 23
+  2: _Z3subii:2
+   1: 2
+  3: 21
+
+_Z3sumii:11:22
+ 1: 11
+ 2: 10 _Z3subii:10
+ 3: 1
\ No newline at end of file

diff  --git a/llvm/test/Transforms/SampleProfile/inline-mergeprof.ll b/llvm/test/Transforms/SampleProfile/inline-mergeprof.ll
new file mode 100644
index 000000000000..8b5989f7a623
--- /dev/null
+++ b/llvm/test/Transforms/SampleProfile/inline-mergeprof.ll
@@ -0,0 +1,97 @@
+; Test we lose details of not inlined profile without '-sample-profile-merge-inlinee'
+; RUN: opt < %s -sample-profile -sample-profile-file=%S/Inputs/inline-mergeprof.prof -S | FileCheck -check-prefix=SCALE %s
+; RUN: opt < %s -passes=sample-profile -sample-profile-file=%S/Inputs/inline-mergeprof.prof -S | FileCheck -check-prefix=SCALE %s
+
+; Test we properly merge not inlined profile properly with '-sample-profile-merge-inlinee'
+; RUN: opt < %s -sample-profile -sample-profile-file=%S/Inputs/inline-mergeprof.prof -sample-profile-merge-inlinee -S | FileCheck -check-prefix=MERGE %s
+; RUN: opt < %s -passes=sample-profile -sample-profile-file=%S/Inputs/inline-mergeprof.prof -sample-profile-merge-inlinee -S | FileCheck -check-prefix=MERGE  %s
+
+ at .str = private unnamed_addr constant [11 x i8] c"sum is %d\0A\00", align 1
+
+define i32 @main() !dbg !6 {
+entry:
+  %retval = alloca i32, align 4
+  %s = alloca i32, align 4
+  %i = alloca i32, align 4
+  %tmp = load i32, i32* %i, align 4, !dbg !8
+  %tmp1 = load i32, i32* %s, align 4, !dbg !8
+  %call = call i32 @_Z3sumii(i32 %tmp, i32 %tmp1), !dbg !8
+; SCALE: call i32 @_Z3sumii
+; MERGE: call i32 @_Z3sumii
+  store i32 %call, i32* %s, align 4, !dbg !8
+  ret i32 0, !dbg !11
+}
+
+define i32 @_Z3sumii(i32 %x, i32 %y) !dbg !12 {
+entry:
+  %x.addr = alloca i32, align 4
+  %y.addr = alloca i32, align 4
+  store i32 %x, i32* %x.addr, align 4
+  store i32 %y, i32* %y.addr, align 4
+  %tmp = load i32, i32* %x.addr, align 4, !dbg !13
+  %tmp1 = load i32, i32* %y.addr, align 4, !dbg !13
+  %add = add nsw i32 %tmp, %tmp1, !dbg !13
+  %tmp2 = load i32, i32* %x.addr, align 4, !dbg !13
+  %tmp3 = load i32, i32* %y.addr, align 4, !dbg !13
+  %cmp1 = icmp ne i32 %tmp3, 100, !dbg !13
+  br i1 %cmp1, label %if.then, label %if.else, !dbg !13
+
+if.then:                                          ; preds = %entry
+  %call = call i32 @_Z3subii(i32 %tmp2, i32 %tmp3), !dbg !14
+  ret i32 %add, !dbg !14
+
+if.else:                                          ; preds = %entry
+  ret i32 %add, !dbg !15
+}
+
+define i32 @_Z3subii(i32 %x, i32 %y) !dbg !16 {
+entry:
+  %x.addr = alloca i32, align 4
+  %y.addr = alloca i32, align 4
+  store i32 %x, i32* %x.addr, align 4
+  store i32 %y, i32* %y.addr, align 4
+  %tmp = load i32, i32* %x.addr, align 4, !dbg !17
+  %tmp1 = load i32, i32* %y.addr, align 4, !dbg !17
+  %add = sub nsw i32 %tmp, %tmp1, !dbg !17
+  ret i32 %add, !dbg !18
+}
+
+declare i32 @printf(i8*, ...)
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!3, !4}
+!llvm.ident = !{!5}
+
+!0 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus, file: !1, producer: "clang version 3.5 ", isOptimized: false, runtimeVersion: 0, emissionKind: NoDebug, enums: !2, retainedTypes: !2, globals: !2, imports: !2)
+!1 = !DIFile(filename: "calls.cc", directory: ".")
+!2 = !{}
+!3 = !{i32 2, !"Dwarf Version", i32 4}
+!4 = !{i32 1, !"Debug Info Version", i32 3}
+!5 = !{!"clang version 3.5 "}
+!6 = distinct !DISubprogram(name: "main", scope: !1, file: !1, line: 7, type: !7, scopeLine: 7, virtualIndex: 6, flags: DIFlagPrototyped, spFlags: DISPFlagDefinition, unit: !0, retainedNodes: !2)
+!7 = !DISubroutineType(types: !2)
+!8 = !DILocation(line: 10, scope: !9)
+!9 = !DILexicalBlockFile(scope: !10, file: !1, discriminator: 2)
+!10 = distinct !DILexicalBlock(scope: !6, file: !1, line: 10)
+!11 = !DILocation(line: 12, scope: !6)
+!12 = distinct !DISubprogram(name: "sum", scope: !1, file: !1, line: 3, type: !7, scopeLine: 3, virtualIndex: 6, flags: DIFlagPrototyped, spFlags: DISPFlagDefinition, unit: !0, retainedNodes: !2)
+!13 = !DILocation(line: 4, scope: !12)
+!14 = !DILocation(line: 5, scope: !12)
+!15 = !DILocation(line: 6, scope: !12)
+!16 = distinct !DISubprogram(name: "sub", scope: !1, file: !1, line: 20, type: !7, scopeLine: 20, virtualIndex: 6, flags: DIFlagPrototyped, spFlags: DISPFlagDefinition, unit: !0, retainedNodes: !2)
+!17 = !DILocation(line: 20, scope: !16)
+!18 = !DILocation(line: 21, scope: !16)
+
+; SCALE: name: "sum"
+; SCALE-NEXT: {!"function_entry_count", i64 46}
+; SCALE: !{!"branch_weights", i32 11, i32 2}
+; SCALE: !{!"branch_weights", i64 20}
+; SCALE: name: "sub"
+; SCALE-NEXT: {!"function_entry_count", i64 -1}
+
+; MERGE: name: "sum"
+; MERGE-NEXT: {!"function_entry_count", i64 46}
+; MERGE: !{!"branch_weights", i32 11, i32 23}
+; MERGE: !{!"branch_weights", i32 10}
+; MERGE: name: "sub"
+; MERGE-NEXT: {!"function_entry_count", i64 3}
\ No newline at end of file


        


More information about the llvm-commits mailing list