[llvm] a45d72e - [CSSPGO] Add switch for sample loader to honor global pre-inliner decision from llvm-profgen

Wenlei He via llvm-commits llvm-commits at lists.llvm.org
Wed Aug 25 17:25:14 PDT 2021


Author: Wenlei He
Date: 2021-08-25T17:20:15-07:00
New Revision: a45d72e0247d080eb9437736bb6cadfc27e4c065

URL: https://github.com/llvm/llvm-project/commit/a45d72e0247d080eb9437736bb6cadfc27e4c065
DIFF: https://github.com/llvm/llvm-project/commit/a45d72e0247d080eb9437736bb6cadfc27e4c065.diff

LOG: [CSSPGO] Add switch for sample loader to honor global pre-inliner decision from llvm-profgen

The change adds a switch to allow sample loader to use global pre-inliner's decision instead. The pre-inliner in llvm-profgen makes inline decision globally based on whole program profile and function byte size as cost proxy.

Since pre-inliner also adjusts/merges context profile based on its inline decision, honoring its inline decision in sample loader would lead to better post-inline profile quality especially for thinlto where cross module profile merging isn't possible without pre-inliner.

Minor fix in profile reader is also included. When pre-inliner is use, we now also turn off the default merging and trimming logic unless it's explicitly asked.

Differential Revision: https://reviews.llvm.org/D108677

Added: 
    llvm/test/Transforms/SampleProfile/Inputs/csspgo-use-preinliner.prof
    llvm/test/Transforms/SampleProfile/csspgo-use-preinliner.ll

Modified: 
    llvm/lib/ProfileData/SampleProfReader.cpp
    llvm/lib/Transforms/IPO/SampleProfile.cpp
    llvm/tools/llvm-profgen/ProfileGenerator.cpp

Removed: 
    


################################################################################
diff  --git a/llvm/lib/ProfileData/SampleProfReader.cpp b/llvm/lib/ProfileData/SampleProfReader.cpp
index b40361b3d971..59a04e20965e 100644
--- a/llvm/lib/ProfileData/SampleProfReader.cpp
+++ b/llvm/lib/ProfileData/SampleProfReader.cpp
@@ -793,7 +793,7 @@ std::error_code SampleProfileReaderExtBinaryBase::readFuncProfiles() {
   }
   assert((CSProfileCount == 0 || CSProfileCount == Profiles.size()) &&
          "Cannot have both context-sensitive and regular profile");
-  assert(ProfileIsCS == (CSProfileCount > 0) &&
+  assert((!CSProfileCount || ProfileIsCS) &&
          "Section flag should be consistent with actual profile");
   return sampleprof_error::success;
 }

diff  --git a/llvm/lib/Transforms/IPO/SampleProfile.cpp b/llvm/lib/Transforms/IPO/SampleProfile.cpp
index e3e06a21ad56..868a73534781 100644
--- a/llvm/lib/Transforms/IPO/SampleProfile.cpp
+++ b/llvm/lib/Transforms/IPO/SampleProfile.cpp
@@ -214,6 +214,11 @@ static cl::opt<bool> CallsitePrioritizedInline(
     cl::desc("Use call site prioritized inlining for sample profile loader."
              "Currently only CSSPGO is supported."));
 
+static cl::opt<bool> UsePreInlinerDecision(
+    "sample-profile-use-preinliner", cl::Hidden, cl::ZeroOrMore,
+    cl::init(false),
+    cl::desc("Use the preinliner decisions stored in profile context."));
+
 static cl::opt<std::string> ProfileInlineReplayFile(
     "sample-profile-inline-replay", cl::init(""), cl::value_desc("filename"),
     cl::desc(
@@ -1285,6 +1290,21 @@ SampleProfileLoader::shouldInlineCandidate(InlineCandidate &Candidate) {
   if (Cost.isNever() || Cost.isAlways())
     return Cost;
 
+  // With CSSPGO, the preinliner in llvm-profgen can estimate global inline
+  // decisions based on hotness as well as accurate function byte sizes for
+  // given context using function/inlinee sizes from previous build. It
+  // stores the decision in profile, and also adjust/merge context profile
+  // aiming at better context-sensitive post-inline profile quality, assuming
+  // all inline decision estimates are going to be honored by compiler. Here
+  // we replay that inline decision under `sample-profile-use-preinliner`.
+  if (UsePreInlinerDecision) {
+    if (Candidate.CalleeSamples->getContext().hasAttribute(
+            ContextShouldBeInlined))
+      return InlineCost::getAlways("preinliner");
+    else
+      return InlineCost::getNever("preinliner");
+  }
+
   // For old FDO inliner, we inline the call site as long as cost is not
   // "Never". The cost-benefit check is done earlier.
   if (!CallsitePrioritizedInline) {

diff  --git a/llvm/test/Transforms/SampleProfile/Inputs/csspgo-use-preinliner.prof b/llvm/test/Transforms/SampleProfile/Inputs/csspgo-use-preinliner.prof
new file mode 100644
index 000000000000..534f2443d06e
--- /dev/null
+++ b/llvm/test/Transforms/SampleProfile/Inputs/csspgo-use-preinliner.prof
@@ -0,0 +1,39 @@
+[main:3 @ _Z5funcAi:1 @ _Z8funcLeafi]:1467299:11
+ 0: 6
+ 1: 6
+ 3: 287884
+ 4: 287864 _Z3fibi:315608
+ 15: 23
+ !Attributes: 3
+[main:3.1 @ _Z5funcBi:1 @ _Z8funcLeafi]:500853:20
+ 0: 15
+ 1: 15
+ 3: 74946
+ 4: 74941 _Z3fibi:82359
+ 10: 23324
+ 11: 23327 _Z3fibi:25228
+ 15: 11
+ !Attributes: 3
+[main]:154:0
+ 2: 12
+ 3: 18 _Z5funcAi:11
+ 3.1: 18 _Z5funcBi:19
+[external:12 @ main]:154:12
+ 2: 12
+ 3: 10 _Z5funcAi:7
+ 3.1: 10 _Z5funcBi:11
+[main:3.1 @ _Z5funcBi]:120:19
+ 0: 19
+ 1: 19 _Z8funcLeafi:20
+ 3: 12
+ !Attributes: 3
+[externalA:17 @ _Z5funcBi]:120:3
+ 0: 3
+ 1: 3
+[external:10 @ _Z5funcBi]:120:10
+ 0: 10
+ 1: 10
+[main:3 @ _Z5funcAi]:99:11
+ 0: 10
+ 1: 10 _Z8funcLeafi:11
+ 3: 24

diff  --git a/llvm/test/Transforms/SampleProfile/csspgo-use-preinliner.ll b/llvm/test/Transforms/SampleProfile/csspgo-use-preinliner.ll
new file mode 100644
index 000000000000..180fd8a02c9b
--- /dev/null
+++ b/llvm/test/Transforms/SampleProfile/csspgo-use-preinliner.ll
@@ -0,0 +1,158 @@
+; Test for CSSPGO's sample loader inlining to make sure it can use llvm-profgen preinliner's decision
+
+; RUN: opt < %s -passes=sample-profile -sample-profile-file=%S/Inputs/csspgo-use-preinliner.prof -pass-remarks=inline -sample-profile-prioritized-inline -profile-sample-accurate -sample-profile-use-preinliner=0 -S 2>&1 | FileCheck %s --check-prefix=DEFAULT
+; RUN: opt < %s -passes=sample-profile -sample-profile-file=%S/Inputs/csspgo-use-preinliner.prof -pass-remarks=inline -sample-profile-prioritized-inline -profile-sample-accurate -sample-profile-use-preinliner=1 -S 2>&1 | FileCheck %s --check-prefix=PREINLINE
+
+
+; DEFAULT: '_Z5funcAi' inlined into 'main'
+; DEFAULT-NOT: inlined into
+
+; PREINLINE-NOT: inlined into
+; PREINLINE: '_Z8funcLeafi' inlined into '_Z5funcBi'
+; PREINLINE: '_Z8funcLeafi' inlined into '_Z5funcAi'
+; PREINLINE-NOT: inlined into
+
+ at factor = dso_local global i32 3, align 4, !dbg !0
+
+define dso_local i32 @main() local_unnamed_addr #0 !dbg !18 {
+entry:
+  br label %for.body, !dbg !25
+
+for.cond.cleanup:                                 ; preds = %for.body
+  ret i32 %add3, !dbg !27
+
+for.body:                                         ; preds = %for.body, %entry
+  %x.011 = phi i32 [ 300000, %entry ], [ %dec, %for.body ]
+  %r.010 = phi i32 [ 0, %entry ], [ %add3, %for.body ]
+  %call = tail call i32 @_Z5funcBi(i32 %x.011), !dbg !32
+  %add = add nuw nsw i32 %x.011, 1, !dbg !31
+  %call1 = tail call i32 @_Z5funcAi(i32 %add), !dbg !28
+  %add2 = add i32 %call, %r.010, !dbg !34
+  %add3 = add i32 %add2, %call1, !dbg !35
+  %dec = add nsw i32 %x.011, -1, !dbg !36
+  %cmp = icmp eq i32 %x.011, 0, !dbg !38
+  br i1 %cmp, label %for.cond.cleanup, label %for.body, !dbg !25
+}
+
+define dso_local i32 @_Z5funcAi(i32 %x) local_unnamed_addr #1 !dbg !40 {
+entry:
+  %add = add nsw i32 %x, 100000, !dbg !44
+  %call = tail call i32 @_Z8funcLeafi(i32 %add), !dbg !45
+  ret i32 %call, !dbg !46
+}
+
+define dso_local i32 @_Z8funcLeafi(i32 %x) local_unnamed_addr #1 !dbg !54 {
+entry:
+  %cmp = icmp sgt i32 %x, 0, !dbg !57
+  br i1 %cmp, label %while.body, label %while.cond2.preheader, !dbg !59
+
+while.cond2.preheader:                            ; preds = %entry
+  %cmp313 = icmp slt i32 %x, 0, !dbg !60
+  br i1 %cmp313, label %while.body4, label %if.end, !dbg !63
+
+while.body:                                       ; preds = %while.body, %entry
+  %x.addr.016 = phi i32 [ %sub, %while.body ], [ %x, %entry ]
+  %tmp = load volatile i32, i32* @factor, align 4, !dbg !64
+  %call = tail call i32 @_Z3fibi(i32 %tmp), !dbg !67
+  %sub = sub nsw i32 %x.addr.016, %call, !dbg !68
+  %cmp1 = icmp sgt i32 %sub, 0, !dbg !69
+  br i1 %cmp1, label %while.body, label %if.end, !dbg !71
+
+while.body4:                                      ; preds = %while.body4, %while.cond2.preheader
+  %x.addr.114 = phi i32 [ %add, %while.body4 ], [ %x, %while.cond2.preheader ]
+  %tmp1 = load volatile i32, i32* @factor, align 4, !dbg !72
+  %call5 = tail call i32 @_Z3fibi(i32 %tmp1), !dbg !74
+  %add = add nsw i32 %call5, %x.addr.114, !dbg !75
+  %cmp3 = icmp slt i32 %add, 0, !dbg !60
+  br i1 %cmp3, label %while.body4, label %if.end, !dbg !63
+
+if.end:                                           ; preds = %while.body4, %while.body, %while.cond2.preheader
+  %x.addr.2 = phi i32 [ 0, %while.cond2.preheader ], [ %sub, %while.body ], [ %add, %while.body4 ]
+  ret i32 %x.addr.2, !dbg !76
+}
+
+define dso_local i32 @_Z5funcBi(i32 %x) local_unnamed_addr #0 !dbg !47 {
+entry:
+  %sub = add nsw i32 %x, -100000, !dbg !51
+  %call = tail call i32 @_Z8funcLeafi(i32 %sub), !dbg !52
+  ret i32 %call, !dbg !53
+}
+
+declare i32 @_Z3fibi(i32)
+
+attributes #0 = { nofree noinline norecurse nounwind uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" "use-sample-profile" }
+attributes #1 = { nofree norecurse nounwind uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" "use-sample-profile" }
+
+!llvm.dbg.cu = !{!2}
+!llvm.module.flags = !{!14, !15, !16}
+!llvm.ident = !{!17}
+
+!0 = !DIGlobalVariableExpression(var: !1, expr: !DIExpression())
+!1 = distinct !DIGlobalVariable(name: "factor", scope: !2, file: !3, line: 21, type: !13, isLocal: false, isDefinition: true)
+!2 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus_14, file: !3, producer: "clang version 11.0.0", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug, enums: !4, retainedTypes: !5, globals: !12, splitDebugInlining: false, debugInfoForProfiling: true, nameTableKind: None)
+!3 = !DIFile(filename: "merged.cpp", directory: "/local/autofdo")
+!4 = !{}
+!5 = !{!6, !10, !11}
+!6 = !DISubprogram(name: "funcA", linkageName: "_Z5funcAi", scope: !3, file: !3, line: 6, type: !7, flags: DIFlagPrototyped, spFlags: DISPFlagOptimized, retainedNodes: !4)
+!7 = !DISubroutineType(types: !8)
+!8 = !{!9, !9}
+!9 = !DIBasicType(name: "int", size: 32, encoding: DW_ATE_signed)
+!10 = !DISubprogram(name: "funcB", linkageName: "_Z5funcBi", scope: !3, file: !3, line: 7, type: !7, flags: DIFlagPrototyped, spFlags: DISPFlagOptimized, retainedNodes: !4)
+!11 = !DISubprogram(name: "funcLeaf", linkageName: "_Z8funcLeafi", scope: !3, file: !3, line: 22, type: !7, flags: DIFlagPrototyped, spFlags: DISPFlagOptimized, retainedNodes: !4)
+!12 = !{!0}
+!13 = !DIDerivedType(tag: DW_TAG_volatile_type, baseType: !9)
+!14 = !{i32 7, !"Dwarf Version", i32 4}
+!15 = !{i32 2, !"Debug Info Version", i32 3}
+!16 = !{i32 1, !"wchar_size", i32 4}
+!17 = !{!"clang version 11.0.0"}
+!18 = distinct !DISubprogram(name: "main", scope: !3, file: !3, line: 11, type: !19, scopeLine: 11, flags: DIFlagPrototyped | DIFlagAllCallsDescribed, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !2, retainedNodes: !21)
+!19 = !DISubroutineType(types: !20)
+!20 = !{!9}
+!21 = !{!22, !23}
+!22 = !DILocalVariable(name: "r", scope: !18, file: !3, line: 12, type: !9)
+!23 = !DILocalVariable(name: "x", scope: !24, file: !3, line: 13, type: !9)
+!24 = distinct !DILexicalBlock(scope: !18, file: !3, line: 13, column: 3)
+!25 = !DILocation(line: 13, column: 3, scope: !26)
+!26 = !DILexicalBlockFile(scope: !24, file: !3, discriminator: 2)
+!27 = !DILocation(line: 17, column: 3, scope: !18)
+!28 = !DILocation(line: 14, column: 10, scope: !29)
+!29 = distinct !DILexicalBlock(scope: !30, file: !3, line: 13, column: 37)
+!30 = distinct !DILexicalBlock(scope: !24, file: !3, line: 13, column: 3)
+!31 = !DILocation(line: 14, column: 29, scope: !29)
+!32 = !DILocation(line: 14, column: 21, scope: !33)
+!33 = !DILexicalBlockFile(scope: !29, file: !3, discriminator: 2)
+!34 = !DILocation(line: 14, column: 19, scope: !29)
+!35 = !DILocation(line: 14, column: 7, scope: !29)
+!36 = !DILocation(line: 13, column: 33, scope: !37)
+!37 = !DILexicalBlockFile(scope: !30, file: !3, discriminator: 6)
+!38 = !DILocation(line: 13, column: 26, scope: !39)
+!39 = !DILexicalBlockFile(scope: !30, file: !3, discriminator: 2)
+!40 = distinct !DISubprogram(name: "funcA", linkageName: "_Z5funcAi", scope: !3, file: !3, line: 26, type: !7, scopeLine: 26, flags: DIFlagPrototyped | DIFlagAllCallsDescribed, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !2)
+!44 = !DILocation(line: 27, column: 22, scope: !40)
+!45 = !DILocation(line: 27, column: 11, scope: !40)
+!46 = !DILocation(line: 29, column: 3, scope: !40)
+!47 = distinct !DISubprogram(name: "funcB", linkageName: "_Z5funcBi", scope: !3, file: !3, line: 32, type: !7, scopeLine: 32, flags: DIFlagPrototyped | DIFlagAllCallsDescribed, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !2)
+!51 = !DILocation(line: 33, column: 22, scope: !47)
+!52 = !DILocation(line: 33, column: 11, scope: !47)
+!53 = !DILocation(line: 35, column: 3, scope: !47)
+!54 = distinct !DISubprogram(name: "funcLeaf", linkageName: "_Z8funcLeafi", scope: !3, file: !3, line: 48, type: !7, scopeLine: 48, flags: DIFlagPrototyped | DIFlagAllCallsDescribed, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !2)
+!57 = !DILocation(line: 49, column: 9, scope: !58)
+!58 = distinct !DILexicalBlock(scope: !54, file: !3, line: 49, column: 7)
+!59 = !DILocation(line: 49, column: 7, scope: !54)
+!60 = !DILocation(line: 58, column: 14, scope: !61)
+!61 = !DILexicalBlockFile(scope: !62, file: !3, discriminator: 2)
+!62 = distinct !DILexicalBlock(scope: !58, file: !3, line: 56, column: 8)
+!63 = !DILocation(line: 58, column: 5, scope: !61)
+!64 = !DILocation(line: 52, column: 16, scope: !65)
+!65 = distinct !DILexicalBlock(scope: !66, file: !3, line: 51, column: 19)
+!66 = distinct !DILexicalBlock(scope: !58, file: !3, line: 49, column: 14)
+!67 = !DILocation(line: 52, column: 12, scope: !65)
+!68 = !DILocation(line: 52, column: 9, scope: !65)
+!69 = !DILocation(line: 51, column: 14, scope: !70)
+!70 = !DILexicalBlockFile(scope: !66, file: !3, discriminator: 2)
+!71 = !DILocation(line: 51, column: 5, scope: !70)
+!72 = !DILocation(line: 59, column: 16, scope: !73)
+!73 = distinct !DILexicalBlock(scope: !62, file: !3, line: 58, column: 19)
+!74 = !DILocation(line: 59, column: 12, scope: !73)
+!75 = !DILocation(line: 59, column: 9, scope: !73)
+!76 = !DILocation(line: 63, column: 3, scope: !54)

diff  --git a/llvm/tools/llvm-profgen/ProfileGenerator.cpp b/llvm/tools/llvm-profgen/ProfileGenerator.cpp
index 29c4536ecbc8..79246a3ab3af 100644
--- a/llvm/tools/llvm-profgen/ProfileGenerator.cpp
+++ b/llvm/tools/llvm-profgen/ProfileGenerator.cpp
@@ -407,11 +407,15 @@ void CSProfileGenerator::postProcessProfiles() {
         .run();
   }
 
-  // Trim and merge cold context profile using cold threshold above;
-  SampleContextTrimmer(ProfileMap)
-      .trimAndMergeColdContextProfiles(
-          ColdCountThreshold, CSProfTrimColdContext, CSProfMergeColdContext,
-          CSProfMaxColdContextDepth);
+  // Trim and merge cold context profile using cold threshold above. By default,
+  // we skip such merging and trimming when preinliner is on.
+  if (!EnableCSPreInliner || CSProfTrimColdContext.getNumOccurrences() ||
+      CSProfMergeColdContext.getNumOccurrences()) {
+    SampleContextTrimmer(ProfileMap)
+        .trimAndMergeColdContextProfiles(
+            ColdCountThreshold, CSProfTrimColdContext, CSProfMergeColdContext,
+            CSProfMaxColdContextDepth);
+  }
 }
 
 void CSProfileGenerator::computeSummaryAndThreshold() {


        


More information about the llvm-commits mailing list