[llvm] 07846e3 - [CSSPGO][PriorityInliner] Do not use block weight to drive callsite inlining.

Hongtao Yu via llvm-commits llvm-commits at lists.llvm.org
Tue Mar 1 18:43:27 PST 2022


Author: Hongtao Yu
Date: 2022-03-01T18:43:19-08:00
New Revision: 07846e3387a6aa323d28c79ff055f1ad5622386c

URL: https://github.com/llvm/llvm-project/commit/07846e3387a6aa323d28c79ff055f1ad5622386c
DIFF: https://github.com/llvm/llvm-project/commit/07846e3387a6aa323d28c79ff055f1ad5622386c.diff

LOG: [CSSPGO][PriorityInliner] Do not use block weight to drive callsite inlining.

The priority-based inliner currenlty uses block count combined with callee entry count to drive callsite inlining. This doesn't work well with LTO where postlink inlining is driven by prelink-annotated block count which could be based on the merge of all context profiles. I'm fixing it by using callee profile entry count only which should be context-sensitive.

I'm seeing 0.2% perf improvment for one of our internal large benchmarks with probe-based non-CS profile.

Reviewed By: wenlei

Differential Revision: https://reviews.llvm.org/D120784

Added: 
    

Modified: 
    llvm/lib/Transforms/IPO/SampleProfile.cpp
    llvm/test/Transforms/SampleProfile/Inputs/profile-context-order-scc.prof
    llvm/test/Transforms/SampleProfile/Inputs/profile-context-order.prof
    llvm/test/Transforms/SampleProfile/csspgo-inline-icall.ll
    llvm/test/Transforms/SampleProfile/csspgo-inline.ll

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Transforms/IPO/SampleProfile.cpp b/llvm/lib/Transforms/IPO/SampleProfile.cpp
index 71938968cab22..f34928c8a954f 100644
--- a/llvm/lib/Transforms/IPO/SampleProfile.cpp
+++ b/llvm/lib/Transforms/IPO/SampleProfile.cpp
@@ -1305,14 +1305,8 @@ bool SampleProfileLoader::getInlineCandidate(InlineCandidate *NewCandidate,
   if (Optional<PseudoProbe> Probe = extractProbe(*CB))
     Factor = Probe->Factor;
 
-  uint64_t CallsiteCount = 0;
-  ErrorOr<uint64_t> Weight = getBlockWeight(CB->getParent());
-  if (Weight)
-    CallsiteCount = Weight.get();
-  if (CalleeSamples)
-    CallsiteCount = std::max(
-        CallsiteCount, uint64_t(CalleeSamples->getEntrySamples() * Factor));
-
+  uint64_t CallsiteCount =
+      CalleeSamples ? CalleeSamples->getEntrySamples() * Factor : 0;
   *NewCandidate = {CB, CalleeSamples, CallsiteCount, Factor};
   return true;
 }

diff  --git a/llvm/test/Transforms/SampleProfile/Inputs/profile-context-order-scc.prof b/llvm/test/Transforms/SampleProfile/Inputs/profile-context-order-scc.prof
index 166d83089075a..d3c0a24fae5b8 100644
--- a/llvm/test/Transforms/SampleProfile/Inputs/profile-context-order-scc.prof
+++ b/llvm/test/Transforms/SampleProfile/Inputs/profile-context-order-scc.prof
@@ -1,9 +1,9 @@
-[main:3 @ _Z5funcAi:1 @ _Z8funcLeafi]:1467299:11
+[main:3 @ _Z5funcAi:1 @ _Z8funcLeafi]:1467299:287864
  0: 6
  1: 6
  3: 287884
  15: 23
-[main:3.1 @ _Z5funcBi:1 @ _Z8funcLeafi]:500853:20
+[main:3.1 @ _Z5funcBi:1 @ _Z8funcLeafi]:500853:287864
  0: 15
  1: 15
  3: 74946
@@ -19,7 +19,7 @@
  3.1: 10 _Z5funcBi:11
 [main:3.1 @ _Z5funcBi]:120:19
  0: 19
- 1: 19 _Z8funcLeafi:20
+ 1: 287864 _Z8funcLeafi:287864
  3: 12
 [externalA:17 @ _Z5funcBi]:120:3
  0: 3
@@ -29,7 +29,7 @@
  1: 10
 [main:3 @ _Z5funcAi]:99:11
  0: 10
- 1: 10 _Z8funcLeafi:11
+ 1: 287864 _Z8funcLeafi:287864
  2: 287864 _Z3fibi:315608
  3: 24
 [main:3 @ _Z5funcAi:2 @ _Z3fibi]:287864:315608
@@ -39,5 +39,4 @@
 [main:3 @ _Z5funcAi:1 @ _Z8funcLeafi:1 @ _Z5funcBi]:1467299:6
  0: 6
  1: 6
- 3: 287884
- 15: 23
\ No newline at end of file
+ 3: 6

diff  --git a/llvm/test/Transforms/SampleProfile/Inputs/profile-context-order.prof b/llvm/test/Transforms/SampleProfile/Inputs/profile-context-order.prof
index f941b5053ee68..e4adabf3feee7 100644
--- a/llvm/test/Transforms/SampleProfile/Inputs/profile-context-order.prof
+++ b/llvm/test/Transforms/SampleProfile/Inputs/profile-context-order.prof
@@ -1,4 +1,4 @@
-[main:3 @ _Z5funcAi:1 @ _Z8funcLeafi]:1467299:11
+[main:3 @ _Z5funcAi:1 @ _Z8funcLeafi]:1467299:287864
  0: 6
  1: 6
  3: 287884
@@ -29,10 +29,10 @@
  1: 10
 [main:3 @ _Z5funcAi]:99:11
  0: 10
- 1: 10 _Z8funcLeafi:11
+ 1: 287864 _Z8funcLeafi:287864
  2: 287864 _Z3fibi:315608
  3: 24
 [main:3 @ _Z5funcAi:2 @ _Z3fibi]:287864:315608
  0: 362839
  1: 6
- 3: 287884
\ No newline at end of file
+ 3: 287884

diff  --git a/llvm/test/Transforms/SampleProfile/csspgo-inline-icall.ll b/llvm/test/Transforms/SampleProfile/csspgo-inline-icall.ll
index 01a39e2123a7d..9310b09a15336 100644
--- a/llvm/test/Transforms/SampleProfile/csspgo-inline-icall.ll
+++ b/llvm/test/Transforms/SampleProfile/csspgo-inline-icall.ll
@@ -57,9 +57,9 @@ attributes #0 = {"use-sample-profile"}
 !11 = distinct !DISubprogram(name: "zoo", linkageName: "_Z3zoov", scope: !1, file: !1, line: 24, unit: !0)
 
 
-; ICP-ALL: remark: test.cc:5:0: '_Z3bazv' inlined into 'test'
-; ICP-ALL-NEXT: remark: test.cc:4:0: '_Z3foov' inlined into 'test'
+; ICP-ALL:      remark: test.cc:4:0: '_Z3foov' inlined into 'test'
 ; ICP-ALL-NEXT: remark: test.cc:4:0: '_Z3barv' inlined into 'test'
+; ICP-ALL-NEXT: remark: test.cc:5:0: '_Z3bazv' inlined into 'test'
 ; ICP-ALL-NOT: remark
 
 ; ICP-HOT: remark: test.cc:4:0: '_Z3foov' inlined into 'test'

diff  --git a/llvm/test/Transforms/SampleProfile/csspgo-inline.ll b/llvm/test/Transforms/SampleProfile/csspgo-inline.ll
index 3f2829297fb2c..1894ec6e3ce99 100644
--- a/llvm/test/Transforms/SampleProfile/csspgo-inline.ll
+++ b/llvm/test/Transforms/SampleProfile/csspgo-inline.ll
@@ -18,14 +18,14 @@
 ; RUN: opt < %s -passes=sample-profile -sample-profile-file=%t.prof -sample-profile-prioritized-inline -sample-profile-inline-size -profile-sample-accurate -S -pass-remarks=inline -o /dev/null 2>&1 | FileCheck %s --check-prefix=INLINE-NEW
 ;
 ; With new FDO early inliner, callee entry count is used to drive inlining instead of callee total samples, tuning hot cutoff can get us the same inlining
-; RUN: opt < %s -passes=sample-profile -sample-profile-file=%S/Inputs/profile-context-tracker.prof -sample-profile-inline-size -profile-summary-cutoff-hot=999900 -profile-sample-accurate -S -pass-remarks=inline -o /dev/null 2>&1 | FileCheck %s --check-prefix=INLINE-BASE
+; RUN: opt < %s -passes=sample-profile -sample-profile-file=%S/Inputs/profile-context-tracker.prof -sample-profile-inline-size -profile-summary-cutoff-hot=999990 -profile-sample-accurate -S -pass-remarks=inline -o /dev/null 2>&1 | FileCheck %s --check-prefix=INLINE-BASE
 ;
 ; With new FDO early inliner, callee entry count is used to drive inlining instead of callee total samples, tuning cold sample profile inline threshold can get us the same inlining
 ; RUN: opt < %s -passes=sample-profile -sample-profile-file=%S/Inputs/profile-context-tracker.prof -sample-profile-inline-size -sample-profile-cold-inline-threshold=200 -profile-sample-accurate -S -pass-remarks=inline -o /dev/null 2>&1 | FileCheck %s --check-prefix=INLINE-BASE
 ;
 ; With new FDO early inliner and tuned cutoff, we can control inlining through size growth tuning knob.
-; RUN: opt < %s -passes=sample-profile -sample-profile-file=%S/Inputs/profile-context-tracker.prof -sample-profile-inline-size -profile-summary-cutoff-hot=999900 -sample-profile-inline-limit-min=0 -sample-profile-inline-growth-limit=1 -profile-sample-accurate -S -pass-remarks=inline -o /dev/null 2>&1 | FileCheck %s --allow-empty --check-prefix=INLINE-NEW-LIMIT1
-; RUN: opt < %s -passes=sample-profile -sample-profile-file=%S/Inputs/profile-context-tracker.prof -sample-profile-inline-size -profile-summary-cutoff-hot=999900 -sample-profile-inline-limit-min=10 -sample-profile-inline-growth-limit=1 -profile-sample-accurate -S -pass-remarks=inline -o /dev/null 2>&1 | FileCheck %s --check-prefix=INLINE-NEW-LIMIT2
+; RUN: opt < %s -passes=sample-profile -sample-profile-file=%S/Inputs/profile-context-tracker.prof -sample-profile-inline-size -profile-summary-cutoff-hot=999990 -sample-profile-inline-limit-min=0 -sample-profile-inline-growth-limit=1 -profile-sample-accurate -S -pass-remarks=inline -o /dev/null 2>&1 | FileCheck %s --allow-empty --check-prefix=INLINE-NEW-LIMIT1
+; RUN: opt < %s -passes=sample-profile -sample-profile-file=%S/Inputs/profile-context-tracker.prof -sample-profile-inline-size -profile-summary-cutoff-hot=999990 -sample-profile-inline-limit-min=10 -sample-profile-inline-growth-limit=1 -profile-sample-accurate -S -pass-remarks=inline -o /dev/null 2>&1 | FileCheck %s --check-prefix=INLINE-NEW-LIMIT2
 
 
 ; INLINE-BASE: remark: merged.cpp:14:10: '_Z5funcAi' inlined into 'main' to match profiling context with (cost={{[0-9]+}}, threshold={{[0-9]+}}) at callsite main:3:10
@@ -38,6 +38,7 @@
 ; INLINE-NEW-LIMIT1-NOT: remark
 
 ; INLINE-NEW-LIMIT2: remark: merged.cpp:33:11: '_Z8funcLeafi' inlined into '_Z5funcBi' to match profiling context with (cost={{[0-9]+}}, threshold={{[0-9]+}}) at callsite _Z5funcBi:1:11
+; INLINE-NEW-LIMIT2: remark: merged.cpp:27:11: '_Z8funcLeafi' inlined into '_Z5funcAi' to match profiling context with (cost={{[0-9]+}}, threshold={{[0-9]+}}) at callsite _Z5funcAi:1:11;
 ; INLINE-NEW-LIMIT2-NOT: remark
 
 @factor = dso_local global i32 3, align 4, !dbg !0


        


More information about the llvm-commits mailing list