[llvm] 6d5132b - [CSSPGO] Fix incorrect probe distribution factor computation in top-down inliner

Wenlei He via llvm-commits llvm-commits at lists.llvm.org
Wed Apr 7 08:49:19 PDT 2021


Author: wlei
Date: 2021-04-07T08:48:59-07:00
New Revision: 6d5132b426fbe23c5bc3e591ca2ca09a315d459b

URL: https://github.com/llvm/llvm-project/commit/6d5132b426fbe23c5bc3e591ca2ca09a315d459b
DIFF: https://github.com/llvm/llvm-project/commit/6d5132b426fbe23c5bc3e591ca2ca09a315d459b.diff

LOG: [CSSPGO] Fix incorrect probe distribution factor computation in top-down inliner

We see a regression related to low probe factor(0.01) which prevents some callsites being promoted in ICPPass and later cause the missing inline in CGSCC inliner. The root cause is due to redundant(the second) multiplication of the probe factor and this change try to fix it.

`Sum` does multiply a factor right after findCallSamples but later when using as the parameter in setProbeDistributionFactor, it multiplies one again.

This change could get ~2% perf back on mcf benchmark. In mcf, previously the corresponding factor is 1 and it's the recent feature introducing the <1 factor then trigger this bug.

Reviewed By: hoy, wenlei

Differential Revision: https://reviews.llvm.org/D99787

Added: 
    llvm/test/Transforms/SampleProfile/Inputs/pseudo-probe-icp-factor.prof
    llvm/test/Transforms/SampleProfile/pseudo-probe-icp-factor.ll

Modified: 
    llvm/lib/Transforms/IPO/SampleProfile.cpp

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Transforms/IPO/SampleProfile.cpp b/llvm/lib/Transforms/IPO/SampleProfile.cpp
index 627b9e3d98906..8894cdc38a3fd 100644
--- a/llvm/lib/Transforms/IPO/SampleProfile.cpp
+++ b/llvm/lib/Transforms/IPO/SampleProfile.cpp
@@ -872,17 +872,15 @@ bool SampleProfileLoader::tryPromoteAndInlineCandidate(
       // profile will be used to prorate callsites from the callee if
       // inlined. Once not inlined, the direct callsite distribution should
       // be prorated so that the it will reflect the real callsite counts.
-      setProbeDistributionFactor(CI, Candidate.CallsiteDistribution * Sum /
-                                         SumOrigin);
+      setProbeDistributionFactor(CI, static_cast<float>(Sum) / SumOrigin);
       Candidate.CallInstr = DI;
       if (isa<CallInst>(DI) || isa<InvokeInst>(DI)) {
         bool Inlined = tryInlineCandidate(Candidate, InlinedCallSite);
         if (!Inlined) {
           // Prorate the direct callsite distribution so that it reflects real
           // callsite counts.
-          setProbeDistributionFactor(*DI, Candidate.CallsiteDistribution *
-                                              Candidate.CallsiteCount /
-                                              SumOrigin);
+          setProbeDistributionFactor(
+              *DI, static_cast<float>(Candidate.CallsiteCount) / SumOrigin);
         }
         return Inlined;
       }

diff  --git a/llvm/test/Transforms/SampleProfile/Inputs/pseudo-probe-icp-factor.prof b/llvm/test/Transforms/SampleProfile/Inputs/pseudo-probe-icp-factor.prof
new file mode 100644
index 0000000000000..2274abf83b787
--- /dev/null
+++ b/llvm/test/Transforms/SampleProfile/Inputs/pseudo-probe-icp-factor.prof
@@ -0,0 +1,18 @@
+[main]:1059900:0
+ 2: 224540
+ 4: 224540
+ 7: 193140
+ 8: 224540
+ 10: 193140 foo:193140
+ !CFGChecksum: 563125815542069
+[main:10 @ foo]:386280:193140
+ 1: 193140
+ 2: 193140 baz:171881 bar:11259
+ !CFGChecksum: 281479271677951
+[main:10 @ foo:2 @ baz]:171881:171881
+ 1: 101881
+ 2: 12344
+ !CFGChecksum: 4294967295
+[main:10 @ foo:2 @ bar]:11259:11259
+ 1: 11259
+ !CFGChecksum: 4294967295

diff  --git a/llvm/test/Transforms/SampleProfile/pseudo-probe-icp-factor.ll b/llvm/test/Transforms/SampleProfile/pseudo-probe-icp-factor.ll
new file mode 100644
index 0000000000000..435b19f23c0c0
--- /dev/null
+++ b/llvm/test/Transforms/SampleProfile/pseudo-probe-icp-factor.ll
@@ -0,0 +1,252 @@
+; RUN: opt < %s -passes=sample-profile -sample-profile-file=%S/Inputs/pseudo-probe-icp-factor.prof -S -sample-profile-prioritized-inline=1 2>&1 | FileCheck %s
+
+target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+ at .str = private unnamed_addr constant [4 x i8] c"%d\0A\00", align 1
+
+; Function Attrs: nounwind uwtable
+define dso_local i32 @bar(i32 %0) #0 !dbg !13 {
+  %2 = alloca i32, align 4
+  store i32 %0, i32* %2, align 4, !tbaa !19
+  call void @llvm.dbg.declare(metadata i32* %2, metadata !18, metadata !DIExpression()), !dbg !23
+  call void @llvm.pseudoprobe(i64 -2012135647395072713, i64 1, i32 0, i64 -1), !dbg !24
+  %3 = load i32, i32* %2, align 4, !dbg !24, !tbaa !19
+  %4 = add nsw i32 %3, 1, !dbg !24
+  store i32 %4, i32* %2, align 4, !dbg !24, !tbaa !19
+  %5 = load i32, i32* %2, align 4, !dbg !25, !tbaa !19
+  %6 = add nsw i32 %5, 1, !dbg !26
+  ret i32 %6, !dbg !27
+}
+
+; Function Attrs: nofree nosync nounwind readnone speculatable willreturn
+declare void @llvm.dbg.declare(metadata, metadata, metadata) #1
+
+; Function Attrs: nounwind uwtable
+define dso_local i32 @baz(i32 %0) #0 !dbg !28 {
+  %2 = alloca i32, align 4
+  store i32 %0, i32* %2, align 4, !tbaa !19
+  call void @llvm.dbg.declare(metadata i32* %2, metadata !30, metadata !DIExpression()), !dbg !31
+  call void @llvm.pseudoprobe(i64 7546896869197086323, i64 1, i32 0, i64 -1), !dbg !32
+  %3 = load i32, i32* %2, align 4, !dbg !32, !tbaa !19
+  %4 = add nsw i32 %3, 10, !dbg !33
+  ret i32 %4, !dbg !34
+}
+
+; Function Attrs: nounwind uwtable
+define dso_local i32 @foo(i32 %0, i32 (...)* %1) #0 !dbg !35 {
+  %3 = alloca i32, align 4
+  %4 = alloca i32 (...)*, align 8
+  store i32 %0, i32* %3, align 4, !tbaa !19
+  call void @llvm.dbg.declare(metadata i32* %3, metadata !42, metadata !DIExpression()), !dbg !44
+  store i32 (...)* %1, i32 (...)** %4, align 8, !tbaa !45
+  call void @llvm.dbg.declare(metadata i32 (...)** %4, metadata !43, metadata !DIExpression()), !dbg !47
+  call void @llvm.pseudoprobe(i64 6699318081062747564, i64 1, i32 0, i64 -1), !dbg !48
+  %5 = load i32 (...)*, i32 (...)** %4, align 8, !dbg !48, !tbaa !45
+  %6 = load i32, i32* %3, align 4, !dbg !49, !tbaa !19
+  %7 = bitcast i32 (...)* %5 to i32 (i32, ...)*, !dbg !48
+  %8 = call i32 (i32, ...) %7(i32 %6), !dbg !50
+  ret i32 %8, !dbg !52
+}
+
+; Function Attrs: nounwind uwtable
+define dso_local i32 @main() #0 !dbg !53 {
+  %1 = alloca i32, align 4
+  %2 = alloca i32 (i32)*, align 8
+  %3 = alloca i32, align 4
+  %4 = alloca i32, align 4
+  store i32 0, i32* %1, align 4
+  call void @llvm.pseudoprobe(i64 -2624081020897602054, i64 1, i32 0, i64 -1), !dbg !62
+  %5 = bitcast i32 (i32)** %2 to i8*, !dbg !62
+  call void @llvm.lifetime.start.p0i8(i64 8, i8* %5) #5, !dbg !62
+  call void @llvm.dbg.declare(metadata i32 (i32)** %2, metadata !57, metadata !DIExpression()), !dbg !63
+  %6 = bitcast i32* %3 to i8*, !dbg !64
+  call void @llvm.lifetime.start.p0i8(i64 4, i8* %6) #5, !dbg !64
+  call void @llvm.dbg.declare(metadata i32* %3, metadata !59, metadata !DIExpression()), !dbg !65
+  store i32 0, i32* %3, align 4, !dbg !65, !tbaa !19
+  %7 = bitcast i32* %4 to i8*, !dbg !66
+  call void @llvm.lifetime.start.p0i8(i64 4, i8* %7) #5, !dbg !66
+  call void @llvm.dbg.declare(metadata i32* %4, metadata !60, metadata !DIExpression()), !dbg !67
+  store i32 0, i32* %4, align 4, !dbg !67, !tbaa !19
+  br label %8, !dbg !66
+
+8:                                                ; preds = %26, %0
+  call void @llvm.pseudoprobe(i64 -2624081020897602054, i64 2, i32 0, i64 -1), !dbg !68
+  %9 = load i32, i32* %4, align 4, !dbg !68, !tbaa !19
+  %10 = icmp slt i32 %9, 1000000000, !dbg !70
+  br i1 %10, label %13, label %11, !dbg !71
+
+11:                                               ; preds = %8
+  call void @llvm.pseudoprobe(i64 -2624081020897602054, i64 3, i32 0, i64 -1), !dbg !72
+  %12 = bitcast i32* %4 to i8*, !dbg !72
+  call void @llvm.lifetime.end.p0i8(i64 4, i8* %12) #5, !dbg !72
+  br label %29
+
+13:                                               ; preds = %8
+  call void @llvm.pseudoprobe(i64 -2624081020897602054, i64 4, i32 0, i64 -1), !dbg !73
+  %14 = load i32, i32* %4, align 4, !dbg !73, !tbaa !19
+  %15 = srem i32 %14, 100, !dbg !76
+  %16 = icmp eq i32 %15, 0, !dbg !77
+  br i1 %16, label %17, label %18, !dbg !78
+
+17:                                               ; preds = %13
+  call void @llvm.pseudoprobe(i64 -2624081020897602054, i64 5, i32 0, i64 -1), !dbg !79
+  store i32 (i32)* @bar, i32 (i32)** %2, align 8, !dbg !79, !tbaa !45
+  br label %19, !dbg !80
+
+18:                                               ; preds = %13
+  call void @llvm.pseudoprobe(i64 -2624081020897602054, i64 6, i32 0, i64 -1), !dbg !81
+  store i32 (i32)* @baz, i32 (i32)** %2, align 8, !dbg !81, !tbaa !45
+  br label %19
+
+19:                                               ; preds = %18, %17
+  call void @llvm.pseudoprobe(i64 -2624081020897602054, i64 7, i32 0, i64 -1), !dbg !82
+  %20 = load i32, i32* %4, align 4, !dbg !82, !tbaa !19
+  %21 = load i32 (i32)*, i32 (i32)** %2, align 8, !dbg !83, !tbaa !45
+  %22 = bitcast i32 (i32)* %21 to i32 (...)*, !dbg !83
+  %23 = call i32 @foo(i32 %20, i32 (...)* %22), !dbg !84
+  %24 = load i32, i32* %3, align 4, !dbg !86, !tbaa !19
+  %25 = add nsw i32 %24, %23, !dbg !86
+  store i32 %25, i32* %3, align 4, !dbg !86, !tbaa !19
+  br label %26, !dbg !87
+
+26:                                               ; preds = %19
+  call void @llvm.pseudoprobe(i64 -2624081020897602054, i64 8, i32 0, i64 -1), !dbg !88
+  %27 = load i32, i32* %4, align 4, !dbg !88, !tbaa !19
+  %28 = add nsw i32 %27, 1, !dbg !88
+  store i32 %28, i32* %4, align 4, !dbg !88, !tbaa !19
+  br label %8, !dbg !72, !llvm.loop !89
+
+29:                                               ; preds = %11
+  call void @llvm.pseudoprobe(i64 -2624081020897602054, i64 9, i32 0, i64 -1), !dbg !92
+  %30 = load i32, i32* %3, align 4, !dbg !92, !tbaa !19
+  %31 = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([4 x i8], [4 x i8]* @.str, i64 0, i64 0), i32 %30), !dbg !93
+  %32 = bitcast i32* %3 to i8*, !dbg !95
+  call void @llvm.lifetime.end.p0i8(i64 4, i8* %32) #5, !dbg !95
+  %33 = bitcast i32 (i32)** %2 to i8*, !dbg !95
+  call void @llvm.lifetime.end.p0i8(i64 8, i8* %33) #5, !dbg !95
+  ret i32 0, !dbg !96
+}
+
+; Function Attrs: argmemonly nofree nosync nounwind willreturn
+declare void @llvm.lifetime.start.p0i8(i64 immarg, i8* nocapture) #2
+
+; Function Attrs: argmemonly nofree nosync nounwind willreturn
+declare void @llvm.lifetime.end.p0i8(i64 immarg, i8* nocapture) #2
+
+declare dso_local i32 @printf(i8*, ...) #3
+
+; Function Attrs: inaccessiblememonly nounwind willreturn
+declare void @llvm.pseudoprobe(i64, i64, i32, i64) #4
+
+attributes #0 = { nounwind uwtable "disable-tail-calls"="true" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "tune-cpu"="generic" "unsafe-fp-math"="false" "use-sample-profile" "use-soft-float"="false" }
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!3, !4, !5, !6, !7}
+!llvm.ident = !{!8}
+!llvm.pseudo_probe_desc = !{!9, !10, !11, !12}
+
+!0 = distinct !DICompileUnit(language: DW_LANG_C99, file: !1, producer: "clang version 13.0.06)", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug, enums: !2, splitDebugInlining: false, nameTableKind: None)
+!1 = !DIFile(filename: "test.c", directory: "")
+!2 = !{}
+!3 = !{i32 7, !"Dwarf Version", i32 4}
+!4 = !{i32 2, !"Debug Info Version", i32 3}
+!5 = !{i32 1, !"wchar_size", i32 4}
+!6 = !{i32 1, !"ThinLTO", i32 0}
+!7 = !{i32 1, !"EnableSplitLTOUnit", i32 1}
+!8 = !{!"clang version 13.0.0 "}
+!9 = !{i64 -2012135647395072713, i64 4294967295, !"bar", null}
+!10 = !{i64 7546896869197086323, i64 4294967295, !"baz", null}
+!11 = !{i64 6699318081062747564, i64 281479271677951, !"foo", null}
+!12 = !{i64 -2624081020897602054, i64 563125815542069, !"main", null}
+!13 = distinct !DISubprogram(name: "bar", scope: !1, file: !1, line: 2, type: !14, scopeLine: 3, flags: DIFlagPrototyped | DIFlagAllCallsDescribed, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0, retainedNodes: !17)
+!14 = !DISubroutineType(types: !15)
+!15 = !{!16, !16}
+!16 = !DIBasicType(name: "int", size: 32, encoding: DW_ATE_signed)
+!17 = !{!18}
+!18 = !DILocalVariable(name: "x", arg: 1, scope: !13, file: !1, line: 2, type: !16)
+!19 = !{!20, !20, i64 0}
+!20 = !{!"int", !21, i64 0}
+!21 = !{!"omnipotent char", !22, i64 0}
+!22 = !{!"Simple C/C++ TBAA"}
+!23 = !DILocation(line: 2, column: 13, scope: !13)
+!24 = !DILocation(line: 4, column: 7, scope: !13)
+!25 = !DILocation(line: 5, column: 12, scope: !13)
+!26 = !DILocation(line: 5, column: 14, scope: !13)
+!27 = !DILocation(line: 5, column: 5, scope: !13)
+!28 = distinct !DISubprogram(name: "baz", scope: !1, file: !1, line: 9, type: !14, scopeLine: 9, flags: DIFlagPrototyped | DIFlagAllCallsDescribed, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0, retainedNodes: !29)
+!29 = !{!30}
+!30 = !DILocalVariable(name: "x", arg: 1, scope: !28, file: !1, line: 9, type: !16)
+!31 = !DILocation(line: 9, column: 13, scope: !28)
+!32 = !DILocation(line: 10, column: 10, scope: !28)
+!33 = !DILocation(line: 10, column: 12, scope: !28)
+!34 = !DILocation(line: 10, column: 3, scope: !28)
+!35 = distinct !DISubprogram(name: "foo", scope: !1, file: !1, line: 13, type: !36, scopeLine: 13, flags: DIFlagPrototyped | DIFlagAllCallsDescribed, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0, retainedNodes: !41)
+!36 = !DISubroutineType(types: !37)
+!37 = !{!16, !16, !38}
+!38 = !DIDerivedType(tag: DW_TAG_pointer_type, baseType: !39, size: 64)
+!39 = !DISubroutineType(types: !40)
+!40 = !{!16, null}
+!41 = !{!42, !43}
+!42 = !DILocalVariable(name: "x", arg: 1, scope: !35, file: !1, line: 13, type: !16)
+!43 = !DILocalVariable(name: "f", arg: 2, scope: !35, file: !1, line: 13, type: !38)
+!44 = !DILocation(line: 13, column: 13, scope: !35)
+!45 = !{!46, !46, i64 0}
+!46 = !{!"any pointer", !21, i64 0}
+!47 = !DILocation(line: 13, column: 22, scope: !35)
+!48 = !DILocation(line: 14, column: 10, scope: !35)
+!49 = !DILocation(line: 14, column: 12, scope: !35)
+!50 = !DILocation(line: 14, column: 10, scope: !51)
+!51 = !DILexicalBlockFile(scope: !35, file: !1, discriminator: 108527639)
+!52 = !DILocation(line: 14, column: 3, scope: !35)
+!53 = distinct !DISubprogram(name: "main", scope: !1, file: !1, line: 17, type: !54, scopeLine: 18, flags: DIFlagAllCallsDescribed, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0, retainedNodes: !56)
+!54 = !DISubroutineType(types: !55)
+!55 = !{!16}
+!56 = !{!57, !59, !60}
+!57 = !DILocalVariable(name: "x", scope: !53, file: !1, line: 19, type: !58)
+!58 = !DIDerivedType(tag: DW_TAG_pointer_type, baseType: !14, size: 64)
+!59 = !DILocalVariable(name: "sum", scope: !53, file: !1, line: 25, type: !16)
+!60 = !DILocalVariable(name: "i", scope: !61, file: !1, line: 26, type: !16)
+!61 = distinct !DILexicalBlock(scope: !53, file: !1, line: 26, column: 5)
+!62 = !DILocation(line: 19, column: 3, scope: !53)
+!63 = !DILocation(line: 19, column: 9, scope: !53)
+!64 = !DILocation(line: 25, column: 5, scope: !53)
+!65 = !DILocation(line: 25, column: 9, scope: !53)
+!66 = !DILocation(line: 26, column: 10, scope: !61)
+!67 = !DILocation(line: 26, column: 14, scope: !61)
+!68 = !DILocation(line: 26, column: 21, scope: !69)
+!69 = distinct !DILexicalBlock(scope: !61, file: !1, line: 26, column: 5)
+!70 = !DILocation(line: 26, column: 23, scope: !69)
+!71 = !DILocation(line: 26, column: 5, scope: !61)
+!72 = !DILocation(line: 26, column: 5, scope: !69)
+!73 = !DILocation(line: 27, column: 10, scope: !74)
+!74 = distinct !DILexicalBlock(scope: !75, file: !1, line: 27, column: 10)
+!75 = distinct !DILexicalBlock(scope: !69, file: !1, line: 26, column: 45)
+!76 = !DILocation(line: 27, column: 12, scope: !74)
+!77 = !DILocation(line: 27, column: 19, scope: !74)
+!78 = !DILocation(line: 27, column: 10, scope: !75)
+!79 = !DILocation(line: 28, column: 11, scope: !74)
+!80 = !DILocation(line: 28, column: 9, scope: !74)
+!81 = !DILocation(line: 30, column: 11, scope: !74)
+!82 = !DILocation(line: 32, column: 17, scope: !75)
+!83 = !DILocation(line: 32, column: 20, scope: !75)
+!84 = !DILocation(line: 32, column: 13, scope: !85)
+!85 = !DILexicalBlockFile(scope: !75, file: !1, discriminator: 116916311)
+!86 = !DILocation(line: 32, column: 11, scope: !75)
+!87 = !DILocation(line: 33, column: 5, scope: !75)
+!88 = !DILocation(line: 26, column: 41, scope: !69)
+!89 = distinct !{!89, !71, !90, !91}
+!90 = !DILocation(line: 33, column: 5, scope: !61)
+!91 = !{!"llvm.loop.mustprogress"}
+!92 = !DILocation(line: 34, column: 21, scope: !53)
+!93 = !DILocation(line: 34, column: 5, scope: !94)
+!94 = !DILexicalBlockFile(scope: !53, file: !1, discriminator: 104333335)
+!95 = !DILocation(line: 36, column: 1, scope: !53)
+!96 = !DILocation(line: 35, column: 5, scope: !53)
+
+; CHECK: %[[#]] = call i32 (i32, ...) %30(i32 %[[#]]) #[[#]], !dbg ![[#DBGID:]], !prof ![[#]]
+
+;; A discriminator of 69206039 which is 0x4200017 in hexdecimal, stands for an indirect call probe
+;; with an index of 2 and probe factor of 0.04.
+; CHECK: ![[#DBGID]] = !DILocation(line: [[#]], column: [[#]], scope: ![[#SCOPE:]], inlinedAt: ![[#]])
+; CHECK: ![[#SCOPE]] = !DILexicalBlockFile(scope: ![[#]], file: ![[#]], discriminator: 69206039)


        


More information about the llvm-commits mailing list