[llvm] Bfi precision (PR #66285)

Wed Sep 13 13:26:29 PDT 2023

https://github.com/MatzeB created https://github.com/llvm/llvm-project/pull/66285:

I noticed that we often have poor precision in the `BlockFrequencyInfo` results because of how `convertFloatingToInteger` in the algorithm chooses factors.

This is a request for comments for choosing more aggressive factors! Look at changes in `test/Analysis/BlockFrequencyInfo/precision.ll` to get an impression for the poor precision and improvements.

Has this been tried before? Is there history I am not aware of?

Right now this change does not work yet as it triggers overflows and some instabilities in various places that I am working through. I thought it may be interesting to get some early feedback though in case this change is miguided...

>From 0a2f5723c96ea520ca062fa698a1ad8bc98e4927 Mon Sep 17 00:00:00 2001
From: Matthias Braun <matze at braunis.de>
Date: Mon, 11 Sep 2023 15:02:56 -0700
Subject: [PATCH 1/2] Add test for BlockFrequencyInfo precision for small
 min/max spreads

---
 .../Analysis/BlockFrequencyInfo/precision.ll  | 43 +++++++++++++++++++
 1 file changed, 43 insertions(+)
 create mode 100644 llvm/test/Analysis/BlockFrequencyInfo/precision.ll

diff --git a/llvm/test/Analysis/BlockFrequencyInfo/precision.ll b/llvm/test/Analysis/BlockFrequencyInfo/precision.ll
new file mode 100644
index 000000000000000..4001fe991d6d9e8
--- /dev/null
+++ b/llvm/test/Analysis/BlockFrequencyInfo/precision.ll
@@ -0,0 +1,43 @@
+; RUN: opt < %s -disable-output -passes="print<block-freq>" 2>&1 | FileCheck %s
+; Sanity check precision for small-ish min/max spread.
+
+ at g = global i32 0
+
+; CHECK-LABEL: block-frequency-info: func0
+; CHECK: - entry: float = 1.0, {{.*}}, count = 1000
+; CHECK: - cmp0_true: float = 0.4, {{.*}}, count = 388
+; CHECK: - cmp0_false: float = 0.6, {{.*}}, count = 600
+; CHECK: - cmp1_true: float = 0.1, {{.*}}, count = 88
+; CHECK: - cmp1_false: float = 0.3, {{.*}}, count = 288
+; CHECK: - join: float = 1.0, {{.*}}, count = 1000
+
+define void @func0(i32 %a0, i32 %a1) !prof !0 {
+entry:
+  %cmp0 = icmp ne i32 %a0, 0
+  br i1 %cmp0, label %cmp0_true, label %cmp0_false, !prof !1
+
+cmp0_true:
+  store volatile i32 1, ptr @g
+  %cmp1 = icmp ne i32 %a1, 0
+  br i1 %cmp1, label %cmp1_true, label %cmp1_false, !prof !2
+
+cmp0_false:
+  store volatile i32 2, ptr @g
+  br label %join
+
+cmp1_true:
+  store volatile i32 3, ptr @g
+  br label %join
+
+cmp1_false:
+  store volatile i32 4, ptr @g
+  br label %join
+
+join:
+  store volatile i32 5, ptr @g
+  ret void
+}
+
+!0 = !{!"function_entry_count", i64 1000}
+!1 = !{!"branch_weights", i32 400, i32 600}
+!2 = !{!"branch_weights", i32 1, i32 3}

>From 9a8d1ef94f9be8638eba213f14f2b51dd8f1c14c Mon Sep 17 00:00:00 2001
From: Matthias Braun <matze at braunis.de>
Date: Mon, 11 Sep 2023 14:45:09 -0700
Subject: [PATCH 2/2] BlockFrequencyInfoImpl: Increase precision for small
 min/max spreads

BlockFrequencyInfo stores its result as integer values expressing
frequencies within a function relative to the frequency of the entry
block. This also means that the precision for frequencies < 1.0
depends on the choice for the entry block.

Before this change we would often end up with unnecessarily small
choices resuling in unnecessarily poor precision. This simplifies the
algorithm to use the full 64bits available as much as possible.
---
 llvm/lib/Analysis/BlockFrequencyInfoImpl.cpp  | 29 ++++++-------------
 .../loops_with_profile_info.ll                | 19 ++++++------
 .../Analysis/BlockFrequencyInfo/precision.ll  |  6 ++--
 .../CodeExtractor/MultipleExitBranchProb.ll   |  6 ++--
 .../AArch64/consthoist-unreachable.ll         |  5 ++--
 .../X86/no_fpmath_with_hotness.ll             |  2 +-
 .../LoopVectorize/diag-with-hotness-info-2.ll |  2 +-
 .../LoopVectorize/diag-with-hotness-info.ll   |  2 +-
 .../SampleProfile/pseudo-probe-update-2.ll    | 10 +++----
 9 files changed, 35 insertions(+), 46 deletions(-)

diff --git a/llvm/lib/Analysis/BlockFrequencyInfoImpl.cpp b/llvm/lib/Analysis/BlockFrequencyInfoImpl.cpp
index 82b1e3b9eede709..93661d2edeb3083 100644
--- a/llvm/lib/Analysis/BlockFrequencyInfoImpl.cpp
+++ b/llvm/lib/Analysis/BlockFrequencyInfoImpl.cpp
@@ -481,26 +481,15 @@ void BlockFrequencyInfoImplBase::distributeMass(const BlockNode &Source,
 
 static void convertFloatingToInteger(BlockFrequencyInfoImplBase &BFI,
                                      const Scaled64 &Min, const Scaled64 &Max) {
-  // Scale the Factor to a size that creates integers.  Ideally, integers would
-  // be scaled so that Max == UINT64_MAX so that they can be best
-  // differentiated.  However, in the presence of large frequency values, small
-  // frequencies are scaled down to 1, making it impossible to differentiate
-  // small, unequal numbers. When the spread between Min and Max frequencies
-  // fits well within MaxBits, we make the scale be at least 8.
-  const unsigned MaxBits = 64;
-  const unsigned SpreadBits = (Max / Min).lg();
-  Scaled64 ScalingFactor;
-  if (SpreadBits <= MaxBits - 3) {
-    // If the values are small enough, make the scaling factor at least 8 to
-    // allow distinguishing small values.
-    ScalingFactor = Min.inverse();
-    ScalingFactor <<= 3;
-  } else {
-    // If the values need more than MaxBits to be represented, saturate small
-    // frequency values down to 1 by using a scaling factor that benefits large
-    // frequency values.
-    ScalingFactor = Scaled64(1, MaxBits) / Max;
-  }
+  // Scale the Factor to a size that creates integers.  If possible scale
+  // integers so that Max == UINT64_MAX so that they can be best differentiated.
+  // Is is possible that the range between min and max cannot be accurately
+  // represented in a 64bit integer without either loosing precision for small
+  // values (so small unequal numbers all map to 1) or saturaturing big numbers
+  // loosing precision for big numbers (so unequal big numbers may map to
+  // UINT64_MAX). We choose to loose precision for small numbers.
+  const unsigned MaxBits = sizeof(Scaled64::DigitsType) * CHAR_BIT;
+  Scaled64 ScalingFactor = Scaled64(1, MaxBits) / Max;
 
   // Translate the floats to integers.
   LLVM_DEBUG(dbgs() << "float-to-int: min = " << Min << ", max = " << Max
diff --git a/llvm/test/Analysis/BlockFrequencyInfo/loops_with_profile_info.ll b/llvm/test/Analysis/BlockFrequencyInfo/loops_with_profile_info.ll
index 41226a1cdfbaf32..fadb47bd256b772 100644
--- a/llvm/test/Analysis/BlockFrequencyInfo/loops_with_profile_info.ll
+++ b/llvm/test/Analysis/BlockFrequencyInfo/loops_with_profile_info.ll
@@ -59,7 +59,7 @@ declare i32 @printf(i8*, ...)
 
 ; CHECK: Printing analysis {{.*}} for function 'main':
 ; CHECK-NEXT: block-frequency-info: main
-define i32 @main() {
+define i32 @main() !prof !6 {
 entry:
   %retval = alloca i32, align 4
   %i = alloca i32, align 4
@@ -93,7 +93,7 @@ for.cond4:                                        ; preds = %for.inc, %for.body3
   %cmp5 = icmp slt i32 %2, 100
   br i1 %cmp5, label %for.body6, label %for.end, !prof !3
 
-; CHECK: - for.body6: float = 500000.5, int = 4000004
+; CHECK: - for.body6: {{.*}}, count = 1000000
 for.body6:                                        ; preds = %for.cond4
   call void @bar()
   br label %for.inc
@@ -143,7 +143,7 @@ for.cond16:                                       ; preds = %for.inc19, %for.bod
   %cmp17 = icmp slt i32 %8, 10000
   br i1 %cmp17, label %for.body18, label %for.end21, !prof !4
 
-; CHECK: - for.body18: float = 499999.9, int = 3999998
+; CHECK: - for.body18: {{.*}}, count = 1000000
 for.body18:                                       ; preds = %for.cond16
   call void @bar()
   br label %for.inc19
@@ -175,7 +175,7 @@ for.cond26:                                       ; preds = %for.inc29, %for.end
   %cmp27 = icmp slt i32 %12, 1000000
   br i1 %cmp27, label %for.body28, label %for.end31, !prof !5
 
-; CHECK: - for.body28: float = 499995.2, int = 3999961
+; CHECK: - for.body28: {{.*}}, count = 1000224
 for.body28:                                       ; preds = %for.cond26
   call void @bar()
   br label %for.inc29
@@ -197,8 +197,9 @@ for.end31:                                        ; preds = %for.cond26
 !llvm.ident = !{!0}
 
 !0 = !{!"clang version 3.7.0 (trunk 232635) (llvm/trunk 232636)"}
-!1 = !{!"branch_weights", i32 101, i32 2}
-!2 = !{!"branch_weights", i32 10001, i32 101}
-!3 = !{!"branch_weights", i32 1000001, i32 10001}
-!4 = !{!"branch_weights", i32 1000001, i32 101}
-!5 = !{!"branch_weights", i32 1000001, i32 2}
+!1 = !{!"branch_weights", i32 100, i32 1}
+!2 = !{!"branch_weights", i32 10000, i32 100}
+!3 = !{!"branch_weights", i32 1000000, i32 10000}
+!4 = !{!"branch_weights", i32 1000000, i32 100}
+!5 = !{!"branch_weights", i32 1000000, i32 1}
+!6 = !{!"function_entry_count", i32 1}
diff --git a/llvm/test/Analysis/BlockFrequencyInfo/precision.ll b/llvm/test/Analysis/BlockFrequencyInfo/precision.ll
index 4001fe991d6d9e8..7408d002d065d5b 100644
--- a/llvm/test/Analysis/BlockFrequencyInfo/precision.ll
+++ b/llvm/test/Analysis/BlockFrequencyInfo/precision.ll
@@ -5,10 +5,10 @@
 
 ; CHECK-LABEL: block-frequency-info: func0
 ; CHECK: - entry: float = 1.0, {{.*}}, count = 1000
-; CHECK: - cmp0_true: float = 0.4, {{.*}}, count = 388
+; CHECK: - cmp0_true: float = 0.4, {{.*}}, count = 400
 ; CHECK: - cmp0_false: float = 0.6, {{.*}}, count = 600
-; CHECK: - cmp1_true: float = 0.1, {{.*}}, count = 88
-; CHECK: - cmp1_false: float = 0.3, {{.*}}, count = 288
+; CHECK: - cmp1_true: float = 0.1, {{.*}}, count = 100
+; CHECK: - cmp1_false: float = 0.3, {{.*}}, count = 300
 ; CHECK: - join: float = 1.0, {{.*}}, count = 1000
 
 define void @func0(i32 %a0, i32 %a1) !prof !0 {
diff --git a/llvm/test/Transforms/CodeExtractor/MultipleExitBranchProb.ll b/llvm/test/Transforms/CodeExtractor/MultipleExitBranchProb.ll
index 63568456d0e58c8..f1eb19dbfdbea42 100644
--- a/llvm/test/Transforms/CodeExtractor/MultipleExitBranchProb.ll
+++ b/llvm/test/Transforms/CodeExtractor/MultipleExitBranchProb.ll
@@ -17,8 +17,8 @@ return:             ; preds = %entry
 
 define internal i32 @dummyCaller(i1 %cond) !prof !1 {
 entry:
-%val = call i32 @inlinedFunc(i1 %cond)
-ret i32 %val
+  %val = call i32 @inlinedFunc(i1 %cond)
+  ret i32 %val
 
 ; CHECK-LABEL: @dummyCaller
 ; CHECK: call
@@ -31,4 +31,4 @@ ret i32 %val
 !2 = !{!"branch_weights", i32 5, i32 5}
 !3 = !{!"branch_weights", i32 4, i32 1}
 
-; CHECK: [[COUNT1]] = !{!"branch_weights", i32 31, i32 8}
+; CHECK: [[COUNT1]] = !{!"branch_weights", i32 858993459, i32 214748365}
diff --git a/llvm/test/Transforms/ConstantHoisting/AArch64/consthoist-unreachable.ll b/llvm/test/Transforms/ConstantHoisting/AArch64/consthoist-unreachable.ll
index 69e84e942de65bf..64d7c229ff4e52c 100644
--- a/llvm/test/Transforms/ConstantHoisting/AArch64/consthoist-unreachable.ll
+++ b/llvm/test/Transforms/ConstantHoisting/AArch64/consthoist-unreachable.ll
@@ -10,14 +10,13 @@
 define void @c() {
 ; CHECK-LABEL: @c(
 ; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[CONST:%.*]] = bitcast i32 1232131 to i32
 ; CHECK-NEXT:    [[TOBOOL:%.*]] = icmp ne i16 0, 0
 ; CHECK-NEXT:    br i1 undef, label [[LBL1_US:%.*]], label [[ENTRY_ENTRY_SPLIT_CRIT_EDGE:%.*]]
 ; CHECK:       entry.entry.split_crit_edge:
-; CHECK-NEXT:    [[CONST:%.*]] = bitcast i32 1232131 to i32
 ; CHECK-NEXT:    br label [[LBL1:%.*]]
 ; CHECK:       lbl1.us:
-; CHECK-NEXT:    [[CONST1:%.*]] = bitcast i32 1232131 to i32
-; CHECK-NEXT:    store i32 [[CONST1]], ptr @c.a, align 1
+; CHECK-NEXT:    store i32 [[CONST]], ptr @c.a, align 1
 ; CHECK-NEXT:    br label [[FOR_COND4:%.*]]
 ; CHECK:       lbl1:
 ; CHECK-NEXT:    store i32 [[CONST]], ptr @c.a, align 1
diff --git a/llvm/test/Transforms/LoopVectorize/X86/no_fpmath_with_hotness.ll b/llvm/test/Transforms/LoopVectorize/X86/no_fpmath_with_hotness.ll
index b1fc96ea77ed034..4f413a50837dd69 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/no_fpmath_with_hotness.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/no_fpmath_with_hotness.ll
@@ -108,5 +108,5 @@ attributes #0 = { nounwind }
                              isOptimized: true, flags: "-O2",
                              splitDebugFilename: "abc.debug", emissionKind: 2)
 !29 = !{!"function_entry_count", i64 3}
-!30 = !{!"branch_weights", i32 99, i32 1}
+!30 = !{!"branch_weights", i32 10000, i32 1}
 !31 = !{!"branch_weights", i32 1, i32 99}
diff --git a/llvm/test/Transforms/LoopVectorize/diag-with-hotness-info-2.ll b/llvm/test/Transforms/LoopVectorize/diag-with-hotness-info-2.ll
index ed107b10dcd9874..4da1d099645bee2 100644
--- a/llvm/test/Transforms/LoopVectorize/diag-with-hotness-info-2.ll
+++ b/llvm/test/Transforms/LoopVectorize/diag-with-hotness-info-2.ll
@@ -198,5 +198,5 @@ attributes #0 = { norecurse nounwind ssp uwtable "disable-tail-calls"="false" "l
 !55 = distinct !{!55, !43}
 !56 = !{!"function_entry_count", i64 3}
 !57 = !{!"function_entry_count", i64 50}
-!58 = !{!"branch_weights", i32 99, i32 1}
+!58 = !{!"branch_weights", i32 10000, i32 1}
 !59 = !{!"branch_weights", i32 1, i32 99}
diff --git a/llvm/test/Transforms/LoopVectorize/diag-with-hotness-info.ll b/llvm/test/Transforms/LoopVectorize/diag-with-hotness-info.ll
index 30d11a12c79c4bc..4b7b714a2562800 100644
--- a/llvm/test/Transforms/LoopVectorize/diag-with-hotness-info.ll
+++ b/llvm/test/Transforms/LoopVectorize/diag-with-hotness-info.ll
@@ -209,5 +209,5 @@ attributes #0 = { norecurse nounwind ssp uwtable "disable-tail-calls"="false" "l
 !55 = distinct !{!55, !43}
 !56 = !{!"function_entry_count", i64 3}
 !57 = !{!"function_entry_count", i64 50}
-!58 = !{!"branch_weights", i32 99, i32 1}
+!58 = !{!"branch_weights", i32 10000, i32 1}
 !59 = !{!"branch_weights", i32 1, i32 99}
diff --git a/llvm/test/Transforms/SampleProfile/pseudo-probe-update-2.ll b/llvm/test/Transforms/SampleProfile/pseudo-probe-update-2.ll
index 19e83649723d642..105494942d383d5 100644
--- a/llvm/test/Transforms/SampleProfile/pseudo-probe-update-2.ll
+++ b/llvm/test/Transforms/SampleProfile/pseudo-probe-update-2.ll
@@ -14,8 +14,8 @@ T1:                                               ; preds = %0
   %v1 = call i32 @f1(), !prof !12
   %cond3 = icmp eq i32 %v1, 412
   call void @llvm.pseudoprobe(i64 6699318081062747564, i64 2, i32 0, i64 -1)
-;; The distribution factor -8513881372706734080 stands for 53.85%, whic is from 7/6+7.
-; CHECK: call void @llvm.pseudoprobe(i64 [[#GUID:]], i64 4, i32 0, i64 -8513881372706734080)
+;; The distribution factor -9223372036854775808 stands for 53.85%, whic is from 7/6+7.
+; CHECK: call void @llvm.pseudoprobe(i64 [[#GUID:]], i64 4, i32 0, i64 -9223372036854775808)
   call void @llvm.pseudoprobe(i64 6699318081062747564, i64 4, i32 0, i64 -1), !dbg !13
 ;; Probe 7 has two copies, since they don't share the same inline context, they are not
 ;; considered sharing samples, thus their distribution factors are not fixed up.
@@ -29,8 +29,8 @@ T1:                                               ; preds = %0
 Merge:                                            ; preds = %0
   %v2 = call i32 @f2(), !prof !12
   call void @llvm.pseudoprobe(i64 6699318081062747564, i64 3, i32 0, i64 -1)
-;; The distribution factor 8513881922462547968 stands for 46.25%, which is from 6/6+7.
-; CHECK: call void @llvm.pseudoprobe(i64 [[#GUID]], i64 4, i32 0, i64 8513881922462547968)
+;; The distribution factor  -9223372036854775808 stands for 46.25%, which is from 6/6+7.
+; CHECK: call void @llvm.pseudoprobe(i64 [[#GUID]], i64 4, i32 0, i64  -9223372036854775808)
   call void @llvm.pseudoprobe(i64 6699318081062747564, i64 4, i32 0, i64 8513881922462547968), !dbg !13
 ; CHECK: call void @llvm.pseudoprobe(i64 [[#GUID]], i64 7, i32 0, i64 -1)
   call void @llvm.pseudoprobe(i64 6699318081062747564, i64 7, i32 0, i64 -1), !dbg !18
@@ -77,4 +77,4 @@ attributes #0 = { inaccessiblememonly nounwind willreturn }
 !16 = distinct !DILocation(line: 10, column: 11, scope: !17)
 !17 = !DILexicalBlockFile(scope: !4, file: !5, discriminator: 186646551)
 !18 = !DILocation(line: 53, column: 3, scope: !15, inlinedAt: !19)
-!19 = !DILocation(line: 12, column: 3, scope: !4)
\ No newline at end of file
+!19 = !DILocation(line: 12, column: 3, scope: !4)