[llvm] Bfi precision (PR #66285)

Wed Sep 13 13:27:39 PDT 2023

llvmbot wrote:




@llvm/pr-subscribers-llvm-analysis
            
<details>
<summary>Changes</summary>
I noticed that we often have poor precision in the `BlockFrequencyInfo` results because of how `convertFloatingToInteger` in the algorithm chooses factors.

This is a request for comments for choosing more aggressive factors! Look at changes in `test/Analysis/BlockFrequencyInfo/precision.ll` to get an impression for the poor precision and improvements.

Has this been tried before? Is there history I am not aware of?

Right now this change does not work yet as it triggers overflows and some instabilities in various places that I am working through. I thought it may be interesting to get some early feedback though in case this change is miguided...
--
Full diff: https://github.com/llvm/llvm-project/pull/66285.diff

9 Files Affected:

- (modified) llvm/lib/Analysis/BlockFrequencyInfoImpl.cpp (+9-20) 
- (modified) llvm/test/Analysis/BlockFrequencyInfo/loops_with_profile_info.ll (+10-9) 
- (added) llvm/test/Analysis/BlockFrequencyInfo/precision.ll (+43) 
- (modified) llvm/test/Transforms/CodeExtractor/MultipleExitBranchProb.ll (+3-3) 
- (modified) llvm/test/Transforms/ConstantHoisting/AArch64/consthoist-unreachable.ll (+2-3) 
- (modified) llvm/test/Transforms/LoopVectorize/X86/no_fpmath_with_hotness.ll (+1-1) 
- (modified) llvm/test/Transforms/LoopVectorize/diag-with-hotness-info-2.ll (+1-1) 
- (modified) llvm/test/Transforms/LoopVectorize/diag-with-hotness-info.ll (+1-1) 
- (modified) llvm/test/Transforms/SampleProfile/pseudo-probe-update-2.ll (+5-5) 


<pre>

diff --git a/llvm/lib/Analysis/BlockFrequencyInfoImpl.cpp b/llvm/lib/Analysis/BlockFrequencyInfoImpl.cpp
index 82b1e3b9eede709..93661d2edeb3083 100644
--- a/llvm/lib/Analysis/BlockFrequencyInfoImpl.cpp
+++ b/llvm/lib/Analysis/BlockFrequencyInfoImpl.cpp
@@ -481,26 +481,15 @@ void BlockFrequencyInfoImplBase::distributeMass(const BlockNode &Source,
 
 static void convertFloatingToInteger(BlockFrequencyInfoImplBase &BFI,
                                      const Scaled64 &Min, const Scaled64 &Max) {
-  // Scale the Factor to a size that creates integers.  Ideally, integers would
-  // be scaled so that Max == UINT64_MAX so that they can be best
-  // differentiated.  However, in the presence of large frequency values, small
-  // frequencies are scaled down to 1, making it impossible to differentiate
-  // small, unequal numbers. When the spread between Min and Max frequencies
-  // fits well within MaxBits, we make the scale be at least 8.
-  const unsigned MaxBits = 64;
-  const unsigned SpreadBits = (Max / Min).lg();
-  Scaled64 ScalingFactor;
-  if (SpreadBits <= MaxBits - 3) {
-    // If the values are small enough, make the scaling factor at least 8 to
-    // allow distinguishing small values.
-    ScalingFactor = Min.inverse();
-    ScalingFactor <<= 3;
-  } else {
-    // If the values need more than MaxBits to be represented, saturate small
-    // frequency values down to 1 by using a scaling factor that benefits large
-    // frequency values.
-    ScalingFactor = Scaled64(1, MaxBits) / Max;
-  }
+  // Scale the Factor to a size that creates integers.  If possible scale
+  // integers so that Max == UINT64_MAX so that they can be best differentiated.
+  // Is is possible that the range between min and max cannot be accurately
+  // represented in a 64bit integer without either loosing precision for small
+  // values (so small unequal numbers all map to 1) or saturaturing big numbers
+  // loosing precision for big numbers (so unequal big numbers may map to
+  // UINT64_MAX). We choose to loose precision for small numbers.
+  const unsigned MaxBits = sizeof(Scaled64::DigitsType) * CHAR_BIT;
+  Scaled64 ScalingFactor = Scaled64(1, MaxBits) / Max;
 
   // Translate the floats to integers.
   LLVM_DEBUG(dbgs() << "float-to-int: min = " << Min << ", max = " << Max
diff --git a/llvm/test/Analysis/BlockFrequencyInfo/loops_with_profile_info.ll b/llvm/test/Analysis/BlockFrequencyInfo/loops_with_profile_info.ll
index 41226a1cdfbaf32..fadb47bd256b772 100644
--- a/llvm/test/Analysis/BlockFrequencyInfo/loops_with_profile_info.ll
+++ b/llvm/test/Analysis/BlockFrequencyInfo/loops_with_profile_info.ll
@@ -59,7 +59,7 @@ declare i32 @printf(i8*, ...)
 
 ; CHECK: Printing analysis {{.*}} for function &#x27;main&#x27;:
 ; CHECK-NEXT: block-frequency-info: main
-define i32 @main() {
+define i32 @main() !prof !6 {
 entry:
   %retval = alloca i32, align 4
   %i = alloca i32, align 4
@@ -93,7 +93,7 @@ for.cond4:                                        ; preds = %for.inc, %for.body3
   %cmp5 = icmp slt i32 %2, 100
   br i1 %cmp5, label %for.body6, label %for.end, !prof !3
 
-; CHECK: - for.body6: float = 500000.5, int = 4000004
+; CHECK: - for.body6: {{.*}}, count = 1000000
 for.body6:                                        ; preds = %for.cond4
   call void @bar()
   br label %for.inc
@@ -143,7 +143,7 @@ for.cond16:                                       ; preds = %for.inc19, %for.bod
   %cmp17 = icmp slt i32 %8, 10000
   br i1 %cmp17, label %for.body18, label %for.end21, !prof !4
 
-; CHECK: - for.body18: float = 499999.9, int = 3999998
+; CHECK: - for.body18: {{.*}}, count = 1000000
 for.body18:                                       ; preds = %for.cond16
   call void @bar()
   br label %for.inc19
@@ -175,7 +175,7 @@ for.cond26:                                       ; preds = %for.inc29, %for.end
   %cmp27 = icmp slt i32 %12, 1000000
   br i1 %cmp27, label %for.body28, label %for.end31, !prof !5
 
-; CHECK: - for.body28: float = 499995.2, int = 3999961
+; CHECK: - for.body28: {{.*}}, count = 1000224
 for.body28:                                       ; preds = %for.cond26
   call void @bar()
   br label %for.inc29
@@ -197,8 +197,9 @@ for.end31:                                        ; preds = %for.cond26
 !llvm.ident = !{!0}
 
 !0 = !{!"clang version 3.7.0 (trunk 232635) (llvm/trunk 232636)"}
-!1 = !{!"branch_weights", i32 101, i32 2}
-!2 = !{!"branch_weights", i32 10001, i32 101}
-!3 = !{!"branch_weights", i32 1000001, i32 10001}
-!4 = !{!"branch_weights", i32 1000001, i32 101}
-!5 = !{!"branch_weights", i32 1000001, i32 2}
+!1 = !{!"branch_weights", i32 100, i32 1}
+!2 = !{!"branch_weights", i32 10000, i32 100}
+!3 = !{!"branch_weights", i32 1000000, i32 10000}
+!4 = !{!"branch_weights", i32 1000000, i32 100}
+!5 = !{!"branch_weights", i32 1000000, i32 1}
+!6 = !{!"function_entry_count", i32 1}
diff --git a/llvm/test/Analysis/BlockFrequencyInfo/precision.ll b/llvm/test/Analysis/BlockFrequencyInfo/precision.ll
new file mode 100644
index 000000000000000..7408d002d065d5b
--- /dev/null
+++ b/llvm/test/Analysis/BlockFrequencyInfo/precision.ll
@@ -0,0 +1,43 @@
+; RUN: opt < %s -disable-output -passes="print<block-freq>" 2>&1 | FileCheck %s
+; Sanity check precision for small-ish min/max spread.
+
+ at g = global i32 0
+
+; CHECK-LABEL: block-frequency-info: func0
+; CHECK: - entry: float = 1.0, {{.*}}, count = 1000
+; CHECK: - cmp0_true: float = 0.4, {{.*}}, count = 400
+; CHECK: - cmp0_false: float = 0.6, {{.*}}, count = 600
+; CHECK: - cmp1_true: float = 0.1, {{.*}}, count = 100
+; CHECK: - cmp1_false: float = 0.3, {{.*}}, count = 300
+; CHECK: - join: float = 1.0, {{.*}}, count = 1000
+
+define void @func0(i32 %a0, i32 %a1) !prof !0 {
+entry:
+  %cmp0 = icmp ne i32 %a0, 0
+  br i1 %cmp0, label %cmp0_true, label %cmp0_false, !prof !1
+
+cmp0_true:
+  store volatile i32 1, ptr @g
+  %cmp1 = icmp ne i32 %a1, 0
+  br i1 %cmp1, label %cmp1_true, label %cmp1_false, !prof !2
+
+cmp0_false:
+  store volatile i32 2, ptr @g
+  br label %join
+
+cmp1_true:
+  store volatile i32 3, ptr @g
+  br label %join
+
+cmp1_false:
+  store volatile i32 4, ptr @g
+  br label %join
+
+join:
+  store volatile i32 5, ptr @g
+  ret void
+}
+
+!0 = !{!"function_entry_count", i64 1000}
+!1 = !{!"branch_weights", i32 400, i32 600}
+!2 = !{!"branch_weights", i32 1, i32 3}
diff --git a/llvm/test/Transforms/CodeExtractor/MultipleExitBranchProb.ll b/llvm/test/Transforms/CodeExtractor/MultipleExitBranchProb.ll
index 63568456d0e58c8..f1eb19dbfdbea42 100644
--- a/llvm/test/Transforms/CodeExtractor/MultipleExitBranchProb.ll
+++ b/llvm/test/Transforms/CodeExtractor/MultipleExitBranchProb.ll
@@ -17,8 +17,8 @@ return:             ; preds = %entry
 
 define internal i32 @dummyCaller(i1 %cond) !prof !1 {
 entry:
-%val = call i32 @inlinedFunc(i1 %cond)
-ret i32 %val
+  %val = call i32 @inlinedFunc(i1 %cond)
+  ret i32 %val
 
 ; CHECK-LABEL: @dummyCaller
 ; CHECK: call
@@ -31,4 +31,4 @@ ret i32 %val
 !2 = !{!"branch_weights", i32 5, i32 5}
 !3 = !{!"branch_weights", i32 4, i32 1}
 
-; CHECK: [[COUNT1]] = !{!"branch_weights", i32 31, i32 8}
+; CHECK: [[COUNT1]] = !{!"branch_weights", i32 858993459, i32 214748365}
diff --git a/llvm/test/Transforms/ConstantHoisting/AArch64/consthoist-unreachable.ll b/llvm/test/Transforms/ConstantHoisting/AArch64/consthoist-unreachable.ll
index 69e84e942de65bf..64d7c229ff4e52c 100644
--- a/llvm/test/Transforms/ConstantHoisting/AArch64/consthoist-unreachable.ll
+++ b/llvm/test/Transforms/ConstantHoisting/AArch64/consthoist-unreachable.ll
@@ -10,14 +10,13 @@
 define void @c() {
 ; CHECK-LABEL: @c(
 ; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[CONST:%.*]] = bitcast i32 1232131 to i32
 ; CHECK-NEXT:    [[TOBOOL:%.*]] = icmp ne i16 0, 0
 ; CHECK-NEXT:    br i1 undef, label [[LBL1_US:%.*]], label [[ENTRY_ENTRY_SPLIT_CRIT_EDGE:%.*]]
 ; CHECK:       entry.entry.split_crit_edge:
-; CHECK-NEXT:    [[CONST:%.*]] = bitcast i32 1232131 to i32
 ; CHECK-NEXT:    br label [[LBL1:%.*]]
 ; CHECK:       lbl1.us:
-; CHECK-NEXT:    [[CONST1:%.*]] = bitcast i32 1232131 to i32
-; CHECK-NEXT:    store i32 [[CONST1]], ptr @c.a, align 1
+; CHECK-NEXT:    store i32 [[CONST]], ptr @c.a, align 1
 ; CHECK-NEXT:    br label [[FOR_COND4:%.*]]
 ; CHECK:       lbl1:
 ; CHECK-NEXT:    store i32 [[CONST]], ptr @c.a, align 1
diff --git a/llvm/test/Transforms/LoopVectorize/X86/no_fpmath_with_hotness.ll b/llvm/test/Transforms/LoopVectorize/X86/no_fpmath_with_hotness.ll
index b1fc96ea77ed034..4f413a50837dd69 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/no_fpmath_with_hotness.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/no_fpmath_with_hotness.ll
@@ -108,5 +108,5 @@ attributes #0 = { nounwind }
                              isOptimized: true, flags: "-O2",
                              splitDebugFilename: "abc.debug", emissionKind: 2)
 !29 = !{!"function_entry_count", i64 3}
-!30 = !{!"branch_weights", i32 99, i32 1}
+!30 = !{!"branch_weights", i32 10000, i32 1}
 !31 = !{!"branch_weights", i32 1, i32 99}
diff --git a/llvm/test/Transforms/LoopVectorize/diag-with-hotness-info-2.ll b/llvm/test/Transforms/LoopVectorize/diag-with-hotness-info-2.ll
index ed107b10dcd9874..4da1d099645bee2 100644
--- a/llvm/test/Transforms/LoopVectorize/diag-with-hotness-info-2.ll
+++ b/llvm/test/Transforms/LoopVectorize/diag-with-hotness-info-2.ll
@@ -198,5 +198,5 @@ attributes #0 = { norecurse nounwind ssp uwtable "disable-tail-calls"="false" "l
 !55 = distinct !{!55, !43}
 !56 = !{!"function_entry_count", i64 3}
 !57 = !{!"function_entry_count", i64 50}
-!58 = !{!"branch_weights", i32 99, i32 1}
+!58 = !{!"branch_weights", i32 10000, i32 1}
 !59 = !{!"branch_weights", i32 1, i32 99}
diff --git a/llvm/test/Transforms/LoopVectorize/diag-with-hotness-info.ll b/llvm/test/Transforms/LoopVectorize/diag-with-hotness-info.ll
index 30d11a12c79c4bc..4b7b714a2562800 100644
--- a/llvm/test/Transforms/LoopVectorize/diag-with-hotness-info.ll
+++ b/llvm/test/Transforms/LoopVectorize/diag-with-hotness-info.ll
@@ -209,5 +209,5 @@ attributes #0 = { norecurse nounwind ssp uwtable "disable-tail-calls"="false" "l
 !55 = distinct !{!55, !43}
 !56 = !{!"function_entry_count", i64 3}
 !57 = !{!"function_entry_count", i64 50}
-!58 = !{!"branch_weights", i32 99, i32 1}
+!58 = !{!"branch_weights", i32 10000, i32 1}
 !59 = !{!"branch_weights", i32 1, i32 99}
diff --git a/llvm/test/Transforms/SampleProfile/pseudo-probe-update-2.ll b/llvm/test/Transforms/SampleProfile/pseudo-probe-update-2.ll
index 19e83649723d642..105494942d383d5 100644
--- a/llvm/test/Transforms/SampleProfile/pseudo-probe-update-2.ll
+++ b/llvm/test/Transforms/SampleProfile/pseudo-probe-update-2.ll
@@ -14,8 +14,8 @@ T1:                                               ; preds = %0
   %v1 = call i32 @f1(), !prof !12
   %cond3 = icmp eq i32 %v1, 412
   call void @llvm.pseudoprobe(i64 6699318081062747564, i64 2, i32 0, i64 -1)
-;; The distribution factor -8513881372706734080 stands for 53.85%, whic is from 7/6+7.
-; CHECK: call void @llvm.pseudoprobe(i64 [[#GUID:]], i64 4, i32 0, i64 -8513881372706734080)
+;; The distribution factor -9223372036854775808 stands for 53.85%, whic is from 7/6+7.
+; CHECK: call void @llvm.pseudoprobe(i64 [[#GUID:]], i64 4, i32 0, i64 -9223372036854775808)
   call void @llvm.pseudoprobe(i64 6699318081062747564, i64 4, i32 0, i64 -1), !dbg !13
 ;; Probe 7 has two copies, since they don&#x27;t share the same inline context, they are not
 ;; considered sharing samples, thus their distribution factors are not fixed up.
@@ -29,8 +29,8 @@ T1:                                               ; preds = %0
 Merge:                                            ; preds = %0
   %v2 = call i32 @f2(), !prof !12
   call void @llvm.pseudoprobe(i64 6699318081062747564, i64 3, i32 0, i64 -1)
-;; The distribution factor 8513881922462547968 stands for 46.25%, which is from 6/6+7.
-; CHECK: call void @llvm.pseudoprobe(i64 [[#GUID]], i64 4, i32 0, i64 8513881922462547968)
+;; The distribution factor  -9223372036854775808 stands for 46.25%, which is from 6/6+7.
+; CHECK: call void @llvm.pseudoprobe(i64 [[#GUID]], i64 4, i32 0, i64  -9223372036854775808)
   call void @llvm.pseudoprobe(i64 6699318081062747564, i64 4, i32 0, i64 8513881922462547968), !dbg !13
 ; CHECK: call void @llvm.pseudoprobe(i64 [[#GUID]], i64 7, i32 0, i64 -1)
   call void @llvm.pseudoprobe(i64 6699318081062747564, i64 7, i32 0, i64 -1), !dbg !18
@@ -77,4 +77,4 @@ attributes #0 = { inaccessiblememonly nounwind willreturn }
 !16 = distinct !DILocation(line: 10, column: 11, scope: !17)
 !17 = !DILexicalBlockFile(scope: !4, file: !5, discriminator: 186646551)
 !18 = !DILocation(line: 53, column: 3, scope: !15, inlinedAt: !19)
-!19 = !DILocation(line: 12, column: 3, scope: !4)
\ No newline at end of file
+!19 = !DILocation(line: 12, column: 3, scope: !4)
</pre>
</details>


https://github.com/llvm/llvm-project/pull/66285