[llvm] [PseudoProbe] Extend to skip instrumenting probe into the dests of invoke (PR #79919)

Lei Wang via llvm-commits llvm-commits at lists.llvm.org
Tue Feb 27 16:19:26 PST 2024


https://github.com/wlei-llvm updated https://github.com/llvm/llvm-project/pull/79919

>From aa85400dbde73644bd49436c187526d712e516fb Mon Sep 17 00:00:00 2001
From: wlei <wlei at fb.com>
Date: Mon, 29 Jan 2024 15:53:35 -0800
Subject: [PATCH 1/5] [PseudoProbe] Extend to skip instrumenting probe into the
 dests of invoke

---
 .../llvm/Transforms/IPO/SampleProfileProbe.h  |   7 +-
 .../lib/Transforms/IPO/SampleProfileProbe.cpp |  49 ++++--
 .../ThinLTO/X86/pseudo-probe-desc-import.ll   |   4 +-
 .../SampleProfile/pseudo-probe-eh.ll          |   2 +-
 .../SampleProfile/pseudo-probe-invoke.ll      | 155 ++++++++++++++++++
 5 files changed, 202 insertions(+), 15 deletions(-)
 create mode 100644 llvm/test/Transforms/SampleProfile/pseudo-probe-invoke.ll

diff --git a/llvm/include/llvm/Transforms/IPO/SampleProfileProbe.h b/llvm/include/llvm/Transforms/IPO/SampleProfileProbe.h
index 0f2729a9462de2..0db7b2dcc36600 100644
--- a/llvm/include/llvm/Transforms/IPO/SampleProfileProbe.h
+++ b/llvm/include/llvm/Transforms/IPO/SampleProfileProbe.h
@@ -81,8 +81,11 @@ class SampleProfileProber {
   uint64_t getFunctionHash() const { return FunctionHash; }
   uint32_t getBlockId(const BasicBlock *BB) const;
   uint32_t getCallsiteId(const Instruction *Call) const;
-  void computeCFGHash();
-  void computeProbeIdForBlocks();
+  void findInvokeNormalDests(DenseSet<BasicBlock *> &InvokeNormalDests);
+  void computeCFGHash(const DenseSet<BasicBlock *> &InvokeNormalDests,
+                      const DenseSet<BasicBlock *> &KnownColdBlocks);
+  void computeProbeIdForBlocks(const DenseSet<BasicBlock *> &InvokeNormalDests,
+                               const DenseSet<BasicBlock *> &KnownColdBlocks);
   void computeProbeIdForCallsites();
 
   Function *F;
diff --git a/llvm/lib/Transforms/IPO/SampleProfileProbe.cpp b/llvm/lib/Transforms/IPO/SampleProfileProbe.cpp
index 090e5560483edb..fd3c2f223b87b8 100644
--- a/llvm/lib/Transforms/IPO/SampleProfileProbe.cpp
+++ b/llvm/lib/Transforms/IPO/SampleProfileProbe.cpp
@@ -173,20 +173,49 @@ SampleProfileProber::SampleProfileProber(Function &Func,
   BlockProbeIds.clear();
   CallProbeIds.clear();
   LastProbeId = (uint32_t)PseudoProbeReservedId::Last;
-  computeProbeIdForBlocks();
+
+  DenseSet<BasicBlock *> InvokeNormalDests;
+  findInvokeNormalDests(InvokeNormalDests);
+  DenseSet<BasicBlock *> KnownColdBlocks;
+  computeEHOnlyBlocks(*F, KnownColdBlocks);
+
+  computeProbeIdForBlocks(InvokeNormalDests, KnownColdBlocks);
   computeProbeIdForCallsites();
-  computeCFGHash();
+  computeCFGHash(InvokeNormalDests, KnownColdBlocks);
+}
+
+void SampleProfileProber::findInvokeNormalDests(
+    DenseSet<BasicBlock *> &InvokeNormalDests) {
+  for (auto &BB : *F) {
+    auto *TI = BB.getTerminator();
+    if (auto *II = dyn_cast<InvokeInst>(TI))
+      InvokeNormalDests.insert(II->getNormalDest());
+  }
 }
 
 // Compute Hash value for the CFG: the lower 32 bits are CRC32 of the index
 // value of each BB in the CFG. The higher 32 bits record the number of edges
 // preceded by the number of indirect calls.
 // This is derived from FuncPGOInstrumentation<Edge, BBInfo>::computeCFGHash().
-void SampleProfileProber::computeCFGHash() {
+void SampleProfileProber::computeCFGHash(
+    const DenseSet<BasicBlock *> &InvokeNormalDests,
+    const DenseSet<BasicBlock *> &KnownColdBlocks) {
   std::vector<uint8_t> Indexes;
   JamCRC JC;
   for (auto &BB : *F) {
-    for (BasicBlock *Succ : successors(&BB)) {
+    // Skip the EH flow blocks.
+    if (InvokeNormalDests.contains(&BB) || KnownColdBlocks.contains(&BB))
+      continue;
+
+    // Find the original successors by skipping the EH flow succs.
+    auto *BBPtr = &BB;
+    auto *TI = BBPtr->getTerminator();
+    while (auto *II = dyn_cast<InvokeInst>(TI)) {
+      BBPtr = II->getNormalDest();
+      TI = BBPtr->getTerminator();
+    }
+
+    for (BasicBlock *Succ : successors(BBPtr)) {
       auto Index = getBlockId(Succ);
       for (int J = 0; J < 4; J++)
         Indexes.push_back((uint8_t)(Index >> (J * 8)));
@@ -207,15 +236,15 @@ void SampleProfileProber::computeCFGHash() {
                     << ", Hash = " << FunctionHash << "\n");
 }
 
-void SampleProfileProber::computeProbeIdForBlocks() {
-  DenseSet<BasicBlock *> KnownColdBlocks;
-  computeEHOnlyBlocks(*F, KnownColdBlocks);
+void SampleProfileProber::computeProbeIdForBlocks(
+    const DenseSet<BasicBlock *> &InvokeNormalDests,
+    const DenseSet<BasicBlock *> &KnownColdBlocks) {
   // Insert pseudo probe to non-cold blocks only. This will reduce IR size as
   // well as the binary size while retaining the profile quality.
   for (auto &BB : *F) {
-    ++LastProbeId;
-    if (!KnownColdBlocks.contains(&BB))
-      BlockProbeIds[&BB] = LastProbeId;
+    if (InvokeNormalDests.contains(&BB) || KnownColdBlocks.contains(&BB))
+      continue;
+    BlockProbeIds[&BB] = ++LastProbeId;
   }
 }
 
diff --git a/llvm/test/ThinLTO/X86/pseudo-probe-desc-import.ll b/llvm/test/ThinLTO/X86/pseudo-probe-desc-import.ll
index 21dd8c0fe92414..f915aaccc06e17 100644
--- a/llvm/test/ThinLTO/X86/pseudo-probe-desc-import.ll
+++ b/llvm/test/ThinLTO/X86/pseudo-probe-desc-import.ll
@@ -12,8 +12,8 @@
 ; RUN: llvm-lto -thinlto-action=import %t3.bc -thinlto-index=%t3.index.bc -o /dev/null 2>&1 | FileCheck %s  --check-prefix=WARN
 
 
-; CHECK-NOT: {i64 6699318081062747564, i64 4294967295, !"foo"
-; CHECK: !{i64 -2624081020897602054, i64 281479271677951, !"main"
+; CHECK-NOT: {i64 6699318081062747564, i64 [[#]], !"foo"
+; CHECK: !{i64 -2624081020897602054, i64 [[#]], !"main"
 
 ; WARN: warning: Pseudo-probe ignored: source module '{{.*}}' is compiled with -fpseudo-probe-for-profiling while destination module '{{.*}}' is not
 
diff --git a/llvm/test/Transforms/SampleProfile/pseudo-probe-eh.ll b/llvm/test/Transforms/SampleProfile/pseudo-probe-eh.ll
index 697ef44fb7ed71..9954914bca4380 100644
--- a/llvm/test/Transforms/SampleProfile/pseudo-probe-eh.ll
+++ b/llvm/test/Transforms/SampleProfile/pseudo-probe-eh.ll
@@ -18,7 +18,7 @@ entry:
           to label %ret unwind label %lpad
 
 ret:
-; CHECK: call void @llvm.pseudoprobe
+; CHECK-NOT: call void @llvm.pseudoprobe
   ret void
 
 lpad:                                             ; preds = %entry
diff --git a/llvm/test/Transforms/SampleProfile/pseudo-probe-invoke.ll b/llvm/test/Transforms/SampleProfile/pseudo-probe-invoke.ll
new file mode 100644
index 00000000000000..822ab403dee297
--- /dev/null
+++ b/llvm/test/Transforms/SampleProfile/pseudo-probe-invoke.ll
@@ -0,0 +1,155 @@
+; REQUIRES: x86_64-linux
+; RUN: opt < %s -passes=pseudo-probe -S -o - | FileCheck %s
+
+target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+$__clang_call_terminate = comdat any
+
+ at x = dso_local global i32 0, align 4, !dbg !0
+
+; Function Attrs: mustprogress noinline nounwind uwtable
+define dso_local void @_Z3barv() #0 personality ptr @__gxx_personality_v0 !dbg !14 {
+entry:
+; CHECK: call void @llvm.pseudoprobe(i64 -1069303473483922844, i64 1
+  %0 = load volatile i32, ptr @x, align 4, !dbg !17, !tbaa !19
+  %tobool = icmp ne i32 %0, 0, !dbg !17
+  br i1 %tobool, label %if.then, label %if.else, !dbg !23
+
+if.then:                                          ; preds = %entry
+; CHECK: call void @llvm.pseudoprobe(i64 -1069303473483922844, i64 2
+  invoke void @_Z3foov()
+          to label %invoke.cont unwind label %terminate.lpad, !dbg !24
+
+invoke.cont:                                      ; preds = %if.then
+; CHECK-NOT: call void @llvm.pseudoprobe(i64 -1069303473483922844,
+  invoke void @_Z3bazv()
+          to label %invoke.cont1 unwind label %terminate.lpad, !dbg !26
+
+invoke.cont1:                                     ; preds = %invoke.cont
+; CHECK-NOT: call void @llvm.pseudoprobe(i64 -1069303473483922844,
+  br label %if.end, !dbg !27
+
+if.else:                                          ; preds = %entry
+; CHECK: call void @llvm.pseudoprobe(i64 -1069303473483922844, i64 3
+  invoke void @_Z3foov()
+          to label %invoke.cont2 unwind label %terminate.lpad, !dbg !28
+
+invoke.cont2:                                     ; preds = %if.else
+; CHECK-NOT: call void @llvm.pseudoprobe(i64 -1069303473483922844,
+  br label %if.end
+
+if.end:                                           ; preds = %invoke.cont2, %invoke.cont1
+; CHECK: call void @llvm.pseudoprobe(i64 -1069303473483922844, i64 4
+  invoke void @_Z3foov()
+          to label %invoke.cont3 unwind label %terminate.lpad, !dbg !29
+
+invoke.cont3:                                     ; preds = %if.end
+; CHECK-NOT: call void @llvm.pseudoprobe(i64 -1069303473483922844,
+  %1 = load volatile i32, ptr @x, align 4, !dbg !30, !tbaa !19
+  %tobool4 = icmp ne i32 %1, 0, !dbg !30
+  br i1 %tobool4, label %if.then5, label %if.end6, !dbg !32
+
+if.then5:                                         ; preds = %invoke.cont3
+; CHECK: call void @llvm.pseudoprobe(i64 -1069303473483922844, i64 5
+  %2 = load volatile i32, ptr @x, align 4, !dbg !33, !tbaa !19
+  %inc = add nsw i32 %2, 1, !dbg !33
+  store volatile i32 %inc, ptr @x, align 4, !dbg !33, !tbaa !19
+  br label %if.end6, !dbg !35
+
+if.end6:                                          ; preds = %if.then5, %invoke.cont3
+; CHECK: call void @llvm.pseudoprobe(i64 -1069303473483922844, i64 6
+  ret void, !dbg !36
+
+terminate.lpad:                                   ; preds = %if.end, %if.else, %invoke.cont, %if.then
+; CHECK-NOT: call void @llvm.pseudoprobe(i64 -1069303473483922844,
+  %3 = landingpad { ptr, i32 }
+          catch ptr null, !dbg !24
+  %4 = extractvalue { ptr, i32 } %3, 0, !dbg !24
+  call void @__clang_call_terminate(ptr %4) #3, !dbg !24
+  unreachable, !dbg !24
+}
+
+; Function Attrs: mustprogress noinline nounwind uwtable
+define dso_local void @_Z3foov() #0 !dbg !37 {
+entry:
+  ret void, !dbg !38
+}
+
+declare i32 @__gxx_personality_v0(...)
+
+; Function Attrs: noinline noreturn nounwind uwtable
+define linkonce_odr hidden void @__clang_call_terminate(ptr noundef %0) #1 comdat {
+  %2 = call ptr @__cxa_begin_catch(ptr %0) #4
+  call void @_ZSt9terminatev() #3
+  unreachable
+}
+
+declare ptr @__cxa_begin_catch(ptr)
+
+declare void @_ZSt9terminatev()
+
+; Function Attrs: mustprogress noinline nounwind uwtable
+define dso_local void @_Z3bazv() #0 !dbg !39 {
+entry:
+  ret void, !dbg !40
+}
+
+; CHECK: ![[#]] = !{i64 -3270123626113159616, i64 4294967295, !"_Z3bazv"}
+
+attributes #0 = { mustprogress noinline nounwind uwtable "min-legal-vector-width"="0" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cmov,+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "tune-cpu"="generic" }
+attributes #1 = { noinline noreturn nounwind uwtable "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cmov,+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "tune-cpu"="generic" }
+attributes #2 = { mustprogress noinline norecurse nounwind uwtable "min-legal-vector-width"="0" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cmov,+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "tune-cpu"="generic" }
+attributes #3 = { noreturn nounwind }
+attributes #4 = { nounwind }
+
+!llvm.dbg.cu = !{!2}
+!llvm.module.flags = !{!7, !8, !9, !10, !11, !12}
+!llvm.ident = !{!13}
+
+!0 = !DIGlobalVariableExpression(var: !1, expr: !DIExpression())
+!1 = distinct !DIGlobalVariable(name: "x", scope: !2, file: !3, line: 1, type: !5, isLocal: false, isDefinition: true)
+!2 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus_14, file: !3, producer: "clang version 19.0.0", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug, globals: !4, splitDebugInlining: false, nameTableKind: None)
+!3 = !DIFile(filename: "test.cpp", directory: "/home", checksumkind: CSK_MD5, checksum: "a4c7b0392f3fd9c8ebb85065159dbb02")
+!4 = !{!0}
+!5 = !DIDerivedType(tag: DW_TAG_volatile_type, baseType: !6)
+!6 = !DIBasicType(name: "int", size: 32, encoding: DW_ATE_signed)
+!7 = !{i32 7, !"Dwarf Version", i32 5}
+!8 = !{i32 2, !"Debug Info Version", i32 3}
+!9 = !{i32 1, !"wchar_size", i32 4}
+!10 = !{i32 8, !"PIC Level", i32 2}
+!11 = !{i32 7, !"PIE Level", i32 2}
+!12 = !{i32 7, !"uwtable", i32 2}
+!13 = !{!"clang version 19.0.0"}
+!14 = distinct !DISubprogram(name: "bar", linkageName: "_Z3barv", scope: !3, file: !3, line: 4, type: !15, scopeLine: 4, flags: DIFlagPrototyped | DIFlagAllCallsDescribed, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !2)
+!15 = !DISubroutineType(types: !16)
+!16 = !{null}
+!17 = !DILocation(line: 5, column: 6, scope: !18)
+!18 = distinct !DILexicalBlock(scope: !14, file: !3, line: 5, column: 6)
+!19 = !{!20, !20, i64 0}
+!20 = !{!"int", !21, i64 0}
+!21 = !{!"omnipotent char", !22, i64 0}
+!22 = !{!"Simple C++ TBAA"}
+!23 = !DILocation(line: 5, column: 6, scope: !14)
+!24 = !DILocation(line: 6, column: 5, scope: !25)
+!25 = distinct !DILexicalBlock(scope: !18, file: !3, line: 5, column: 9)
+!26 = !DILocation(line: 7, column: 5, scope: !25)
+!27 = !DILocation(line: 8, column: 3, scope: !25)
+!28 = !DILocation(line: 9, column: 5, scope: !18)
+!29 = !DILocation(line: 11, column: 3, scope: !14)
+!30 = !DILocation(line: 12, column: 6, scope: !31)
+!31 = distinct !DILexicalBlock(scope: !14, file: !3, line: 12, column: 6)
+!32 = !DILocation(line: 12, column: 6, scope: !14)
+!33 = !DILocation(line: 13, column: 5, scope: !34)
+!34 = distinct !DILexicalBlock(scope: !31, file: !3, line: 12, column: 9)
+!35 = !DILocation(line: 14, column: 5, scope: !34)
+!36 = !DILocation(line: 17, column: 1, scope: !14)
+!37 = distinct !DISubprogram(name: "foo", linkageName: "_Z3foov", scope: !3, file: !3, line: 19, type: !15, scopeLine: 19, flags: DIFlagPrototyped | DIFlagAllCallsDescribed, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !2)
+!38 = !DILocation(line: 19, column: 13, scope: !37)
+!39 = distinct !DISubprogram(name: "baz", linkageName: "_Z3bazv", scope: !3, file: !3, line: 18, type: !15, scopeLine: 18, flags: DIFlagPrototyped | DIFlagAllCallsDescribed, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !2)
+!40 = !DILocation(line: 18, column: 13, scope: !39)
+!41 = distinct !DISubprogram(name: "main", scope: !3, file: !3, line: 22, type: !42, scopeLine: 22, flags: DIFlagPrototyped | DIFlagAllCallsDescribed, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !2)
+!42 = !DISubroutineType(types: !43)
+!43 = !{!6}
+!44 = !DILocation(line: 23, column: 3, scope: !41)
+!45 = !DILocation(line: 24, column: 1, scope: !41)

>From 5178af233c10eebd56f1e6aa1c78cd3328c2661f Mon Sep 17 00:00:00 2001
From: wlei <wlei at fb.com>
Date: Tue, 30 Jan 2024 00:15:10 -0800
Subject: [PATCH 2/5] Use single BlocksToIgnore and update comments

---
 llvm/include/llvm/Analysis/EHUtils.h          |  1 +
 .../llvm/Transforms/IPO/SampleProfileProbe.h  |  6 +--
 .../lib/Transforms/IPO/SampleProfileProbe.cpp | 46 +++++++++++--------
 3 files changed, 31 insertions(+), 22 deletions(-)

diff --git a/llvm/include/llvm/Analysis/EHUtils.h b/llvm/include/llvm/Analysis/EHUtils.h
index f2ff6cbd2e9036..9cf2f9244e84fb 100644
--- a/llvm/include/llvm/Analysis/EHUtils.h
+++ b/llvm/include/llvm/Analysis/EHUtils.h
@@ -16,6 +16,7 @@ namespace llvm {
 /// Compute a list of blocks that are only reachable via EH paths.
 template <typename FunctionT, typename BlockT>
 static void computeEHOnlyBlocks(FunctionT &F, DenseSet<BlockT *> &EHBlocks) {
+  assert(EHBlocks.empty() && "Output set should be empty");
   // A block can be unknown if its not reachable from anywhere
   // EH if its only reachable from start blocks via some path through EH pads
   // NonEH if it's reachable from Non EH blocks as well.
diff --git a/llvm/include/llvm/Transforms/IPO/SampleProfileProbe.h b/llvm/include/llvm/Transforms/IPO/SampleProfileProbe.h
index 0db7b2dcc36600..1f7acdc6276aa4 100644
--- a/llvm/include/llvm/Transforms/IPO/SampleProfileProbe.h
+++ b/llvm/include/llvm/Transforms/IPO/SampleProfileProbe.h
@@ -82,10 +82,8 @@ class SampleProfileProber {
   uint32_t getBlockId(const BasicBlock *BB) const;
   uint32_t getCallsiteId(const Instruction *Call) const;
   void findInvokeNormalDests(DenseSet<BasicBlock *> &InvokeNormalDests);
-  void computeCFGHash(const DenseSet<BasicBlock *> &InvokeNormalDests,
-                      const DenseSet<BasicBlock *> &KnownColdBlocks);
-  void computeProbeIdForBlocks(const DenseSet<BasicBlock *> &InvokeNormalDests,
-                               const DenseSet<BasicBlock *> &KnownColdBlocks);
+  void computeCFGHash(const DenseSet<BasicBlock *> &BlocksToIgnore);
+  void computeProbeIdForBlocks(const DenseSet<BasicBlock *> &BlocksToIgnore);
   void computeProbeIdForCallsites();
 
   Function *F;
diff --git a/llvm/lib/Transforms/IPO/SampleProfileProbe.cpp b/llvm/lib/Transforms/IPO/SampleProfileProbe.cpp
index fd3c2f223b87b8..bd823a0f2d2951 100644
--- a/llvm/lib/Transforms/IPO/SampleProfileProbe.cpp
+++ b/llvm/lib/Transforms/IPO/SampleProfileProbe.cpp
@@ -174,14 +174,23 @@ SampleProfileProber::SampleProfileProber(Function &Func,
   CallProbeIds.clear();
   LastProbeId = (uint32_t)PseudoProbeReservedId::Last;
 
-  DenseSet<BasicBlock *> InvokeNormalDests;
-  findInvokeNormalDests(InvokeNormalDests);
-  DenseSet<BasicBlock *> KnownColdBlocks;
-  computeEHOnlyBlocks(*F, KnownColdBlocks);
-
-  computeProbeIdForBlocks(InvokeNormalDests, KnownColdBlocks);
+  DenseSet<BasicBlock *> BlocksToIgnore;
+  // Ignore the cold EH blocks. This will reduce IR size as
+  // well as the binary size while retaining the profile quality.
+  computeEHOnlyBlocks(*F, BlocksToIgnore);
+  // While optimizing nounwind attribute, the frondend may generate unstable IR,
+  // e.g. some versions are optimized with the call-to-invoke conversion, while
+  // other versions do not. This discrepancy in probe ID could cause profile
+  // mismatching issues. To make the probe ID consistent, we can ignore all the
+  // EH flows. Specifically, we can ignore the normal dest block which
+  // originating from the same block as the call/invoke block and the unwind
+  // dest block(computed in computeEHOnlyBlocks), which is a cold block. It
+  // doesn't affect the profile quality.
+  findInvokeNormalDests(BlocksToIgnore);
+
+  computeProbeIdForBlocks(BlocksToIgnore);
   computeProbeIdForCallsites();
-  computeCFGHash(InvokeNormalDests, KnownColdBlocks);
+  computeCFGHash(BlocksToIgnore);
 }
 
 void SampleProfileProber::findInvokeNormalDests(
@@ -198,16 +207,18 @@ void SampleProfileProber::findInvokeNormalDests(
 // preceded by the number of indirect calls.
 // This is derived from FuncPGOInstrumentation<Edge, BBInfo>::computeCFGHash().
 void SampleProfileProber::computeCFGHash(
-    const DenseSet<BasicBlock *> &InvokeNormalDests,
-    const DenseSet<BasicBlock *> &KnownColdBlocks) {
+    const DenseSet<BasicBlock *> &BlocksToIgnore) {
   std::vector<uint8_t> Indexes;
   JamCRC JC;
   for (auto &BB : *F) {
-    // Skip the EH flow blocks.
-    if (InvokeNormalDests.contains(&BB) || KnownColdBlocks.contains(&BB))
+    if (BlocksToIgnore.contains(&BB))
       continue;
-
-    // Find the original successors by skipping the EH flow succs.
+    // To keep the CFG Hash consistent before and after the call-to-invoke
+    // conversion, we need to compute the hash using the original call BB's
+    // successors for the invoke BB. As the current invoke BB's
+    // successors(normal dest and unwind dest) are ignored, we keep searching to
+    // find the leaf normal dest, the leaf's successors are the original call's
+    // successors.
     auto *BBPtr = &BB;
     auto *TI = BBPtr->getTerminator();
     while (auto *II = dyn_cast<InvokeInst>(TI)) {
@@ -217,6 +228,8 @@ void SampleProfileProber::computeCFGHash(
 
     for (BasicBlock *Succ : successors(BBPtr)) {
       auto Index = getBlockId(Succ);
+      assert(Index &&
+             "Ignored block(zero ID) should not be used for hash computation");
       for (int J = 0; J < 4; J++)
         Indexes.push_back((uint8_t)(Index >> (J * 8)));
     }
@@ -237,12 +250,9 @@ void SampleProfileProber::computeCFGHash(
 }
 
 void SampleProfileProber::computeProbeIdForBlocks(
-    const DenseSet<BasicBlock *> &InvokeNormalDests,
-    const DenseSet<BasicBlock *> &KnownColdBlocks) {
-  // Insert pseudo probe to non-cold blocks only. This will reduce IR size as
-  // well as the binary size while retaining the profile quality.
+    const DenseSet<BasicBlock *> &BlocksToIgnore) {
   for (auto &BB : *F) {
-    if (InvokeNormalDests.contains(&BB) || KnownColdBlocks.contains(&BB))
+    if (BlocksToIgnore.contains(&BB))
       continue;
     BlockProbeIds[&BB] = ++LastProbeId;
   }

>From 95e2c8738f9e665df55458e4396ec2d4dc547bb8 Mon Sep 17 00:00:00 2001
From: wlei <wlei at fb.com>
Date: Tue, 30 Jan 2024 14:24:28 -0800
Subject: [PATCH 3/5] Update assertion message and remove EHBlocks.clear()

---
 llvm/include/llvm/Analysis/EHUtils.h           | 2 --
 llvm/lib/Transforms/IPO/SampleProfileProbe.cpp | 4 ++--
 2 files changed, 2 insertions(+), 4 deletions(-)

diff --git a/llvm/include/llvm/Analysis/EHUtils.h b/llvm/include/llvm/Analysis/EHUtils.h
index 9cf2f9244e84fb..3ad0878bd64f88 100644
--- a/llvm/include/llvm/Analysis/EHUtils.h
+++ b/llvm/include/llvm/Analysis/EHUtils.h
@@ -16,7 +16,6 @@ namespace llvm {
 /// Compute a list of blocks that are only reachable via EH paths.
 template <typename FunctionT, typename BlockT>
 static void computeEHOnlyBlocks(FunctionT &F, DenseSet<BlockT *> &EHBlocks) {
-  assert(EHBlocks.empty() && "Output set should be empty");
   // A block can be unknown if its not reachable from anywhere
   // EH if its only reachable from start blocks via some path through EH pads
   // NonEH if it's reachable from Non EH blocks as well.
@@ -80,7 +79,6 @@ static void computeEHOnlyBlocks(FunctionT &F, DenseSet<BlockT *> &EHBlocks) {
     }
   }
 
-  EHBlocks.clear();
   for (auto Entry : Statuses) {
     if (Entry.second == EH)
       EHBlocks.insert(Entry.first);
diff --git a/llvm/lib/Transforms/IPO/SampleProfileProbe.cpp b/llvm/lib/Transforms/IPO/SampleProfileProbe.cpp
index bd823a0f2d2951..fbf3ca933177f2 100644
--- a/llvm/lib/Transforms/IPO/SampleProfileProbe.cpp
+++ b/llvm/lib/Transforms/IPO/SampleProfileProbe.cpp
@@ -228,8 +228,8 @@ void SampleProfileProber::computeCFGHash(
 
     for (BasicBlock *Succ : successors(BBPtr)) {
       auto Index = getBlockId(Succ);
-      assert(Index &&
-             "Ignored block(zero ID) should not be used for hash computation");
+      assert(Index && "Ignored block(zero ID) is used for hash computation, it "
+                      "could cause profile checksum mismatch");
       for (int J = 0; J < 4; J++)
         Indexes.push_back((uint8_t)(Index >> (J * 8)));
     }

>From e3560e283eb9e882ce22f5433f3c7aaffbbcd1b3 Mon Sep 17 00:00:00 2001
From: wlei <wlei at fb.com>
Date: Mon, 12 Feb 2024 12:11:02 -0800
Subject: [PATCH 4/5] Extend to ignore new split blocks and unreacheable blocks

---
 .../llvm/Transforms/IPO/SampleProfileProbe.h  |  10 +-
 .../lib/Transforms/IPO/SampleProfileProbe.cpp | 110 ++++++++++++------
 2 files changed, 83 insertions(+), 37 deletions(-)

diff --git a/llvm/include/llvm/Transforms/IPO/SampleProfileProbe.h b/llvm/include/llvm/Transforms/IPO/SampleProfileProbe.h
index 1f7acdc6276aa4..40b9c647ed7208 100644
--- a/llvm/include/llvm/Transforms/IPO/SampleProfileProbe.h
+++ b/llvm/include/llvm/Transforms/IPO/SampleProfileProbe.h
@@ -81,7 +81,15 @@ class SampleProfileProber {
   uint64_t getFunctionHash() const { return FunctionHash; }
   uint32_t getBlockId(const BasicBlock *BB) const;
   uint32_t getCallsiteId(const Instruction *Call) const;
-  void findInvokeNormalDests(DenseSet<BasicBlock *> &InvokeNormalDests);
+  void findNewSplitBlocks(DenseSet<BasicBlock *> &NewSplitBlocks);
+  void findUnreachableBlocks(DenseSet<BasicBlock *> &BlocksToIgnore);
+  void computeBlocksToIgnore(DenseSet<BasicBlock *> &BlocksToIgnoreProbe,
+                             DenseSet<BasicBlock *> &BlocksToIgnoreCall);
+  void
+  computeProbeIdForCallsites(const DenseSet<BasicBlock *> &BlocksToIgnoreCall);
+  const Instruction *
+  getOriginalTerminator(const BasicBlock *BB,
+                        const DenseSet<BasicBlock *> &BlocksToIgnore);
   void computeCFGHash(const DenseSet<BasicBlock *> &BlocksToIgnore);
   void computeProbeIdForBlocks(const DenseSet<BasicBlock *> &BlocksToIgnore);
   void computeProbeIdForCallsites();
diff --git a/llvm/lib/Transforms/IPO/SampleProfileProbe.cpp b/llvm/lib/Transforms/IPO/SampleProfileProbe.cpp
index fbf3ca933177f2..3de3a617c861e8 100644
--- a/llvm/lib/Transforms/IPO/SampleProfileProbe.cpp
+++ b/llvm/lib/Transforms/IPO/SampleProfileProbe.cpp
@@ -174,32 +174,76 @@ SampleProfileProber::SampleProfileProber(Function &Func,
   CallProbeIds.clear();
   LastProbeId = (uint32_t)PseudoProbeReservedId::Last;
 
-  DenseSet<BasicBlock *> BlocksToIgnore;
-  // Ignore the cold EH blocks. This will reduce IR size as
-  // well as the binary size while retaining the profile quality.
-  computeEHOnlyBlocks(*F, BlocksToIgnore);
-  // While optimizing nounwind attribute, the frondend may generate unstable IR,
-  // e.g. some versions are optimized with the call-to-invoke conversion, while
-  // other versions do not. This discrepancy in probe ID could cause profile
-  // mismatching issues. To make the probe ID consistent, we can ignore all the
-  // EH flows. Specifically, we can ignore the normal dest block which
-  // originating from the same block as the call/invoke block and the unwind
-  // dest block(computed in computeEHOnlyBlocks), which is a cold block. It
-  // doesn't affect the profile quality.
-  findInvokeNormalDests(BlocksToIgnore);
-
-  computeProbeIdForBlocks(BlocksToIgnore);
-  computeProbeIdForCallsites();
-  computeCFGHash(BlocksToIgnore);
+  DenseSet<BasicBlock *> BlocksToIgnoreProbe;
+  DenseSet<BasicBlock *> BlocksToIgnoreCall;
+  computeBlocksToIgnore(BlocksToIgnoreProbe, BlocksToIgnoreCall);
+
+  computeProbeIdForBlocks(BlocksToIgnoreProbe);
+  computeProbeIdForCallsites(BlocksToIgnoreCall);
+  computeCFGHash(BlocksToIgnoreProbe);
+}
+
+// Two purposes to compute the blocks to ignore:
+// 1. Reduce the IR size.
+// 2. Make the instrumentation(checksum mismatch) stable. e.g. the frondend may
+// generate unstable IR while optimizing nounwind attribute, some versions are
+// optimized with the call-to-invoke conversion, while other versions do not.
+// This discrepancy in probe ID could cause profile mismatching issues.
+// Note that those ignored blocks are either cold blocks or new split blocks
+// whose original blocks are instrumented, so it shouldn't degrade the profile
+// quailty.
+void SampleProfileProber::computeBlocksToIgnore(
+    DenseSet<BasicBlock *> &BlocksToIgnoreProbe,
+    DenseSet<BasicBlock *> &BlocksToIgnoreCall) {
+  // Ignore the cold EH blocks.
+  computeEHOnlyBlocks(*F, BlocksToIgnoreCall);
+  findUnreachableBlocks(BlocksToIgnoreCall);
+
+  BlocksToIgnoreProbe.insert(BlocksToIgnoreCall.begin(),
+                             BlocksToIgnoreCall.end());
+  findNewSplitBlocks(BlocksToIgnoreProbe);
+}
+
+void SampleProfileProber::findUnreachableBlocks(
+    DenseSet<BasicBlock *> &BlocksToIgnore) {
+  for (auto &BB : *F) {
+    if (&BB != &F->getEntryBlock() && pred_size(&BB) == 0)
+      BlocksToIgnore.insert(&BB);
+  }
 }
 
-void SampleProfileProber::findInvokeNormalDests(
-    DenseSet<BasicBlock *> &InvokeNormalDests) {
+// Basic block can be split into multiple blocks, e.g. due to the
+// call-to-invoke. If they are hotness-wise equal, we can optimize to only
+// instrument the leading block, ignore the other new split blocks.
+void SampleProfileProber::findNewSplitBlocks(
+    DenseSet<BasicBlock *> &NewSplitBlocks) {
   for (auto &BB : *F) {
+    // Blocks connected by unconditional branch are hotness-wise equal, ignore
+    // the second block.
+    if (pred_size(&BB) == 1 && succ_size(*pred_begin(&BB)) == 1)
+      NewSplitBlocks.insert(&BB);
+
+    // For call-to-invoke conversion, the unwind dest is usually cold, so ignore
+    // the normal dest of invoke as the new split BBs.
     auto *TI = BB.getTerminator();
     if (auto *II = dyn_cast<InvokeInst>(TI))
-      InvokeNormalDests.insert(II->getNormalDest());
+      NewSplitBlocks.insert(II->getNormalDest());
+  }
+}
+
+// To keep the CFG Hash consistent before and after the block split opt(such as
+// call-to-invoke conversion), we need to compute the hash using the original
+// BB's successors for the new split BB. It keep searching to find the leaf
+// new-split BB, the leaf's successors are the original BB's successors.
+const Instruction *SampleProfileProber::getOriginalTerminator(
+    const BasicBlock *BB, const DenseSet<BasicBlock *> &BlocksToIgnore) {
+  auto *TI = BB->getTerminator();
+  if (auto *II = dyn_cast<InvokeInst>(TI)) {
+    return getOriginalTerminator(II->getNormalDest(), BlocksToIgnore);
+  } else if (succ_size(BB) == 1 && BlocksToIgnore.contains(*succ_begin(BB))) {
+    return getOriginalTerminator(*succ_begin(BB), BlocksToIgnore);
   }
+  return TI;
 }
 
 // Compute Hash value for the CFG: the lower 32 bits are CRC32 of the index
@@ -213,23 +257,14 @@ void SampleProfileProber::computeCFGHash(
   for (auto &BB : *F) {
     if (BlocksToIgnore.contains(&BB))
       continue;
-    // To keep the CFG Hash consistent before and after the call-to-invoke
-    // conversion, we need to compute the hash using the original call BB's
-    // successors for the invoke BB. As the current invoke BB's
-    // successors(normal dest and unwind dest) are ignored, we keep searching to
-    // find the leaf normal dest, the leaf's successors are the original call's
-    // successors.
-    auto *BBPtr = &BB;
-    auto *TI = BBPtr->getTerminator();
-    while (auto *II = dyn_cast<InvokeInst>(TI)) {
-      BBPtr = II->getNormalDest();
-      TI = BBPtr->getTerminator();
-    }
 
-    for (BasicBlock *Succ : successors(BBPtr)) {
+    auto *TI = getOriginalTerminator(&BB, BlocksToIgnore);
+    for (unsigned I = 0, E = TI->getNumSuccessors(); I != E; ++I) {
+      auto *Succ = TI->getSuccessor(I);
       auto Index = getBlockId(Succ);
-      assert(Index && "Ignored block(zero ID) is used for hash computation, it "
-                      "could cause profile checksum mismatch");
+      // Ingore ignored-block(zero ID) to avoid unstable checksum.
+      if (Index == 0)
+        continue;
       for (int J = 0; J < 4; J++)
         Indexes.push_back((uint8_t)(Index >> (J * 8)));
     }
@@ -258,11 +293,14 @@ void SampleProfileProber::computeProbeIdForBlocks(
   }
 }
 
-void SampleProfileProber::computeProbeIdForCallsites() {
+void SampleProfileProber::computeProbeIdForCallsites(
+    const DenseSet<BasicBlock *> &BlocksToIgnoreCall) {
   LLVMContext &Ctx = F->getContext();
   Module *M = F->getParent();
 
   for (auto &BB : *F) {
+    if (BlocksToIgnoreCall.contains(&BB))
+      continue;
     for (auto &I : BB) {
       if (!isa<CallBase>(I))
         continue;

>From cbc862ae88cc0835632e594598f36658fb7a3863 Mon Sep 17 00:00:00 2001
From: wlei <wlei at fb.com>
Date: Tue, 27 Feb 2024 15:06:38 -0800
Subject: [PATCH 5/5] Limit mergeable into predecessor case only to EH flow

---
 .../llvm/Transforms/IPO/SampleProfileProbe.h  | 12 +--
 .../lib/Transforms/IPO/SampleProfileProbe.cpp | 90 ++++++++++---------
 2 files changed, 56 insertions(+), 46 deletions(-)

diff --git a/llvm/include/llvm/Transforms/IPO/SampleProfileProbe.h b/llvm/include/llvm/Transforms/IPO/SampleProfileProbe.h
index 40b9c647ed7208..03aa93ce6bd387 100644
--- a/llvm/include/llvm/Transforms/IPO/SampleProfileProbe.h
+++ b/llvm/include/llvm/Transforms/IPO/SampleProfileProbe.h
@@ -81,14 +81,14 @@ class SampleProfileProber {
   uint64_t getFunctionHash() const { return FunctionHash; }
   uint32_t getBlockId(const BasicBlock *BB) const;
   uint32_t getCallsiteId(const Instruction *Call) const;
-  void findNewSplitBlocks(DenseSet<BasicBlock *> &NewSplitBlocks);
   void findUnreachableBlocks(DenseSet<BasicBlock *> &BlocksToIgnore);
-  void computeBlocksToIgnore(DenseSet<BasicBlock *> &BlocksToIgnoreProbe,
-                             DenseSet<BasicBlock *> &BlocksToIgnoreCall);
-  void
-  computeProbeIdForCallsites(const DenseSet<BasicBlock *> &BlocksToIgnoreCall);
+  void findInvokeNormalDests(DenseSet<BasicBlock *> &InvokeNormalDests);
+  void computeBlocksToIgnore(DenseSet<BasicBlock *> &BlocksToIgnore,
+                             DenseSet<BasicBlock *> &BlocksAndCallsToIgnore);
+  void computeProbeIdForCallsites(
+      const DenseSet<BasicBlock *> &BlocksAndCallsToIgnore);
   const Instruction *
-  getOriginalTerminator(const BasicBlock *BB,
+  getOriginalTerminator(const BasicBlock *Head,
                         const DenseSet<BasicBlock *> &BlocksToIgnore);
   void computeCFGHash(const DenseSet<BasicBlock *> &BlocksToIgnore);
   void computeProbeIdForBlocks(const DenseSet<BasicBlock *> &BlocksToIgnore);
diff --git a/llvm/lib/Transforms/IPO/SampleProfileProbe.cpp b/llvm/lib/Transforms/IPO/SampleProfileProbe.cpp
index 3de3a617c861e8..8ba75f8603e533 100644
--- a/llvm/lib/Transforms/IPO/SampleProfileProbe.cpp
+++ b/llvm/lib/Transforms/IPO/SampleProfileProbe.cpp
@@ -174,18 +174,18 @@ SampleProfileProber::SampleProfileProber(Function &Func,
   CallProbeIds.clear();
   LastProbeId = (uint32_t)PseudoProbeReservedId::Last;
 
-  DenseSet<BasicBlock *> BlocksToIgnoreProbe;
-  DenseSet<BasicBlock *> BlocksToIgnoreCall;
-  computeBlocksToIgnore(BlocksToIgnoreProbe, BlocksToIgnoreCall);
+  DenseSet<BasicBlock *> BlocksToIgnore;
+  DenseSet<BasicBlock *> BlocksAndCallsToIgnore;
+  computeBlocksToIgnore(BlocksToIgnore, BlocksAndCallsToIgnore);
 
-  computeProbeIdForBlocks(BlocksToIgnoreProbe);
-  computeProbeIdForCallsites(BlocksToIgnoreCall);
-  computeCFGHash(BlocksToIgnoreProbe);
+  computeProbeIdForBlocks(BlocksToIgnore);
+  computeProbeIdForCallsites(BlocksAndCallsToIgnore);
+  computeCFGHash(BlocksToIgnore);
 }
 
 // Two purposes to compute the blocks to ignore:
 // 1. Reduce the IR size.
-// 2. Make the instrumentation(checksum mismatch) stable. e.g. the frondend may
+// 2. Make the instrumentation(checksum) stable. e.g. the frondend may
 // generate unstable IR while optimizing nounwind attribute, some versions are
 // optimized with the call-to-invoke conversion, while other versions do not.
 // This discrepancy in probe ID could cause profile mismatching issues.
@@ -193,15 +193,17 @@ SampleProfileProber::SampleProfileProber(Function &Func,
 // whose original blocks are instrumented, so it shouldn't degrade the profile
 // quailty.
 void SampleProfileProber::computeBlocksToIgnore(
-    DenseSet<BasicBlock *> &BlocksToIgnoreProbe,
-    DenseSet<BasicBlock *> &BlocksToIgnoreCall) {
-  // Ignore the cold EH blocks.
-  computeEHOnlyBlocks(*F, BlocksToIgnoreCall);
-  findUnreachableBlocks(BlocksToIgnoreCall);
-
-  BlocksToIgnoreProbe.insert(BlocksToIgnoreCall.begin(),
-                             BlocksToIgnoreCall.end());
-  findNewSplitBlocks(BlocksToIgnoreProbe);
+    DenseSet<BasicBlock *> &BlocksToIgnore,
+    DenseSet<BasicBlock *> &BlocksAndCallsToIgnore) {
+  // Ignore the cold EH and unreachable blocks and calls.
+  computeEHOnlyBlocks(*F, BlocksAndCallsToIgnore);
+  findUnreachableBlocks(BlocksAndCallsToIgnore);
+
+  BlocksToIgnore.insert(BlocksAndCallsToIgnore.begin(),
+                        BlocksAndCallsToIgnore.end());
+  // Handle the call-to-invoke conversion case, ignore the normal dests of
+  // invoke.
+  findInvokeNormalDests(BlocksToIgnore);
 }
 
 void SampleProfileProber::findUnreachableBlocks(
@@ -212,36 +214,44 @@ void SampleProfileProber::findUnreachableBlocks(
   }
 }
 
-// Basic block can be split into multiple blocks, e.g. due to the
-// call-to-invoke. If they are hotness-wise equal, we can optimize to only
-// instrument the leading block, ignore the other new split blocks.
-void SampleProfileProber::findNewSplitBlocks(
-    DenseSet<BasicBlock *> &NewSplitBlocks) {
+// In call-to-invoke conversion, basic block can be split into multiple blocks,
+// only instrument probe in the head block, ignore the normal dests.
+void SampleProfileProber::findInvokeNormalDests(
+    DenseSet<BasicBlock *> &InvokeNormalDests) {
   for (auto &BB : *F) {
-    // Blocks connected by unconditional branch are hotness-wise equal, ignore
-    // the second block.
-    if (pred_size(&BB) == 1 && succ_size(*pred_begin(&BB)) == 1)
-      NewSplitBlocks.insert(&BB);
-
-    // For call-to-invoke conversion, the unwind dest is usually cold, so ignore
-    // the normal dest of invoke as the new split BBs.
     auto *TI = BB.getTerminator();
-    if (auto *II = dyn_cast<InvokeInst>(TI))
-      NewSplitBlocks.insert(II->getNormalDest());
+    if (auto *II = dyn_cast<InvokeInst>(TI)) {
+      auto *ND = II->getNormalDest();
+      InvokeNormalDests.insert(ND);
+
+      // The normal dest and the try/catch block are connected by an
+      // unconditional branch.
+      while (pred_size(ND) == 1) {
+        auto *Pred = *pred_begin(ND);
+        if (succ_size(Pred) == 1) {
+          InvokeNormalDests.insert(Pred);
+          ND = Pred;
+        } else
+          break;
+      }
+    }
   }
 }
 
-// To keep the CFG Hash consistent before and after the block split opt(such as
-// call-to-invoke conversion), we need to compute the hash using the original
-// BB's successors for the new split BB. It keep searching to find the leaf
-// new-split BB, the leaf's successors are the original BB's successors.
+// The call-to-invoke conversion splits the original block into a list of block,
+// we need to compute the hash using the original block's successors to keep the
+// CFG Hash consistent. For a given head block, we keep searching the
+// succesor(normal dest or unconditional branch dest) to find the tail block,
+// the tail block's successors are the original block's successors.
 const Instruction *SampleProfileProber::getOriginalTerminator(
-    const BasicBlock *BB, const DenseSet<BasicBlock *> &BlocksToIgnore) {
-  auto *TI = BB->getTerminator();
+    const BasicBlock *Head, const DenseSet<BasicBlock *> &BlocksToIgnore) {
+  auto *TI = Head->getTerminator();
   if (auto *II = dyn_cast<InvokeInst>(TI)) {
     return getOriginalTerminator(II->getNormalDest(), BlocksToIgnore);
-  } else if (succ_size(BB) == 1 && BlocksToIgnore.contains(*succ_begin(BB))) {
-    return getOriginalTerminator(*succ_begin(BB), BlocksToIgnore);
+  } else if (succ_size(Head) == 1 &&
+             BlocksToIgnore.contains(*succ_begin(Head))) {
+    // Go to the unconditional branch dest.
+    return getOriginalTerminator(*succ_begin(Head), BlocksToIgnore);
   }
   return TI;
 }
@@ -294,12 +304,12 @@ void SampleProfileProber::computeProbeIdForBlocks(
 }
 
 void SampleProfileProber::computeProbeIdForCallsites(
-    const DenseSet<BasicBlock *> &BlocksToIgnoreCall) {
+    const DenseSet<BasicBlock *> &BlocksAndCallsToIgnore) {
   LLVMContext &Ctx = F->getContext();
   Module *M = F->getParent();
 
   for (auto &BB : *F) {
-    if (BlocksToIgnoreCall.contains(&BB))
+    if (BlocksAndCallsToIgnore.contains(&BB))
       continue;
     for (auto &I : BB) {
       if (!isa<CallBase>(I))



More information about the llvm-commits mailing list