[llvm-branch-commits] [llvm] [BOLT] Match blocks with pseudo probes (PR #99891)

Amir Ayupov via llvm-branch-commits llvm-branch-commits at lists.llvm.org
Wed Sep 4 14:31:56 PDT 2024


https://github.com/aaupov updated https://github.com/llvm/llvm-project/pull/99891

>From 0274f697376264c2d77816190f9a434f64e79089 Mon Sep 17 00:00:00 2001
From: shawbyoung <shawbyoung at gmail.com>
Date: Mon, 22 Jul 2024 11:56:23 -0700
Subject: [PATCH 01/39] Changed assignment of profiles with pseudo probe index

Created using spr 1.3.4
---
 bolt/lib/Profile/StaleProfileMatching.cpp     | 85 +++++++++++++++----
 .../X86/match-blocks-with-pseudo-probes.test  | 25 ++----
 2 files changed, 78 insertions(+), 32 deletions(-)

diff --git a/bolt/lib/Profile/StaleProfileMatching.cpp b/bolt/lib/Profile/StaleProfileMatching.cpp
index 4105f626fb5b6a..c135ee5ff48373 100644
--- a/bolt/lib/Profile/StaleProfileMatching.cpp
+++ b/bolt/lib/Profile/StaleProfileMatching.cpp
@@ -195,11 +195,15 @@ class StaleMatcher {
   void init(const std::vector<FlowBlock *> &Blocks,
             const std::vector<BlendedBlockHash> &Hashes,
             const std::vector<uint64_t> &CallHashes,
-            std::optional<uint64_t> YamlBFGUID) {
+            const std::unordered_map<uint64_t,
+                                     std::vector<const MCDecodedPseudoProbe *>>
+                IndexToBinaryPseudoProbes,
+            const std::unordered_map<const MCDecodedPseudoProbe *, FlowBlock *>
+                BinaryPseudoProbeToBlock,
+            const uint64_t YamlBFGUID) {
     assert(Blocks.size() == Hashes.size() &&
            Hashes.size() == CallHashes.size() &&
            "incorrect matcher initialization");
-
     for (size_t I = 0; I < Blocks.size(); I++) {
       FlowBlock *Block = Blocks[I];
       uint16_t OpHash = Hashes[I].OpcodeHash;
@@ -209,6 +213,8 @@ class StaleMatcher {
             std::make_pair(Hashes[I], Block));
       this->Blocks.push_back(Block);
     }
+    this->IndexToBinaryPseudoProbes = IndexToBinaryPseudoProbes;
+    this->BinaryPseudoProbeToBlock = BinaryPseudoProbeToBlock;
     this->YamlBFGUID = YamlBFGUID;
   }
 
@@ -234,10 +240,14 @@ class StaleMatcher {
   using HashBlockPairType = std::pair<BlendedBlockHash, FlowBlock *>;
   std::unordered_map<uint16_t, std::vector<HashBlockPairType>> OpHashToBlocks;
   std::unordered_map<uint64_t, std::vector<HashBlockPairType>> CallHashToBlocks;
-  std::vector<FlowBlock *> Blocks;
+  std::unordered_map<uint64_t, std::vector<const MCDecodedPseudoProbe *>>
+      IndexToBinaryPseudoProbes;
+  std::unordered_map<const MCDecodedPseudoProbe *, FlowBlock *>
+      BinaryPseudoProbeToBlock;
+  std::vector<const FlowBlock *> Blocks;
   // If the pseudo probe checksums of the profiled and binary functions are
   // equal, then the YamlBF's GUID is defined and used to match blocks.
-  std::optional<uint64_t> YamlBFGUID;
+  uint64_t YamlBFGUID;
 
   // Uses OpcodeHash to find the most similar block for a given hash.
   const FlowBlock *matchWithOpcodes(BlendedBlockHash BlendedHash) const {
@@ -284,7 +294,7 @@ class StaleMatcher {
     // Searches for the pseudo probe attached to the matched function's block,
     // ignoring pseudo probes attached to function calls and inlined functions'
     // blocks.
-    outs() << "match with pseudo probes\n";
+    std::vector<const yaml::bolt::PseudoProbeInfo *> BlockPseudoProbes;
     for (const auto &PseudoProbe : PseudoProbes) {
       // Ensures that pseudo probe information belongs to the appropriate
       // function and not an inlined function.
@@ -293,11 +303,30 @@ class StaleMatcher {
       // Skips pseudo probes attached to function calls.
       if (PseudoProbe.Type != static_cast<uint8_t>(PseudoProbeType::Block))
         continue;
-      assert(PseudoProbe.Index < Blocks.size() &&
-             "pseudo probe index out of range");
-      return Blocks[PseudoProbe.Index];
+
+      BlockPseudoProbes.push_back(&PseudoProbe);
     }
-    return nullptr;
+
+    // Returns nullptr if there is not a 1:1 mapping of the yaml block pseudo
+    // probe and binary pseudo probe.
+    if (BlockPseudoProbes.size() == 0 || BlockPseudoProbes.size() > 1)
+      return nullptr;
+
+    uint64_t Index = BlockPseudoProbes[0]->Index;
+    assert(Index < Blocks.size() && "Invalid pseudo probe index");
+
+    auto It = IndexToBinaryPseudoProbes.find(Index);
+    assert(It != IndexToBinaryPseudoProbes.end() &&
+           "All blocks should have a pseudo probe");
+    if (It->second.size() > 1)
+      return nullptr;
+
+    const MCDecodedPseudoProbe *BinaryPseudoProbe = It->second[0];
+    auto BinaryPseudoProbeIt = BinaryPseudoProbeToBlock.find(BinaryPseudoProbe);
+    assert(BinaryPseudoProbeIt != BinaryPseudoProbeToBlock.end() &&
+           "All binary pseudo probes should belong a binary basic block");
+
+    return BinaryPseudoProbeIt->second;
   }
 };
 
@@ -491,6 +520,11 @@ size_t matchWeightsByHashes(
   std::vector<uint64_t> CallHashes;
   std::vector<FlowBlock *> Blocks;
   std::vector<BlendedBlockHash> BlendedHashes;
+  std::unordered_map<uint64_t, std::vector<const MCDecodedPseudoProbe *>>
+      IndexToBinaryPseudoProbes;
+  std::unordered_map<const MCDecodedPseudoProbe *, FlowBlock *>
+      BinaryPseudoProbeToBlock;
+  const MCPseudoProbeDecoder *PseudoProbeDecoder = BC.getPseudoProbeDecoder();
   for (uint64_t I = 0; I < BlockOrder.size(); I++) {
     const BinaryBasicBlock *BB = BlockOrder[I];
     assert(BB->getHash() != 0 && "empty hash of BinaryBasicBlock");
@@ -510,9 +544,27 @@ size_t matchWeightsByHashes(
     Blocks.push_back(&Func.Blocks[I + 1]);
     BlendedBlockHash BlendedHash(BB->getHash());
     BlendedHashes.push_back(BlendedHash);
+    if (PseudoProbeDecoder) {
+      const AddressProbesMap &ProbeMap =
+          PseudoProbeDecoder->getAddress2ProbesMap();
+      const uint64_t FuncAddr = BF.getAddress();
+      const std::pair<uint64_t, uint64_t> &BlockRange =
+          BB->getInputAddressRange();
+      const auto &BlockProbes =
+          llvm::make_range(ProbeMap.lower_bound(FuncAddr + BlockRange.first),
+                           ProbeMap.lower_bound(FuncAddr + BlockRange.second));
+      for (const auto &[_, Probes] : BlockProbes) {
+        for (const MCDecodedPseudoProbe &Probe : Probes) {
+          IndexToBinaryPseudoProbes[Probe.getIndex()].push_back(&Probe);
+          BinaryPseudoProbeToBlock[&Probe] = Blocks[I];
+        }
+      }
+    }
+
     LLVM_DEBUG(dbgs() << "BB with index " << I << " has hash = "
                       << Twine::utohexstr(BB->getHash()) << "\n");
   }
+
   uint64_t BFPseudoProbeDescHash = 0;
   if (BF.hasPseudoProbe()) {
     const MCPseudoProbeDecoder *PseudoProbeDecoder = BC.getPseudoProbeDecoder();
@@ -521,14 +573,15 @@ size_t matchWeightsByHashes(
     BFPseudoProbeDescHash =
         PseudoProbeDecoder->getFuncDescForGUID(BF.getGUID())->FuncHash;
   }
-  bool MatchWithPseudoProbes =
-      BFPseudoProbeDescHash && YamlBF.PseudoProbeDescHash
-          ? BFPseudoProbeDescHash == YamlBF.PseudoProbeDescHash
-          : false;
+  uint64_t YamlBFGUID =
+      BFPseudoProbeDescHash && YamlBF.PseudoProbeDescHash &&
+              BFPseudoProbeDescHash == YamlBF.PseudoProbeDescHash
+          ? static_cast<uint64_t>(YamlBF.GUID)
+          : 0;
+
   StaleMatcher Matcher;
-  Matcher.init(Blocks, BlendedHashes, CallHashes,
-               MatchWithPseudoProbes ? std::make_optional(YamlBF.GUID)
-                                     : std::nullopt);
+  Matcher.init(Blocks, BlendedHashes, CallHashes, IndexToBinaryPseudoProbes,
+               BinaryPseudoProbeToBlock, YamlBFGUID);
 
   // Index in yaml profile => corresponding (matched) block
   DenseMap<uint64_t, const FlowBlock *> MatchedBlocks;
diff --git a/bolt/test/X86/match-blocks-with-pseudo-probes.test b/bolt/test/X86/match-blocks-with-pseudo-probes.test
index e0adb6948e2065..1d74b92a11c569 100644
--- a/bolt/test/X86/match-blocks-with-pseudo-probes.test
+++ b/bolt/test/X86/match-blocks-with-pseudo-probes.test
@@ -5,7 +5,7 @@
 # RUN: llvm-mc -filetype=obj -triple x86_64-unknown-unknown %t/main.s -o %t.o
 # RUN: %clang %cflags %t.o -o %t.exe -Wl,-q -nostdlib
 # RUN: llvm-bolt %t.exe -o %t.out --data %t/yaml -v=2 \
-# RUN:   --print-cfg --funcs=main --profile-ignore-hash=0 2>&1 | FileCheck %s
+# RUN:   --print-cfg --funcs=main --profile-ignore-hash=0 --infer-stale-profile 2>&1 | FileCheck %s
 
 # CHECK: BOLT-INFO: matched 0 functions with similar names
 
@@ -47,23 +47,16 @@ header:
   dfs-order:       false
   hash-func:       xxh3
 functions:
-  - name:            main
-    fid:             0
-    hash:            0x0000000000000001
-    exec:            1
-    nblocks:         6
+  - name:                   main
+    fid:                    0
+    hash:                   0x0000000000000001
+    exec:                   1
+    nblocks:                6
+    guid:                   0xDB956436E78DD5FA
+    pseudo_probe_desc_hash: 15822663052811949562    #lookup in code in a second
     blocks:
       - bid:             1
         hash:            0x0000000000000001
         insns:           1
         succ:            [ { bid: 3, cnt: 1} ]
-  - name:            foo
-    fid:             1
-    hash:            0x0000000000000002
-    exec:            1
-    nblocks:         6
-    blocks:
-      - bid:             1
-        hash:            0x0000000000000002
-        insns:           1
-        succ:            [ { bid: 3, cnt: 1} ]
+        pseudo_probes: [ { guid: 0xDB956436E78DD5FA, id: 0, type: 0 } ]

>From 7e3d8d6b171954836c858f0814befc54f70bd3aa Mon Sep 17 00:00:00 2001
From: shawbyoung <shawbyoung at gmail.com>
Date: Mon, 22 Jul 2024 14:27:44 -0700
Subject: [PATCH 02/39] Edit test and assert

Created using spr 1.3.4
---
 bolt/lib/Profile/StaleProfileMatching.cpp          | 2 +-
 bolt/test/X86/match-blocks-with-pseudo-probes.test | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/bolt/lib/Profile/StaleProfileMatching.cpp b/bolt/lib/Profile/StaleProfileMatching.cpp
index c135ee5ff48373..71e0579415fc69 100644
--- a/bolt/lib/Profile/StaleProfileMatching.cpp
+++ b/bolt/lib/Profile/StaleProfileMatching.cpp
@@ -313,7 +313,7 @@ class StaleMatcher {
       return nullptr;
 
     uint64_t Index = BlockPseudoProbes[0]->Index;
-    assert(Index < Blocks.size() && "Invalid pseudo probe index");
+    assert(Index <= Blocks.size() && "Invalid pseudo probe index");
 
     auto It = IndexToBinaryPseudoProbes.find(Index);
     assert(It != IndexToBinaryPseudoProbes.end() &&
diff --git a/bolt/test/X86/match-blocks-with-pseudo-probes.test b/bolt/test/X86/match-blocks-with-pseudo-probes.test
index 1d74b92a11c569..6dc01eb492eaef 100644
--- a/bolt/test/X86/match-blocks-with-pseudo-probes.test
+++ b/bolt/test/X86/match-blocks-with-pseudo-probes.test
@@ -53,10 +53,10 @@ functions:
     exec:                   1
     nblocks:                6
     guid:                   0xDB956436E78DD5FA
-    pseudo_probe_desc_hash: 15822663052811949562    #lookup in code in a second
+    pseudo_probe_desc_hash: 4294967295    #lookup in code in a second
     blocks:
       - bid:             1
         hash:            0x0000000000000001
         insns:           1
         succ:            [ { bid: 3, cnt: 1} ]
-        pseudo_probes: [ { guid: 0xDB956436E78DD5FA, id: 0, type: 0 } ]
+        pseudo_probes: [ { guid: 0xDB956436E78DD5FA, id: 1, type: 0 } ]

>From 780a07ee5a4b2bc3f5bd6e33fb072d67d1113c89 Mon Sep 17 00:00:00 2001
From: shawbyoung <shawbyoung at gmail.com>
Date: Tue, 23 Jul 2024 11:37:14 -0700
Subject: [PATCH 03/39] Fixed failing asserts, pruned prospective pseudo probes
 for matching

Created using spr 1.3.4
---
 bolt/lib/Profile/StaleProfileMatching.cpp | 56 ++++++++++++++++-------
 1 file changed, 40 insertions(+), 16 deletions(-)

diff --git a/bolt/lib/Profile/StaleProfileMatching.cpp b/bolt/lib/Profile/StaleProfileMatching.cpp
index 71e0579415fc69..d45066ed66ef26 100644
--- a/bolt/lib/Profile/StaleProfileMatching.cpp
+++ b/bolt/lib/Profile/StaleProfileMatching.cpp
@@ -45,6 +45,7 @@ namespace opts {
 
 extern cl::opt<bool> TimeRewrite;
 extern cl::OptionCategory BoltOptCategory;
+extern cl::opt<unsigned> Verbosity;
 
 cl::opt<bool>
     InferStaleProfile("infer-stale-profile",
@@ -197,9 +198,9 @@ class StaleMatcher {
             const std::vector<uint64_t> &CallHashes,
             const std::unordered_map<uint64_t,
                                      std::vector<const MCDecodedPseudoProbe *>>
-                IndexToBinaryPseudoProbes,
+                &IndexToBinaryPseudoProbes,
             const std::unordered_map<const MCDecodedPseudoProbe *, FlowBlock *>
-                BinaryPseudoProbeToBlock,
+                &BinaryPseudoProbeToBlock,
             const uint64_t YamlBFGUID) {
     assert(Blocks.size() == Hashes.size() &&
            Hashes.size() == CallHashes.size() &&
@@ -294,6 +295,9 @@ class StaleMatcher {
     // Searches for the pseudo probe attached to the matched function's block,
     // ignoring pseudo probes attached to function calls and inlined functions'
     // blocks.
+    if (opts::Verbosity >= 2)
+      outs() << "BOLT-INFO: attempting to match block with pseudo probes\n";
+
     std::vector<const yaml::bolt::PseudoProbeInfo *> BlockPseudoProbes;
     for (const auto &PseudoProbe : PseudoProbes) {
       // Ensures that pseudo probe information belongs to the appropriate
@@ -306,26 +310,41 @@ class StaleMatcher {
 
       BlockPseudoProbes.push_back(&PseudoProbe);
     }
-
     // Returns nullptr if there is not a 1:1 mapping of the yaml block pseudo
     // probe and binary pseudo probe.
-    if (BlockPseudoProbes.size() == 0 || BlockPseudoProbes.size() > 1)
+    if (BlockPseudoProbes.size() == 0) {
+      if (opts::Verbosity >= 2)
+        errs() << "BOLT-WARNING: no pseudo probes in profile block\n";
       return nullptr;
-
+    }
+    if (BlockPseudoProbes.size() > 1) {
+      if (opts::Verbosity >= 2)
+        errs() << "BOLT-WARNING: more than 1 pseudo probes in profile block\n";
+      return nullptr;
+    }
     uint64_t Index = BlockPseudoProbes[0]->Index;
-    assert(Index <= Blocks.size() && "Invalid pseudo probe index");
-
+    if (Index > Blocks.size()) {
+      if (opts::Verbosity >= 2)
+        errs() << "BOLT-WARNING: invalid index block pseudo probe index\n";
+      return nullptr;
+    }
     auto It = IndexToBinaryPseudoProbes.find(Index);
-    assert(It != IndexToBinaryPseudoProbes.end() &&
-           "All blocks should have a pseudo probe");
-    if (It->second.size() > 1)
+    if (It == IndexToBinaryPseudoProbes.end()) {
+      if (opts::Verbosity >= 2)
+        errs() << "BOLT-WARNING: no block pseudo probes found within binary "
+                  "block at index\n";
       return nullptr;
-
+    }
+    if (It->second.size() > 1) {
+      if (opts::Verbosity >= 2)
+        errs() << "BOLT-WARNING: more than 1 block pseudo probes in binary "
+                  "block at index\n";
+      return nullptr;
+    }
     const MCDecodedPseudoProbe *BinaryPseudoProbe = It->second[0];
     auto BinaryPseudoProbeIt = BinaryPseudoProbeToBlock.find(BinaryPseudoProbe);
     assert(BinaryPseudoProbeIt != BinaryPseudoProbeToBlock.end() &&
            "All binary pseudo probes should belong a binary basic block");
-
     return BinaryPseudoProbeIt->second;
   }
 };
@@ -555,6 +574,10 @@ size_t matchWeightsByHashes(
                            ProbeMap.lower_bound(FuncAddr + BlockRange.second));
       for (const auto &[_, Probes] : BlockProbes) {
         for (const MCDecodedPseudoProbe &Probe : Probes) {
+          if (Probe.getInlineTreeNode()->hasInlineSite())
+            continue;
+          if (Probe.getType() != static_cast<uint8_t>(PseudoProbeType::Block))
+            continue;
           IndexToBinaryPseudoProbes[Probe.getIndex()].push_back(&Probe);
           BinaryPseudoProbeToBlock[&Probe] = Blocks[I];
         }
@@ -566,12 +589,13 @@ size_t matchWeightsByHashes(
   }
 
   uint64_t BFPseudoProbeDescHash = 0;
-  if (BF.hasPseudoProbe()) {
-    const MCPseudoProbeDecoder *PseudoProbeDecoder = BC.getPseudoProbeDecoder();
+  if (BF.getGUID() != 0) {
     assert(PseudoProbeDecoder &&
            "If BF has pseudo probe, BC should have a pseudo probe decoder");
-    BFPseudoProbeDescHash =
-        PseudoProbeDecoder->getFuncDescForGUID(BF.getGUID())->FuncHash;
+    auto &GUID2FuncDescMap = PseudoProbeDecoder->getGUID2FuncDescMap();
+    auto It = GUID2FuncDescMap.find(BF.getGUID());
+    if (It != GUID2FuncDescMap.end())
+      BFPseudoProbeDescHash = It->second.FuncHash;
   }
   uint64_t YamlBFGUID =
       BFPseudoProbeDescHash && YamlBF.PseudoProbeDescHash &&

>From 1638ac1dacec63d9099ae3c19f2fee7c0797ed71 Mon Sep 17 00:00:00 2001
From: shawbyoung <shawbyoung at gmail.com>
Date: Tue, 23 Jul 2024 14:24:02 -0700
Subject: [PATCH 04/39] Added logging for pseudo probe block matching

Created using spr 1.3.4
---
 bolt/include/bolt/Core/BinaryContext.h    | 12 ++++++---
 bolt/lib/Passes/BinaryPasses.cpp          | 18 +++++++++++---
 bolt/lib/Profile/StaleProfileMatching.cpp | 30 +++++++++++++++++------
 3 files changed, 44 insertions(+), 16 deletions(-)

diff --git a/bolt/include/bolt/Core/BinaryContext.h b/bolt/include/bolt/Core/BinaryContext.h
index b3cf9f834cc083..39f2ac512d3056 100644
--- a/bolt/include/bolt/Core/BinaryContext.h
+++ b/bolt/include/bolt/Core/BinaryContext.h
@@ -717,12 +717,16 @@ class BinaryContext {
     /// Stats for stale profile matching:
     ///   the total number of basic blocks in the profile
     uint32_t NumStaleBlocks{0};
-    ///   the number of matched basic blocks
-    uint32_t NumMatchedBlocks{0};
+    ///   the number of exactly matched basic blocks
+    uint32_t NumExactMatchedBlocks{0};
+    ///   the number of pseudo probe matched basic blocks
+    uint32_t NumPseudoProbeMatchedBlocks{0};
     ///   the total count of samples in the profile
     uint64_t StaleSampleCount{0};
-    ///   the count of matched samples
-    uint64_t MatchedSampleCount{0};
+    ///   the count of exactly matched samples
+    uint64_t ExactMatchedSampleCount{0};
+    ///   the count of pseudo probe matched samples
+    uint64_t PseudoProbeMatchedSampleCount{0};
     ///   the number of stale functions that have matching number of blocks in
     ///   the profile
     uint64_t NumStaleFuncsWithEqualBlockCount{0};
diff --git a/bolt/lib/Passes/BinaryPasses.cpp b/bolt/lib/Passes/BinaryPasses.cpp
index fa95ad7324ac1c..b786f07a6a6651 100644
--- a/bolt/lib/Passes/BinaryPasses.cpp
+++ b/bolt/lib/Passes/BinaryPasses.cpp
@@ -1519,10 +1519,20 @@ Error PrintProgramStats::runOnFunctions(BinaryContext &BC) {
         "BOLT-INFO: inference found an exact match for %.2f%% of basic blocks"
         " (%zu out of %zu stale) responsible for %.2f%% samples"
         " (%zu out of %zu stale)\n",
-        100.0 * BC.Stats.NumMatchedBlocks / BC.Stats.NumStaleBlocks,
-        BC.Stats.NumMatchedBlocks, BC.Stats.NumStaleBlocks,
-        100.0 * BC.Stats.MatchedSampleCount / BC.Stats.StaleSampleCount,
-        BC.Stats.MatchedSampleCount, BC.Stats.StaleSampleCount);
+        100.0 * BC.Stats.NumExactMatchedBlocks / BC.Stats.NumStaleBlocks,
+        BC.Stats.NumExactMatchedBlocks, BC.Stats.NumStaleBlocks,
+        100.0 * BC.Stats.ExactMatchedSampleCount / BC.Stats.StaleSampleCount,
+        BC.Stats.ExactMatchedSampleCount, BC.Stats.StaleSampleCount);
+    BC.outs() << format(
+        "BOLT-INFO: inference found a pseudo probe match for %.2f%% of basic "
+        "blocks"
+        " (%zu out of %zu stale) responsible for %.2f%% samples"
+        " (%zu out of %zu stale)\n",
+        100.0 * BC.Stats.NumPseudoProbeMatchedBlocks / BC.Stats.NumStaleBlocks,
+        BC.Stats.NumPseudoProbeMatchedBlocks, BC.Stats.NumStaleBlocks,
+        100.0 * BC.Stats.PseudoProbeMatchedSampleCount /
+            BC.Stats.StaleSampleCount,
+        BC.Stats.PseudoProbeMatchedSampleCount, BC.Stats.StaleSampleCount);
   }
 
   if (const uint64_t NumUnusedObjects = BC.getNumUnusedProfiledObjects()) {
diff --git a/bolt/lib/Profile/StaleProfileMatching.cpp b/bolt/lib/Profile/StaleProfileMatching.cpp
index d45066ed66ef26..919f3a732b3552 100644
--- a/bolt/lib/Profile/StaleProfileMatching.cpp
+++ b/bolt/lib/Profile/StaleProfileMatching.cpp
@@ -220,13 +220,14 @@ class StaleMatcher {
   }
 
   /// Find the most similar block for a given hash.
-  const FlowBlock *matchBlock(
-      BlendedBlockHash BlendedHash, uint64_t CallHash,
-      const std::vector<yaml::bolt::PseudoProbeInfo> &PseudoProbes) const {
+  const FlowBlock *
+  matchBlock(BlendedBlockHash BlendedHash, uint64_t CallHash,
+             const std::vector<yaml::bolt::PseudoProbeInfo> &PseudoProbes) {
     const FlowBlock *BestBlock = matchWithOpcodes(BlendedHash);
     BestBlock = BestBlock ? BestBlock : matchWithCalls(BlendedHash, CallHash);
-    return BestBlock || !YamlBFGUID ? BestBlock
-                                    : matchWithPseudoProbes(PseudoProbes);
+    return BestBlock || !YamlBFGUID
+               ? BestBlock
+               : matchWithPseudoProbes(BlendedHash, PseudoProbes);
   }
 
   /// Returns true if the two basic blocks (in the binary and in the profile)
@@ -237,6 +238,11 @@ class StaleMatcher {
     return Hash1.InstrHash == Hash2.InstrHash;
   }
 
+  bool isPseudoProbeMatch(BlendedBlockHash YamlBBHash) {
+    return MatchedWithPseudoProbes.find(YamlBBHash.combine()) !=
+           MatchedWithPseudoProbes.end();
+  }
+
 private:
   using HashBlockPairType = std::pair<BlendedBlockHash, FlowBlock *>;
   std::unordered_map<uint16_t, std::vector<HashBlockPairType>> OpHashToBlocks;
@@ -245,6 +251,7 @@ class StaleMatcher {
       IndexToBinaryPseudoProbes;
   std::unordered_map<const MCDecodedPseudoProbe *, FlowBlock *>
       BinaryPseudoProbeToBlock;
+  std::unordered_set<uint64_t> MatchedWithPseudoProbes;
   std::vector<const FlowBlock *> Blocks;
   // If the pseudo probe checksums of the profiled and binary functions are
   // equal, then the YamlBF's GUID is defined and used to match blocks.
@@ -291,7 +298,8 @@ class StaleMatcher {
   // Uses pseudo probe information to attach the profile to the appropriate
   // block.
   const FlowBlock *matchWithPseudoProbes(
-      const std::vector<yaml::bolt::PseudoProbeInfo> &PseudoProbes) const {
+      BlendedBlockHash BlendedHash,
+      const std::vector<yaml::bolt::PseudoProbeInfo> &PseudoProbes) {
     // Searches for the pseudo probe attached to the matched function's block,
     // ignoring pseudo probes attached to function calls and inlined functions'
     // blocks.
@@ -345,6 +353,8 @@ class StaleMatcher {
     auto BinaryPseudoProbeIt = BinaryPseudoProbeToBlock.find(BinaryPseudoProbe);
     assert(BinaryPseudoProbeIt != BinaryPseudoProbeToBlock.end() &&
            "All binary pseudo probes should belong a binary basic block");
+
+    MatchedWithPseudoProbes.insert(BlendedHash.combine());
     return BinaryPseudoProbeIt->second;
   }
 };
@@ -639,9 +649,13 @@ size_t matchWeightsByHashes(
                         << "\n");
       // Update matching stats accounting for the matched block.
       if (Matcher.isHighConfidenceMatch(BinHash, YamlHash)) {
-        ++BC.Stats.NumMatchedBlocks;
-        BC.Stats.MatchedSampleCount += YamlBB.ExecCount;
+        ++BC.Stats.NumExactMatchedBlocks;
+        BC.Stats.ExactMatchedSampleCount += YamlBB.ExecCount;
         LLVM_DEBUG(dbgs() << "  exact match\n");
+      } else if (Matcher.isPseudoProbeMatch(YamlHash)) {
+        ++BC.Stats.NumPseudoProbeMatchedBlocks;
+        BC.Stats.PseudoProbeMatchedSampleCount += YamlBB.ExecCount;
+        LLVM_DEBUG(dbgs() << "  pseudo probe match\n");
       } else {
         LLVM_DEBUG(dbgs() << "  loose match\n");
       }

>From 144716be84d2207ee98fb238b88c6495942dec21 Mon Sep 17 00:00:00 2001
From: shawbyoung <shawbyoung at gmail.com>
Date: Tue, 23 Jul 2024 15:41:31 -0700
Subject: [PATCH 05/39] Changed pseudo probe matching failure logging to v=3

Created using spr 1.3.4
---
 bolt/lib/Profile/StaleProfileMatching.cpp | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/bolt/lib/Profile/StaleProfileMatching.cpp b/bolt/lib/Profile/StaleProfileMatching.cpp
index 919f3a732b3552..2d1a73bd60e8f1 100644
--- a/bolt/lib/Profile/StaleProfileMatching.cpp
+++ b/bolt/lib/Profile/StaleProfileMatching.cpp
@@ -255,7 +255,7 @@ class StaleMatcher {
   std::vector<const FlowBlock *> Blocks;
   // If the pseudo probe checksums of the profiled and binary functions are
   // equal, then the YamlBF's GUID is defined and used to match blocks.
-  uint64_t YamlBFGUID;
+  uint64_t YamlBFGUID{0};
 
   // Uses OpcodeHash to find the most similar block for a given hash.
   const FlowBlock *matchWithOpcodes(BlendedBlockHash BlendedHash) const {
@@ -321,30 +321,30 @@ class StaleMatcher {
     // Returns nullptr if there is not a 1:1 mapping of the yaml block pseudo
     // probe and binary pseudo probe.
     if (BlockPseudoProbes.size() == 0) {
-      if (opts::Verbosity >= 2)
+      if (opts::Verbosity >= 3)
         errs() << "BOLT-WARNING: no pseudo probes in profile block\n";
       return nullptr;
     }
     if (BlockPseudoProbes.size() > 1) {
-      if (opts::Verbosity >= 2)
+      if (opts::Verbosity >= 3)
         errs() << "BOLT-WARNING: more than 1 pseudo probes in profile block\n";
       return nullptr;
     }
     uint64_t Index = BlockPseudoProbes[0]->Index;
     if (Index > Blocks.size()) {
-      if (opts::Verbosity >= 2)
+      if (opts::Verbosity >= 3)
         errs() << "BOLT-WARNING: invalid index block pseudo probe index\n";
       return nullptr;
     }
     auto It = IndexToBinaryPseudoProbes.find(Index);
     if (It == IndexToBinaryPseudoProbes.end()) {
-      if (opts::Verbosity >= 2)
+      if (opts::Verbosity >= 3)
         errs() << "BOLT-WARNING: no block pseudo probes found within binary "
                   "block at index\n";
       return nullptr;
     }
     if (It->second.size() > 1) {
-      if (opts::Verbosity >= 2)
+      if (opts::Verbosity >= 3)
         errs() << "BOLT-WARNING: more than 1 block pseudo probes in binary "
                   "block at index\n";
       return nullptr;

>From 29347109ada65c82fef3aa0803b18c413d9c4e6b Mon Sep 17 00:00:00 2001
From: shawbyoung <shawbyoung at gmail.com>
Date: Tue, 23 Jul 2024 15:48:14 -0700
Subject: [PATCH 06/39] More loggin

Created using spr 1.3.4
---
 bolt/lib/Profile/StaleProfileMatching.cpp | 13 ++++++++++++-
 1 file changed, 12 insertions(+), 1 deletion(-)

diff --git a/bolt/lib/Profile/StaleProfileMatching.cpp b/bolt/lib/Profile/StaleProfileMatching.cpp
index 2d1a73bd60e8f1..3762d91ea9489b 100644
--- a/bolt/lib/Profile/StaleProfileMatching.cpp
+++ b/bolt/lib/Profile/StaleProfileMatching.cpp
@@ -238,11 +238,17 @@ class StaleMatcher {
     return Hash1.InstrHash == Hash2.InstrHash;
   }
 
+  /// Returns true if a profiled block was matched with its pseudo probe.
   bool isPseudoProbeMatch(BlendedBlockHash YamlBBHash) {
     return MatchedWithPseudoProbes.find(YamlBBHash.combine()) !=
            MatchedWithPseudoProbes.end();
   }
 
+  /// Returns the number of blocks matched with pseudo probes.
+  size_t getNumBlocksMatchedWithPseudoProbes() const {
+    return MatchedWithPseudoProbes.size();
+  } 
+
 private:
   using HashBlockPairType = std::pair<BlendedBlockHash, FlowBlock *>;
   std::unordered_map<uint16_t, std::vector<HashBlockPairType>> OpHashToBlocks;
@@ -303,7 +309,7 @@ class StaleMatcher {
     // Searches for the pseudo probe attached to the matched function's block,
     // ignoring pseudo probes attached to function calls and inlined functions'
     // blocks.
-    if (opts::Verbosity >= 2)
+    if (opts::Verbosity >= 3)
       outs() << "BOLT-INFO: attempting to match block with pseudo probes\n";
 
     std::vector<const yaml::bolt::PseudoProbeInfo *> BlockPseudoProbes;
@@ -672,6 +678,11 @@ size_t matchWeightsByHashes(
     BC.Stats.StaleSampleCount += YamlBB.ExecCount;
   }
 
+  if (opts::Verbosity >= 2)
+    outs() << "BOLT-INFO: " 
+      << StaleMatcher.getNumBlocksMatchedWithPseudoProbes()
+      << " blocks matched with pseudo probes\n";
+
   // Match jumps from the profile to the jumps from CFG
   std::vector<uint64_t> OutWeight(Func.Blocks.size(), 0);
   std::vector<uint64_t> InWeight(Func.Blocks.size(), 0);

>From b74fc8b2f200b776dcf0e51d505e4e43267ef938 Mon Sep 17 00:00:00 2001
From: shawbyoung <shawbyoung at gmail.com>
Date: Tue, 23 Jul 2024 16:03:21 -0700
Subject: [PATCH 07/39] Logging blocks matched with opcodes

Created using spr 1.3.4
---
 bolt/lib/Profile/StaleProfileMatching.cpp | 21 ++++++++++++++-------
 1 file changed, 14 insertions(+), 7 deletions(-)

diff --git a/bolt/lib/Profile/StaleProfileMatching.cpp b/bolt/lib/Profile/StaleProfileMatching.cpp
index 3762d91ea9489b..b31bddd47edf9c 100644
--- a/bolt/lib/Profile/StaleProfileMatching.cpp
+++ b/bolt/lib/Profile/StaleProfileMatching.cpp
@@ -224,6 +224,8 @@ class StaleMatcher {
   matchBlock(BlendedBlockHash BlendedHash, uint64_t CallHash,
              const std::vector<yaml::bolt::PseudoProbeInfo> &PseudoProbes) {
     const FlowBlock *BestBlock = matchWithOpcodes(BlendedHash);
+    if (BestBlock)
+      ++MatchedWithOpcodes;
     BestBlock = BestBlock ? BestBlock : matchWithCalls(BlendedHash, CallHash);
     return BestBlock || !YamlBFGUID
                ? BestBlock
@@ -247,7 +249,10 @@ class StaleMatcher {
   /// Returns the number of blocks matched with pseudo probes.
   size_t getNumBlocksMatchedWithPseudoProbes() const {
     return MatchedWithPseudoProbes.size();
-  } 
+  }
+
+  /// Returns the number of blocks matched with opcodes.
+  size_t getNumBlocksMatchedWithOpcodes() const { return MatchedWithOpcodes; }
 
 private:
   using HashBlockPairType = std::pair<BlendedBlockHash, FlowBlock *>;
@@ -259,9 +264,8 @@ class StaleMatcher {
       BinaryPseudoProbeToBlock;
   std::unordered_set<uint64_t> MatchedWithPseudoProbes;
   std::vector<const FlowBlock *> Blocks;
-  // If the pseudo probe checksums of the profiled and binary functions are
-  // equal, then the YamlBF's GUID is defined and used to match blocks.
   uint64_t YamlBFGUID{0};
+  uint64_t MatchedWithOpcodes{0};
 
   // Uses OpcodeHash to find the most similar block for a given hash.
   const FlowBlock *matchWithOpcodes(BlendedBlockHash BlendedHash) const {
@@ -678,10 +682,13 @@ size_t matchWeightsByHashes(
     BC.Stats.StaleSampleCount += YamlBB.ExecCount;
   }
 
-  if (opts::Verbosity >= 2)
-    outs() << "BOLT-INFO: " 
-      << StaleMatcher.getNumBlocksMatchedWithPseudoProbes()
-      << " blocks matched with pseudo probes\n";
+  if (opts::Verbosity >= 2) {
+    outs() << "BOLT-INFO: "
+           << StaleMatcher.getNumBlocksMatchedWithPseudoProbes()
+           << " blocks matched with pseudo probes\n"
+           << "BOLT-INFO: " << StaleMatcher.getNumBlocksMatchedWithOpcodes()
+           << " blocks matched with opcodes\n";
+  }
 
   // Match jumps from the profile to the jumps from CFG
   std::vector<uint64_t> OutWeight(Func.Blocks.size(), 0);

>From c38fb98fb287d881ce8162fde0522d60b43da56f Mon Sep 17 00:00:00 2001
From: shawbyoung <shawbyoung at gmail.com>
Date: Tue, 23 Jul 2024 16:10:09 -0700
Subject: [PATCH 08/39] Updated test

Created using spr 1.3.4
---
 bolt/lib/Profile/StaleProfileMatching.cpp          | 4 ++--
 bolt/test/X86/match-blocks-with-pseudo-probes.test | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/bolt/lib/Profile/StaleProfileMatching.cpp b/bolt/lib/Profile/StaleProfileMatching.cpp
index b31bddd47edf9c..c621c29a0db83c 100644
--- a/bolt/lib/Profile/StaleProfileMatching.cpp
+++ b/bolt/lib/Profile/StaleProfileMatching.cpp
@@ -684,9 +684,9 @@ size_t matchWeightsByHashes(
 
   if (opts::Verbosity >= 2) {
     outs() << "BOLT-INFO: "
-           << StaleMatcher.getNumBlocksMatchedWithPseudoProbes()
+           << Matcher.getNumBlocksMatchedWithPseudoProbes()
            << " blocks matched with pseudo probes\n"
-           << "BOLT-INFO: " << StaleMatcher.getNumBlocksMatchedWithOpcodes()
+           << "BOLT-INFO: " << Matcher.getNumBlocksMatchedWithOpcodes()
            << " blocks matched with opcodes\n";
   }
 
diff --git a/bolt/test/X86/match-blocks-with-pseudo-probes.test b/bolt/test/X86/match-blocks-with-pseudo-probes.test
index 6dc01eb492eaef..83f9c20f31ba6f 100644
--- a/bolt/test/X86/match-blocks-with-pseudo-probes.test
+++ b/bolt/test/X86/match-blocks-with-pseudo-probes.test
@@ -7,7 +7,7 @@
 # RUN: llvm-bolt %t.exe -o %t.out --data %t/yaml -v=2 \
 # RUN:   --print-cfg --funcs=main --profile-ignore-hash=0 --infer-stale-profile 2>&1 | FileCheck %s
 
-# CHECK: BOLT-INFO: matched 0 functions with similar names
+# CHECK: BOLT-INFO: inference found a pseudo probe match for 100.00% of basic blocks (1 out of 1 stale) responsible for -nan% samples (0 out of 0 stale)
 
 #--- main.s
  .text

>From b2a3ca7fd532828ae7320da6f888f20a1717bb92 Mon Sep 17 00:00:00 2001
From: shawbyoung <shawbyoung at gmail.com>
Date: Tue, 23 Jul 2024 16:14:34 -0700
Subject: [PATCH 09/39] Name changes in prep for inlined block pseudo probe
 block matching

Created using spr 1.3.4
---
 bolt/lib/Profile/StaleProfileMatching.cpp | 35 +++++++++++------------
 1 file changed, 17 insertions(+), 18 deletions(-)

diff --git a/bolt/lib/Profile/StaleProfileMatching.cpp b/bolt/lib/Profile/StaleProfileMatching.cpp
index c621c29a0db83c..4410fddaf0b21e 100644
--- a/bolt/lib/Profile/StaleProfileMatching.cpp
+++ b/bolt/lib/Profile/StaleProfileMatching.cpp
@@ -198,9 +198,9 @@ class StaleMatcher {
             const std::vector<uint64_t> &CallHashes,
             const std::unordered_map<uint64_t,
                                      std::vector<const MCDecodedPseudoProbe *>>
-                &IndexToBinaryPseudoProbes,
+                &IndexToBBPseudoProbes,
             const std::unordered_map<const MCDecodedPseudoProbe *, FlowBlock *>
-                &BinaryPseudoProbeToBlock,
+                &BBPseudoProbeToBlock,
             const uint64_t YamlBFGUID) {
     assert(Blocks.size() == Hashes.size() &&
            Hashes.size() == CallHashes.size() &&
@@ -214,8 +214,8 @@ class StaleMatcher {
             std::make_pair(Hashes[I], Block));
       this->Blocks.push_back(Block);
     }
-    this->IndexToBinaryPseudoProbes = IndexToBinaryPseudoProbes;
-    this->BinaryPseudoProbeToBlock = BinaryPseudoProbeToBlock;
+    this->IndexToBBPseudoProbes = IndexToBBPseudoProbes;
+    this->BBPseudoProbeToBlock = BBPseudoProbeToBlock;
     this->YamlBFGUID = YamlBFGUID;
   }
 
@@ -259,9 +259,9 @@ class StaleMatcher {
   std::unordered_map<uint16_t, std::vector<HashBlockPairType>> OpHashToBlocks;
   std::unordered_map<uint64_t, std::vector<HashBlockPairType>> CallHashToBlocks;
   std::unordered_map<uint64_t, std::vector<const MCDecodedPseudoProbe *>>
-      IndexToBinaryPseudoProbes;
+      IndexToBBPseudoProbes;
   std::unordered_map<const MCDecodedPseudoProbe *, FlowBlock *>
-      BinaryPseudoProbeToBlock;
+      BBPseudoProbeToBlock;
   std::unordered_set<uint64_t> MatchedWithPseudoProbes;
   std::vector<const FlowBlock *> Blocks;
   uint64_t YamlBFGUID{0};
@@ -346,8 +346,8 @@ class StaleMatcher {
         errs() << "BOLT-WARNING: invalid index block pseudo probe index\n";
       return nullptr;
     }
-    auto It = IndexToBinaryPseudoProbes.find(Index);
-    if (It == IndexToBinaryPseudoProbes.end()) {
+    auto It = IndexToBBPseudoProbes.find(Index);
+    if (It == IndexToBBPseudoProbes.end()) {
       if (opts::Verbosity >= 3)
         errs() << "BOLT-WARNING: no block pseudo probes found within binary "
                   "block at index\n";
@@ -360,8 +360,8 @@ class StaleMatcher {
       return nullptr;
     }
     const MCDecodedPseudoProbe *BinaryPseudoProbe = It->second[0];
-    auto BinaryPseudoProbeIt = BinaryPseudoProbeToBlock.find(BinaryPseudoProbe);
-    assert(BinaryPseudoProbeIt != BinaryPseudoProbeToBlock.end() &&
+    auto BinaryPseudoProbeIt = BBPseudoProbeToBlock.find(BinaryPseudoProbe);
+    assert(BinaryPseudoProbeIt != BBPseudoProbeToBlock.end() &&
            "All binary pseudo probes should belong a binary basic block");
 
     MatchedWithPseudoProbes.insert(BlendedHash.combine());
@@ -560,9 +560,9 @@ size_t matchWeightsByHashes(
   std::vector<FlowBlock *> Blocks;
   std::vector<BlendedBlockHash> BlendedHashes;
   std::unordered_map<uint64_t, std::vector<const MCDecodedPseudoProbe *>>
-      IndexToBinaryPseudoProbes;
+      IndexToBBPseudoProbes;
   std::unordered_map<const MCDecodedPseudoProbe *, FlowBlock *>
-      BinaryPseudoProbeToBlock;
+      BBPseudoProbeToBlock;
   const MCPseudoProbeDecoder *PseudoProbeDecoder = BC.getPseudoProbeDecoder();
   for (uint64_t I = 0; I < BlockOrder.size(); I++) {
     const BinaryBasicBlock *BB = BlockOrder[I];
@@ -598,8 +598,8 @@ size_t matchWeightsByHashes(
             continue;
           if (Probe.getType() != static_cast<uint8_t>(PseudoProbeType::Block))
             continue;
-          IndexToBinaryPseudoProbes[Probe.getIndex()].push_back(&Probe);
-          BinaryPseudoProbeToBlock[&Probe] = Blocks[I];
+          IndexToBBPseudoProbes[Probe.getIndex()].push_back(&Probe);
+          BBPseudoProbeToBlock[&Probe] = Blocks[I];
         }
       }
     }
@@ -624,8 +624,8 @@ size_t matchWeightsByHashes(
           : 0;
 
   StaleMatcher Matcher;
-  Matcher.init(Blocks, BlendedHashes, CallHashes, IndexToBinaryPseudoProbes,
-               BinaryPseudoProbeToBlock, YamlBFGUID);
+  Matcher.init(Blocks, BlendedHashes, CallHashes, IndexToBBPseudoProbes,
+               BBPseudoProbeToBlock, YamlBFGUID);
 
   // Index in yaml profile => corresponding (matched) block
   DenseMap<uint64_t, const FlowBlock *> MatchedBlocks;
@@ -683,8 +683,7 @@ size_t matchWeightsByHashes(
   }
 
   if (opts::Verbosity >= 2) {
-    outs() << "BOLT-INFO: "
-           << Matcher.getNumBlocksMatchedWithPseudoProbes()
+    outs() << "BOLT-INFO: " << Matcher.getNumBlocksMatchedWithPseudoProbes()
            << " blocks matched with pseudo probes\n"
            << "BOLT-INFO: " << Matcher.getNumBlocksMatchedWithOpcodes()
            << " blocks matched with opcodes\n";

>From 2eb7bf2cff7c974a3327879fd46df7348fdb43e3 Mon Sep 17 00:00:00 2001
From: shawbyoung <shawbyoung at gmail.com>
Date: Tue, 23 Jul 2024 17:16:19 -0700
Subject: [PATCH 10/39] Rm unnecessary Blocks vec in StaleMatcher

Created using spr 1.3.4
---
 bolt/lib/Profile/StaleProfileMatching.cpp | 7 -------
 1 file changed, 7 deletions(-)

diff --git a/bolt/lib/Profile/StaleProfileMatching.cpp b/bolt/lib/Profile/StaleProfileMatching.cpp
index 4410fddaf0b21e..6ee14ef0194bde 100644
--- a/bolt/lib/Profile/StaleProfileMatching.cpp
+++ b/bolt/lib/Profile/StaleProfileMatching.cpp
@@ -212,7 +212,6 @@ class StaleMatcher {
       if (CallHashes[I])
         CallHashToBlocks[CallHashes[I]].push_back(
             std::make_pair(Hashes[I], Block));
-      this->Blocks.push_back(Block);
     }
     this->IndexToBBPseudoProbes = IndexToBBPseudoProbes;
     this->BBPseudoProbeToBlock = BBPseudoProbeToBlock;
@@ -263,7 +262,6 @@ class StaleMatcher {
   std::unordered_map<const MCDecodedPseudoProbe *, FlowBlock *>
       BBPseudoProbeToBlock;
   std::unordered_set<uint64_t> MatchedWithPseudoProbes;
-  std::vector<const FlowBlock *> Blocks;
   uint64_t YamlBFGUID{0};
   uint64_t MatchedWithOpcodes{0};
 
@@ -341,11 +339,6 @@ class StaleMatcher {
       return nullptr;
     }
     uint64_t Index = BlockPseudoProbes[0]->Index;
-    if (Index > Blocks.size()) {
-      if (opts::Verbosity >= 3)
-        errs() << "BOLT-WARNING: invalid index block pseudo probe index\n";
-      return nullptr;
-    }
     auto It = IndexToBBPseudoProbes.find(Index);
     if (It == IndexToBBPseudoProbes.end()) {
       if (opts::Verbosity >= 3)

>From 212bd005b53b85596ffe84012546247db99e898f Mon Sep 17 00:00:00 2001
From: shawbyoung <shawbyoung at gmail.com>
Date: Tue, 23 Jul 2024 17:28:01 -0700
Subject: [PATCH 11/39] Improved matched block counting

Created using spr 1.3.4
---
 bolt/lib/Profile/StaleProfileMatching.cpp | 20 +++++++++++++-------
 1 file changed, 13 insertions(+), 7 deletions(-)

diff --git a/bolt/lib/Profile/StaleProfileMatching.cpp b/bolt/lib/Profile/StaleProfileMatching.cpp
index 6ee14ef0194bde..06557b3c3a3887 100644
--- a/bolt/lib/Profile/StaleProfileMatching.cpp
+++ b/bolt/lib/Profile/StaleProfileMatching.cpp
@@ -223,12 +223,19 @@ class StaleMatcher {
   matchBlock(BlendedBlockHash BlendedHash, uint64_t CallHash,
              const std::vector<yaml::bolt::PseudoProbeInfo> &PseudoProbes) {
     const FlowBlock *BestBlock = matchWithOpcodes(BlendedHash);
-    if (BestBlock)
+    if (BestBlock) {
       ++MatchedWithOpcodes;
-    BestBlock = BestBlock ? BestBlock : matchWithCalls(BlendedHash, CallHash);
-    return BestBlock || !YamlBFGUID
-               ? BestBlock
-               : matchWithPseudoProbes(BlendedHash, PseudoProbes);
+      return BestBlock;
+    }
+    BestBlock = matchWithCalls(BlendedHash, CallHash);
+    if (BestBlock) {
+      return BestBlock;
+    }
+    BestBlock = matchWithPseudoProbes(BlendedHash, PseudoProbes);
+    if (BestBlock) {
+      MatchedWithPseudoProbes.insert(BlendedHash.combine());
+    }
+    return BestBlock;
   }
 
   /// Returns true if the two basic blocks (in the binary and in the profile)
@@ -307,7 +314,7 @@ class StaleMatcher {
   // block.
   const FlowBlock *matchWithPseudoProbes(
       BlendedBlockHash BlendedHash,
-      const std::vector<yaml::bolt::PseudoProbeInfo> &PseudoProbes) {
+      const std::vector<yaml::bolt::PseudoProbeInfo> &PseudoProbes) const {
     // Searches for the pseudo probe attached to the matched function's block,
     // ignoring pseudo probes attached to function calls and inlined functions'
     // blocks.
@@ -357,7 +364,6 @@ class StaleMatcher {
     assert(BinaryPseudoProbeIt != BBPseudoProbeToBlock.end() &&
            "All binary pseudo probes should belong a binary basic block");
 
-    MatchedWithPseudoProbes.insert(BlendedHash.combine());
     return BinaryPseudoProbeIt->second;
   }
 };

>From eb6dfb973126a245aea21fae5369e06d628ddcdd Mon Sep 17 00:00:00 2001
From: shawbyoung <shawbyoung at gmail.com>
Date: Tue, 23 Jul 2024 17:33:55 -0700
Subject: [PATCH 12/39] Removed comment from test

Created using spr 1.3.4
---
 bolt/lib/Profile/StaleProfileMatching.cpp          | 6 +++---
 bolt/test/X86/match-blocks-with-pseudo-probes.test | 2 +-
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/bolt/lib/Profile/StaleProfileMatching.cpp b/bolt/lib/Profile/StaleProfileMatching.cpp
index 06557b3c3a3887..f3e19e8fb100d0 100644
--- a/bolt/lib/Profile/StaleProfileMatching.cpp
+++ b/bolt/lib/Profile/StaleProfileMatching.cpp
@@ -252,14 +252,14 @@ class StaleMatcher {
            MatchedWithPseudoProbes.end();
   }
 
+  /// Returns the number of blocks matched with opcodes.
+  size_t getNumBlocksMatchedWithOpcodes() const { return MatchedWithOpcodes; }
+
   /// Returns the number of blocks matched with pseudo probes.
   size_t getNumBlocksMatchedWithPseudoProbes() const {
     return MatchedWithPseudoProbes.size();
   }
 
-  /// Returns the number of blocks matched with opcodes.
-  size_t getNumBlocksMatchedWithOpcodes() const { return MatchedWithOpcodes; }
-
 private:
   using HashBlockPairType = std::pair<BlendedBlockHash, FlowBlock *>;
   std::unordered_map<uint16_t, std::vector<HashBlockPairType>> OpHashToBlocks;
diff --git a/bolt/test/X86/match-blocks-with-pseudo-probes.test b/bolt/test/X86/match-blocks-with-pseudo-probes.test
index 83f9c20f31ba6f..4a6f2f1cf129af 100644
--- a/bolt/test/X86/match-blocks-with-pseudo-probes.test
+++ b/bolt/test/X86/match-blocks-with-pseudo-probes.test
@@ -53,7 +53,7 @@ functions:
     exec:                   1
     nblocks:                6
     guid:                   0xDB956436E78DD5FA
-    pseudo_probe_desc_hash: 4294967295    #lookup in code in a second
+    pseudo_probe_desc_hash: 4294967295
     blocks:
       - bid:             1
         hash:            0x0000000000000001

>From 16b5cfbbbb37820b00ad07b086481c46aefd0142 Mon Sep 17 00:00:00 2001
From: shawbyoung <shawbyoung at gmail.com>
Date: Tue, 23 Jul 2024 17:48:12 -0700
Subject: [PATCH 13/39] Added comments and check for null YamlBFGUID in
 StaleMatcher before PseudoProbe matching

Created using spr 1.3.4
---
 bolt/lib/Profile/StaleProfileMatching.cpp | 10 +++++++++-
 1 file changed, 9 insertions(+), 1 deletion(-)

diff --git a/bolt/lib/Profile/StaleProfileMatching.cpp b/bolt/lib/Profile/StaleProfileMatching.cpp
index f3e19e8fb100d0..b74c1c5071815a 100644
--- a/bolt/lib/Profile/StaleProfileMatching.cpp
+++ b/bolt/lib/Profile/StaleProfileMatching.cpp
@@ -218,7 +218,8 @@ class StaleMatcher {
     this->YamlBFGUID = YamlBFGUID;
   }
 
-  /// Find the most similar block for a given hash.
+  /// Find the most similar flow block for a profile block given its hashes and
+  /// pseudo probe information.
   const FlowBlock *
   matchBlock(BlendedBlockHash BlendedHash, uint64_t CallHash,
              const std::vector<yaml::bolt::PseudoProbeInfo> &PseudoProbes) {
@@ -315,6 +316,8 @@ class StaleMatcher {
   const FlowBlock *matchWithPseudoProbes(
       BlendedBlockHash BlendedHash,
       const std::vector<yaml::bolt::PseudoProbeInfo> &PseudoProbes) const {
+    if (!YamlBFGUID)
+      return nullptr;
     // Searches for the pseudo probe attached to the matched function's block,
     // ignoring pseudo probes attached to function calls and inlined functions'
     // blocks.
@@ -582,6 +585,7 @@ size_t matchWeightsByHashes(
     Blocks.push_back(&Func.Blocks[I + 1]);
     BlendedBlockHash BlendedHash(BB->getHash());
     BlendedHashes.push_back(BlendedHash);
+    // Collects pseudo probes attached to the BB for use in the StaleMatcher.
     if (PseudoProbeDecoder) {
       const AddressProbesMap &ProbeMap =
           PseudoProbeDecoder->getAddress2ProbesMap();
@@ -607,6 +611,10 @@ size_t matchWeightsByHashes(
                       << Twine::utohexstr(BB->getHash()) << "\n");
   }
 
+  // Sets the YamlBFGUID in the StaleMatcher such that if either the profiled or
+  // binary function dne or they are not equal, to zero, as not to perform
+  // pseudo probe block matching. Otherwise, the YamlBF's GUID is used for
+  // pseudo probe block matching.
   uint64_t BFPseudoProbeDescHash = 0;
   if (BF.getGUID() != 0) {
     assert(PseudoProbeDecoder &&

>From 799f20cf7ed8dfc30d89beadd90d91758cdc9485 Mon Sep 17 00:00:00 2001
From: shawbyoung <shawbyoung at gmail.com>
Date: Wed, 24 Jul 2024 07:42:26 -0700
Subject: [PATCH 14/39] Omitting braces in one line if

Created using spr 1.3.4
---
 bolt/lib/Profile/StaleProfileMatching.cpp | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/bolt/lib/Profile/StaleProfileMatching.cpp b/bolt/lib/Profile/StaleProfileMatching.cpp
index b74c1c5071815a..e2fd85373485b6 100644
--- a/bolt/lib/Profile/StaleProfileMatching.cpp
+++ b/bolt/lib/Profile/StaleProfileMatching.cpp
@@ -229,9 +229,8 @@ class StaleMatcher {
       return BestBlock;
     }
     BestBlock = matchWithCalls(BlendedHash, CallHash);
-    if (BestBlock) {
+    if (BestBlock)
       return BestBlock;
-    }
     BestBlock = matchWithPseudoProbes(BlendedHash, PseudoProbes);
     if (BestBlock) {
       MatchedWithPseudoProbes.insert(BlendedHash.combine());

>From 33f1b2ad7809786449b328e3eca93bb9e46694f4 Mon Sep 17 00:00:00 2001
From: shawbyoung <shawbyoung at gmail.com>
Date: Wed, 24 Jul 2024 09:06:37 -0700
Subject: [PATCH 15/39] Omit unnecessary braces

Created using spr 1.3.4
---
 bolt/lib/Profile/StaleProfileMatching.cpp | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/bolt/lib/Profile/StaleProfileMatching.cpp b/bolt/lib/Profile/StaleProfileMatching.cpp
index b5ddef210d5e9a..26cbd8250ce588 100644
--- a/bolt/lib/Profile/StaleProfileMatching.cpp
+++ b/bolt/lib/Profile/StaleProfileMatching.cpp
@@ -233,9 +233,8 @@ class StaleMatcher {
     if (BestBlock)
       return BestBlock;
     BestBlock = matchWithPseudoProbes(BlendedHash, PseudoProbes);
-    if (BestBlock) {
+    if (BestBlock)
       MatchedWithPseudoProbes.insert(BlendedHash.combine());
-    }
     return BestBlock;
   }
 

>From 9889f8903c85d1eae4a1cd536bda03d6959c8ba4 Mon Sep 17 00:00:00 2001
From: shawbyoung <shawbyoung at gmail.com>
Date: Wed, 24 Jul 2024 09:51:22 -0700
Subject: [PATCH 16/39] Change initialization of index -> probe and probe ->
 block mappings

Created using spr 1.3.4
---
 bolt/lib/Profile/StaleProfileMatching.cpp | 81 ++++++++++++-----------
 1 file changed, 41 insertions(+), 40 deletions(-)

diff --git a/bolt/lib/Profile/StaleProfileMatching.cpp b/bolt/lib/Profile/StaleProfileMatching.cpp
index 26cbd8250ce588..6e7525b66aaae4 100644
--- a/bolt/lib/Profile/StaleProfileMatching.cpp
+++ b/bolt/lib/Profile/StaleProfileMatching.cpp
@@ -193,16 +193,12 @@ struct BlendedBlockHash {
 /// release.
 class StaleMatcher {
 public:
+  StaleMatcher(const uint64_t YamlBFGUID) : YamlBFGUID(YamlBFGUID) {}
+
   /// Initialize stale matcher.
   void init(const std::vector<FlowBlock *> &Blocks,
             const std::vector<BlendedBlockHash> &Hashes,
-            const std::vector<uint64_t> &CallHashes,
-            const std::unordered_map<uint64_t,
-                                     std::vector<const MCDecodedPseudoProbe *>>
-                &IndexToBBPseudoProbes,
-            const std::unordered_map<const MCDecodedPseudoProbe *, FlowBlock *>
-                &BBPseudoProbeToBlock,
-            const uint64_t YamlBFGUID) {
+            const std::vector<uint64_t> &CallHashes) {
     assert(Blocks.size() == Hashes.size() &&
            Hashes.size() == CallHashes.size() &&
            "incorrect matcher initialization");
@@ -214,9 +210,17 @@ class StaleMatcher {
         CallHashToBlocks[CallHashes[I]].push_back(
             std::make_pair(Hashes[I], Block));
     }
-    this->IndexToBBPseudoProbes = IndexToBBPseudoProbes;
-    this->BBPseudoProbeToBlock = BBPseudoProbeToBlock;
-    this->YamlBFGUID = YamlBFGUID;
+  }
+
+  /// Creates a mapping from a pseudo probe index to block pseudo probe in the
+  /// binary.
+  void mapIndexToProbe(uint64_t Index, const MCDecodedPseudoProbe *Probe) {
+    IndexToBBPseudoProbes[Index].push_back(Probe);
+  }
+
+  /// Creates a mapping from a pseudo probe to a flow block.
+  void mapProbeToBB(const MCDecodedPseudoProbe *Probe, FlowBlock *Block) {
+    BBPseudoProbeToBlock[Probe] = Block;
   }
 
   /// Find the most similar flow block for a profile block given its hashes and
@@ -269,7 +273,7 @@ class StaleMatcher {
   std::unordered_map<const MCDecodedPseudoProbe *, FlowBlock *>
       BBPseudoProbeToBlock;
   std::unordered_set<uint64_t> MatchedWithPseudoProbes;
-  uint64_t YamlBFGUID{0};
+  const uint64_t YamlBFGUID{0};
   uint64_t MatchedWithOpcodes{0};
 
   // Uses OpcodeHash to find the most similar block for a given hash.
@@ -557,14 +561,32 @@ size_t matchWeightsByHashes(
 
   assert(Func.Blocks.size() == BlockOrder.size() + 2);
 
+  // Sets the YamlBFGUID in the StaleMatcher such that if either the profiled or
+  // binary function dne or they are not equal, to zero, as not to perform
+  // pseudo probe block matching. Otherwise, the YamlBF's GUID is used for
+  // pseudo probe block matching.
+  const MCPseudoProbeDecoder *PseudoProbeDecoder =
+      opts::ProfileUsePseudoProbes ? BC.getPseudoProbeDecoder() : nullptr;
+  uint64_t BFPseudoProbeDescHash = 0;
+  if (opts::ProfileUsePseudoProbes && BF.getGUID() != 0) {
+    assert(PseudoProbeDecoder &&
+           "If BF has pseudo probe, BC should have a pseudo probe decoder");
+    auto &GUID2FuncDescMap = PseudoProbeDecoder->getGUID2FuncDescMap();
+    auto It = GUID2FuncDescMap.find(BF.getGUID());
+    if (It != GUID2FuncDescMap.end())
+      BFPseudoProbeDescHash = It->second.FuncHash;
+  }
+  uint64_t YamlBFGUID =
+      BFPseudoProbeDescHash && YamlBF.PseudoProbeDescHash &&
+              BFPseudoProbeDescHash == YamlBF.PseudoProbeDescHash
+          ? static_cast<uint64_t>(YamlBF.GUID)
+          : 0;
+
+  StaleMatcher Matcher(YamlBFGUID);
   std::vector<uint64_t> CallHashes;
   std::vector<FlowBlock *> Blocks;
   std::vector<BlendedBlockHash> BlendedHashes;
-  std::unordered_map<uint64_t, std::vector<const MCDecodedPseudoProbe *>>
-      IndexToBBPseudoProbes;
-  std::unordered_map<const MCDecodedPseudoProbe *, FlowBlock *>
-      BBPseudoProbeToBlock;
-  const MCPseudoProbeDecoder *PseudoProbeDecoder = BC.getPseudoProbeDecoder();
+
   for (uint64_t I = 0; I < BlockOrder.size(); I++) {
     const BinaryBasicBlock *BB = BlockOrder[I];
     assert(BB->getHash() != 0 && "empty hash of BinaryBasicBlock");
@@ -600,8 +622,8 @@ size_t matchWeightsByHashes(
             continue;
           if (Probe.getType() != static_cast<uint8_t>(PseudoProbeType::Block))
             continue;
-          IndexToBBPseudoProbes[Probe.getIndex()].push_back(&Probe);
-          BBPseudoProbeToBlock[&Probe] = Blocks[I];
+          Matcher.mapIndexToProbe(Probe.getIndex(), &Probe);
+          Matcher.mapProbeToBB(&Probe, Blocks[I]);
         }
       }
     }
@@ -610,28 +632,7 @@ size_t matchWeightsByHashes(
                       << Twine::utohexstr(BB->getHash()) << "\n");
   }
 
-  // Sets the YamlBFGUID in the StaleMatcher such that if either the profiled or
-  // binary function dne or they are not equal, to zero, as not to perform
-  // pseudo probe block matching. Otherwise, the YamlBF's GUID is used for
-  // pseudo probe block matching.
-  uint64_t BFPseudoProbeDescHash = 0;
-  if (opts::ProfileUsePseudoProbes && BF.getGUID() != 0) {
-    assert(PseudoProbeDecoder &&
-           "If BF has pseudo probe, BC should have a pseudo probe decoder");
-    auto &GUID2FuncDescMap = PseudoProbeDecoder->getGUID2FuncDescMap();
-    auto It = GUID2FuncDescMap.find(BF.getGUID());
-    if (It != GUID2FuncDescMap.end())
-      BFPseudoProbeDescHash = It->second.FuncHash;
-  }
-  uint64_t YamlBFGUID =
-      BFPseudoProbeDescHash && YamlBF.PseudoProbeDescHash &&
-              BFPseudoProbeDescHash == YamlBF.PseudoProbeDescHash
-          ? static_cast<uint64_t>(YamlBF.GUID)
-          : 0;
-
-  StaleMatcher Matcher;
-  Matcher.init(Blocks, BlendedHashes, CallHashes, IndexToBBPseudoProbes,
-               BBPseudoProbeToBlock, YamlBFGUID);
+  Matcher.init(Blocks, BlendedHashes, CallHashes);
 
   // Index in yaml profile => corresponding (matched) block
   DenseMap<uint64_t, const FlowBlock *> MatchedBlocks;

>From 022c517af0278979b92b48d75503b4278880d040 Mon Sep 17 00:00:00 2001
From: shawbyoung <shawbyoung at gmail.com>
Date: Wed, 24 Jul 2024 09:54:02 -0700
Subject: [PATCH 17/39] Formatting

Created using spr 1.3.4
---
 bolt/lib/Profile/StaleProfileMatching.cpp | 2 --
 1 file changed, 2 deletions(-)

diff --git a/bolt/lib/Profile/StaleProfileMatching.cpp b/bolt/lib/Profile/StaleProfileMatching.cpp
index 6e7525b66aaae4..37140a7dd1e573 100644
--- a/bolt/lib/Profile/StaleProfileMatching.cpp
+++ b/bolt/lib/Profile/StaleProfileMatching.cpp
@@ -586,7 +586,6 @@ size_t matchWeightsByHashes(
   std::vector<uint64_t> CallHashes;
   std::vector<FlowBlock *> Blocks;
   std::vector<BlendedBlockHash> BlendedHashes;
-
   for (uint64_t I = 0; I < BlockOrder.size(); I++) {
     const BinaryBasicBlock *BB = BlockOrder[I];
     assert(BB->getHash() != 0 && "empty hash of BinaryBasicBlock");
@@ -631,7 +630,6 @@ size_t matchWeightsByHashes(
     LLVM_DEBUG(dbgs() << "BB with index " << I << " has hash = "
                       << Twine::utohexstr(BB->getHash()) << "\n");
   }
-
   Matcher.init(Blocks, BlendedHashes, CallHashes);
 
   // Index in yaml profile => corresponding (matched) block

>From 5109893be3eaf62e756e4837b6a49b9aa4b0824b Mon Sep 17 00:00:00 2001
From: shawbyoung <shawbyoung at gmail.com>
Date: Wed, 24 Jul 2024 10:12:25 -0700
Subject: [PATCH 18/39] Comments

Created using spr 1.3.4
---
 bolt/lib/Profile/StaleProfileMatching.cpp | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/bolt/lib/Profile/StaleProfileMatching.cpp b/bolt/lib/Profile/StaleProfileMatching.cpp
index 37140a7dd1e573..ba271cdbb2dc02 100644
--- a/bolt/lib/Profile/StaleProfileMatching.cpp
+++ b/bolt/lib/Profile/StaleProfileMatching.cpp
@@ -212,8 +212,7 @@ class StaleMatcher {
     }
   }
 
-  /// Creates a mapping from a pseudo probe index to block pseudo probe in the
-  /// binary.
+  /// Creates a mapping from a pseudo probe index to pseudo probe.
   void mapIndexToProbe(uint64_t Index, const MCDecodedPseudoProbe *Probe) {
     IndexToBBPseudoProbes[Index].push_back(Probe);
   }
@@ -321,12 +320,13 @@ class StaleMatcher {
       const std::vector<yaml::bolt::PseudoProbeInfo> &PseudoProbes) const {
     if (!YamlBFGUID)
       return nullptr;
-    // Searches for the pseudo probe attached to the matched function's block,
-    // ignoring pseudo probes attached to function calls and inlined functions'
-    // blocks.
+
     if (opts::Verbosity >= 3)
       outs() << "BOLT-INFO: attempting to match block with pseudo probes\n";
 
+    // Searches for the pseudo probe attached to the matched function's block,
+    // ignoring pseudo probes attached to function calls and inlined functions'
+    // blocks.
     std::vector<const yaml::bolt::PseudoProbeInfo *> BlockPseudoProbes;
     for (const auto &PseudoProbe : PseudoProbes) {
       // Ensures that pseudo probe information belongs to the appropriate

>From 5bf42207453830f9690895d9dd4efa78365c7bb1 Mon Sep 17 00:00:00 2001
From: shawbyoung <shawbyoung at gmail.com>
Date: Wed, 24 Jul 2024 16:09:05 -0700
Subject: [PATCH 19/39] Changed std ADTs to LLVM

Created using spr 1.3.4
---
 bolt/lib/Profile/StaleProfileMatching.cpp | 9 ++++-----
 1 file changed, 4 insertions(+), 5 deletions(-)

diff --git a/bolt/lib/Profile/StaleProfileMatching.cpp b/bolt/lib/Profile/StaleProfileMatching.cpp
index ba271cdbb2dc02..65eb3b0a419ceb 100644
--- a/bolt/lib/Profile/StaleProfileMatching.cpp
+++ b/bolt/lib/Profile/StaleProfileMatching.cpp
@@ -267,11 +267,10 @@ class StaleMatcher {
   using HashBlockPairType = std::pair<BlendedBlockHash, FlowBlock *>;
   std::unordered_map<uint16_t, std::vector<HashBlockPairType>> OpHashToBlocks;
   std::unordered_map<uint64_t, std::vector<HashBlockPairType>> CallHashToBlocks;
-  std::unordered_map<uint64_t, std::vector<const MCDecodedPseudoProbe *>>
+  DenseMap<uint64_t, std::vector<const MCDecodedPseudoProbe *>>
       IndexToBBPseudoProbes;
-  std::unordered_map<const MCDecodedPseudoProbe *, FlowBlock *>
-      BBPseudoProbeToBlock;
-  std::unordered_set<uint64_t> MatchedWithPseudoProbes;
+  DenseMap<const MCDecodedPseudoProbe *, FlowBlock *> BBPseudoProbeToBlock;
+  DenseSet<uint64_t> MatchedWithPseudoProbes;
   const uint64_t YamlBFGUID{0};
   uint64_t MatchedWithOpcodes{0};
 
@@ -327,7 +326,7 @@ class StaleMatcher {
     // Searches for the pseudo probe attached to the matched function's block,
     // ignoring pseudo probes attached to function calls and inlined functions'
     // blocks.
-    std::vector<const yaml::bolt::PseudoProbeInfo *> BlockPseudoProbes;
+    SmallVector<const yaml::bolt::PseudoProbeInfo *> BlockPseudoProbes;
     for (const auto &PseudoProbe : PseudoProbes) {
       // Ensures that pseudo probe information belongs to the appropriate
       // function and not an inlined function.

>From f1179b11812841b6cdf78fc3c2e166279246cd08 Mon Sep 17 00:00:00 2001
From: shawbyoung <shawbyoung at gmail.com>
Date: Wed, 24 Jul 2024 16:16:03 -0700
Subject: [PATCH 20/39] In matchWithPseudoProbe, hoist
 BlocksPseudoProbes.size(), added logging helpr

Created using spr 1.3.4
---
 bolt/lib/Profile/StaleProfileMatching.cpp | 28 +++++++++++++----------
 1 file changed, 16 insertions(+), 12 deletions(-)

diff --git a/bolt/lib/Profile/StaleProfileMatching.cpp b/bolt/lib/Profile/StaleProfileMatching.cpp
index 65eb3b0a419ceb..7f45077bf7c86f 100644
--- a/bolt/lib/Profile/StaleProfileMatching.cpp
+++ b/bolt/lib/Profile/StaleProfileMatching.cpp
@@ -338,30 +338,34 @@ class StaleMatcher {
 
       BlockPseudoProbes.push_back(&PseudoProbe);
     }
+
+    auto LogPseudoProbeBlockMatchFail = [&](std::string Message) {
+      if (opts::Verbosity >= 3)
+        errs() << Message;
+    };
     // Returns nullptr if there is not a 1:1 mapping of the yaml block pseudo
     // probe and binary pseudo probe.
-    if (BlockPseudoProbes.size() == 0) {
-      if (opts::Verbosity >= 3)
-        errs() << "BOLT-WARNING: no pseudo probes in profile block\n";
+    size_t NBlockPseudoProbes = BlockPseudoProbes.size();
+    if (NBlockPseudoProbes == 0) {
+      LogPseudoProbeBlockMatchFail(
+        "BOLT-WARNING: no pseudo probes in profile block\n");
       return nullptr;
     }
-    if (BlockPseudoProbes.size() > 1) {
-      if (opts::Verbosity >= 3)
-        errs() << "BOLT-WARNING: more than 1 pseudo probes in profile block\n";
+    if (BNBlockPseudoProbes > 1) {
+      LogPseudoProbeBlockMatchFail(
+        "BOLT-WARNING: more than 1 pseudo probes in profile block\n");
       return nullptr;
     }
     uint64_t Index = BlockPseudoProbes[0]->Index;
     auto It = IndexToBBPseudoProbes.find(Index);
     if (It == IndexToBBPseudoProbes.end()) {
-      if (opts::Verbosity >= 3)
-        errs() << "BOLT-WARNING: no block pseudo probes found within binary "
-                  "block at index\n";
+      LogPseudoProbeBlockMatchFail(
+        "BOLT-WARNING: no block pseudo probes found within BB at index\n");
       return nullptr;
     }
     if (It->second.size() > 1) {
-      if (opts::Verbosity >= 3)
-        errs() << "BOLT-WARNING: more than 1 block pseudo probes in binary "
-                  "block at index\n";
+      LogPseudoProbeBlockMatchFail(
+        "BOLT-WARNING: more than 1 block pseudo probes in BB at index\n");
       return nullptr;
     }
     const MCDecodedPseudoProbe *BinaryPseudoProbe = It->second[0];

>From 5076bab518abdf7994111026898d4d6c08f21e2b Mon Sep 17 00:00:00 2001
From: shawbyoung <shawbyoung at gmail.com>
Date: Wed, 24 Jul 2024 16:23:42 -0700
Subject: [PATCH 21/39] A more beautiful helper function for
 matchWithPseudoProbes

Created using spr 1.3.4
---
 bolt/lib/Profile/StaleProfileMatching.cpp | 32 ++++++++++-------------
 1 file changed, 14 insertions(+), 18 deletions(-)

diff --git a/bolt/lib/Profile/StaleProfileMatching.cpp b/bolt/lib/Profile/StaleProfileMatching.cpp
index 7f45077bf7c86f..9d2584767c478f 100644
--- a/bolt/lib/Profile/StaleProfileMatching.cpp
+++ b/bolt/lib/Profile/StaleProfileMatching.cpp
@@ -339,35 +339,31 @@ class StaleMatcher {
       BlockPseudoProbes.push_back(&PseudoProbe);
     }
 
-    auto LogPseudoProbeBlockMatchFail = [&](std::string Message) {
-      if (opts::Verbosity >= 3)
+    auto LogErrIfExpr = [&](bool Expr, std::string Message) -> bool {
+      if (Expr)
         errs() << Message;
+      return Expr;
     };
     // Returns nullptr if there is not a 1:1 mapping of the yaml block pseudo
     // probe and binary pseudo probe.
     size_t NBlockPseudoProbes = BlockPseudoProbes.size();
-    if (NBlockPseudoProbes == 0) {
-      LogPseudoProbeBlockMatchFail(
-        "BOLT-WARNING: no pseudo probes in profile block\n");
+    if (LogErrIfExpr(NBlockPseudoProbes == 0,
+                     "BOLT-WARNING: no pseudo probes in profile block\n"))
       return nullptr;
-    }
-    if (BNBlockPseudoProbes > 1) {
-      LogPseudoProbeBlockMatchFail(
-        "BOLT-WARNING: more than 1 pseudo probes in profile block\n");
+    if (LogErrIfExpr(
+            NBlockPseudoProbes > 1,
+            "BOLT-WARNING: more than 1 pseudo probes in profile block\n"))
       return nullptr;
-    }
     uint64_t Index = BlockPseudoProbes[0]->Index;
     auto It = IndexToBBPseudoProbes.find(Index);
-    if (It == IndexToBBPseudoProbes.end()) {
-      LogPseudoProbeBlockMatchFail(
-        "BOLT-WARNING: no block pseudo probes found within BB at index\n");
+    if (LogErrIfExpr(
+            It == IndexToBBPseudoProbes.end(),
+            "BOLT-WARNING: no block pseudo probes found within BB at index\n"))
       return nullptr;
-    }
-    if (It->second.size() > 1) {
-      LogPseudoProbeBlockMatchFail(
-        "BOLT-WARNING: more than 1 block pseudo probes in BB at index\n");
+    if (LogErrIfExpr(
+            It->second.size() > 1,
+            "BOLT-WARNING: more than 1 block pseudo probes in BB at index\n"))
       return nullptr;
-    }
     const MCDecodedPseudoProbe *BinaryPseudoProbe = It->second[0];
     auto BinaryPseudoProbeIt = BBPseudoProbeToBlock.find(BinaryPseudoProbe);
     assert(BinaryPseudoProbeIt != BBPseudoProbeToBlock.end() &&

>From 4f2f64211c3aabe740e013e5d1be0dc6771f7f60 Mon Sep 17 00:00:00 2001
From: shawbyoung <shawbyoung at gmail.com>
Date: Thu, 25 Jul 2024 10:02:04 -0700
Subject: [PATCH 22/39] Added inlined block pseudo probe matching

Created using spr 1.3.4
---
 bolt/lib/Profile/StaleProfileMatching.cpp | 142 ++++++++++++++++------
 1 file changed, 105 insertions(+), 37 deletions(-)

diff --git a/bolt/lib/Profile/StaleProfileMatching.cpp b/bolt/lib/Profile/StaleProfileMatching.cpp
index 9d2584767c478f..fd9be5f44fe61a 100644
--- a/bolt/lib/Profile/StaleProfileMatching.cpp
+++ b/bolt/lib/Profile/StaleProfileMatching.cpp
@@ -212,9 +212,15 @@ class StaleMatcher {
     }
   }
 
+  /// Creates a mapping from a inlined pseudo probe's guid and index to probe.
+  void mapGUIDAndIndexToProbe(uint64_t Guid, uint64_t Index,
+                              const MCDecodedPseudoProbe *Probe) {
+    IndexAndGUIDToInlinedProbes[Guid][Index].push_back(Probe);
+  }
+
   /// Creates a mapping from a pseudo probe index to pseudo probe.
   void mapIndexToProbe(uint64_t Index, const MCDecodedPseudoProbe *Probe) {
-    IndexToBBPseudoProbes[Index].push_back(Probe);
+    IndexToProbes[Index].push_back(Probe);
   }
 
   /// Creates a mapping from a pseudo probe to a flow block.
@@ -267,8 +273,10 @@ class StaleMatcher {
   using HashBlockPairType = std::pair<BlendedBlockHash, FlowBlock *>;
   std::unordered_map<uint16_t, std::vector<HashBlockPairType>> OpHashToBlocks;
   std::unordered_map<uint64_t, std::vector<HashBlockPairType>> CallHashToBlocks;
-  DenseMap<uint64_t, std::vector<const MCDecodedPseudoProbe *>>
-      IndexToBBPseudoProbes;
+  DenseMap<uint64_t, std::vector<const MCDecodedPseudoProbe *>> IndexToProbes;
+  DenseMap<uint64_t,
+           DenseMap<uint64_t, std::vector<const MCDecodedPseudoProbe *>>>
+      IndexAndGUIDToInlinedProbes;
   DenseMap<const MCDecodedPseudoProbe *, FlowBlock *> BBPseudoProbeToBlock;
   DenseSet<uint64_t> MatchedWithPseudoProbes;
   const uint64_t YamlBFGUID{0};
@@ -312,40 +320,68 @@ class StaleMatcher {
     }
     return BestBlock;
   }
-  // Uses pseudo probe information to attach the profile to the appropriate
-  // block.
-  const FlowBlock *matchWithPseudoProbes(
-      BlendedBlockHash BlendedHash,
-      const std::vector<yaml::bolt::PseudoProbeInfo> &PseudoProbes) const {
-    if (!YamlBFGUID)
-      return nullptr;
 
+  /// A helper function for logging.
+  static bool LogErrIfExpr(bool Expr, std::string Message) {
+    if (Expr)
+      errs() << Message;
+    return Expr;
+  }
+
+  /// Matches an inlined profile block with an inlined binary block based on
+  /// pseudo probes.
+  const FlowBlock *matchWithInlinedBlockPseudoProbes(
+      SmallVector<const yaml::bolt::PseudoProbeInfo *>
+          &InlinedBlockPseudoProbes) const {
     if (opts::Verbosity >= 3)
-      outs() << "BOLT-INFO: attempting to match block with pseudo probes\n";
+      outs() << "BOLT-INFO: attempting to match block with inlined block "
+                "pseudo probes\n";
 
-    // Searches for the pseudo probe attached to the matched function's block,
-    // ignoring pseudo probes attached to function calls and inlined functions'
-    // blocks.
-    SmallVector<const yaml::bolt::PseudoProbeInfo *> BlockPseudoProbes;
-    for (const auto &PseudoProbe : PseudoProbes) {
-      // Ensures that pseudo probe information belongs to the appropriate
-      // function and not an inlined function.
-      if (PseudoProbe.GUID != YamlBFGUID)
-        continue;
-      // Skips pseudo probes attached to function calls.
-      if (PseudoProbe.Type != static_cast<uint8_t>(PseudoProbeType::Block))
-        continue;
+    size_t NInlinedBlockPseudoProbes = InlinedBlockPseudoProbes.size();
+    if (LogErrIfExpr(NInlinedBlockPseudoProbes == 0,
+                     "BOLT-WARNING: no pseudo probes in profile block\n"))
+      return nullptr;
+    if (LogErrIfExpr(
+            NInlinedBlockPseudoProbes > 1,
+            "BOLT-WARNING: more than 1 pseudo probes in profile block\n"))
+      return nullptr;
 
-      BlockPseudoProbes.push_back(&PseudoProbe);
-    }
+    const auto *InlinedPseudoProbe = InlinedBlockPseudoProbes[0];
+    uint64_t Guid = InlinedPseudoProbe->GUID;
+    uint64_t Index = InlinedPseudoProbe->Index;
+
+    auto GuidIt = IndexAndGUIDToInlinedProbes.find(Guid);
+    if (LogErrIfExpr(
+            GuidIt == IndexAndGUIDToInlinedProbes.end(),
+            "BOLT-WARNING: no pseudo probes found within BB at index\n"))
+      return nullptr;
+    auto IndexIt = GuidIt->second.find(Index);
+    if (LogErrIfExpr(
+            IndexIt == GuidIt->second.end(),
+            "BOLT-WARNING: no pseudo probes found within BB at index\n"))
+      return nullptr;
+
+    if (LogErrIfExpr(
+            IndexIt->second.size() > 1,
+            "BOLT-WARNING: more than 1 block pseudo probes in BB at index\n"))
+      return nullptr;
+
+    const MCDecodedPseudoProbe *BinaryPseudoProbe = IndexIt->second[0];
+    auto BinaryPseudoProbeIt = BBPseudoProbeToBlock.find(BinaryPseudoProbe);
+    assert(BinaryPseudoProbeIt != BBPseudoProbeToBlock.end() &&
+           "All binary pseudo probes should belong a binary basic block");
+
+    return BinaryPseudoProbeIt->second;
+  }
+
+  /// Matches a profile block with an binary block based on pseudo probes.
+  const FlowBlock *matchWithNonInlinedBlockPseudoProbes(
+      SmallVector<const yaml::bolt::PseudoProbeInfo *> &BlockPseudoProbes)
+      const {
+    if (opts::Verbosity >= 3)
+      outs() << "BOLT-INFO: attempting to match block with inlined block "
+                "pseudo probes\n";
 
-    auto LogErrIfExpr = [&](bool Expr, std::string Message) -> bool {
-      if (Expr)
-        errs() << Message;
-      return Expr;
-    };
-    // Returns nullptr if there is not a 1:1 mapping of the yaml block pseudo
-    // probe and binary pseudo probe.
     size_t NBlockPseudoProbes = BlockPseudoProbes.size();
     if (LogErrIfExpr(NBlockPseudoProbes == 0,
                      "BOLT-WARNING: no pseudo probes in profile block\n"))
@@ -355,9 +391,9 @@ class StaleMatcher {
             "BOLT-WARNING: more than 1 pseudo probes in profile block\n"))
       return nullptr;
     uint64_t Index = BlockPseudoProbes[0]->Index;
-    auto It = IndexToBBPseudoProbes.find(Index);
+    auto It = IndexToProbes.find(Index);
     if (LogErrIfExpr(
-            It == IndexToBBPseudoProbes.end(),
+            It == IndexToProbes.end(),
             "BOLT-WARNING: no block pseudo probes found within BB at index\n"))
       return nullptr;
     if (LogErrIfExpr(
@@ -371,6 +407,36 @@ class StaleMatcher {
 
     return BinaryPseudoProbeIt->second;
   }
+
+  /// Uses pseudo probe information to attach the profile to the appropriate
+  /// block.
+  const FlowBlock *matchWithPseudoProbes(
+      BlendedBlockHash BlendedHash,
+      const std::vector<yaml::bolt::PseudoProbeInfo> &PseudoProbes) const {
+    if (!YamlBFGUID)
+      return nullptr;
+
+    // Searches for the pseudo probe attached to the matched function's block.
+    SmallVector<const yaml::bolt::PseudoProbeInfo *> BlockPseudoProbes;
+    SmallVector<const yaml::bolt::PseudoProbeInfo *> InlinedBlockPseudoProbes;
+    for (const auto &PseudoProbe : PseudoProbes) {
+      // Skips pseudo probes attached to function calls.
+      if (PseudoProbe.Type != static_cast<uint8_t>(PseudoProbeType::Block))
+        continue;
+      if (PseudoProbe.GUID != YamlBFGUID)
+        InlinedBlockPseudoProbes.push_back(&PseudoProbe);
+      else
+        BlockPseudoProbes.push_back(&PseudoProbe);
+    }
+
+    // Returns nullptr if there is not a 1:1 mapping of the profile block pseudo
+    // probe and a binary block pseudo probe.
+    const FlowBlock *MatchedInlinedBlock =
+        matchWithInlinedBlockPseudoProbes(InlinedBlockPseudoProbes);
+    return MatchedInlinedBlock
+               ? MatchedInlinedBlock
+               : matchWithNonInlinedBlockPseudoProbes(BlockPseudoProbes);
+  }
 };
 
 void BinaryFunction::computeBlockHashes(HashFunction HashFunction) const {
@@ -616,11 +682,13 @@ size_t matchWeightsByHashes(
                            ProbeMap.lower_bound(FuncAddr + BlockRange.second));
       for (const auto &[_, Probes] : BlockProbes) {
         for (const MCDecodedPseudoProbe &Probe : Probes) {
-          if (Probe.getInlineTreeNode()->hasInlineSite())
-            continue;
           if (Probe.getType() != static_cast<uint8_t>(PseudoProbeType::Block))
             continue;
-          Matcher.mapIndexToProbe(Probe.getIndex(), &Probe);
+          if (Probe.getInlineTreeNode()->hasInlineSite())
+            Matcher.mapGUIDAndIndexToProbe(Probe.getGuid(), Probe.getIndex(),
+                                           &Probe);
+          else
+            Matcher.mapIndexToProbe(Probe.getIndex(), &Probe);
           Matcher.mapProbeToBB(&Probe, Blocks[I]);
         }
       }

>From 327eb81f46912cf3a52f3228d6a40ba8925e7fa0 Mon Sep 17 00:00:00 2001
From: shawbyoung <shawbyoung at gmail.com>
Date: Thu, 25 Jul 2024 10:31:56 -0700
Subject: [PATCH 23/39] Added flag to trigger pseudo probe block matching

Created using spr 1.3.4
---
 bolt/lib/Profile/StaleProfileMatching.cpp          | 10 ++++++++--
 bolt/test/X86/match-blocks-with-pseudo-probes.test |  2 +-
 2 files changed, 9 insertions(+), 3 deletions(-)

diff --git a/bolt/lib/Profile/StaleProfileMatching.cpp b/bolt/lib/Profile/StaleProfileMatching.cpp
index fd9be5f44fe61a..99c1523ee71630 100644
--- a/bolt/lib/Profile/StaleProfileMatching.cpp
+++ b/bolt/lib/Profile/StaleProfileMatching.cpp
@@ -118,6 +118,11 @@ cl::opt<unsigned> StaleMatchingCostJumpUnknownFTInc(
         "The cost of increasing an unknown fall-through jump count by one."),
     cl::init(3), cl::ReallyHidden, cl::cat(BoltOptCategory));
 
+cl::opt<bool> StaleMatchingWithBlockPseudoProbes(
+    "stale-matching-with-block-pseudo-probes",
+    cl::desc("Turns on stale matching with block pseudo probes."), cl::init(3),
+    cl::ReallyHidden, cl::cat(BoltOptCategory));
+
 } // namespace opts
 
 namespace llvm {
@@ -413,7 +418,7 @@ class StaleMatcher {
   const FlowBlock *matchWithPseudoProbes(
       BlendedBlockHash BlendedHash,
       const std::vector<yaml::bolt::PseudoProbeInfo> &PseudoProbes) const {
-    if (!YamlBFGUID)
+    if (!opts::StaleMatchingWithBlockPseudoProbes || !YamlBFGUID)
       return nullptr;
 
     // Searches for the pseudo probe attached to the matched function's block.
@@ -671,7 +676,8 @@ size_t matchWeightsByHashes(
     BlendedBlockHash BlendedHash(BB->getHash());
     BlendedHashes.push_back(BlendedHash);
     // Collects pseudo probes attached to the BB for use in the StaleMatcher.
-    if (opts::ProfileUsePseudoProbes && PseudoProbeDecoder) {
+    if (opts::ProfileUsePseudoProbes &&
+        opts::StaleMatchingWithBlockPseudoProbes && PseudoProbeDecoder) {
       const AddressProbesMap &ProbeMap =
           PseudoProbeDecoder->getAddress2ProbesMap();
       const uint64_t FuncAddr = BF.getAddress();
diff --git a/bolt/test/X86/match-blocks-with-pseudo-probes.test b/bolt/test/X86/match-blocks-with-pseudo-probes.test
index 9b73117271b55c..9bb1334876e090 100644
--- a/bolt/test/X86/match-blocks-with-pseudo-probes.test
+++ b/bolt/test/X86/match-blocks-with-pseudo-probes.test
@@ -5,7 +5,7 @@
 # RUN: llvm-mc -filetype=obj -triple x86_64-unknown-unknown %t/main.s -o %t.o
 # RUN: %clang %cflags %t.o -o %t.exe -Wl,-q -nostdlib
 # RUN: llvm-bolt %t.exe -o %t.out --data %t/yaml -v=2 \
-# RUN:   --print-cfg --funcs=main --profile-ignore-hash=0 --infer-stale-profile --profile-use-pseudo-probes 2>&1 | FileCheck %s
+# RUN:   --print-cfg --funcs=main --profile-ignore-hash=0 --infer-stale-profile --profile-use-pseudo-probes --stale-matching-with-block-pseudo-probes 2>&1 | FileCheck %s
 
 # CHECK: BOLT-INFO: inference found a pseudo probe match for 100.00% of basic blocks (1 out of 1 stale) responsible for -nan% samples (0 out of 0 stale)
 

>From 37793aaa4c371cda1201c5d18f9bc233df1895dd Mon Sep 17 00:00:00 2001
From: shawbyoung <shawbyoung at gmail.com>
Date: Thu, 25 Jul 2024 14:30:54 -0700
Subject: [PATCH 24/39] Added flag for pseudo probe block matching

Created using spr 1.3.4
---
 bolt/lib/Profile/StaleProfileMatching.cpp | 21 +++++++++++----------
 1 file changed, 11 insertions(+), 10 deletions(-)

diff --git a/bolt/lib/Profile/StaleProfileMatching.cpp b/bolt/lib/Profile/StaleProfileMatching.cpp
index 99c1523ee71630..c5558bf923ea0f 100644
--- a/bolt/lib/Profile/StaleProfileMatching.cpp
+++ b/bolt/lib/Profile/StaleProfileMatching.cpp
@@ -433,7 +433,6 @@ class StaleMatcher {
       else
         BlockPseudoProbes.push_back(&PseudoProbe);
     }
-
     // Returns nullptr if there is not a 1:1 mapping of the profile block pseudo
     // probe and a binary block pseudo probe.
     const FlowBlock *MatchedInlinedBlock =
@@ -636,9 +635,12 @@ size_t matchWeightsByHashes(
   // pseudo probe block matching. Otherwise, the YamlBF's GUID is used for
   // pseudo probe block matching.
   const MCPseudoProbeDecoder *PseudoProbeDecoder =
-      opts::ProfileUsePseudoProbes ? BC.getPseudoProbeDecoder() : nullptr;
+      opts::ProfileUsePseudoProbes && opts::StaleMatchingWithBlockPseudoProbes
+          ? BC.getPseudoProbeDecoder()
+          : nullptr;
   uint64_t BFPseudoProbeDescHash = 0;
-  if (opts::ProfileUsePseudoProbes && BF.getGUID() != 0) {
+  if (opts::ProfileUsePseudoProbes &&
+      opts::StaleMatchingWithBlockPseudoProbes && BF.getGUID() != 0) {
     assert(PseudoProbeDecoder &&
            "If BF has pseudo probe, BC should have a pseudo probe decoder");
     auto &GUID2FuncDescMap = PseudoProbeDecoder->getGUID2FuncDescMap();
@@ -646,13 +648,8 @@ size_t matchWeightsByHashes(
     if (It != GUID2FuncDescMap.end())
       BFPseudoProbeDescHash = It->second.FuncHash;
   }
-  uint64_t YamlBFGUID =
-      BFPseudoProbeDescHash && YamlBF.PseudoProbeDescHash &&
-              BFPseudoProbeDescHash == YamlBF.PseudoProbeDescHash
-          ? static_cast<uint64_t>(YamlBF.GUID)
-          : 0;
 
-  StaleMatcher Matcher(YamlBFGUID);
+  StaleMatcher Matcher(YamlBF.GUID);
   std::vector<uint64_t> CallHashes;
   std::vector<FlowBlock *> Blocks;
   std::vector<BlendedBlockHash> BlendedHashes;
@@ -677,7 +674,11 @@ size_t matchWeightsByHashes(
     BlendedHashes.push_back(BlendedHash);
     // Collects pseudo probes attached to the BB for use in the StaleMatcher.
     if (opts::ProfileUsePseudoProbes &&
-        opts::StaleMatchingWithBlockPseudoProbes && PseudoProbeDecoder) {
+        opts::StaleMatchingWithBlockPseudoProbes && BFPseudoProbeDescHash &&
+        YamlBF.PseudoProbeDescHash &&
+        BFPseudoProbeDescHash == YamlBF.PseudoProbeDescHash) {
+      assert(PseudoProbeDecoder &&
+             "If pseudo probes are in use, psuedo probe decoder should exist");
       const AddressProbesMap &ProbeMap =
           PseudoProbeDecoder->getAddress2ProbesMap();
       const uint64_t FuncAddr = BF.getAddress();

>From ba00b22aaec16f3b50f7b5e084e1c8683221ba20 Mon Sep 17 00:00:00 2001
From: shawbyoung <shawbyoung at gmail.com>
Date: Thu, 25 Jul 2024 14:38:29 -0700
Subject: [PATCH 25/39] Set flag init val, changed std::string to StringRef

Created using spr 1.3.4
---
 bolt/lib/Profile/StaleProfileMatching.cpp | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/bolt/lib/Profile/StaleProfileMatching.cpp b/bolt/lib/Profile/StaleProfileMatching.cpp
index c5558bf923ea0f..ef9320ae168fe7 100644
--- a/bolt/lib/Profile/StaleProfileMatching.cpp
+++ b/bolt/lib/Profile/StaleProfileMatching.cpp
@@ -120,8 +120,8 @@ cl::opt<unsigned> StaleMatchingCostJumpUnknownFTInc(
 
 cl::opt<bool> StaleMatchingWithBlockPseudoProbes(
     "stale-matching-with-block-pseudo-probes",
-    cl::desc("Turns on stale matching with block pseudo probes."), cl::init(3),
-    cl::ReallyHidden, cl::cat(BoltOptCategory));
+    cl::desc("Turns on stale matching with block pseudo probes."),
+    cl::init(false), cl::ReallyHidden, cl::cat(BoltOptCategory));
 
 } // namespace opts
 
@@ -327,7 +327,7 @@ class StaleMatcher {
   }
 
   /// A helper function for logging.
-  static bool LogErrIfExpr(bool Expr, std::string Message) {
+  static bool LogErrIfExpr(bool Expr, StringRef Message) {
     if (Expr)
       errs() << Message;
     return Expr;

>From 5e47249c00c6f0825c19496e628d1f31d56894c9 Mon Sep 17 00:00:00 2001
From: Amir Ayupov <aaupov at fb.com>
Date: Wed, 31 Jul 2024 22:12:34 -0700
Subject: [PATCH 26/39] [BOLT][NFC] Add timers for MetadataManager invocations

Test Plan: added bolt/test/timers.c

Reviewers: ayermolo, maksfb, rafaelauler, dcci

Reviewed By: dcci

Pull Request: https://github.com/llvm/llvm-project/pull/101267
---
 bolt/lib/Rewrite/RewriteInstance.cpp | 10 ++++++++++
 bolt/test/timers.c                   | 15 +++++++++++++++
 2 files changed, 25 insertions(+)
 create mode 100644 bolt/test/timers.c

diff --git a/bolt/lib/Rewrite/RewriteInstance.cpp b/bolt/lib/Rewrite/RewriteInstance.cpp
index 33ebae3b6e6de2..b7e361c35088a2 100644
--- a/bolt/lib/Rewrite/RewriteInstance.cpp
+++ b/bolt/lib/Rewrite/RewriteInstance.cpp
@@ -3131,18 +3131,24 @@ void RewriteInstance::initializeMetadataManager() {
 }
 
 void RewriteInstance::processSectionMetadata() {
+  NamedRegionTimer T("processmetadata-section", "process section metadata",
+                     TimerGroupName, TimerGroupDesc, opts::TimeRewrite);
   initializeMetadataManager();
 
   MetadataManager.runSectionInitializers();
 }
 
 void RewriteInstance::processMetadataPreCFG() {
+  NamedRegionTimer T("processmetadata-precfg", "process metadata pre-CFG",
+                     TimerGroupName, TimerGroupDesc, opts::TimeRewrite);
   MetadataManager.runInitializersPreCFG();
 
   processProfileDataPreCFG();
 }
 
 void RewriteInstance::processMetadataPostCFG() {
+  NamedRegionTimer T("processmetadata-postcfg", "process metadata post-CFG",
+                     TimerGroupName, TimerGroupDesc, opts::TimeRewrite);
   MetadataManager.runInitializersPostCFG();
 }
 
@@ -3536,10 +3542,14 @@ void RewriteInstance::emitAndLink() {
 }
 
 void RewriteInstance::finalizeMetadataPreEmit() {
+  NamedRegionTimer T("finalizemetadata-preemit", "finalize metadata pre-emit",
+                     TimerGroupName, TimerGroupDesc, opts::TimeRewrite);
   MetadataManager.runFinalizersPreEmit();
 }
 
 void RewriteInstance::updateMetadata() {
+  NamedRegionTimer T("updatemetadata-postemit", "update metadata post-emit",
+                     TimerGroupName, TimerGroupDesc, opts::TimeRewrite);
   MetadataManager.runFinalizersAfterEmit();
 
   if (opts::UpdateDebugSections) {
diff --git a/bolt/test/timers.c b/bolt/test/timers.c
new file mode 100644
index 00000000000000..b16218dd7ea76d
--- /dev/null
+++ b/bolt/test/timers.c
@@ -0,0 +1,15 @@
+/* This test checks timers for metadata manager phases.
+# RUN: %clang %cflags %s -o %t.exe
+# RUN: link_fdata %s %t.exe %t.fdata
+# RUN: llvm-bolt %t.exe -o %t.null --data %t.fdata -w %t.yaml --time-rewrite \
+# RUN:   2>&1 | FileCheck %s
+
+# CHECK-DAG: update metadata post-emit
+# CHECK-DAG: process section metadata
+# CHECK-DAG: process metadata pre-CFG
+# CHECK-DAG: process metadata post-CFG
+# CHECK-DAG: finalize metadata pre-emit
+
+# FDATA: 0 [unknown] 0 1 main 0 1 0
+*/
+int main() { return 0; }

>From 3902effbfc181bdac5e2131e8583dca99a33d573 Mon Sep 17 00:00:00 2001
From: Amir Ayupov <aaupov at fb.com>
Date: Mon, 26 Aug 2024 09:05:34 -0700
Subject: [PATCH 27/39] [MC][NFC] Count pseudo probes and function records
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Pre-parse pseudo probes section counting the number of probes and
function records. These numbers are used in follow-up diff to
pre-allocate vectors for decoded probes and inline tree nodes.

Additional benefit is avoiding error handling during parsing.

This pre-parsing is fast: for a 404MiB .pseudo_probe section with
43373881 probes and 25228770 function records, it only takes 0.68±0.01s.
The total time of buildAddress2ProbeMap is 21s.

Reviewers: dcci, maksfb, rafaelauler, wlei-llvm, ayermolo

Reviewed By: wlei-llvm

Pull Request: https://github.com/llvm/llvm-project/pull/102774
---
 bolt/lib/Rewrite/PseudoProbeRewriter.cpp |   1 -
 llvm/include/llvm/MC/MCPseudoProbe.h     |   6 +
 llvm/lib/MC/MCPseudoProbe.cpp            | 143 +++++++++++++++++------
 3 files changed, 113 insertions(+), 37 deletions(-)

diff --git a/bolt/lib/Rewrite/PseudoProbeRewriter.cpp b/bolt/lib/Rewrite/PseudoProbeRewriter.cpp
index 886bbdbf9d686e..37a5b937ebcaa3 100644
--- a/bolt/lib/Rewrite/PseudoProbeRewriter.cpp
+++ b/bolt/lib/Rewrite/PseudoProbeRewriter.cpp
@@ -143,7 +143,6 @@ void PseudoProbeRewriter::parsePseudoProbe() {
   if (!ProbeDecoder.buildAddress2ProbeMap(
           reinterpret_cast<const uint8_t *>(Contents.data()), Contents.size(),
           GuidFilter, FuncStartAddrs)) {
-    ProbeDecoder.getAddress2ProbesMap().clear();
     errs() << "BOLT-WARNING: fail in building Address2ProbeMap\n";
     return;
   }
diff --git a/llvm/include/llvm/MC/MCPseudoProbe.h b/llvm/include/llvm/MC/MCPseudoProbe.h
index 5344dea4141b3e..44692bb183d5a4 100644
--- a/llvm/include/llvm/MC/MCPseudoProbe.h
+++ b/llvm/include/llvm/MC/MCPseudoProbe.h
@@ -371,6 +371,12 @@ class MCPseudoProbeDecoder {
   // Decode pseudo_probe_desc section to build GUID to PseudoProbeFuncDesc map.
   bool buildGUID2FuncDescMap(const uint8_t *Start, std::size_t Size);
 
+  // Decode pseudo_probe section to count the number of probes and inlined
+  // function records for each function record.
+  template <bool IsTopLevelFunc>
+  bool countRecords(bool &Discard, uint32_t &ProbeCount, uint32_t &InlinedCount,
+                    const Uint64Set &GuidFilter);
+
   // Decode pseudo_probe section to build address to probes map for specifed
   // functions only.
   bool buildAddress2ProbeMap(const uint8_t *Start, std::size_t Size,
diff --git a/llvm/lib/MC/MCPseudoProbe.cpp b/llvm/lib/MC/MCPseudoProbe.cpp
index a5a030e19b849e..21c6c4f766c30f 100644
--- a/llvm/lib/MC/MCPseudoProbe.cpp
+++ b/llvm/lib/MC/MCPseudoProbe.cpp
@@ -18,6 +18,7 @@
 #include "llvm/MC/MCObjectStreamer.h"
 #include "llvm/MC/MCSymbol.h"
 #include "llvm/Support/Endian.h"
+#include "llvm/Support/Error.h"
 #include "llvm/Support/LEB128.h"
 #include "llvm/Support/MD5.h"
 #include "llvm/Support/raw_ostream.h"
@@ -429,17 +430,11 @@ bool MCPseudoProbeDecoder::buildAddress2ProbeMap(
     Index = Cur->getChildren().size();
   } else {
     // Read inline site for inlinees
-    auto ErrorOrIndex = readUnsignedNumber<uint32_t>();
-    if (!ErrorOrIndex)
-      return false;
-    Index = std::move(*ErrorOrIndex);
+    Index = cantFail(errorOrToExpected(readUnsignedNumber<uint32_t>()));
   }
 
   // Read guid
-  auto ErrorOrCurGuid = readUnencodedNumber<uint64_t>();
-  if (!ErrorOrCurGuid)
-    return false;
-  uint64_t Guid = std::move(*ErrorOrCurGuid);
+  uint64_t Guid = cantFail(errorOrToExpected(readUnencodedNumber<uint64_t>()));
 
   // Decide if top-level node should be disgarded.
   if (IsTopLevelFunc && !GuidFilter.empty() && !GuidFilter.count(Guid))
@@ -457,41 +452,27 @@ bool MCPseudoProbeDecoder::buildAddress2ProbeMap(
   }
 
   // Read number of probes in the current node.
-  auto ErrorOrNodeCount = readUnsignedNumber<uint32_t>();
-  if (!ErrorOrNodeCount)
-    return false;
-  uint32_t NodeCount = std::move(*ErrorOrNodeCount);
+  uint32_t NodeCount =
+      cantFail(errorOrToExpected(readUnsignedNumber<uint32_t>()));
   // Read number of direct inlinees
-  auto ErrorOrCurChildrenToProcess = readUnsignedNumber<uint32_t>();
-  if (!ErrorOrCurChildrenToProcess)
-    return false;
+  uint32_t ChildrenToProcess =
+      cantFail(errorOrToExpected(readUnsignedNumber<uint32_t>()));
   // Read all probes in this node
   for (std::size_t I = 0; I < NodeCount; I++) {
     // Read index
-    auto ErrorOrIndex = readUnsignedNumber<uint32_t>();
-    if (!ErrorOrIndex)
-      return false;
-    uint32_t Index = std::move(*ErrorOrIndex);
+    uint32_t Index =
+        cantFail(errorOrToExpected(readUnsignedNumber<uint32_t>()));
     // Read type | flag.
-    auto ErrorOrValue = readUnencodedNumber<uint8_t>();
-    if (!ErrorOrValue)
-      return false;
-    uint8_t Value = std::move(*ErrorOrValue);
+    uint8_t Value = cantFail(errorOrToExpected(readUnencodedNumber<uint8_t>()));
     uint8_t Kind = Value & 0xf;
     uint8_t Attr = (Value & 0x70) >> 4;
     // Read address
     uint64_t Addr = 0;
     if (Value & 0x80) {
-      auto ErrorOrOffset = readSignedNumber<int64_t>();
-      if (!ErrorOrOffset)
-        return false;
-      int64_t Offset = std::move(*ErrorOrOffset);
+      int64_t Offset = cantFail(errorOrToExpected(readSignedNumber<int64_t>()));
       Addr = LastAddr + Offset;
     } else {
-      auto ErrorOrAddr = readUnencodedNumber<int64_t>();
-      if (!ErrorOrAddr)
-        return false;
-      Addr = std::move(*ErrorOrAddr);
+      Addr = cantFail(errorOrToExpected(readUnencodedNumber<int64_t>()));
       if (isSentinelProbe(Attr)) {
         // For sentinel probe, the addr field actually stores the GUID of the
         // split function. Convert it to the real address.
@@ -508,10 +489,8 @@ bool MCPseudoProbeDecoder::buildAddress2ProbeMap(
 
     uint32_t Discriminator = 0;
     if (hasDiscriminator(Attr)) {
-      auto ErrorOrDiscriminator = readUnsignedNumber<uint32_t>();
-      if (!ErrorOrDiscriminator)
-        return false;
-      Discriminator = std::move(*ErrorOrDiscriminator);
+      Discriminator =
+          cantFail(errorOrToExpected(readUnsignedNumber<uint32_t>()));
     }
 
     if (Cur && !isSentinelProbe(Attr)) {
@@ -524,17 +503,109 @@ bool MCPseudoProbeDecoder::buildAddress2ProbeMap(
     LastAddr = Addr;
   }
 
-  uint32_t ChildrenToProcess = std::move(*ErrorOrCurChildrenToProcess);
   for (uint32_t I = 0; I < ChildrenToProcess; I++) {
     buildAddress2ProbeMap(Cur, LastAddr, GuidFilter, FuncStartAddrs);
   }
+  return true;
+}
+
+template <bool IsTopLevelFunc>
+bool MCPseudoProbeDecoder::countRecords(bool &Discard, uint32_t &ProbeCount,
+                                        uint32_t &InlinedCount,
+                                        const Uint64Set &GuidFilter) {
+  if (!IsTopLevelFunc)
+    // Read inline site for inlinees
+    if (!readUnsignedNumber<uint32_t>())
+      return false;
+
+  // Read guid
+  auto ErrorOrCurGuid = readUnencodedNumber<uint64_t>();
+  if (!ErrorOrCurGuid)
+    return false;
+  uint64_t Guid = std::move(*ErrorOrCurGuid);
+
+  // Decide if top-level node should be disgarded.
+  if (IsTopLevelFunc) {
+    Discard = !GuidFilter.empty() && !GuidFilter.count(Guid);
+    if (!Discard)
+      // Allocate an entry for top-level function record.
+      ++InlinedCount;
+  }
+
+  // Read number of probes in the current node.
+  auto ErrorOrNodeCount = readUnsignedNumber<uint32_t>();
+  if (!ErrorOrNodeCount)
+    return false;
+  uint32_t NodeCount = std::move(*ErrorOrNodeCount);
+  uint32_t CurrentProbeCount = 0;
+
+  // Read number of direct inlinees
+  auto ErrorOrCurChildrenToProcess = readUnsignedNumber<uint32_t>();
+  if (!ErrorOrCurChildrenToProcess)
+    return false;
+  uint32_t ChildrenToProcess = std::move(*ErrorOrCurChildrenToProcess);
+
+  // Read all probes in this node
+  for (std::size_t I = 0; I < NodeCount; I++) {
+    // Read index
+    if (!readUnsignedNumber<uint32_t>())
+      return false;
+
+    // Read type | flag.
+    auto ErrorOrValue = readUnencodedNumber<uint8_t>();
+    if (!ErrorOrValue)
+      return false;
+    uint8_t Value = std::move(*ErrorOrValue);
+
+    uint8_t Attr = (Value & 0x70) >> 4;
+    if (Value & 0x80) {
+      // Offset
+      if (!readSignedNumber<int64_t>())
+        return false;
+    } else {
+      // Addr
+      if (!readUnencodedNumber<int64_t>())
+        return false;
+    }
+
+    if (hasDiscriminator(Attr))
+      // Discriminator
+      if (!readUnsignedNumber<uint32_t>())
+        return false;
+
+    if (!Discard && !isSentinelProbe(Attr))
+      ++CurrentProbeCount;
+  }
 
+  if (!Discard) {
+    ProbeCount += CurrentProbeCount;
+    InlinedCount += ChildrenToProcess;
+  }
+
+  for (uint32_t I = 0; I < ChildrenToProcess; I++)
+    if (!countRecords<false>(Discard, ProbeCount, InlinedCount, GuidFilter))
+      return false;
   return true;
 }
 
 bool MCPseudoProbeDecoder::buildAddress2ProbeMap(
     const uint8_t *Start, std::size_t Size, const Uint64Set &GuidFilter,
     const Uint64Map &FuncStartAddrs) {
+  // For function records in the order of their appearance in the encoded data
+  // (DFS), count the number of contained probes and inlined function records.
+  uint32_t ProbeCount = 0;
+  uint32_t InlinedCount = 0;
+  uint32_t TopLevelFuncs = 0;
+  Data = Start;
+  End = Data + Size;
+  bool Discard = false;
+  while (Data < End) {
+    if (!countRecords<true>(Discard, ProbeCount, InlinedCount, GuidFilter))
+      return false;
+    TopLevelFuncs += !Discard;
+  }
+  assert(Data == End && "Have unprocessed data in pseudo_probe section");
+
   Data = Start;
   End = Data + Size;
   uint64_t LastAddr = 0;

>From d20d4d6598c3546be964a2df638f4418645bc0b7 Mon Sep 17 00:00:00 2001
From: Amir Ayupov <aaupov at fb.com>
Date: Thu, 25 Jul 2024 16:36:43 -0700
Subject: [PATCH 28/39] [MC][NFC] Drop unused
 MCDecodedPseudoProbeInlineTree::ChildrenToProcess (#100576)

The usage was removed in 3f97016857b0305294f3a55ea220884fb50ce033.

Results in a slight peak RSS reduction in
`perf2bolt --profile-use-pseudo-probes` from 17.24 to 16.85 GiB.
---
 llvm/include/llvm/MC/MCPseudoProbe.h | 2 --
 1 file changed, 2 deletions(-)

diff --git a/llvm/include/llvm/MC/MCPseudoProbe.h b/llvm/include/llvm/MC/MCPseudoProbe.h
index 44692bb183d5a4..f3539b23b8a358 100644
--- a/llvm/include/llvm/MC/MCPseudoProbe.h
+++ b/llvm/include/llvm/MC/MCPseudoProbe.h
@@ -280,8 +280,6 @@ class MCDecodedPseudoProbeInlineTree
                                          MCDecodedPseudoProbeInlineTree> {
 public:
   InlineSite ISite;
-  // Used for decoding
-  uint32_t ChildrenToProcess = 0;
 
   MCDecodedPseudoProbeInlineTree() = default;
   MCDecodedPseudoProbeInlineTree(const InlineSite &Site) : ISite(Site){};

>From a857d324de090fe9723e999eadc6cb29d8141a93 Mon Sep 17 00:00:00 2001
From: Amir Ayupov <aaupov at fb.com>
Date: Sat, 10 Aug 2024 23:45:47 -0700
Subject: [PATCH 29/39] [profgen][NFC] Pass parameter as const_ref

Pass `ProbeNode` parameter of `trackInlineesOptimizedAway` as const
reference.

Reviewers: wlei-llvm, WenleiHe

Reviewed By: WenleiHe

Pull Request: https://github.com/llvm/llvm-project/pull/102787
---
 llvm/include/llvm/MC/MCPseudoProbe.h       | 1 +
 llvm/tools/llvm-profgen/ProfiledBinary.cpp | 3 ++-
 llvm/tools/llvm-profgen/ProfiledBinary.h   | 7 ++++---
 3 files changed, 7 insertions(+), 4 deletions(-)

diff --git a/llvm/include/llvm/MC/MCPseudoProbe.h b/llvm/include/llvm/MC/MCPseudoProbe.h
index f3539b23b8a358..3dd10c0717679b 100644
--- a/llvm/include/llvm/MC/MCPseudoProbe.h
+++ b/llvm/include/llvm/MC/MCPseudoProbe.h
@@ -241,6 +241,7 @@ class MCPseudoProbeInlineTreeBase {
   InlinedProbeTreeMap &getChildren() { return Children; }
   const InlinedProbeTreeMap &getChildren() const { return Children; }
   std::vector<ProbeType> &getProbes() { return Probes; }
+  const std::vector<ProbeType> &getProbes() const { return Probes; }
   void addProbes(ProbeType Probe) { Probes.push_back(Probe); }
   // Caller node of the inline site
   MCPseudoProbeInlineTreeBase<ProbeType, DerivedProbeInlineTreeType> *Parent =
diff --git a/llvm/tools/llvm-profgen/ProfiledBinary.cpp b/llvm/tools/llvm-profgen/ProfiledBinary.cpp
index 632ddc7b50f54a..574a9c9f52bf18 100644
--- a/llvm/tools/llvm-profgen/ProfiledBinary.cpp
+++ b/llvm/tools/llvm-profgen/ProfiledBinary.cpp
@@ -137,7 +137,8 @@ void BinarySizeContextTracker::trackInlineesOptimizedAway(
 
 void BinarySizeContextTracker::trackInlineesOptimizedAway(
     MCPseudoProbeDecoder &ProbeDecoder,
-    MCDecodedPseudoProbeInlineTree &ProbeNode, ProbeFrameStack &ProbeContext) {
+    const MCDecodedPseudoProbeInlineTree &ProbeNode,
+    ProbeFrameStack &ProbeContext) {
   StringRef FuncName =
       ProbeDecoder.getFuncDescForGUID(ProbeNode.Guid)->FuncName;
   ProbeContext.emplace_back(FuncName, 0);
diff --git a/llvm/tools/llvm-profgen/ProfiledBinary.h b/llvm/tools/llvm-profgen/ProfiledBinary.h
index f2eeca45454592..0588cb48b2af62 100644
--- a/llvm/tools/llvm-profgen/ProfiledBinary.h
+++ b/llvm/tools/llvm-profgen/ProfiledBinary.h
@@ -167,9 +167,10 @@ class BinarySizeContextTracker {
   void trackInlineesOptimizedAway(MCPseudoProbeDecoder &ProbeDecoder);
 
   using ProbeFrameStack = SmallVector<std::pair<StringRef, uint32_t>>;
-  void trackInlineesOptimizedAway(MCPseudoProbeDecoder &ProbeDecoder,
-                                  MCDecodedPseudoProbeInlineTree &ProbeNode,
-                                  ProbeFrameStack &Context);
+  void
+  trackInlineesOptimizedAway(MCPseudoProbeDecoder &ProbeDecoder,
+                             const MCDecodedPseudoProbeInlineTree &ProbeNode,
+                             ProbeFrameStack &Context);
 
   void dump() { RootContext.dumpTree(); }
 

>From cddea6a015b94140e96dee4d0fa902f8536c0a81 Mon Sep 17 00:00:00 2001
From: Amir Ayupov <aaupov at fb.com>
Date: Mon, 26 Aug 2024 09:09:13 -0700
Subject: [PATCH 30/39] [MC][NFC] Statically allocate storage for decoded
 pseudo probes and function records

Use #102774 to allocate storage for decoded probes (`PseudoProbeVec`)
and function records (`InlineTreeVec`).

Leverage that to also shrink sizes of `MCDecodedPseudoProbe`:
- Drop Guid since it's accessible via `InlineTree`.

`MCDecodedPseudoProbeInlineTree`:
- Keep track of probes and inlinees using `ArrayRef`s now that probes
  and function records belonging to the same function are allocated
  contiguously.

This reduces peak RSS from 13.7 GiB to 9.7 GiB and pseudo probe parsing
time (as part of perf2bolt) from 15.3s to 9.6s for a large binary with
400MiB .pseudo_probe section containing 43M probes and 25M function
records.

Depends on:
#102774
#102787
#102788

Reviewers: maksfb, rafaelauler, dcci, ayermolo, wlei-llvm

Reviewed By: wlei-llvm

Pull Request: https://github.com/llvm/llvm-project/pull/102789
---
 bolt/lib/Rewrite/PseudoProbeRewriter.cpp     |  30 ++--
 llvm/include/llvm/MC/MCPseudoProbe.h         | 136 +++++++++++++------
 llvm/lib/MC/MCPseudoProbe.cpp                |  58 +++++---
 llvm/tools/llvm-profgen/ProfileGenerator.cpp |   6 +-
 llvm/tools/llvm-profgen/ProfiledBinary.cpp   |  10 +-
 5 files changed, 164 insertions(+), 76 deletions(-)

diff --git a/bolt/lib/Rewrite/PseudoProbeRewriter.cpp b/bolt/lib/Rewrite/PseudoProbeRewriter.cpp
index 37a5b937ebcaa3..9677530919b90d 100644
--- a/bolt/lib/Rewrite/PseudoProbeRewriter.cpp
+++ b/bolt/lib/Rewrite/PseudoProbeRewriter.cpp
@@ -200,7 +200,9 @@ void PseudoProbeRewriter::updatePseudoProbes() {
     }
 
     unsigned ProbeTrack = AP.second.size();
-    std::list<MCDecodedPseudoProbe>::iterator Probe = AP.second.begin();
+    auto Probe = llvm::map_iterator(
+        AP.second.begin(),
+        [](auto RW) -> MCDecodedPseudoProbe & { return RW.get(); });
     while (ProbeTrack != 0) {
       if (Probe->isBlock()) {
         Probe->setAddress(BlkOutputAddress);
@@ -218,9 +220,7 @@ void PseudoProbeRewriter::updatePseudoProbes() {
         }
 
         while (CallOutputAddress != CallOutputAddresses.second) {
-          AP.second.push_back(*Probe);
-          AP.second.back().setAddress(CallOutputAddress->second);
-          Probe->getInlineTreeNode()->addProbes(&(AP.second.back()));
+          ProbeDecoder.addInjectedProbe(*Probe, CallOutputAddress->second);
           CallOutputAddress = std::next(CallOutputAddress);
         }
       }
@@ -332,7 +332,7 @@ void PseudoProbeRewriter::encodePseudoProbes() {
       ProbeDecoder.getDummyInlineRoot();
   for (auto Child = Root.getChildren().begin();
        Child != Root.getChildren().end(); ++Child)
-    Inlinees[Child->first] = Child->second.get();
+    Inlinees[Child->getInlineSite()] = &*Child;
 
   for (auto Inlinee : Inlinees)
     // INT64_MAX is "placeholder" of unused callsite index field in the pair
@@ -358,25 +358,37 @@ void PseudoProbeRewriter::encodePseudoProbes() {
     EmitInt(Cur->Guid, 8);
     // Emit number of probes in this node
     uint64_t Deleted = 0;
-    for (MCDecodedPseudoProbe *&Probe : Cur->getProbes())
+    for (MCDecodedPseudoProbe *&Probe :
+         llvm::make_pointer_range(Cur->getProbes()))
       if (Probe->getAddress() == INT64_MAX)
         Deleted++;
     LLVM_DEBUG(dbgs() << "Deleted Probes:" << Deleted << "\n");
-    uint64_t ProbesSize = Cur->getProbes().size() - Deleted;
+    size_t InjectedProbes = ProbeDecoder.getNumInjectedProbes(Cur);
+    uint64_t ProbesSize = Cur->getProbes().size() - Deleted + InjectedProbes;
     EmitULEB128IntValue(ProbesSize);
     // Emit number of direct inlinees
     EmitULEB128IntValue(Cur->getChildren().size());
     // Emit probes in this group
-    for (MCDecodedPseudoProbe *&Probe : Cur->getProbes()) {
+    for (MCDecodedPseudoProbe *&Probe :
+         llvm::make_pointer_range(Cur->getProbes())) {
       if (Probe->getAddress() == INT64_MAX)
         continue;
       EmitDecodedPseudoProbe(Probe);
       LastProbe = Probe;
     }
+    if (InjectedProbes) {
+      for (MCDecodedPseudoProbe *&Probe :
+           llvm::make_pointer_range(ProbeDecoder.getInjectedProbes(Cur))) {
+        if (Probe->getAddress() == INT64_MAX)
+          continue;
+        EmitDecodedPseudoProbe(Probe);
+        LastProbe = Probe;
+      }
+    }
 
     for (auto Child = Cur->getChildren().begin();
          Child != Cur->getChildren().end(); ++Child)
-      Inlinees[Child->first] = Child->second.get();
+      Inlinees[Child->getInlineSite()] = &*Child;
     for (const auto &Inlinee : Inlinees) {
       assert(Cur->Guid != 0 && "non root tree node must have nonzero Guid");
       NextNodes.push_back({std::get<1>(Inlinee.first), Inlinee.second});
diff --git a/llvm/include/llvm/MC/MCPseudoProbe.h b/llvm/include/llvm/MC/MCPseudoProbe.h
index 3dd10c0717679b..66ad9db4860d8a 100644
--- a/llvm/include/llvm/MC/MCPseudoProbe.h
+++ b/llvm/include/llvm/MC/MCPseudoProbe.h
@@ -54,20 +54,21 @@
 #ifndef LLVM_MC_MCPSEUDOPROBE_H
 #define LLVM_MC_MCPSEUDOPROBE_H
 
+#include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/DenseSet.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/StringRef.h"
+#include "llvm/ADT/iterator.h"
 #include "llvm/IR/PseudoProbe.h"
 #include "llvm/Support/ErrorOr.h"
-#include <list>
+#include <functional>
 #include <map>
 #include <memory>
 #include <string>
 #include <tuple>
 #include <type_traits>
 #include <unordered_map>
-#include <unordered_set>
 #include <vector>
 
 namespace llvm {
@@ -103,14 +104,15 @@ using MCPseudoProbeInlineStack = SmallVector<InlineSite, 8>;
 using GUIDProbeFunctionMap =
     std::unordered_map<uint64_t, MCPseudoProbeFuncDesc>;
 // Address to pseudo probes map.
-using AddressProbesMap = std::map<uint64_t, std::list<MCDecodedPseudoProbe>>;
+using AddressProbesMap =
+    std::map<uint64_t,
+             std::vector<std::reference_wrapper<MCDecodedPseudoProbe>>>;
 
 class MCDecodedPseudoProbeInlineTree;
 
 class MCPseudoProbeBase {
 protected:
-  uint64_t Guid;
-  uint64_t Index;
+  uint32_t Index;
   uint32_t Discriminator;
   uint8_t Attributes;
   uint8_t Type;
@@ -120,14 +122,12 @@ class MCPseudoProbeBase {
   const static uint32_t PseudoProbeFirstId = 1;
 
 public:
-  MCPseudoProbeBase(uint64_t G, uint64_t I, uint64_t At, uint8_t T, uint32_t D)
-      : Guid(G), Index(I), Discriminator(D), Attributes(At), Type(T) {}
+  MCPseudoProbeBase(uint64_t I, uint64_t At, uint8_t T, uint32_t D)
+      : Index(I), Discriminator(D), Attributes(At), Type(T) {}
 
   bool isEntry() const { return Index == PseudoProbeFirstId; }
 
-  uint64_t getGuid() const { return Guid; }
-
-  uint64_t getIndex() const { return Index; }
+  uint32_t getIndex() const { return Index; }
 
   uint32_t getDiscriminator() const { return Discriminator; }
 
@@ -157,18 +157,20 @@ class MCPseudoProbeBase {
 /// uses an address from a temporary label created at the current address in the
 /// current section.
 class MCPseudoProbe : public MCPseudoProbeBase {
+  uint64_t Guid;
   MCSymbol *Label;
 
 public:
   MCPseudoProbe(MCSymbol *Label, uint64_t Guid, uint64_t Index, uint64_t Type,
                 uint64_t Attributes, uint32_t Discriminator)
-      : MCPseudoProbeBase(Guid, Index, Attributes, Type, Discriminator),
+      : MCPseudoProbeBase(Index, Attributes, Type, Discriminator), Guid(Guid),
         Label(Label) {
     assert(Type <= 0xFF && "Probe type too big to encode, exceeding 2^8");
     assert(Attributes <= 0xFF &&
            "Probe attributes too big to encode, exceeding 2^16");
   }
 
+  uint64_t getGuid() const { return Guid; };
   MCSymbol *getLabel() const { return Label; }
   void emit(MCObjectStreamer *MCOS, const MCPseudoProbe *LastProbe) const;
 };
@@ -181,11 +183,11 @@ class MCDecodedPseudoProbe : public MCPseudoProbeBase {
   MCDecodedPseudoProbeInlineTree *InlineTree;
 
 public:
-  MCDecodedPseudoProbe(uint64_t Ad, uint64_t G, uint32_t I, PseudoProbeType K,
-                       uint8_t At, uint32_t D,
-                       MCDecodedPseudoProbeInlineTree *Tree)
-      : MCPseudoProbeBase(G, I, At, static_cast<uint8_t>(K), D), Address(Ad),
+  MCDecodedPseudoProbe(uint64_t Ad, uint32_t I, PseudoProbeType K, uint8_t At,
+                       uint32_t D, MCDecodedPseudoProbeInlineTree *Tree)
+      : MCPseudoProbeBase(I, At, static_cast<uint8_t>(K), D), Address(Ad),
         InlineTree(Tree){};
+  uint64_t getGuid() const;
 
   uint64_t getAddress() const { return Address; }
 
@@ -211,21 +213,14 @@ class MCDecodedPseudoProbe : public MCPseudoProbeBase {
              bool ShowName) const;
 };
 
-template <typename ProbeType, typename DerivedProbeInlineTreeType>
+template <typename ProbesType, typename DerivedProbeInlineTreeType,
+          typename InlinedProbeTreeMap>
 class MCPseudoProbeInlineTreeBase {
-  struct InlineSiteHash {
-    uint64_t operator()(const InlineSite &Site) const {
-      return std::get<0>(Site) ^ std::get<1>(Site);
-    }
-  };
-
 protected:
   // Track children (e.g. inlinees) of current context
-  using InlinedProbeTreeMap = std::unordered_map<
-      InlineSite, std::unique_ptr<DerivedProbeInlineTreeType>, InlineSiteHash>;
   InlinedProbeTreeMap Children;
   // Set of probes that come with the function.
-  std::vector<ProbeType> Probes;
+  ProbesType Probes;
   MCPseudoProbeInlineTreeBase() {
     static_assert(std::is_base_of<MCPseudoProbeInlineTreeBase,
                                   DerivedProbeInlineTreeType>::value,
@@ -240,12 +235,10 @@ class MCPseudoProbeInlineTreeBase {
   bool isRoot() const { return Guid == 0; }
   InlinedProbeTreeMap &getChildren() { return Children; }
   const InlinedProbeTreeMap &getChildren() const { return Children; }
-  std::vector<ProbeType> &getProbes() { return Probes; }
-  const std::vector<ProbeType> &getProbes() const { return Probes; }
-  void addProbes(ProbeType Probe) { Probes.push_back(Probe); }
+  const ProbesType &getProbes() const { return Probes; }
   // Caller node of the inline site
-  MCPseudoProbeInlineTreeBase<ProbeType, DerivedProbeInlineTreeType> *Parent =
-      nullptr;
+  MCPseudoProbeInlineTreeBase<ProbesType, DerivedProbeInlineTreeType,
+                              InlinedProbeTreeMap> *Parent = nullptr;
   DerivedProbeInlineTreeType *getOrAddNode(const InlineSite &Site) {
     auto Ret = Children.emplace(
         Site, std::make_unique<DerivedProbeInlineTreeType>(Site));
@@ -259,9 +252,17 @@ class MCPseudoProbeInlineTreeBase {
 // instance is created as the root of a tree.
 // A real instance of this class is created for each function, either a
 // not inlined function that has code in .text section or an inlined function.
+struct InlineSiteHash {
+  uint64_t operator()(const InlineSite &Site) const {
+    return std::get<0>(Site) ^ std::get<1>(Site);
+  }
+};
 class MCPseudoProbeInlineTree
-    : public MCPseudoProbeInlineTreeBase<MCPseudoProbe,
-                                         MCPseudoProbeInlineTree> {
+    : public MCPseudoProbeInlineTreeBase<
+          std::vector<MCPseudoProbe>, MCPseudoProbeInlineTree,
+          std::unordered_map<InlineSite,
+                             std::unique_ptr<MCPseudoProbeInlineTree>,
+                             InlineSiteHash>> {
 public:
   MCPseudoProbeInlineTree() = default;
   MCPseudoProbeInlineTree(uint64_t Guid) { this->Guid = Guid; }
@@ -277,16 +278,31 @@ class MCPseudoProbeInlineTree
 
 // inline tree node for the decoded pseudo probe
 class MCDecodedPseudoProbeInlineTree
-    : public MCPseudoProbeInlineTreeBase<MCDecodedPseudoProbe *,
-                                         MCDecodedPseudoProbeInlineTree> {
-public:
-  InlineSite ISite;
+    : public MCPseudoProbeInlineTreeBase<
+          MCDecodedPseudoProbe *, MCDecodedPseudoProbeInlineTree,
+          MutableArrayRef<MCDecodedPseudoProbeInlineTree>> {
+  uint32_t NumProbes = 0;
+  uint32_t ProbeId = 0;
 
+public:
   MCDecodedPseudoProbeInlineTree() = default;
-  MCDecodedPseudoProbeInlineTree(const InlineSite &Site) : ISite(Site){};
+  MCDecodedPseudoProbeInlineTree(const InlineSite &Site,
+                                 MCDecodedPseudoProbeInlineTree *Parent)
+      : ProbeId(std::get<1>(Site)) {
+    this->Guid = std::get<0>(Site);
+    this->Parent = Parent;
+  }
 
   // Return false if it's a dummy inline site
   bool hasInlineSite() const { return !isRoot() && !Parent->isRoot(); }
+  InlineSite getInlineSite() const { return InlineSite(Guid, ProbeId); }
+  void setProbes(MutableArrayRef<MCDecodedPseudoProbe> ProbesRef) {
+    Probes = ProbesRef.data();
+    NumProbes = ProbesRef.size();
+  }
+  auto getProbes() const {
+    return MutableArrayRef<MCDecodedPseudoProbe>(Probes, NumProbes);
+  }
 };
 
 /// Instances of this class represent the pseudo probes inserted into a compile
@@ -336,6 +352,20 @@ class MCPseudoProbeTable {
 };
 
 class MCPseudoProbeDecoder {
+  // Decoded pseudo probes vector.
+  std::vector<MCDecodedPseudoProbe> PseudoProbeVec;
+  // Injected pseudo probes, identified by the containing inline tree node.
+  // Need to keep injected probes separately for two reasons:
+  // 1) Probes cannot be added to the PseudoProbeVec: appending may cause
+  //    reallocation so that pointers to its elements will become invalid.
+  // 2) Probes belonging to function record must be contiguous in PseudoProbeVec
+  //    as owning InlineTree references them with an ArrayRef to save space.
+  std::unordered_map<const MCDecodedPseudoProbeInlineTree *,
+                     std::vector<MCDecodedPseudoProbe>>
+      InjectedProbeMap;
+  // Decoded inline records vector.
+  std::vector<MCDecodedPseudoProbeInlineTree> InlineTreeVec;
+
   // GUID to PseudoProbeFuncDesc map.
   GUIDProbeFunctionMap GUID2FuncDescMap;
 
@@ -382,10 +412,6 @@ class MCPseudoProbeDecoder {
                              const Uint64Set &GuildFilter,
                              const Uint64Map &FuncStartAddrs);
 
-  bool buildAddress2ProbeMap(MCDecodedPseudoProbeInlineTree *Cur,
-                             uint64_t &LastAddr, const Uint64Set &GuildFilter,
-                             const Uint64Map &FuncStartAddrs);
-
   // Print pseudo_probe_desc section info
   void printGUID2FuncDescMap(raw_ostream &OS);
 
@@ -428,6 +454,34 @@ class MCPseudoProbeDecoder {
   const MCDecodedPseudoProbeInlineTree &getDummyInlineRoot() const {
     return DummyInlineRoot;
   }
+
+  void addInjectedProbe(const MCDecodedPseudoProbe &Probe, uint64_t Address) {
+    const MCDecodedPseudoProbeInlineTree *Parent = Probe.getInlineTreeNode();
+    InjectedProbeMap[Parent].emplace_back(Probe).setAddress(Address);
+  }
+
+  size_t
+  getNumInjectedProbes(const MCDecodedPseudoProbeInlineTree *Parent) const {
+    auto It = InjectedProbeMap.find(Parent);
+    if (It == InjectedProbeMap.end())
+      return 0;
+    return It->second.size();
+  }
+
+  auto getInjectedProbes(MCDecodedPseudoProbeInlineTree *Parent) {
+    auto It = InjectedProbeMap.find(Parent);
+    assert(It != InjectedProbeMap.end());
+    return iterator_range(It->second);
+  }
+
+private:
+  // Recursively parse an inlining tree encoded in pseudo_probe section. Returns
+  // whether the the top-level node should be skipped.
+  template <bool IsTopLevelFunc>
+  bool buildAddress2ProbeMap(MCDecodedPseudoProbeInlineTree *Cur,
+                             uint64_t &LastAddr, const Uint64Set &GuildFilter,
+                             const Uint64Map &FuncStartAddrs,
+                             const uint32_t CurChildIndex);
 };
 
 } // end namespace llvm
diff --git a/llvm/lib/MC/MCPseudoProbe.cpp b/llvm/lib/MC/MCPseudoProbe.cpp
index 21c6c4f766c30f..3dc443a503f722 100644
--- a/llvm/lib/MC/MCPseudoProbe.cpp
+++ b/llvm/lib/MC/MCPseudoProbe.cpp
@@ -49,6 +49,8 @@ static const MCExpr *buildSymbolDiff(MCObjectStreamer *MCOS, const MCSymbol *A,
   return AddrDelta;
 }
 
+uint64_t MCDecodedPseudoProbe::getGuid() const { return InlineTree->Guid; }
+
 void MCPseudoProbe::emit(MCObjectStreamer *MCOS,
                          const MCPseudoProbe *LastProbe) const {
   bool IsSentinel = isSentinelProbe(getAttributes());
@@ -289,8 +291,8 @@ void MCDecodedPseudoProbe::getInlineContext(
   // Note that it won't include the probe's belonging function(leaf location)
   while (Cur->hasInlineSite()) {
     StringRef FuncName = getProbeFNameForGUID(GUID2FuncMAP, Cur->Parent->Guid);
-    ContextStack.emplace_back(
-        MCPseudoProbeFrameLocation(FuncName, std::get<1>(Cur->ISite)));
+    ContextStack.emplace_back(MCPseudoProbeFrameLocation(
+        FuncName, std::get<1>(Cur->getInlineSite())));
     Cur = static_cast<MCDecodedPseudoProbeInlineTree *>(Cur->Parent);
   }
   // Make the ContextStack in caller-callee order
@@ -318,10 +320,10 @@ void MCDecodedPseudoProbe::print(raw_ostream &OS,
                                  bool ShowName) const {
   OS << "FUNC: ";
   if (ShowName) {
-    StringRef FuncName = getProbeFNameForGUID(GUID2FuncMAP, Guid);
+    StringRef FuncName = getProbeFNameForGUID(GUID2FuncMAP, getGuid());
     OS << FuncName.str() << " ";
   } else {
-    OS << Guid << " ";
+    OS << getGuid() << " ";
   }
   OS << "Index: " << Index << "  ";
   if (Discriminator)
@@ -417,17 +419,18 @@ bool MCPseudoProbeDecoder::buildGUID2FuncDescMap(const uint8_t *Start,
   return true;
 }
 
+template <bool IsTopLevelFunc>
 bool MCPseudoProbeDecoder::buildAddress2ProbeMap(
     MCDecodedPseudoProbeInlineTree *Cur, uint64_t &LastAddr,
-    const Uint64Set &GuidFilter, const Uint64Map &FuncStartAddrs) {
+    const Uint64Set &GuidFilter, const Uint64Map &FuncStartAddrs,
+    const uint32_t CurChildIndex) {
   // The pseudo_probe section encodes an inline forest and each tree has a
   // format defined in MCPseudoProbe.h
 
   uint32_t Index = 0;
-  bool IsTopLevelFunc = Cur == &DummyInlineRoot;
   if (IsTopLevelFunc) {
     // Use a sequential id for top level inliner.
-    Index = Cur->getChildren().size();
+    Index = CurChildIndex;
   } else {
     // Read inline site for inlinees
     Index = cantFail(errorOrToExpected(readUnsignedNumber<uint32_t>()));
@@ -443,8 +446,9 @@ bool MCPseudoProbeDecoder::buildAddress2ProbeMap(
   // If the incoming node is null, all its children nodes should be disgarded.
   if (Cur) {
     // Switch/add to a new tree node(inlinee)
-    Cur = Cur->getOrAddNode(std::make_tuple(Guid, Index));
-    Cur->Guid = Guid;
+    Cur->getChildren()[CurChildIndex] =
+        MCDecodedPseudoProbeInlineTree(InlineSite(Guid, Index), Cur);
+    Cur = &Cur->getChildren()[CurChildIndex];
     if (IsTopLevelFunc && !EncodingIsAddrBased) {
       if (auto V = FuncStartAddrs.lookup(Guid))
         LastAddr = V;
@@ -454,6 +458,7 @@ bool MCPseudoProbeDecoder::buildAddress2ProbeMap(
   // Read number of probes in the current node.
   uint32_t NodeCount =
       cantFail(errorOrToExpected(readUnsignedNumber<uint32_t>()));
+  uint32_t CurrentProbeCount = 0;
   // Read number of direct inlinees
   uint32_t ChildrenToProcess =
       cantFail(errorOrToExpected(readUnsignedNumber<uint32_t>()));
@@ -494,19 +499,25 @@ bool MCPseudoProbeDecoder::buildAddress2ProbeMap(
     }
 
     if (Cur && !isSentinelProbe(Attr)) {
-      // Populate Address2ProbesMap
-      auto &Probes = Address2ProbesMap[Addr];
-      Probes.emplace_back(Addr, Cur->Guid, Index, PseudoProbeType(Kind), Attr,
-                          Discriminator, Cur);
-      Cur->addProbes(&Probes.back());
+      PseudoProbeVec.emplace_back(Addr, Index, PseudoProbeType(Kind), Attr,
+                                  Discriminator, Cur);
+      Address2ProbesMap[Addr].emplace_back(PseudoProbeVec.back());
+      ++CurrentProbeCount;
     }
     LastAddr = Addr;
   }
 
+  if (Cur) {
+    Cur->setProbes(
+        MutableArrayRef(PseudoProbeVec).take_back(CurrentProbeCount));
+    InlineTreeVec.resize(InlineTreeVec.size() + ChildrenToProcess);
+    Cur->getChildren() =
+        MutableArrayRef(InlineTreeVec).take_back(ChildrenToProcess);
+  }
   for (uint32_t I = 0; I < ChildrenToProcess; I++) {
-    buildAddress2ProbeMap(Cur, LastAddr, GuidFilter, FuncStartAddrs);
+    buildAddress2ProbeMap<false>(Cur, LastAddr, GuidFilter, FuncStartAddrs, I);
   }
-  return true;
+  return Cur;
 }
 
 template <bool IsTopLevelFunc>
@@ -605,14 +616,25 @@ bool MCPseudoProbeDecoder::buildAddress2ProbeMap(
     TopLevelFuncs += !Discard;
   }
   assert(Data == End && "Have unprocessed data in pseudo_probe section");
+  PseudoProbeVec.reserve(ProbeCount);
+  InlineTreeVec.reserve(InlinedCount);
+
+  // Allocate top-level function records as children of DummyInlineRoot.
+  InlineTreeVec.resize(TopLevelFuncs);
+  DummyInlineRoot.getChildren() = MutableArrayRef(InlineTreeVec);
 
   Data = Start;
   End = Data + Size;
   uint64_t LastAddr = 0;
+  uint32_t CurChildIndex = 0;
   while (Data < End)
-    buildAddress2ProbeMap(&DummyInlineRoot, LastAddr, GuidFilter,
-                          FuncStartAddrs);
+    CurChildIndex += buildAddress2ProbeMap<true>(
+        &DummyInlineRoot, LastAddr, GuidFilter, FuncStartAddrs, CurChildIndex);
   assert(Data == End && "Have unprocessed data in pseudo_probe section");
+  assert(PseudoProbeVec.size() == ProbeCount &&
+         "Mismatching probe count pre- and post-parsing");
+  assert(InlineTreeVec.size() == InlinedCount &&
+         "Mismatching function records count pre- and post-parsing");
   return true;
 }
 
diff --git a/llvm/tools/llvm-profgen/ProfileGenerator.cpp b/llvm/tools/llvm-profgen/ProfileGenerator.cpp
index 175556c2220e6d..2c6875281047d3 100644
--- a/llvm/tools/llvm-profgen/ProfileGenerator.cpp
+++ b/llvm/tools/llvm-profgen/ProfileGenerator.cpp
@@ -1302,9 +1302,9 @@ void CSProfileGenerator::populateBodySamplesWithProbes(
   // and will be inferred by the compiler.
   for (auto &I : FrameSamples) {
     for (auto *FunctionProfile : I.second) {
-      for (auto *Probe : I.first->getProbes()) {
-        FunctionProfile->addBodySamples(Probe->getIndex(),
-                                        Probe->getDiscriminator(), 0);
+      for (const MCDecodedPseudoProbe &Probe : I.first->getProbes()) {
+        FunctionProfile->addBodySamples(Probe.getIndex(),
+                                        Probe.getDiscriminator(), 0);
       }
     }
   }
diff --git a/llvm/tools/llvm-profgen/ProfiledBinary.cpp b/llvm/tools/llvm-profgen/ProfiledBinary.cpp
index 574a9c9f52bf18..fe7d3ffa476eb5 100644
--- a/llvm/tools/llvm-profgen/ProfiledBinary.cpp
+++ b/llvm/tools/llvm-profgen/ProfiledBinary.cpp
@@ -132,7 +132,7 @@ void BinarySizeContextTracker::trackInlineesOptimizedAway(
     MCPseudoProbeDecoder &ProbeDecoder) {
   ProbeFrameStack ProbeContext;
   for (const auto &Child : ProbeDecoder.getDummyInlineRoot().getChildren())
-    trackInlineesOptimizedAway(ProbeDecoder, *Child.second, ProbeContext);
+    trackInlineesOptimizedAway(ProbeDecoder, Child, ProbeContext);
 }
 
 void BinarySizeContextTracker::trackInlineesOptimizedAway(
@@ -160,9 +160,9 @@ void BinarySizeContextTracker::trackInlineesOptimizedAway(
 
   // DFS down the probe inline tree
   for (const auto &ChildNode : ProbeNode.getChildren()) {
-    InlineSite Location = ChildNode.first;
+    InlineSite Location = ChildNode.getInlineSite();
     ProbeContext.back().second = std::get<1>(Location);
-    trackInlineesOptimizedAway(ProbeDecoder, *ChildNode.second, ProbeContext);
+    trackInlineesOptimizedAway(ProbeDecoder, ChildNode, ProbeContext);
   }
 
   ProbeContext.pop_back();
@@ -454,8 +454,8 @@ void ProfiledBinary::decodePseudoProbe(const ELFObjectFileBase *Obj) {
   // Build TopLevelProbeFrameMap to track size for optimized inlinees when probe
   // is available
   if (TrackFuncContextSize) {
-    for (const auto &Child : ProbeDecoder.getDummyInlineRoot().getChildren()) {
-      auto *Frame = Child.second.get();
+    for (auto &Child : ProbeDecoder.getDummyInlineRoot().getChildren()) {
+      auto *Frame = &Child;
       StringRef FuncName =
           ProbeDecoder.getFuncDescForGUID(Frame->Guid)->FuncName;
       TopLevelProbeFrameMap[FuncName] = Frame;

>From 9746055b0a1ae1e7c6aff50fc217dc216605c277 Mon Sep 17 00:00:00 2001
From: Amir Ayupov <aaupov at fb.com>
Date: Sat, 10 Aug 2024 23:48:40 -0700
Subject: [PATCH 31/39] [MC][profgen][NFC] Expand auto for MCDecodedPseudoProbe

Expand autos in select places in preparation to #102789.

Reviewers: dcci, maksfb, WenleiHe, rafaelauler, ayermolo, wlei-llvm

Reviewed By: WenleiHe, wlei-llvm

Pull Request: https://github.com/llvm/llvm-project/pull/102788
---
 llvm/lib/MC/MCPseudoProbe.cpp                | 4 ++--
 llvm/tools/llvm-profgen/ProfileGenerator.cpp | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/llvm/lib/MC/MCPseudoProbe.cpp b/llvm/lib/MC/MCPseudoProbe.cpp
index 3dc443a503f722..1031dac331bb1c 100644
--- a/llvm/lib/MC/MCPseudoProbe.cpp
+++ b/llvm/lib/MC/MCPseudoProbe.cpp
@@ -652,7 +652,7 @@ void MCPseudoProbeDecoder::printProbeForAddress(raw_ostream &OS,
                                                 uint64_t Address) {
   auto It = Address2ProbesMap.find(Address);
   if (It != Address2ProbesMap.end()) {
-    for (auto &Probe : It->second) {
+    for (const MCDecodedPseudoProbe &Probe : It->second) {
       OS << " [Probe]:\t";
       Probe.print(OS, GUID2FuncDescMap, true);
     }
@@ -679,7 +679,7 @@ MCPseudoProbeDecoder::getCallProbeForAddr(uint64_t Address) const {
   const auto &Probes = It->second;
 
   const MCDecodedPseudoProbe *CallProbe = nullptr;
-  for (const auto &Probe : Probes) {
+  for (const MCDecodedPseudoProbe &Probe : Probes) {
     if (Probe.isCall()) {
       // Disabling the assert and returning first call probe seen so far.
       // Subsequent call probes, if any, are ignored. Due to the the way
diff --git a/llvm/tools/llvm-profgen/ProfileGenerator.cpp b/llvm/tools/llvm-profgen/ProfileGenerator.cpp
index 2c6875281047d3..d9283271b03c03 100644
--- a/llvm/tools/llvm-profgen/ProfileGenerator.cpp
+++ b/llvm/tools/llvm-profgen/ProfileGenerator.cpp
@@ -1194,7 +1194,7 @@ void ProfileGeneratorBase::extractProbesFromRange(
           Binary->getAddress2ProbesMap();
       auto It = Address2ProbesMap.find(IP.Address);
       if (It != Address2ProbesMap.end()) {
-        for (const auto &Probe : It->second) {
+        for (const MCDecodedPseudoProbe &Probe : It->second) {
           ProbeCounter[&Probe] += Count;
         }
       }

>From 3dcef4813afc966aa7bb73d733556c369d3a8011 Mon Sep 17 00:00:00 2001
From: Amir Ayupov <aaupov at fb.com>
Date: Mon, 26 Aug 2024 09:14:35 -0700
Subject: [PATCH 32/39] [MC][NFC] Reduce Address2ProbesMap size

Replace the map from addresses to list of probes with a flat vector
containing probe references sorted by their addresses.

Reduces pseudo probe parsing time from 9.56s to 8.59s and peak RSS from
9.66 GiB to 9.08 GiB as part of perf2bolt processing a large binary.

Test Plan:
```
bin/llvm-lit -sv test/tools/llvm-profgen
```

Reviewers: maksfb, rafaelauler, dcci, ayermolo, wlei-llvm

Reviewed By: wlei-llvm

Pull Request: https://github.com/llvm/llvm-project/pull/102904
---
 bolt/lib/Profile/DataAggregator.cpp          | 14 ++--
 bolt/lib/Profile/YAMLProfileWriter.cpp       | 11 +--
 bolt/lib/Rewrite/PseudoProbeRewriter.cpp     | 83 ++++++++------------
 llvm/include/llvm/MC/MCPseudoProbe.h         | 30 +++++--
 llvm/lib/MC/MCPseudoProbe.cpp                | 43 +++++-----
 llvm/tools/llvm-profgen/ProfileGenerator.cpp |  8 +-
 6 files changed, 94 insertions(+), 95 deletions(-)

diff --git a/bolt/lib/Profile/DataAggregator.cpp b/bolt/lib/Profile/DataAggregator.cpp
index a300e5b2b1dabd..813d825f8b570c 100644
--- a/bolt/lib/Profile/DataAggregator.cpp
+++ b/bolt/lib/Profile/DataAggregator.cpp
@@ -2415,17 +2415,15 @@ std::error_code DataAggregator::writeBATYAML(BinaryContext &BC,
         Fragments.insert(BF);
         for (const BinaryFunction *F : Fragments) {
           const uint64_t FuncAddr = F->getAddress();
-          const auto &FragmentProbes =
-              llvm::make_range(ProbeMap.lower_bound(FuncAddr),
-                               ProbeMap.lower_bound(FuncAddr + F->getSize()));
-          for (const auto &[OutputAddress, Probes] : FragmentProbes) {
+          for (const MCDecodedPseudoProbe &Probe :
+               ProbeMap.find(FuncAddr, FuncAddr + F->getSize())) {
+            const uint32_t OutputAddress = Probe.getAddress();
             const uint32_t InputOffset = BAT->translate(
                 FuncAddr, OutputAddress - FuncAddr, /*IsBranchSrc=*/true);
             const unsigned BlockIndex = getBlock(InputOffset).second;
-            for (const MCDecodedPseudoProbe &Probe : Probes)
-              YamlBF.Blocks[BlockIndex].PseudoProbes.emplace_back(
-                  yaml::bolt::PseudoProbeInfo{Probe.getGuid(), Probe.getIndex(),
-                                              Probe.getType()});
+            YamlBF.Blocks[BlockIndex].PseudoProbes.emplace_back(
+                yaml::bolt::PseudoProbeInfo{Probe.getGuid(), Probe.getIndex(),
+                                            Probe.getType()});
           }
         }
       }
diff --git a/bolt/lib/Profile/YAMLProfileWriter.cpp b/bolt/lib/Profile/YAMLProfileWriter.cpp
index 84777741d611a3..f74cf60e076d0a 100644
--- a/bolt/lib/Profile/YAMLProfileWriter.cpp
+++ b/bolt/lib/Profile/YAMLProfileWriter.cpp
@@ -193,13 +193,10 @@ YAMLProfileWriter::convert(const BinaryFunction &BF, bool UseDFS,
       const uint64_t FuncAddr = BF.getAddress();
       const std::pair<uint64_t, uint64_t> &BlockRange =
           BB->getInputAddressRange();
-      const auto &BlockProbes =
-          llvm::make_range(ProbeMap.lower_bound(FuncAddr + BlockRange.first),
-                           ProbeMap.lower_bound(FuncAddr + BlockRange.second));
-      for (const auto &[_, Probes] : BlockProbes)
-        for (const MCDecodedPseudoProbe &Probe : Probes)
-          YamlBB.PseudoProbes.emplace_back(yaml::bolt::PseudoProbeInfo{
-              Probe.getGuid(), Probe.getIndex(), Probe.getType()});
+      for (const MCDecodedPseudoProbe &Probe : ProbeMap.find(
+               FuncAddr + BlockRange.first, FuncAddr + BlockRange.second))
+        YamlBB.PseudoProbes.emplace_back(yaml::bolt::PseudoProbeInfo{
+            Probe.getGuid(), Probe.getIndex(), Probe.getType()});
     }
 
     YamlBF.Blocks.emplace_back(YamlBB);
diff --git a/bolt/lib/Rewrite/PseudoProbeRewriter.cpp b/bolt/lib/Rewrite/PseudoProbeRewriter.cpp
index 9677530919b90d..7516918b2389fc 100644
--- a/bolt/lib/Rewrite/PseudoProbeRewriter.cpp
+++ b/bolt/lib/Rewrite/PseudoProbeRewriter.cpp
@@ -173,13 +173,13 @@ void PseudoProbeRewriter::updatePseudoProbes() {
   AddressProbesMap &Address2ProbesMap = ProbeDecoder.getAddress2ProbesMap();
   const GUIDProbeFunctionMap &GUID2Func = ProbeDecoder.getGUID2FuncDescMap();
 
-  for (auto &AP : Address2ProbesMap) {
-    BinaryFunction *F = BC.getBinaryFunctionContainingAddress(AP.first);
+  for (MCDecodedPseudoProbe &Probe : Address2ProbesMap) {
+    uint64_t Address = Probe.getAddress();
+    BinaryFunction *F = BC.getBinaryFunctionContainingAddress(Address);
     // If F is removed, eliminate all probes inside it from inline tree
     // Setting probes' addresses as INT64_MAX means elimination
     if (!F) {
-      for (MCDecodedPseudoProbe &Probe : AP.second)
-        Probe.setAddress(INT64_MAX);
+      Probe.setAddress(INT64_MAX);
       continue;
     }
     // If F is not emitted, the function will remain in the same address as its
@@ -187,45 +187,36 @@ void PseudoProbeRewriter::updatePseudoProbes() {
     if (!F->isEmitted())
       continue;
 
-    uint64_t Offset = AP.first - F->getAddress();
+    uint64_t Offset = Address - F->getAddress();
     const BinaryBasicBlock *BB = F->getBasicBlockContainingOffset(Offset);
     uint64_t BlkOutputAddress = BB->getOutputAddressRange().first;
     // Check if block output address is defined.
     // If not, such block is removed from binary. Then remove the probes from
     // inline tree
     if (BlkOutputAddress == 0) {
-      for (MCDecodedPseudoProbe &Probe : AP.second)
-        Probe.setAddress(INT64_MAX);
+      Probe.setAddress(INT64_MAX);
       continue;
     }
 
-    unsigned ProbeTrack = AP.second.size();
-    auto Probe = llvm::map_iterator(
-        AP.second.begin(),
-        [](auto RW) -> MCDecodedPseudoProbe & { return RW.get(); });
-    while (ProbeTrack != 0) {
-      if (Probe->isBlock()) {
-        Probe->setAddress(BlkOutputAddress);
-      } else if (Probe->isCall()) {
-        // A call probe may be duplicated due to ICP
-        // Go through output of InputOffsetToAddressMap to collect all related
-        // probes
-        auto CallOutputAddresses = BC.getIOAddressMap().lookupAll(AP.first);
-        auto CallOutputAddress = CallOutputAddresses.first;
-        if (CallOutputAddress == CallOutputAddresses.second) {
-          Probe->setAddress(INT64_MAX);
-        } else {
-          Probe->setAddress(CallOutputAddress->second);
-          CallOutputAddress = std::next(CallOutputAddress);
-        }
-
-        while (CallOutputAddress != CallOutputAddresses.second) {
-          ProbeDecoder.addInjectedProbe(*Probe, CallOutputAddress->second);
-          CallOutputAddress = std::next(CallOutputAddress);
-        }
+    if (Probe.isBlock()) {
+      Probe.setAddress(BlkOutputAddress);
+    } else if (Probe.isCall()) {
+      // A call probe may be duplicated due to ICP
+      // Go through output of InputOffsetToAddressMap to collect all related
+      // probes
+      auto CallOutputAddresses = BC.getIOAddressMap().lookupAll(Address);
+      auto CallOutputAddress = CallOutputAddresses.first;
+      if (CallOutputAddress == CallOutputAddresses.second) {
+        Probe.setAddress(INT64_MAX);
+      } else {
+        Probe.setAddress(CallOutputAddress->second);
+        CallOutputAddress = std::next(CallOutputAddress);
+      }
+
+      while (CallOutputAddress != CallOutputAddresses.second) {
+        ProbeDecoder.addInjectedProbe(Probe, CallOutputAddress->second);
+        CallOutputAddress = std::next(CallOutputAddress);
       }
-      Probe = std::next(Probe);
-      ProbeTrack--;
     }
   }
 
@@ -241,22 +232,16 @@ void PseudoProbeRewriter::updatePseudoProbes() {
             BinaryBlock.getName();
 
     // scan all addresses -> correlate probe to block when print out
-    std::vector<uint64_t> Addresses;
-    for (auto &Entry : Address2ProbesMap)
-      Addresses.push_back(Entry.first);
-    llvm::sort(Addresses);
-    for (uint64_t Key : Addresses) {
-      for (MCDecodedPseudoProbe &Probe : Address2ProbesMap[Key]) {
-        if (Probe.getAddress() == INT64_MAX)
-          outs() << "Deleted Probe: ";
-        else
-          outs() << "Address: " << format_hex(Probe.getAddress(), 8) << " ";
-        Probe.print(outs(), GUID2Func, true);
-        // print block name only if the probe is block type and undeleted.
-        if (Probe.isBlock() && Probe.getAddress() != INT64_MAX)
-          outs() << format_hex(Probe.getAddress(), 8) << " Probe is in "
-                 << Addr2BlockNames[Probe.getAddress()] << "\n";
-      }
+    for (MCDecodedPseudoProbe &Probe : Address2ProbesMap) {
+      if (Probe.getAddress() == INT64_MAX)
+        outs() << "Deleted Probe: ";
+      else
+        outs() << "Address: " << format_hex(Probe.getAddress(), 8) << " ";
+      Probe.print(outs(), GUID2Func, true);
+      // print block name only if the probe is block type and undeleted.
+      if (Probe.isBlock() && Probe.getAddress() != INT64_MAX)
+        outs() << format_hex(Probe.getAddress(), 8) << " Probe is in "
+               << Addr2BlockNames[Probe.getAddress()] << "\n";
     }
     outs() << "=======================================\n";
   }
diff --git a/llvm/include/llvm/MC/MCPseudoProbe.h b/llvm/include/llvm/MC/MCPseudoProbe.h
index 66ad9db4860d8a..854f1209c39346 100644
--- a/llvm/include/llvm/MC/MCPseudoProbe.h
+++ b/llvm/include/llvm/MC/MCPseudoProbe.h
@@ -63,7 +63,6 @@
 #include "llvm/IR/PseudoProbe.h"
 #include "llvm/Support/ErrorOr.h"
 #include <functional>
-#include <map>
 #include <memory>
 #include <string>
 #include <tuple>
@@ -103,10 +102,6 @@ using MCPseudoProbeInlineStack = SmallVector<InlineSite, 8>;
 // GUID to PseudoProbeFuncDesc map
 using GUIDProbeFunctionMap =
     std::unordered_map<uint64_t, MCPseudoProbeFuncDesc>;
-// Address to pseudo probes map.
-using AddressProbesMap =
-    std::map<uint64_t,
-             std::vector<std::reference_wrapper<MCDecodedPseudoProbe>>>;
 
 class MCDecodedPseudoProbeInlineTree;
 
@@ -213,6 +208,31 @@ class MCDecodedPseudoProbe : public MCPseudoProbeBase {
              bool ShowName) const;
 };
 
+// Address to pseudo probes map.
+class AddressProbesMap
+    : public std::vector<std::reference_wrapper<MCDecodedPseudoProbe>> {
+  auto getIt(uint64_t Addr) const {
+    auto CompareProbe = [](const MCDecodedPseudoProbe &Probe, uint64_t Addr) {
+      return Probe.getAddress() < Addr;
+    };
+    return llvm::lower_bound(*this, Addr, CompareProbe);
+  }
+
+public:
+  // Returns range of probes within [\p From, \p To) address range.
+  auto find(uint64_t From, uint64_t To) const {
+    return llvm::make_range(getIt(From), getIt(To));
+  }
+  // Returns range of probes with given \p Address.
+  auto find(uint64_t Address) const {
+    auto FromIt = getIt(Address);
+    if (FromIt == end() || FromIt->get().getAddress() != Address)
+      return llvm::make_range(end(), end());
+    auto ToIt = getIt(Address + 1);
+    return llvm::make_range(FromIt, ToIt);
+  }
+};
+
 template <typename ProbesType, typename DerivedProbeInlineTreeType,
           typename InlinedProbeTreeMap>
 class MCPseudoProbeInlineTreeBase {
diff --git a/llvm/lib/MC/MCPseudoProbe.cpp b/llvm/lib/MC/MCPseudoProbe.cpp
index 1031dac331bb1c..5951499c0cb280 100644
--- a/llvm/lib/MC/MCPseudoProbe.cpp
+++ b/llvm/lib/MC/MCPseudoProbe.cpp
@@ -501,7 +501,6 @@ bool MCPseudoProbeDecoder::buildAddress2ProbeMap(
     if (Cur && !isSentinelProbe(Attr)) {
       PseudoProbeVec.emplace_back(Addr, Index, PseudoProbeType(Kind), Attr,
                                   Discriminator, Cur);
-      Address2ProbesMap[Addr].emplace_back(PseudoProbeVec.back());
       ++CurrentProbeCount;
     }
     LastAddr = Addr;
@@ -635,6 +634,15 @@ bool MCPseudoProbeDecoder::buildAddress2ProbeMap(
          "Mismatching probe count pre- and post-parsing");
   assert(InlineTreeVec.size() == InlinedCount &&
          "Mismatching function records count pre- and post-parsing");
+
+  std::vector<std::pair<uint64_t, uint32_t>> SortedA2P(ProbeCount);
+  for (const auto &[I, Probe] : llvm::enumerate(PseudoProbeVec))
+    SortedA2P[I] = {Probe.getAddress(), I};
+  llvm::sort(SortedA2P);
+  Address2ProbesMap.reserve(ProbeCount);
+  for (const uint32_t I : llvm::make_second_range(SortedA2P))
+    Address2ProbesMap.emplace_back(PseudoProbeVec[I]);
+  SortedA2P.clear();
   return true;
 }
 
@@ -650,36 +658,29 @@ void MCPseudoProbeDecoder::printGUID2FuncDescMap(raw_ostream &OS) {
 
 void MCPseudoProbeDecoder::printProbeForAddress(raw_ostream &OS,
                                                 uint64_t Address) {
-  auto It = Address2ProbesMap.find(Address);
-  if (It != Address2ProbesMap.end()) {
-    for (const MCDecodedPseudoProbe &Probe : It->second) {
-      OS << " [Probe]:\t";
-      Probe.print(OS, GUID2FuncDescMap, true);
-    }
+  for (const MCDecodedPseudoProbe &Probe : Address2ProbesMap.find(Address)) {
+    OS << " [Probe]:\t";
+    Probe.print(OS, GUID2FuncDescMap, true);
   }
 }
 
 void MCPseudoProbeDecoder::printProbesForAllAddresses(raw_ostream &OS) {
-  auto Entries = make_first_range(Address2ProbesMap);
-  SmallVector<uint64_t, 0> Addresses(Entries.begin(), Entries.end());
-  llvm::sort(Addresses);
-  for (auto K : Addresses) {
-    OS << "Address:\t";
-    OS << K;
-    OS << "\n";
-    printProbeForAddress(OS, K);
+  uint64_t PrevAddress = INT64_MAX;
+  for (MCDecodedPseudoProbe &Probe : Address2ProbesMap) {
+    uint64_t Address = Probe.getAddress();
+    if (Address != PrevAddress) {
+      PrevAddress = Address;
+      OS << "Address:\t" << Address << '\n';
+    }
+    OS << " [Probe]:\t";
+    Probe.print(OS, GUID2FuncDescMap, true);
   }
 }
 
 const MCDecodedPseudoProbe *
 MCPseudoProbeDecoder::getCallProbeForAddr(uint64_t Address) const {
-  auto It = Address2ProbesMap.find(Address);
-  if (It == Address2ProbesMap.end())
-    return nullptr;
-  const auto &Probes = It->second;
-
   const MCDecodedPseudoProbe *CallProbe = nullptr;
-  for (const MCDecodedPseudoProbe &Probe : Probes) {
+  for (const MCDecodedPseudoProbe &Probe : Address2ProbesMap.find(Address)) {
     if (Probe.isCall()) {
       // Disabling the assert and returning first call probe seen so far.
       // Subsequent call probes, if any, are ignored. Due to the the way
diff --git a/llvm/tools/llvm-profgen/ProfileGenerator.cpp b/llvm/tools/llvm-profgen/ProfileGenerator.cpp
index d9283271b03c03..3b12f536be55d2 100644
--- a/llvm/tools/llvm-profgen/ProfileGenerator.cpp
+++ b/llvm/tools/llvm-profgen/ProfileGenerator.cpp
@@ -1192,11 +1192,9 @@ void ProfileGeneratorBase::extractProbesFromRange(
     do {
       const AddressProbesMap &Address2ProbesMap =
           Binary->getAddress2ProbesMap();
-      auto It = Address2ProbesMap.find(IP.Address);
-      if (It != Address2ProbesMap.end()) {
-        for (const MCDecodedPseudoProbe &Probe : It->second) {
-          ProbeCounter[&Probe] += Count;
-        }
+      for (const MCDecodedPseudoProbe &Probe :
+           Address2ProbesMap.find(IP.Address)) {
+        ProbeCounter[&Probe] += Count;
       }
     } while (IP.advance() && IP.Address <= RangeEnd);
   }

>From ba149d99c8dc1d813226b660f3d14b5d879a721c Mon Sep 17 00:00:00 2001
From: Amir Ayupov <aaupov at fb.com>
Date: Mon, 26 Aug 2024 09:15:53 -0700
Subject: [PATCH 33/39] [MC][NFC] Use vector for GUIDProbeFunctionMap

Replace unordered_map with a vector. Pre-parse the section to statically
allocate storage. Use BumpPtrAllocator for FuncName strings, keep
StringRef in FuncDesc.

Reduces peak RSS of pseudo probe parsing from 9.08 GiB to 8.89 GiB as
part of perf2bolt with a large binary.

Test Plan:
```
bin/llvm-lit -sv test/tools/llvm-profgen
```

Reviewers: wlei-llvm, rafaelauler, dcci, maksfb, ayermolo

Reviewed By: wlei-llvm

Pull Request: https://github.com/llvm/llvm-project/pull/102905
---
 bolt/lib/Rewrite/PseudoProbeRewriter.cpp |  3 +-
 llvm/include/llvm/MC/MCPseudoProbe.h     | 19 +++++++--
 llvm/lib/MC/MCPseudoProbe.cpp            | 52 ++++++++++++++----------
 3 files changed, 49 insertions(+), 25 deletions(-)

diff --git a/bolt/lib/Rewrite/PseudoProbeRewriter.cpp b/bolt/lib/Rewrite/PseudoProbeRewriter.cpp
index 7516918b2389fc..4925b4b385d9b1 100644
--- a/bolt/lib/Rewrite/PseudoProbeRewriter.cpp
+++ b/bolt/lib/Rewrite/PseudoProbeRewriter.cpp
@@ -155,7 +155,8 @@ void PseudoProbeRewriter::parsePseudoProbe() {
     ProbeDecoder.printProbesForAllAddresses(outs());
   }
 
-  for (const auto &[GUID, FuncDesc] : ProbeDecoder.getGUID2FuncDescMap()) {
+  for (const auto &FuncDesc : ProbeDecoder.getGUID2FuncDescMap()) {
+    uint64_t GUID = FuncDesc.FuncGUID;
     if (!FuncStartAddrs.contains(GUID))
       continue;
     BinaryFunction *BF = BC.getBinaryFunctionAtAddress(FuncStartAddrs[GUID]);
diff --git a/llvm/include/llvm/MC/MCPseudoProbe.h b/llvm/include/llvm/MC/MCPseudoProbe.h
index 854f1209c39346..32905c1e9a424a 100644
--- a/llvm/include/llvm/MC/MCPseudoProbe.h
+++ b/llvm/include/llvm/MC/MCPseudoProbe.h
@@ -61,6 +61,7 @@
 #include "llvm/ADT/StringRef.h"
 #include "llvm/ADT/iterator.h"
 #include "llvm/IR/PseudoProbe.h"
+#include "llvm/Support/Allocator.h"
 #include "llvm/Support/ErrorOr.h"
 #include <functional>
 #include <memory>
@@ -86,7 +87,7 @@ enum class MCPseudoProbeFlag {
 struct MCPseudoProbeFuncDesc {
   uint64_t FuncGUID = 0;
   uint64_t FuncHash = 0;
-  std::string FuncName;
+  StringRef FuncName;
 
   MCPseudoProbeFuncDesc(uint64_t GUID, uint64_t Hash, StringRef Name)
       : FuncGUID(GUID), FuncHash(Hash), FuncName(Name){};
@@ -100,8 +101,18 @@ class MCDecodedPseudoProbe;
 using InlineSite = std::tuple<uint64_t, uint32_t>;
 using MCPseudoProbeInlineStack = SmallVector<InlineSite, 8>;
 // GUID to PseudoProbeFuncDesc map
-using GUIDProbeFunctionMap =
-    std::unordered_map<uint64_t, MCPseudoProbeFuncDesc>;
+class GUIDProbeFunctionMap : public std::vector<MCPseudoProbeFuncDesc> {
+public:
+  auto find(uint64_t GUID) const {
+    auto CompareDesc = [](const MCPseudoProbeFuncDesc &Desc, uint64_t GUID) {
+      return Desc.FuncGUID < GUID;
+    };
+    auto It = llvm::lower_bound(*this, GUID, CompareDesc);
+    if (It->FuncGUID != GUID)
+      return end();
+    return It;
+  }
+};
 
 class MCDecodedPseudoProbeInlineTree;
 
@@ -389,6 +400,8 @@ class MCPseudoProbeDecoder {
   // GUID to PseudoProbeFuncDesc map.
   GUIDProbeFunctionMap GUID2FuncDescMap;
 
+  BumpPtrAllocator FuncNameAllocator;
+
   // Address to probes map.
   AddressProbesMap Address2ProbesMap;
 
diff --git a/llvm/lib/MC/MCPseudoProbe.cpp b/llvm/lib/MC/MCPseudoProbe.cpp
index 5951499c0cb280..90d7588407068a 100644
--- a/llvm/lib/MC/MCPseudoProbe.cpp
+++ b/llvm/lib/MC/MCPseudoProbe.cpp
@@ -274,7 +274,7 @@ static StringRef getProbeFNameForGUID(const GUIDProbeFunctionMap &GUID2FuncMAP,
   auto It = GUID2FuncMAP.find(GUID);
   assert(It != GUID2FuncMAP.end() &&
          "Probe function must exist for a valid GUID");
-  return It->second.FuncName;
+  return It->FuncName;
 }
 
 void MCPseudoProbeFuncDesc::print(raw_ostream &OS) {
@@ -390,32 +390,46 @@ bool MCPseudoProbeDecoder::buildGUID2FuncDescMap(const uint8_t *Start,
   Data = Start;
   End = Data + Size;
 
+  uint32_t FuncDescCount = 0;
   while (Data < End) {
-    auto ErrorOrGUID = readUnencodedNumber<uint64_t>();
-    if (!ErrorOrGUID)
+    // GUID
+    if (!readUnencodedNumber<uint64_t>())
       return false;
-
-    auto ErrorOrHash = readUnencodedNumber<uint64_t>();
-    if (!ErrorOrHash)
+    // Hash
+    if (!readUnencodedNumber<uint64_t>())
       return false;
 
     auto ErrorOrNameSize = readUnsignedNumber<uint32_t>();
     if (!ErrorOrNameSize)
       return false;
-    uint32_t NameSize = std::move(*ErrorOrNameSize);
-
-    auto ErrorOrName = readString(NameSize);
-    if (!ErrorOrName)
+    // Function name
+    if (!readString(*ErrorOrNameSize))
       return false;
+    ++FuncDescCount;
+  }
+  assert(Data == End && "Have unprocessed data in pseudo_probe_desc section");
+  GUID2FuncDescMap.reserve(FuncDescCount);
 
-    uint64_t GUID = std::move(*ErrorOrGUID);
-    uint64_t Hash = std::move(*ErrorOrHash);
-    StringRef Name = std::move(*ErrorOrName);
+  Data = Start;
+  End = Data + Size;
+  while (Data < End) {
+    uint64_t GUID =
+        cantFail(errorOrToExpected(readUnencodedNumber<uint64_t>()));
+    uint64_t Hash =
+        cantFail(errorOrToExpected(readUnencodedNumber<uint64_t>()));
+    uint32_t NameSize =
+        cantFail(errorOrToExpected(readUnsignedNumber<uint32_t>()));
+    StringRef Name = cantFail(errorOrToExpected(readString(NameSize)));
 
     // Initialize PseudoProbeFuncDesc and populate it into GUID2FuncDescMap
-    GUID2FuncDescMap.emplace(GUID, MCPseudoProbeFuncDesc(GUID, Hash, Name));
+    GUID2FuncDescMap.emplace_back(GUID, Hash, Name.copy(FuncNameAllocator));
   }
   assert(Data == End && "Have unprocessed data in pseudo_probe_desc section");
+  assert(GUID2FuncDescMap.size() == FuncDescCount &&
+         "Mismatching function description count pre- and post-parsing");
+  llvm::sort(GUID2FuncDescMap, [](const auto &LHS, const auto &RHS) {
+    return LHS.FuncGUID < RHS.FuncGUID;
+  });
   return true;
 }
 
@@ -648,12 +662,8 @@ bool MCPseudoProbeDecoder::buildAddress2ProbeMap(
 
 void MCPseudoProbeDecoder::printGUID2FuncDescMap(raw_ostream &OS) {
   OS << "Pseudo Probe Desc:\n";
-  // Make the output deterministic
-  std::map<uint64_t, MCPseudoProbeFuncDesc> OrderedMap(GUID2FuncDescMap.begin(),
-                                                       GUID2FuncDescMap.end());
-  for (auto &I : OrderedMap) {
-    I.second.print(OS);
-  }
+  for (auto &I : GUID2FuncDescMap)
+    I.print(OS);
 }
 
 void MCPseudoProbeDecoder::printProbeForAddress(raw_ostream &OS,
@@ -705,7 +715,7 @@ const MCPseudoProbeFuncDesc *
 MCPseudoProbeDecoder::getFuncDescForGUID(uint64_t GUID) const {
   auto It = GUID2FuncDescMap.find(GUID);
   assert(It != GUID2FuncDescMap.end() && "Function descriptor doesn't exist");
-  return &It->second;
+  return &*It;
 }
 
 void MCPseudoProbeDecoder::getInlineContextForProbe(

>From c35e8acd11e67fb9a5cd0a66ae51066f24df524a Mon Sep 17 00:00:00 2001
From: Amir Ayupov <aaupov at fb.com>
Date: Tue, 30 Jul 2024 11:24:24 -0700
Subject: [PATCH 34/39] buildAddress2ProbeMap timers

---
 llvm/lib/MC/MCPseudoProbe.cpp | 50 +++++++++++++++++++++++++++++++++++
 1 file changed, 50 insertions(+)

diff --git a/llvm/lib/MC/MCPseudoProbe.cpp b/llvm/lib/MC/MCPseudoProbe.cpp
index 90d7588407068a..af7fe7edff1e70 100644
--- a/llvm/lib/MC/MCPseudoProbe.cpp
+++ b/llvm/lib/MC/MCPseudoProbe.cpp
@@ -19,8 +19,10 @@
 #include "llvm/MC/MCSymbol.h"
 #include "llvm/Support/Endian.h"
 #include "llvm/Support/Error.h"
+#include "llvm/Support/FormatVariadic.h"
 #include "llvm/Support/LEB128.h"
 #include "llvm/Support/MD5.h"
+#include "llvm/Support/Timer.h"
 #include "llvm/Support/raw_ostream.h"
 #include <algorithm>
 #include <cassert>
@@ -376,6 +378,8 @@ ErrorOr<StringRef> MCPseudoProbeDecoder::readString(uint32_t Size) {
 
 bool MCPseudoProbeDecoder::buildGUID2FuncDescMap(const uint8_t *Start,
                                                  std::size_t Size) {
+  Timer T("buildGUID2FDMap", "build GUID to FuncDesc map");
+  T.startTimer();
   // The pseudo_probe_desc section has a format like:
   // .section .pseudo_probe_desc,"", at progbits
   // .quad -5182264717993193164   // GUID
@@ -430,6 +434,12 @@ bool MCPseudoProbeDecoder::buildGUID2FuncDescMap(const uint8_t *Start,
   llvm::sort(GUID2FuncDescMap, [](const auto &LHS, const auto &RHS) {
     return LHS.FuncGUID < RHS.FuncGUID;
   });
+  T.stopTimer();
+  auto TT = T.getTotalTime();
+  T.clear();
+  dbgs() << "func desc ";
+  TT.print(TT, dbgs());
+  dbgs() << '\n';
   return true;
 }
 
@@ -623,12 +633,20 @@ bool MCPseudoProbeDecoder::buildAddress2ProbeMap(
   Data = Start;
   End = Data + Size;
   bool Discard = false;
+  Timer T("countRecords", "pre-parsing function records");
+  T.startTimer();
   while (Data < End) {
     if (!countRecords<true>(Discard, ProbeCount, InlinedCount, GuidFilter))
       return false;
     TopLevelFuncs += !Discard;
   }
+  T.stopTimer();
+  auto TT = T.getTotalTime();
+  T.clear();
+  dbgs() << "pre-parsing ";
+  TT.print(TT, dbgs());
   assert(Data == End && "Have unprocessed data in pseudo_probe section");
+  T.startTimer();
   PseudoProbeVec.reserve(ProbeCount);
   InlineTreeVec.reserve(InlinedCount);
 
@@ -636,6 +654,13 @@ bool MCPseudoProbeDecoder::buildAddress2ProbeMap(
   InlineTreeVec.resize(TopLevelFuncs);
   DummyInlineRoot.getChildren() = MutableArrayRef(InlineTreeVec);
 
+  T.stopTimer();
+  TT = T.getTotalTime();
+  T.clear();
+  dbgs() << "\nalloc ";
+  TT.print(TT, dbgs());
+
+  T.startTimer();
   Data = Start;
   End = Data + Size;
   uint64_t LastAddr = 0;
@@ -643,12 +668,18 @@ bool MCPseudoProbeDecoder::buildAddress2ProbeMap(
   while (Data < End)
     CurChildIndex += buildAddress2ProbeMap<true>(
         &DummyInlineRoot, LastAddr, GuidFilter, FuncStartAddrs, CurChildIndex);
+  T.stopTimer();
+  TT = T.getTotalTime();
+  T.clear();
+  dbgs() << "\nparsing ";
+  TT.print(TT, dbgs());
   assert(Data == End && "Have unprocessed data in pseudo_probe section");
   assert(PseudoProbeVec.size() == ProbeCount &&
          "Mismatching probe count pre- and post-parsing");
   assert(InlineTreeVec.size() == InlinedCount &&
          "Mismatching function records count pre- and post-parsing");
 
+  T.startTimer();
   std::vector<std::pair<uint64_t, uint32_t>> SortedA2P(ProbeCount);
   for (const auto &[I, Probe] : llvm::enumerate(PseudoProbeVec))
     SortedA2P[I] = {Probe.getAddress(), I};
@@ -657,6 +688,25 @@ bool MCPseudoProbeDecoder::buildAddress2ProbeMap(
   for (const uint32_t I : llvm::make_second_range(SortedA2P))
     Address2ProbesMap.emplace_back(PseudoProbeVec[I]);
   SortedA2P.clear();
+  T.stopTimer();
+  TT = T.getTotalTime();
+  T.clear();
+  dbgs() << "\nsorting ";
+  TT.print(TT, dbgs());
+  dbgs() << '\n';
+  size_t PPVecSize = 32 * PseudoProbeVec.capacity();
+  size_t ITVecSize = 48 * InlineTreeVec.capacity();
+  size_t G2FDMapSize = 32 * GUID2FuncDescMap.capacity();
+  size_t StringSize = FuncNameAllocator.getBytesAllocated();
+  size_t A2PSize = 8 * Address2ProbesMap.capacity();
+  dbgs() << formatv("PPVec size: {0} GiB\n", 1.f * PPVecSize / (1 << 30))
+         << formatv("ITVec size: {0} GiB\n", 1.f * ITVecSize / (1 << 30))
+         << formatv("G2FDMap size: {0} GiB\n", 1.f * G2FDMapSize / (1 << 30))
+         << formatv("  (strings {0} GiB)\n", 1.f * StringSize / (1 << 30))
+         << formatv("A2P size: {0} GiB\n", 1.f * A2PSize / (1 << 30))
+         << formatv("Total size: {0} GiB\n",
+                    1.f * (PPVecSize + ITVecSize + G2FDMapSize + A2PSize) /
+                        (1 << 30));
   return true;
 }
 

>From 1c469cf2dd59241b65e07c5f1030af1d371d881b Mon Sep 17 00:00:00 2001
From: Amir Ayupov <aaupov at fb.com>
Date: Tue, 27 Aug 2024 14:44:39 -0700
Subject: [PATCH 35/39] [BOLT][NFC] Rename profile-use-pseudo-probes

---
 bolt/lib/Profile/DataAggregator.cpp            |  4 ++--
 bolt/lib/Profile/YAMLProfileReader.cpp         |  5 -----
 bolt/lib/Profile/YAMLProfileWriter.cpp         | 11 ++++++++---
 bolt/lib/Rewrite/PseudoProbeRewriter.cpp       |  6 +++---
 bolt/test/X86/pseudoprobe-decoding-inline.test |  6 +++---
 5 files changed, 16 insertions(+), 16 deletions(-)

diff --git a/bolt/lib/Profile/DataAggregator.cpp b/bolt/lib/Profile/DataAggregator.cpp
index 813d825f8b570c..10d745cc69824b 100644
--- a/bolt/lib/Profile/DataAggregator.cpp
+++ b/bolt/lib/Profile/DataAggregator.cpp
@@ -88,7 +88,7 @@ MaxSamples("max-samples",
   cl::cat(AggregatorCategory));
 
 extern cl::opt<opts::ProfileFormatKind> ProfileFormat;
-extern cl::opt<bool> ProfileUsePseudoProbes;
+extern cl::opt<bool> ProfileWritePseudoProbes;
 extern cl::opt<std::string> SaveProfile;
 
 cl::opt<bool> ReadPreAggregated(
@@ -2300,7 +2300,7 @@ std::error_code DataAggregator::writeBATYAML(BinaryContext &BC,
   yaml::bolt::BinaryProfile BP;
 
   const MCPseudoProbeDecoder *PseudoProbeDecoder =
-      opts::ProfileUsePseudoProbes ? BC.getPseudoProbeDecoder() : nullptr;
+      opts::ProfileWritePseudoProbes ? BC.getPseudoProbeDecoder() : nullptr;
 
   // Fill out the header info.
   BP.Header.Version = 1;
diff --git a/bolt/lib/Profile/YAMLProfileReader.cpp b/bolt/lib/Profile/YAMLProfileReader.cpp
index 3eca5e972fa5ba..604a9fb4813be4 100644
--- a/bolt/lib/Profile/YAMLProfileReader.cpp
+++ b/bolt/lib/Profile/YAMLProfileReader.cpp
@@ -49,11 +49,6 @@ llvm::cl::opt<bool>
 llvm::cl::opt<bool> ProfileUseDFS("profile-use-dfs",
                                   cl::desc("use DFS order for YAML profile"),
                                   cl::Hidden, cl::cat(BoltOptCategory));
-
-llvm::cl::opt<bool> ProfileUsePseudoProbes(
-    "profile-use-pseudo-probes",
-    cl::desc("Use pseudo probes for profile generation and matching"),
-    cl::Hidden, cl::cat(BoltOptCategory));
 } // namespace opts
 
 namespace llvm {
diff --git a/bolt/lib/Profile/YAMLProfileWriter.cpp b/bolt/lib/Profile/YAMLProfileWriter.cpp
index f74cf60e076d0a..ffbf2388e912fb 100644
--- a/bolt/lib/Profile/YAMLProfileWriter.cpp
+++ b/bolt/lib/Profile/YAMLProfileWriter.cpp
@@ -13,6 +13,7 @@
 #include "bolt/Profile/DataAggregator.h"
 #include "bolt/Profile/ProfileReaderBase.h"
 #include "bolt/Rewrite/RewriteInstance.h"
+#include "bolt/Utils/CommandLineOpts.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/FileSystem.h"
 #include "llvm/Support/raw_ostream.h"
@@ -21,8 +22,12 @@
 #define DEBUG_TYPE "bolt-prof"
 
 namespace opts {
-extern llvm::cl::opt<bool> ProfileUseDFS;
-extern llvm::cl::opt<bool> ProfileUsePseudoProbes;
+using namespace llvm;
+extern cl::opt<bool> ProfileUseDFS;
+cl::opt<bool> ProfileWritePseudoProbes(
+    "profile-write-pseudo-probes",
+    cl::desc("Use pseudo probes in profile generation"), cl::Hidden,
+    cl::cat(BoltOptCategory));
 } // namespace opts
 
 namespace llvm {
@@ -59,7 +64,7 @@ YAMLProfileWriter::convert(const BinaryFunction &BF, bool UseDFS,
   yaml::bolt::BinaryFunctionProfile YamlBF;
   const BinaryContext &BC = BF.getBinaryContext();
   const MCPseudoProbeDecoder *PseudoProbeDecoder =
-      opts::ProfileUsePseudoProbes ? BC.getPseudoProbeDecoder() : nullptr;
+      opts::ProfileWritePseudoProbes ? BC.getPseudoProbeDecoder() : nullptr;
 
   const uint16_t LBRProfile = BF.getProfileFlags() & BinaryFunction::PF_LBR;
 
diff --git a/bolt/lib/Rewrite/PseudoProbeRewriter.cpp b/bolt/lib/Rewrite/PseudoProbeRewriter.cpp
index 4925b4b385d9b1..fef721167869dd 100644
--- a/bolt/lib/Rewrite/PseudoProbeRewriter.cpp
+++ b/bolt/lib/Rewrite/PseudoProbeRewriter.cpp
@@ -49,7 +49,7 @@ static cl::opt<PrintPseudoProbesOptions> PrintPseudoProbes(
                clEnumValN(PPP_All, "all", "enable all debugging printout")),
     cl::Hidden, cl::cat(BoltCategory));
 
-extern cl::opt<bool> ProfileUsePseudoProbes;
+extern cl::opt<bool> ProfileWritePseudoProbes;
 } // namespace opts
 
 namespace {
@@ -90,14 +90,14 @@ class PseudoProbeRewriter final : public MetadataRewriter {
 };
 
 Error PseudoProbeRewriter::preCFGInitializer() {
-  if (opts::ProfileUsePseudoProbes)
+  if (opts::ProfileWritePseudoProbes)
     parsePseudoProbe();
 
   return Error::success();
 }
 
 Error PseudoProbeRewriter::postEmitFinalizer() {
-  if (!opts::ProfileUsePseudoProbes)
+  if (!opts::ProfileWritePseudoProbes)
     parsePseudoProbe();
   updatePseudoProbes();
 
diff --git a/bolt/test/X86/pseudoprobe-decoding-inline.test b/bolt/test/X86/pseudoprobe-decoding-inline.test
index b361551e5711ea..1fdd00c7ef6c4b 100644
--- a/bolt/test/X86/pseudoprobe-decoding-inline.test
+++ b/bolt/test/X86/pseudoprobe-decoding-inline.test
@@ -6,11 +6,11 @@
 # PREAGG: B X:0 #main# 1 0
 ## Check pseudo-probes in regular YAML profile (non-BOLTed binary)
 # RUN: link_fdata %s %S/../../../llvm/test/tools/llvm-profgen/Inputs/inline-cs-pseudoprobe.perfbin %t.preagg PREAGG
-# RUN: perf2bolt %S/../../../llvm/test/tools/llvm-profgen/Inputs/inline-cs-pseudoprobe.perfbin -p %t.preagg --pa -w %t.yaml -o %t.fdata --profile-use-pseudo-probes
+# RUN: perf2bolt %S/../../../llvm/test/tools/llvm-profgen/Inputs/inline-cs-pseudoprobe.perfbin -p %t.preagg --pa -w %t.yaml -o %t.fdata --profile-write-pseudo-probes
 # RUN: FileCheck --input-file %t.yaml %s --check-prefix CHECK-YAML
 ## Check pseudo-probes in BAT YAML profile (BOLTed binary)
 # RUN: link_fdata %s %t.bolt %t.preagg2 PREAGG
-# RUN: perf2bolt %t.bolt -p %t.preagg2 --pa -w %t.yaml2 -o %t.fdata2 --profile-use-pseudo-probes
+# RUN: perf2bolt %t.bolt -p %t.preagg2 --pa -w %t.yaml2 -o %t.fdata2 --profile-write-pseudo-probes
 # RUN: FileCheck --input-file %t.yaml2 %s --check-prefix CHECK-YAML
 # CHECK-YAML: name: bar
 # CHECK-YAML: - bid: 0
@@ -30,7 +30,7 @@
 # CHECK-YAML: guid: 0xDB956436E78DD5FA
 # CHECK-YAML: pseudo_probe_desc_hash: 0x10000FFFFFFFF
 #
-## Check that without --profile-use-pseudo-probes option, no pseudo probes are
+## Check that without --profile-write-pseudo-probes option, no pseudo probes are
 ## generated
 # RUN: perf2bolt %S/../../../llvm/test/tools/llvm-profgen/Inputs/inline-cs-pseudoprobe.perfbin -p %t.preagg --pa -w %t.yaml -o %t.fdata
 # RUN: FileCheck --input-file %t.yaml %s --check-prefix CHECK-NO-OPT

>From 97f81017f04f23a5ec209e89b3800a34868c7c9a Mon Sep 17 00:00:00 2001
From: Amir Ayupov <aaupov at fb.com>
Date: Tue, 27 Aug 2024 09:26:44 -0700
Subject: [PATCH 36/39] [BOLT][NFCI] Strip suffix in getLTOCommonName

Also provide a mechanism to override the list of suffixes to consider.
Override LTOSuffixes for getGUID in pseudo probe parsing.

Test Plan:

Reviewers:
Subscribers:

Tasks:

Tags:

Differential Revision: https://phabricator.intern.facebook.com/D61857819

Pull Request: https://github.com/llvm/llvm-project/pull/106243
---
 bolt/include/bolt/Utils/Utils.h          |  5 ++++
 bolt/lib/Rewrite/PseudoProbeRewriter.cpp | 37 ++++++++++++++++++------
 bolt/lib/Utils/Utils.cpp                 | 12 ++++++--
 3 files changed, 42 insertions(+), 12 deletions(-)

diff --git a/bolt/include/bolt/Utils/Utils.h b/bolt/include/bolt/Utils/Utils.h
index 3886c5f8757c08..9baee7d94066de 100644
--- a/bolt/include/bolt/Utils/Utils.h
+++ b/bolt/include/bolt/Utils/Utils.h
@@ -41,6 +41,11 @@ std::string getEscapedName(const StringRef &Name);
 /// Return the unescaped name
 std::string getUnescapedName(const StringRef &Name);
 
+/// Return a common part for a given \p Name wrt a given \p Suffixes list.
+/// Preserve the suffix if \p KeepSuffix is set, only dropping characters
+/// following it, otherwise drop the suffix as well.
+std::optional<StringRef> getCommonName(const StringRef Name, bool KeepSuffix,
+                                       ArrayRef<StringRef> Suffixes);
 /// LTO-generated function names take a form:
 ///
 ///   <function_name>.lto_priv.<decimal_number>/...
diff --git a/bolt/lib/Rewrite/PseudoProbeRewriter.cpp b/bolt/lib/Rewrite/PseudoProbeRewriter.cpp
index fef721167869dd..862208abbbd1aa 100644
--- a/bolt/lib/Rewrite/PseudoProbeRewriter.cpp
+++ b/bolt/lib/Rewrite/PseudoProbeRewriter.cpp
@@ -14,6 +14,7 @@
 #include "bolt/Rewrite/MetadataRewriter.h"
 #include "bolt/Rewrite/MetadataRewriters.h"
 #include "bolt/Utils/CommandLineOpts.h"
+#include "bolt/Utils/Utils.h"
 #include "llvm/IR/Function.h"
 #include "llvm/MC/MCPseudoProbe.h"
 #include "llvm/Support/CommandLine.h"
@@ -133,10 +134,16 @@ void PseudoProbeRewriter::parsePseudoProbe() {
 
   MCPseudoProbeDecoder::Uint64Set GuidFilter;
   MCPseudoProbeDecoder::Uint64Map FuncStartAddrs;
+  SmallVector<StringRef, 3> Suffixes({".destroy", ".resume", ".llvm."});
   for (const BinaryFunction *F : BC.getAllBinaryFunctions()) {
     for (const MCSymbol *Sym : F->getSymbols()) {
-      FuncStartAddrs[Function::getGUID(NameResolver::restore(Sym->getName()))] =
-          F->getAddress();
+      StringRef SymName = NameResolver::restore(Sym->getName());
+      if (std::optional<StringRef> CommonName =
+              getCommonName(SymName, false, Suffixes)) {
+        SymName = *CommonName;
+      }
+      uint64_t GUID = Function::getGUID(SymName);
+      FuncStartAddrs[GUID] = F->getAddress();
     }
   }
   Contents = PseudoProbeSection->getContents();
@@ -155,13 +162,25 @@ void PseudoProbeRewriter::parsePseudoProbe() {
     ProbeDecoder.printProbesForAllAddresses(outs());
   }
 
-  for (const auto &FuncDesc : ProbeDecoder.getGUID2FuncDescMap()) {
-    uint64_t GUID = FuncDesc.FuncGUID;
-    if (!FuncStartAddrs.contains(GUID))
-      continue;
-    BinaryFunction *BF = BC.getBinaryFunctionAtAddress(FuncStartAddrs[GUID]);
-    assert(BF);
-    BF->setGUID(GUID);
+  const GUIDProbeFunctionMap &GUID2Func = ProbeDecoder.getGUID2FuncDescMap();
+  // Checks GUID in GUID2Func and returns it if it's present or null otherwise.
+  auto checkGUID = [&](StringRef SymName) {
+    uint64_t GUID = Function::getGUID(SymName);
+    if (GUID2Func.find(GUID) == GUID2Func.end())
+      return 0ull;
+    return GUID;
+  };
+  for (BinaryFunction *F : BC.getAllBinaryFunctions()) {
+    for (const MCSymbol *Sym : F->getSymbols()) {
+      StringRef SymName = NameResolver::restore(Sym->getName());
+      uint64_t GUID = checkGUID(SymName);
+      std::optional<StringRef> CommonName =
+          getCommonName(SymName, false, Suffixes);
+      if (!GUID && CommonName)
+        GUID = checkGUID(*CommonName);
+      if (GUID)
+        F->setGUID(GUID);
+    }
   }
 }
 
diff --git a/bolt/lib/Utils/Utils.cpp b/bolt/lib/Utils/Utils.cpp
index 718e97535fd22a..ecc2f1010a9858 100644
--- a/bolt/lib/Utils/Utils.cpp
+++ b/bolt/lib/Utils/Utils.cpp
@@ -66,15 +66,21 @@ std::string getUnescapedName(const StringRef &Name) {
   return Output;
 }
 
-std::optional<StringRef> getLTOCommonName(const StringRef Name) {
-  for (StringRef Suffix : {".__uniq.", ".lto_priv.", ".constprop.", ".llvm."}) {
+std::optional<StringRef> getCommonName(const StringRef Name, bool KeepSuffix,
+                                       ArrayRef<StringRef> Suffixes) {
+  for (StringRef Suffix : Suffixes) {
     size_t LTOSuffixPos = Name.find(Suffix);
     if (LTOSuffixPos != StringRef::npos)
-      return Name.substr(0, LTOSuffixPos + Suffix.size());
+      return Name.substr(0, LTOSuffixPos + (KeepSuffix ? Suffix.size() : 0));
   }
   return std::nullopt;
 }
 
+std::optional<StringRef> getLTOCommonName(const StringRef Name) {
+  return getCommonName(Name, true,
+                       {".__uniq.", ".lto_priv.", ".constprop.", ".llvm."});
+}
+
 std::optional<uint8_t> readDWARFExpressionTargetReg(StringRef ExprBytes) {
   uint8_t Opcode = ExprBytes[0];
   if (Opcode == dwarf::DW_CFA_def_cfa_expression)

>From e0a705e3f79c40426af9e4decdcac4cab7129cb4 Mon Sep 17 00:00:00 2001
From: Amir Ayupov <aaupov at fb.com>
Date: Mon, 26 Aug 2024 13:53:31 -0700
Subject: [PATCH 37/39] [BOLT] Only parse probes for profiled functions in
 profile-write-pseudo-probes mode

---
 bolt/lib/Rewrite/PseudoProbeRewriter.cpp | 25 +++++++++++++++++++++---
 1 file changed, 22 insertions(+), 3 deletions(-)

diff --git a/bolt/lib/Rewrite/PseudoProbeRewriter.cpp b/bolt/lib/Rewrite/PseudoProbeRewriter.cpp
index 862208abbbd1aa..4b3f9ab4cb64ae 100644
--- a/bolt/lib/Rewrite/PseudoProbeRewriter.cpp
+++ b/bolt/lib/Rewrite/PseudoProbeRewriter.cpp
@@ -72,7 +72,8 @@ class PseudoProbeRewriter final : public MetadataRewriter {
 
   /// Parse .pseudo_probe_desc section and .pseudo_probe section
   /// Setup Pseudo probe decoder
-  void parsePseudoProbe();
+  /// If \p ProfiledOnly is set, only parse records for functions with profile.
+  void parsePseudoProbe(bool ProfiledOnly = false);
 
   /// PseudoProbe decoder
   std::shared_ptr<MCPseudoProbeDecoder> ProbeDecoderPtr;
@@ -92,7 +93,7 @@ class PseudoProbeRewriter final : public MetadataRewriter {
 
 Error PseudoProbeRewriter::preCFGInitializer() {
   if (opts::ProfileWritePseudoProbes)
-    parsePseudoProbe();
+    parsePseudoProbe(true);
 
   return Error::success();
 }
@@ -105,7 +106,7 @@ Error PseudoProbeRewriter::postEmitFinalizer() {
   return Error::success();
 }
 
-void PseudoProbeRewriter::parsePseudoProbe() {
+void PseudoProbeRewriter::parsePseudoProbe(bool ProfiledOnly) {
   MCPseudoProbeDecoder &ProbeDecoder(*ProbeDecoderPtr);
   PseudoProbeDescSection = BC.getUniqueSectionByName(".pseudo_probe_desc");
   PseudoProbeSection = BC.getUniqueSectionByName(".pseudo_probe");
@@ -136,6 +137,7 @@ void PseudoProbeRewriter::parsePseudoProbe() {
   MCPseudoProbeDecoder::Uint64Map FuncStartAddrs;
   SmallVector<StringRef, 3> Suffixes({".destroy", ".resume", ".llvm."});
   for (const BinaryFunction *F : BC.getAllBinaryFunctions()) {
+    bool HasProfile = F->hasProfileAvailable();
     for (const MCSymbol *Sym : F->getSymbols()) {
       StringRef SymName = NameResolver::restore(Sym->getName());
       if (std::optional<StringRef> CommonName =
@@ -144,6 +146,23 @@ void PseudoProbeRewriter::parsePseudoProbe() {
       }
       uint64_t GUID = Function::getGUID(SymName);
       FuncStartAddrs[GUID] = F->getAddress();
+      if (ProfiledOnly && HasProfile)
+        GuidFilter.insert(GUID);
+      std::optional<StringRef> CommonName =
+          getCommonName(SymName, false, Suffixes);
+      if (!CommonName)
+        continue;
+      GUID = Function::getGUID(*CommonName);
+      FuncStartAddrs.try_emplace(GUID, F->getAddress());
+      if (ProfiledOnly && HasProfile)
+        GuidFilter.insert(GUID);
+    }
+  }
+  if (ProfiledOnly) {
+    for (const auto &FuncDesc : ProbeDecoder.getGUID2FuncDescMap()) {
+      uint64_t GUID = FuncDesc.FuncGUID;
+      if (!FuncStartAddrs.contains(GUID))
+        GuidFilter.insert(GUID);
     }
   }
   Contents = PseudoProbeSection->getContents();

>From 66fe5d50d65cb6bdda52076c4073508be0a5bc60 Mon Sep 17 00:00:00 2001
From: Amir Ayupov <aaupov at fb.com>
Date: Fri, 30 Aug 2024 20:31:37 -0700
Subject: [PATCH 38/39] [BOLT] Add pseudo probe inline tree to YAML profile

To be used for pseudo probe function matching (#100446).

Test Plan: updated pseudoprobe-decoding-inline.test

Pull Request: https://github.com/llvm/llvm-project/pull/107137
---
 .../include/bolt/Profile/ProfileYAMLMapping.h | 49 +++++++++++----
 bolt/lib/Profile/DataAggregator.cpp           | 53 ++++++++++++++---
 bolt/lib/Profile/YAMLProfileWriter.cpp        | 59 +++++++++++++++----
 .../test/X86/pseudoprobe-decoding-inline.test | 31 ++++++----
 4 files changed, 153 insertions(+), 39 deletions(-)

diff --git a/bolt/include/bolt/Profile/ProfileYAMLMapping.h b/bolt/include/bolt/Profile/ProfileYAMLMapping.h
index 2a0514d7d9304b..f0cc116ebc6cb0 100644
--- a/bolt/include/bolt/Profile/ProfileYAMLMapping.h
+++ b/bolt/include/bolt/Profile/ProfileYAMLMapping.h
@@ -95,24 +95,28 @@ template <> struct MappingTraits<bolt::SuccessorInfo> {
 
 namespace bolt {
 struct PseudoProbeInfo {
-  llvm::yaml::Hex64 GUID;
   uint64_t Index;
+  uint32_t InlineTreeIndex;
+  llvm::yaml::Hex32 Offset{0};
   uint8_t Type;
 
   bool operator==(const PseudoProbeInfo &Other) const {
-    return GUID == Other.GUID && Index == Other.Index;
+    return InlineTreeIndex == Other.InlineTreeIndex && Index == Other.Index;
   }
-  bool operator!=(const PseudoProbeInfo &Other) const {
-    return !(*this == Other);
+  bool operator<(const PseudoProbeInfo &Other) const {
+    if (InlineTreeIndex == Other.InlineTreeIndex)
+      return Index < Other.Index;
+    return InlineTreeIndex < Other.InlineTreeIndex;
   }
 };
 } // end namespace bolt
 
 template <> struct MappingTraits<bolt::PseudoProbeInfo> {
   static void mapping(IO &YamlIO, bolt::PseudoProbeInfo &PI) {
-    YamlIO.mapRequired("guid", PI.GUID);
     YamlIO.mapRequired("id", PI.Index);
     YamlIO.mapRequired("type", PI.Type);
+    YamlIO.mapOptional("inline_tree_id", PI.InlineTreeIndex, (uint32_t)0);
+    YamlIO.mapOptional("offset", PI.Offset, (uint32_t)0);
   }
 
   static const bool flow = true;
@@ -122,7 +126,7 @@ template <> struct MappingTraits<bolt::PseudoProbeInfo> {
 
 LLVM_YAML_IS_FLOW_SEQUENCE_VECTOR(llvm::yaml::bolt::CallSiteInfo)
 LLVM_YAML_IS_FLOW_SEQUENCE_VECTOR(llvm::yaml::bolt::SuccessorInfo)
-LLVM_YAML_IS_FLOW_SEQUENCE_VECTOR(llvm::yaml::bolt::PseudoProbeInfo)
+LLVM_YAML_IS_SEQUENCE_VECTOR(llvm::yaml::bolt::PseudoProbeInfo)
 
 namespace llvm {
 namespace yaml {
@@ -163,10 +167,35 @@ template <> struct MappingTraits<bolt::BinaryBasicBlockProfile> {
   }
 };
 
+namespace bolt {
+struct InlineTreeInfo {
+  uint32_t Index;
+  uint32_t ParentIndex;
+  uint32_t CallSiteProbe;
+  llvm::yaml::Hex64 GUID;
+  llvm::yaml::Hex64 Hash;
+  bool operator==(const InlineTreeInfo &Other) const {
+    return Index == Other.Index;
+  }
+};
+} // end namespace bolt
+
+template <> struct MappingTraits<bolt::InlineTreeInfo> {
+  static void mapping(IO &YamlIO, bolt::InlineTreeInfo &ITI) {
+    YamlIO.mapRequired("guid", ITI.GUID);
+    YamlIO.mapRequired("hash", ITI.Hash);
+    YamlIO.mapRequired("id", ITI.Index);
+    YamlIO.mapOptional("parent", ITI.ParentIndex, (uint32_t)0);
+    YamlIO.mapOptional("callsite", ITI.CallSiteProbe, 0);
+  }
+
+  static const bool flow = true;
+};
 } // end namespace yaml
 } // end namespace llvm
 
 LLVM_YAML_IS_SEQUENCE_VECTOR(llvm::yaml::bolt::BinaryBasicBlockProfile)
+LLVM_YAML_IS_SEQUENCE_VECTOR(llvm::yaml::bolt::InlineTreeInfo)
 
 namespace llvm {
 namespace yaml {
@@ -179,8 +208,7 @@ struct BinaryFunctionProfile {
   llvm::yaml::Hex64 Hash{0};
   uint64_t ExecCount{0};
   std::vector<BinaryBasicBlockProfile> Blocks;
-  llvm::yaml::Hex64 GUID{0};
-  llvm::yaml::Hex64 PseudoProbeDescHash{0};
+  std::vector<InlineTreeInfo> InlineTree;
   bool Used{false};
 };
 } // end namespace bolt
@@ -194,9 +222,8 @@ template <> struct MappingTraits<bolt::BinaryFunctionProfile> {
     YamlIO.mapRequired("nblocks", BFP.NumBasicBlocks);
     YamlIO.mapOptional("blocks", BFP.Blocks,
                        std::vector<bolt::BinaryBasicBlockProfile>());
-    YamlIO.mapOptional("guid", BFP.GUID, (uint64_t)0);
-    YamlIO.mapOptional("pseudo_probe_desc_hash", BFP.PseudoProbeDescHash,
-                       (uint64_t)0);
+    YamlIO.mapOptional("inline_tree", BFP.InlineTree,
+                       std::vector<bolt::InlineTreeInfo>());
   }
 };
 
diff --git a/bolt/lib/Profile/DataAggregator.cpp b/bolt/lib/Profile/DataAggregator.cpp
index 10d745cc69824b..803cc4725b5702 100644
--- a/bolt/lib/Profile/DataAggregator.cpp
+++ b/bolt/lib/Profile/DataAggregator.cpp
@@ -34,6 +34,7 @@
 #include "llvm/Support/raw_ostream.h"
 #include <map>
 #include <optional>
+#include <queue>
 #include <unordered_map>
 #include <utility>
 
@@ -2402,12 +2403,43 @@ std::error_code DataAggregator::writeBATYAML(BinaryContext &BC,
         const unsigned BlockIndex = BlockMap.getBBIndex(BI.To.Offset);
         YamlBF.Blocks[BlockIndex].ExecCount += BI.Branches;
       }
-      if (PseudoProbeDecoder) {
-        if ((YamlBF.GUID = BF->getGUID())) {
-          const MCPseudoProbeFuncDesc *FuncDesc =
-              PseudoProbeDecoder->getFuncDescForGUID(YamlBF.GUID);
-          YamlBF.PseudoProbeDescHash = FuncDesc->FuncHash;
+      DenseMap<const MCDecodedPseudoProbeInlineTree *, uint32_t>
+          InlineTreeNodeId;
+      if (PseudoProbeDecoder && BF->getGUID()) {
+        std::queue<const MCDecodedPseudoProbeInlineTree *> ITWorklist;
+        // FIXME: faster inline tree lookup by top-level GUID
+        if (const MCDecodedPseudoProbeInlineTree *InlineTree = llvm::find_if(
+                PseudoProbeDecoder->getDummyInlineRoot().getChildren(),
+                [&](const auto &InlineTree) {
+                  return InlineTree.Guid == BF->getGUID();
+                })) {
+          ITWorklist.push(InlineTree);
+          InlineTreeNodeId[InlineTree] = 0;
+          auto Hash =
+              PseudoProbeDecoder->getFuncDescForGUID(BF->getGUID())->FuncHash;
+          YamlBF.InlineTree.emplace_back(
+              yaml::bolt::InlineTreeInfo{0, 0, 0, BF->getGUID(), Hash});
+        }
+        uint32_t ParentId = 0;
+        uint32_t NodeId = 1;
+        while (!ITWorklist.empty()) {
+          const MCDecodedPseudoProbeInlineTree *Cur = ITWorklist.front();
+          for (const MCDecodedPseudoProbeInlineTree &Child :
+               Cur->getChildren()) {
+            InlineTreeNodeId[&Child] = NodeId;
+            auto Hash =
+                PseudoProbeDecoder->getFuncDescForGUID(Child.Guid)->FuncHash;
+            YamlBF.InlineTree.emplace_back(yaml::bolt::InlineTreeInfo{
+                NodeId++, ParentId, std::get<1>(Child.getInlineSite()),
+                Child.Guid, Hash});
+            ITWorklist.push(&Child);
+          }
+          ITWorklist.pop();
+          ++ParentId;
         }
+      }
+
+      if (PseudoProbeDecoder) {
         // Fetch probes belonging to all fragments
         const AddressProbesMap &ProbeMap =
             PseudoProbeDecoder->getAddress2ProbesMap();
@@ -2420,12 +2452,19 @@ std::error_code DataAggregator::writeBATYAML(BinaryContext &BC,
             const uint32_t OutputAddress = Probe.getAddress();
             const uint32_t InputOffset = BAT->translate(
                 FuncAddr, OutputAddress - FuncAddr, /*IsBranchSrc=*/true);
-            const unsigned BlockIndex = getBlock(InputOffset).second;
+            const auto [BlockOffset, BlockIndex] = getBlock(InputOffset);
+            uint32_t NodeId = InlineTreeNodeId[Probe.getInlineTreeNode()];
+            uint32_t Offset = InputOffset - BlockOffset;
             YamlBF.Blocks[BlockIndex].PseudoProbes.emplace_back(
-                yaml::bolt::PseudoProbeInfo{Probe.getGuid(), Probe.getIndex(),
+                yaml::bolt::PseudoProbeInfo{Probe.getIndex(), NodeId, Offset,
                                             Probe.getType()});
           }
         }
+        for (yaml::bolt::BinaryBasicBlockProfile &YamlBB : YamlBF.Blocks) {
+          llvm::sort(YamlBB.PseudoProbes);
+          YamlBB.PseudoProbes.erase(llvm::unique(YamlBB.PseudoProbes),
+                                    YamlBB.PseudoProbes.end());
+        }
       }
       // Drop blocks without a hash, won't be useful for stale matching.
       llvm::erase_if(YamlBF.Blocks,
diff --git a/bolt/lib/Profile/YAMLProfileWriter.cpp b/bolt/lib/Profile/YAMLProfileWriter.cpp
index ffbf2388e912fb..817689230e2a70 100644
--- a/bolt/lib/Profile/YAMLProfileWriter.cpp
+++ b/bolt/lib/Profile/YAMLProfileWriter.cpp
@@ -12,11 +12,15 @@
 #include "bolt/Profile/BoltAddressTranslation.h"
 #include "bolt/Profile/DataAggregator.h"
 #include "bolt/Profile/ProfileReaderBase.h"
+#include "bolt/Profile/ProfileYAMLMapping.h"
 #include "bolt/Rewrite/RewriteInstance.h"
 #include "bolt/Utils/CommandLineOpts.h"
+#include "llvm/MC/MCPseudoProbe.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/FileSystem.h"
 #include "llvm/Support/raw_ostream.h"
+#include <deque>
+#include <queue>
 
 #undef  DEBUG_TYPE
 #define DEBUG_TYPE "bolt-prof"
@@ -77,13 +81,6 @@ YAMLProfileWriter::convert(const BinaryFunction &BF, bool UseDFS,
   YamlBF.Hash = BF.getHash();
   YamlBF.NumBasicBlocks = BF.size();
   YamlBF.ExecCount = BF.getKnownExecutionCount();
-  if (PseudoProbeDecoder) {
-    if ((YamlBF.GUID = BF.getGUID())) {
-      const MCPseudoProbeFuncDesc *FuncDesc =
-          PseudoProbeDecoder->getFuncDescForGUID(YamlBF.GUID);
-      YamlBF.PseudoProbeDescHash = FuncDesc->FuncHash;
-    }
-  }
 
   BinaryFunction::BasicBlockOrderType Order;
   llvm::copy(UseDFS ? BF.dfs() : BF.getLayout().blocks(),
@@ -92,6 +89,40 @@ YAMLProfileWriter::convert(const BinaryFunction &BF, bool UseDFS,
   const FunctionLayout Layout = BF.getLayout();
   Layout.updateLayoutIndices(Order);
 
+  DenseMap<const MCDecodedPseudoProbeInlineTree *, uint32_t> InlineTreeNodeId;
+  if (PseudoProbeDecoder && BF.getGUID()) {
+    std::queue<const MCDecodedPseudoProbeInlineTree *> ITWorklist;
+    // FIXME: faster inline tree lookup by top-level GUID
+    if (const MCDecodedPseudoProbeInlineTree *InlineTree = llvm::find_if(
+            PseudoProbeDecoder->getDummyInlineRoot().getChildren(),
+            [&](const auto &InlineTree) {
+              return InlineTree.Guid == BF.getGUID();
+            })) {
+      ITWorklist.push(InlineTree);
+      InlineTreeNodeId[InlineTree] = 0;
+      auto Hash =
+          PseudoProbeDecoder->getFuncDescForGUID(BF.getGUID())->FuncHash;
+      YamlBF.InlineTree.emplace_back(
+          yaml::bolt::InlineTreeInfo{0, 0, 0, BF.getGUID(), Hash});
+    }
+    uint32_t ParentId = 0;
+    uint32_t NodeId = 1;
+    while (!ITWorklist.empty()) {
+      const MCDecodedPseudoProbeInlineTree *Cur = ITWorklist.front();
+      for (const MCDecodedPseudoProbeInlineTree &Child : Cur->getChildren()) {
+        InlineTreeNodeId[&Child] = NodeId;
+        auto Hash =
+            PseudoProbeDecoder->getFuncDescForGUID(Child.Guid)->FuncHash;
+        YamlBF.InlineTree.emplace_back(yaml::bolt::InlineTreeInfo{
+            NodeId++, ParentId, std::get<1>(Child.getInlineSite()), Child.Guid,
+            Hash});
+        ITWorklist.push(&Child);
+      }
+      ITWorklist.pop();
+      ++ParentId;
+    }
+  }
+
   for (const BinaryBasicBlock *BB : Order) {
     yaml::bolt::BinaryBasicBlockProfile YamlBB;
     YamlBB.Index = BB->getLayoutIndex();
@@ -198,10 +229,18 @@ YAMLProfileWriter::convert(const BinaryFunction &BF, bool UseDFS,
       const uint64_t FuncAddr = BF.getAddress();
       const std::pair<uint64_t, uint64_t> &BlockRange =
           BB->getInputAddressRange();
-      for (const MCDecodedPseudoProbe &Probe : ProbeMap.find(
-               FuncAddr + BlockRange.first, FuncAddr + BlockRange.second))
+      const std::pair<uint64_t, uint64_t> BlockAddrRange = {
+          FuncAddr + BlockRange.first, FuncAddr + BlockRange.second};
+      for (const MCDecodedPseudoProbe &Probe :
+           ProbeMap.find(BlockAddrRange.first, BlockAddrRange.second)) {
+        uint32_t NodeId = InlineTreeNodeId[Probe.getInlineTreeNode()];
+        uint32_t Offset = Probe.getAddress() - BlockAddrRange.first;
         YamlBB.PseudoProbes.emplace_back(yaml::bolt::PseudoProbeInfo{
-            Probe.getGuid(), Probe.getIndex(), Probe.getType()});
+            Probe.getIndex(), NodeId, Offset, Probe.getType()});
+      }
+      llvm::sort(YamlBB.PseudoProbes);
+      YamlBB.PseudoProbes.erase(llvm::unique(YamlBB.PseudoProbes),
+                                YamlBB.PseudoProbes.end());
     }
 
     YamlBF.Blocks.emplace_back(YamlBB);
diff --git a/bolt/test/X86/pseudoprobe-decoding-inline.test b/bolt/test/X86/pseudoprobe-decoding-inline.test
index 1fdd00c7ef6c4b..629dd84ab8e1dc 100644
--- a/bolt/test/X86/pseudoprobe-decoding-inline.test
+++ b/bolt/test/X86/pseudoprobe-decoding-inline.test
@@ -14,29 +14,38 @@
 # RUN: FileCheck --input-file %t.yaml2 %s --check-prefix CHECK-YAML
 # CHECK-YAML: name: bar
 # CHECK-YAML: - bid: 0
-# CHECK-YAML:   pseudo_probes: [ { guid: 0xE413754A191DB537, id: 1, type: 0 }, { guid: 0xE413754A191DB537, id: 4, type: 0 } ]
-# CHECK-YAML: guid: 0xE413754A191DB537
-# CHECK-YAML: pseudo_probe_desc_hash: 0x10E852DA94
+# CHECK-YAML:      pseudo_probes:
+# CHECK-YAML-NEXT:   - { id: 1, type: 0
+# CHECK-YAML-NEXT:   - { id: 4, type: 0
+# CHECK-YAML:      inline_tree:
+# CHECK-YAML-NEXT:   - { guid: 0xE413754A191DB537, hash: 0x10E852DA94, id: 0 }
 #
 # CHECK-YAML: name: foo
 # CHECK-YAML: - bid: 0
-# CHECK-YAML:   pseudo_probes: [ { guid: 0x5CF8C24CDB18BDAC, id: 1, type: 0 }, { guid: 0x5CF8C24CDB18BDAC, id: 2, type: 0 } ]
-# CHECK-YAML: guid: 0x5CF8C24CDB18BDAC
-# CHECK-YAML: pseudo_probe_desc_hash: 0x200205A19C5B4
+# CHECK-YAML:      pseudo_probes:
+# CHECK-YAML-NEXT: - { id: 1, type: 0 }
+# CHECK-YAML-NEXT: - { id: 2, type: 0 }
+# CHECK-YAML:      inline_tree:
+# CHECK-YAML-NEXT:   - { guid: 0x5CF8C24CDB18BDAC, hash: 0x200205A19C5B4, id: 0 }
+# CHECK-YAML-NEXT:   - { guid: 0xE413754A191DB537, hash: 0x10E852DA94, id: 1, callsite: 8 }
 #
 # CHECK-YAML: name: main
 # CHECK-YAML: - bid: 0
-# CHECK-YAML:   pseudo_probes: [ { guid: 0xDB956436E78DD5FA, id: 1, type: 0 }, { guid: 0x5CF8C24CDB18BDAC, id: 1, type: 0 }, { guid: 0x5CF8C24CDB18BDAC, id: 2, type: 0 } ]
-# CHECK-YAML: guid: 0xDB956436E78DD5FA
-# CHECK-YAML: pseudo_probe_desc_hash: 0x10000FFFFFFFF
+# CHECK-YAML:      pseudo_probes:
+# CHECK-YAML-NEXT: - { id: 1, type: 0 }
+# CHECK-YAML-NEXT: - { id: 1, type: 0, inline_tree_id: 1 }
+# CHECK-YAML-NEXT: - { id: 2, type: 0, inline_tree_id: 1 }
+# CHECK-YAML:      inline_tree:
+# CHECK-YAML-NEXT:   - { guid: 0xDB956436E78DD5FA, hash: 0x10000FFFFFFFF, id: 0 }
+# CHECK-YAML-NEXT:   - { guid: 0x5CF8C24CDB18BDAC, hash: 0x200205A19C5B4, id: 1, callsite: 2 }
+# CHECK-YAML-NEXT:   - { guid: 0xE413754A191DB537, hash: 0x10E852DA94, id: 2, parent: 1, callsite: 8 }
 #
 ## Check that without --profile-write-pseudo-probes option, no pseudo probes are
 ## generated
 # RUN: perf2bolt %S/../../../llvm/test/tools/llvm-profgen/Inputs/inline-cs-pseudoprobe.perfbin -p %t.preagg --pa -w %t.yaml -o %t.fdata
 # RUN: FileCheck --input-file %t.yaml %s --check-prefix CHECK-NO-OPT
 # CHECK-NO-OPT-NOT: pseudo_probes
-# CHECK-NO-OPT-NOT: guid
-# CHECK-NO-OPT-NOT: pseudo_probe_desc_hash
+# CHECK-NO-OPT-NOT: inline_tree
 
 CHECK: Report of decoding input pseudo probe binaries
 

>From 36197b175681d07b4704e576fb008cec3cc1e05e Mon Sep 17 00:00:00 2001
From: Amir Ayupov <aaupov at fb.com>
Date: Wed, 28 Aug 2024 21:10:25 +0200
Subject: [PATCH 39/39] Reworked block probe matching

Use new probe ifaces
Get all function probes at once
Drop ProfileUsePseudoProbes
Unify matchWithBlockPseudoProbes
Distinguish exact and loose probe match
---
 bolt/include/bolt/Core/BinaryContext.h    |  20 +-
 bolt/lib/Passes/BinaryPasses.cpp          |  40 ++-
 bolt/lib/Profile/StaleProfileMatching.cpp | 404 ++++++++++------------
 bolt/lib/Rewrite/PseudoProbeRewriter.cpp  |   8 +-
 4 files changed, 237 insertions(+), 235 deletions(-)

diff --git a/bolt/include/bolt/Core/BinaryContext.h b/bolt/include/bolt/Core/BinaryContext.h
index 3e20cb607e657b..3f7b2ac0bc6cf9 100644
--- a/bolt/include/bolt/Core/BinaryContext.h
+++ b/bolt/include/bolt/Core/BinaryContext.h
@@ -724,14 +724,26 @@ class BinaryContext {
     uint32_t NumStaleBlocks{0};
     ///   the number of exactly matched basic blocks
     uint32_t NumExactMatchedBlocks{0};
-    ///   the number of pseudo probe matched basic blocks
-    uint32_t NumPseudoProbeMatchedBlocks{0};
+    ///   the number of loosely matched basic blocks
+    uint32_t NumLooseMatchedBlocks{0};
+    ///   the number of exactly pseudo probe matched basic blocks
+    uint32_t NumPseudoProbeExactMatchedBlocks{0};
+    ///   the number of loosely pseudo probe matched basic blocks
+    uint32_t NumPseudoProbeLooseMatchedBlocks{0};
+    ///   the number of call matched basic blocks
+    uint32_t NumCallMatchedBlocks{0};
     ///   the total count of samples in the profile
     uint64_t StaleSampleCount{0};
     ///   the count of exactly matched samples
     uint64_t ExactMatchedSampleCount{0};
-    ///   the count of pseudo probe matched samples
-    uint64_t PseudoProbeMatchedSampleCount{0};
+    ///   the count of exactly matched samples
+    uint64_t LooseMatchedSampleCount{0};
+    ///   the count of exactly pseudo probe matched samples
+    uint64_t PseudoProbeExactMatchedSampleCount{0};
+    ///   the count of loosely pseudo probe matched samples
+    uint64_t PseudoProbeLooseMatchedSampleCount{0};
+    ///   the count of call matched samples
+    uint64_t CallMatchedSampleCount{0};
     ///   the number of stale functions that have matching number of blocks in
     ///   the profile
     uint64_t NumStaleFuncsWithEqualBlockCount{0};
diff --git a/bolt/lib/Passes/BinaryPasses.cpp b/bolt/lib/Passes/BinaryPasses.cpp
index b786f07a6a6651..8edbd58c3ed3de 100644
--- a/bolt/lib/Passes/BinaryPasses.cpp
+++ b/bolt/lib/Passes/BinaryPasses.cpp
@@ -1524,15 +1524,43 @@ Error PrintProgramStats::runOnFunctions(BinaryContext &BC) {
         100.0 * BC.Stats.ExactMatchedSampleCount / BC.Stats.StaleSampleCount,
         BC.Stats.ExactMatchedSampleCount, BC.Stats.StaleSampleCount);
     BC.outs() << format(
-        "BOLT-INFO: inference found a pseudo probe match for %.2f%% of basic "
+        "BOLT-INFO: inference found an exact pseudo probe match for %.2f%% of "
+        "basic blocks (%zu out of %zu stale) responsible for %.2f%% samples"
+        " (%zu out of %zu stale)\n",
+        100.0 * BC.Stats.NumPseudoProbeExactMatchedBlocks /
+            BC.Stats.NumStaleBlocks,
+        BC.Stats.NumPseudoProbeExactMatchedBlocks, BC.Stats.NumStaleBlocks,
+        100.0 * BC.Stats.PseudoProbeExactMatchedSampleCount /
+            BC.Stats.StaleSampleCount,
+        BC.Stats.PseudoProbeExactMatchedSampleCount, BC.Stats.StaleSampleCount);
+    BC.outs() << format(
+        "BOLT-INFO: inference found a loose pseudo probe match for %.2f%% of "
+        "basic blocks (%zu out of %zu stale) responsible for %.2f%% samples"
+        " (%zu out of %zu stale)\n",
+        100.0 * BC.Stats.NumPseudoProbeLooseMatchedBlocks /
+            BC.Stats.NumStaleBlocks,
+        BC.Stats.NumPseudoProbeLooseMatchedBlocks, BC.Stats.NumStaleBlocks,
+        100.0 * BC.Stats.PseudoProbeLooseMatchedSampleCount /
+            BC.Stats.StaleSampleCount,
+        BC.Stats.PseudoProbeLooseMatchedSampleCount, BC.Stats.StaleSampleCount);
+    BC.outs() << format(
+        "BOLT-INFO: inference found a call match for %.2f%% of basic "
         "blocks"
         " (%zu out of %zu stale) responsible for %.2f%% samples"
         " (%zu out of %zu stale)\n",
-        100.0 * BC.Stats.NumPseudoProbeMatchedBlocks / BC.Stats.NumStaleBlocks,
-        BC.Stats.NumPseudoProbeMatchedBlocks, BC.Stats.NumStaleBlocks,
-        100.0 * BC.Stats.PseudoProbeMatchedSampleCount /
-            BC.Stats.StaleSampleCount,
-        BC.Stats.PseudoProbeMatchedSampleCount, BC.Stats.StaleSampleCount);
+        100.0 * BC.Stats.NumCallMatchedBlocks / BC.Stats.NumStaleBlocks,
+        BC.Stats.NumCallMatchedBlocks, BC.Stats.NumStaleBlocks,
+        100.0 * BC.Stats.CallMatchedSampleCount / BC.Stats.StaleSampleCount,
+        BC.Stats.CallMatchedSampleCount, BC.Stats.StaleSampleCount);
+    BC.outs() << format(
+        "BOLT-INFO: inference found a loose match for %.2f%% of basic "
+        "blocks"
+        " (%zu out of %zu stale) responsible for %.2f%% samples"
+        " (%zu out of %zu stale)\n",
+        100.0 * BC.Stats.NumLooseMatchedBlocks / BC.Stats.NumStaleBlocks,
+        BC.Stats.NumLooseMatchedBlocks, BC.Stats.NumStaleBlocks,
+        100.0 * BC.Stats.LooseMatchedSampleCount / BC.Stats.StaleSampleCount,
+        BC.Stats.LooseMatchedSampleCount, BC.Stats.StaleSampleCount);
   }
 
   if (const uint64_t NumUnusedObjects = BC.getNumUnusedProfiledObjects()) {
diff --git a/bolt/lib/Profile/StaleProfileMatching.cpp b/bolt/lib/Profile/StaleProfileMatching.cpp
index ef9320ae168fe7..2ec74ac7549f7c 100644
--- a/bolt/lib/Profile/StaleProfileMatching.cpp
+++ b/bolt/lib/Profile/StaleProfileMatching.cpp
@@ -29,6 +29,7 @@
 #include "bolt/Profile/YAMLProfileReader.h"
 #include "llvm/ADT/Bitfields.h"
 #include "llvm/ADT/Hashing.h"
+#include "llvm/MC/MCPseudoProbe.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Timer.h"
 #include "llvm/Support/xxhash.h"
@@ -46,7 +47,6 @@ namespace opts {
 extern cl::opt<bool> TimeRewrite;
 extern cl::OptionCategory BoltOptCategory;
 extern cl::opt<unsigned> Verbosity;
-extern cl::opt<bool> ProfileUsePseudoProbes;
 
 cl::opt<bool>
     InferStaleProfile("infer-stale-profile",
@@ -198,8 +198,6 @@ struct BlendedBlockHash {
 /// release.
 class StaleMatcher {
 public:
-  StaleMatcher(const uint64_t YamlBFGUID) : YamlBFGUID(YamlBFGUID) {}
-
   /// Initialize stale matcher.
   void init(const std::vector<FlowBlock *> &Blocks,
             const std::vector<BlendedBlockHash> &Hashes,
@@ -217,39 +215,38 @@ class StaleMatcher {
     }
   }
 
-  /// Creates a mapping from a inlined pseudo probe's guid and index to probe.
-  void mapGUIDAndIndexToProbe(uint64_t Guid, uint64_t Index,
-                              const MCDecodedPseudoProbe *Probe) {
-    IndexAndGUIDToInlinedProbes[Guid][Index].push_back(Probe);
-  }
-
-  /// Creates a mapping from a pseudo probe index to pseudo probe.
-  void mapIndexToProbe(uint64_t Index, const MCDecodedPseudoProbe *Probe) {
-    IndexToProbes[Index].push_back(Probe);
-  }
-
   /// Creates a mapping from a pseudo probe to a flow block.
   void mapProbeToBB(const MCDecodedPseudoProbe *Probe, FlowBlock *Block) {
     BBPseudoProbeToBlock[Probe] = Block;
   }
 
+  enum MatchMethod : char {
+    MATCH_EXACT = 0,
+    MATCH_PROBE_EXACT,
+    MATCH_PROBE_LOOSE,
+    MATCH_OPCODE,
+    MATCH_CALL,
+    NO_MATCH
+  };
+
   /// Find the most similar flow block for a profile block given its hashes and
   /// pseudo probe information.
-  const FlowBlock *
+  std::pair<const FlowBlock *, MatchMethod>
   matchBlock(BlendedBlockHash BlendedHash, uint64_t CallHash,
-             const std::vector<yaml::bolt::PseudoProbeInfo> &PseudoProbes) {
-    const FlowBlock *BestBlock = matchWithOpcodes(BlendedHash);
-    if (BestBlock) {
-      ++MatchedWithOpcodes;
-      return BestBlock;
-    }
-    BestBlock = matchWithCalls(BlendedHash, CallHash);
-    if (BestBlock)
-      return BestBlock;
-    BestBlock = matchWithPseudoProbes(BlendedHash, PseudoProbes);
-    if (BestBlock)
-      MatchedWithPseudoProbes.insert(BlendedHash.combine());
-    return BestBlock;
+             const ArrayRef<yaml::bolt::PseudoProbeInfo> PseudoProbes,
+             const ArrayRef<yaml::bolt::InlineTreeInfo> InlineTree) {
+    const auto &[Block, Hash] = matchWithOpcodes(BlendedHash);
+    if (isHighConfidenceMatch(Hash, BlendedHash))
+      return {Block, MATCH_EXACT};
+    const auto &[ProbeBlock, Exact] =
+        matchWithPseudoProbes(PseudoProbes, InlineTree);
+    if (ProbeBlock)
+      return {ProbeBlock, Exact ? MATCH_PROBE_EXACT : MATCH_PROBE_LOOSE};
+    if (const FlowBlock *BestBlock = matchWithCalls(BlendedHash, CallHash))
+      return {BestBlock, MATCH_CALL};
+    if (Block)
+      return {Block, MATCH_OPCODE};
+    return {nullptr, NO_MATCH};
   }
 
   /// Returns true if the two basic blocks (in the binary and in the profile)
@@ -260,48 +257,49 @@ class StaleMatcher {
     return Hash1.InstrHash == Hash2.InstrHash;
   }
 
-  /// Returns true if a profiled block was matched with its pseudo probe.
-  bool isPseudoProbeMatch(BlendedBlockHash YamlBBHash) {
-    return MatchedWithPseudoProbes.find(YamlBBHash.combine()) !=
-           MatchedWithPseudoProbes.end();
+  /// Returns matched InlineTree * for a given profile inline_tree_id.
+  const MCDecodedPseudoProbeInlineTree *
+  getInlineTreeNode(uint32_t ProfileInlineTreeNodeId) const {
+    auto It = InlineTreeNodeMap.find(ProfileInlineTreeNodeId);
+    if (It == InlineTreeNodeMap.end())
+      return nullptr;
+    return It->second;
   }
 
-  /// Returns the number of blocks matched with opcodes.
-  size_t getNumBlocksMatchedWithOpcodes() const { return MatchedWithOpcodes; }
-
-  /// Returns the number of blocks matched with pseudo probes.
-  size_t getNumBlocksMatchedWithPseudoProbes() const {
-    return MatchedWithPseudoProbes.size();
+  void mapInlineTreeNode(uint32_t ProfileNode,
+                         const MCDecodedPseudoProbeInlineTree *BinaryNode) {
+    auto Res = InlineTreeNodeMap.try_emplace(ProfileNode, BinaryNode);
+    assert(Res.second &&
+           "Duplicate mapping from profile node index to binary inline tree");
+    (void)Res;
   }
 
 private:
   using HashBlockPairType = std::pair<BlendedBlockHash, FlowBlock *>;
   std::unordered_map<uint16_t, std::vector<HashBlockPairType>> OpHashToBlocks;
   std::unordered_map<uint64_t, std::vector<HashBlockPairType>> CallHashToBlocks;
-  DenseMap<uint64_t, std::vector<const MCDecodedPseudoProbe *>> IndexToProbes;
-  DenseMap<uint64_t,
-           DenseMap<uint64_t, std::vector<const MCDecodedPseudoProbe *>>>
-      IndexAndGUIDToInlinedProbes;
+  DenseMap<uint32_t, const MCDecodedPseudoProbeInlineTree *> InlineTreeNodeMap;
   DenseMap<const MCDecodedPseudoProbe *, FlowBlock *> BBPseudoProbeToBlock;
-  DenseSet<uint64_t> MatchedWithPseudoProbes;
-  const uint64_t YamlBFGUID{0};
-  uint64_t MatchedWithOpcodes{0};
 
-  // Uses OpcodeHash to find the most similar block for a given hash.
-  const FlowBlock *matchWithOpcodes(BlendedBlockHash BlendedHash) const {
+  // Uses OpcodeHash to find the most similar block (with blended hash) for a
+  // given hash.
+  std::pair<const FlowBlock *, BlendedBlockHash>
+  matchWithOpcodes(BlendedBlockHash BlendedHash) const {
     auto BlockIt = OpHashToBlocks.find(BlendedHash.OpcodeHash);
     if (BlockIt == OpHashToBlocks.end())
-      return nullptr;
+      return {nullptr, BlendedBlockHash(0)};
     FlowBlock *BestBlock = nullptr;
     uint64_t BestDist = std::numeric_limits<uint64_t>::max();
+    BlendedBlockHash BestHash;
     for (const auto &[Hash, Block] : BlockIt->second) {
       uint64_t Dist = Hash.distance(BlendedHash);
       if (BestBlock == nullptr || Dist < BestDist) {
         BestDist = Dist;
         BestBlock = Block;
+        BestHash = Hash;
       }
     }
-    return BestBlock;
+    return {BestBlock, BestHash};
   }
 
   // Uses CallHash to find the most similar block for a given hash.
@@ -326,120 +324,71 @@ class StaleMatcher {
     return BestBlock;
   }
 
-  /// A helper function for logging.
-  static bool LogErrIfExpr(bool Expr, StringRef Message) {
-    if (Expr)
-      errs() << Message;
-    return Expr;
-  }
-
-  /// Matches an inlined profile block with an inlined binary block based on
-  /// pseudo probes.
-  const FlowBlock *matchWithInlinedBlockPseudoProbes(
-      SmallVector<const yaml::bolt::PseudoProbeInfo *>
-          &InlinedBlockPseudoProbes) const {
-    if (opts::Verbosity >= 3)
-      outs() << "BOLT-INFO: attempting to match block with inlined block "
-                "pseudo probes\n";
-
-    size_t NInlinedBlockPseudoProbes = InlinedBlockPseudoProbes.size();
-    if (LogErrIfExpr(NInlinedBlockPseudoProbes == 0,
-                     "BOLT-WARNING: no pseudo probes in profile block\n"))
-      return nullptr;
-    if (LogErrIfExpr(
-            NInlinedBlockPseudoProbes > 1,
-            "BOLT-WARNING: more than 1 pseudo probes in profile block\n"))
-      return nullptr;
-
-    const auto *InlinedPseudoProbe = InlinedBlockPseudoProbes[0];
-    uint64_t Guid = InlinedPseudoProbe->GUID;
-    uint64_t Index = InlinedPseudoProbe->Index;
-
-    auto GuidIt = IndexAndGUIDToInlinedProbes.find(Guid);
-    if (LogErrIfExpr(
-            GuidIt == IndexAndGUIDToInlinedProbes.end(),
-            "BOLT-WARNING: no pseudo probes found within BB at index\n"))
-      return nullptr;
-    auto IndexIt = GuidIt->second.find(Index);
-    if (LogErrIfExpr(
-            IndexIt == GuidIt->second.end(),
-            "BOLT-WARNING: no pseudo probes found within BB at index\n"))
-      return nullptr;
-
-    if (LogErrIfExpr(
-            IndexIt->second.size() > 1,
-            "BOLT-WARNING: more than 1 block pseudo probes in BB at index\n"))
-      return nullptr;
-
-    const MCDecodedPseudoProbe *BinaryPseudoProbe = IndexIt->second[0];
-    auto BinaryPseudoProbeIt = BBPseudoProbeToBlock.find(BinaryPseudoProbe);
-    assert(BinaryPseudoProbeIt != BBPseudoProbeToBlock.end() &&
-           "All binary pseudo probes should belong a binary basic block");
-
-    return BinaryPseudoProbeIt->second;
-  }
-
   /// Matches a profile block with an binary block based on pseudo probes.
-  const FlowBlock *matchWithNonInlinedBlockPseudoProbes(
-      SmallVector<const yaml::bolt::PseudoProbeInfo *> &BlockPseudoProbes)
-      const {
-    if (opts::Verbosity >= 3)
-      outs() << "BOLT-INFO: attempting to match block with inlined block "
-                "pseudo probes\n";
-
-    size_t NBlockPseudoProbes = BlockPseudoProbes.size();
-    if (LogErrIfExpr(NBlockPseudoProbes == 0,
-                     "BOLT-WARNING: no pseudo probes in profile block\n"))
-      return nullptr;
-    if (LogErrIfExpr(
-            NBlockPseudoProbes > 1,
-            "BOLT-WARNING: more than 1 pseudo probes in profile block\n"))
-      return nullptr;
-    uint64_t Index = BlockPseudoProbes[0]->Index;
-    auto It = IndexToProbes.find(Index);
-    if (LogErrIfExpr(
-            It == IndexToProbes.end(),
-            "BOLT-WARNING: no block pseudo probes found within BB at index\n"))
-      return nullptr;
-    if (LogErrIfExpr(
-            It->second.size() > 1,
-            "BOLT-WARNING: more than 1 block pseudo probes in BB at index\n"))
-      return nullptr;
-    const MCDecodedPseudoProbe *BinaryPseudoProbe = It->second[0];
-    auto BinaryPseudoProbeIt = BBPseudoProbeToBlock.find(BinaryPseudoProbe);
-    assert(BinaryPseudoProbeIt != BBPseudoProbeToBlock.end() &&
-           "All binary pseudo probes should belong a binary basic block");
+  /// Returns the best matching block (or nullptr) and whether the match is
+  /// unambiguous.
+  std::pair<const FlowBlock *, bool> matchWithPseudoProbes(
+      const ArrayRef<yaml::bolt::PseudoProbeInfo> BlockPseudoProbes,
+      const ArrayRef<yaml::bolt::InlineTreeInfo> InlineTree) const {
+    if (!opts::StaleMatchingWithBlockPseudoProbes)
+      return {nullptr, false};
+
+    auto logIf = [](bool Expr, StringRef Message) {
+      LLVM_DEBUG(if (Expr) errs() << Message << '\n');
+      return Expr;
+    };
 
-    return BinaryPseudoProbeIt->second;
-  }
+    DenseMap<const FlowBlock *, uint32_t> FlowBlockMatchCount;
 
-  /// Uses pseudo probe information to attach the profile to the appropriate
-  /// block.
-  const FlowBlock *matchWithPseudoProbes(
-      BlendedBlockHash BlendedHash,
-      const std::vector<yaml::bolt::PseudoProbeInfo> &PseudoProbes) const {
-    if (!opts::StaleMatchingWithBlockPseudoProbes || !YamlBFGUID)
-      return nullptr;
-
-    // Searches for the pseudo probe attached to the matched function's block.
-    SmallVector<const yaml::bolt::PseudoProbeInfo *> BlockPseudoProbes;
-    SmallVector<const yaml::bolt::PseudoProbeInfo *> InlinedBlockPseudoProbes;
-    for (const auto &PseudoProbe : PseudoProbes) {
-      // Skips pseudo probes attached to function calls.
-      if (PseudoProbe.Type != static_cast<uint8_t>(PseudoProbeType::Block))
+    for (const yaml::bolt::PseudoProbeInfo &Probe : BlockPseudoProbes) {
+      const MCDecodedPseudoProbeInlineTree *InlineTreeNode =
+          getInlineTreeNode(Probe.InlineTreeIndex);
+      if (logIf(!InlineTreeNode,
+                formatv("no matching inline tree node for {0} {1}",
+                        Probe.InlineTreeIndex, Probe.Index).str())) {
+        ++FlowBlockMatchCount[nullptr];
         continue;
-      if (PseudoProbe.GUID != YamlBFGUID)
-        InlinedBlockPseudoProbes.push_back(&PseudoProbe);
-      else
-        BlockPseudoProbes.push_back(&PseudoProbe);
+      }
+      const MCDecodedPseudoProbe *BinaryProbe = nullptr;
+      for (const MCDecodedPseudoProbe &FuncProbe :
+           InlineTreeNode->getProbes()) {
+        if (FuncProbe.getIndex() != Probe.Index)
+          continue;
+        BinaryProbe = &FuncProbe;
+        break;
+      }
+      if (logIf(!BinaryProbe, formatv("no matching binary probe for {0} {1}",
+                                      Probe.InlineTreeIndex, Probe.Index)
+                                  .str())) {
+        ++FlowBlockMatchCount[nullptr];
+        continue;
+      }
+      auto It = BBPseudoProbeToBlock.find(BinaryProbe);
+      if (logIf(It == BBPseudoProbeToBlock.end(),
+                formatv("no probe->block for {0} {1}", Probe.InlineTreeIndex,
+                        Probe.Index)
+                    .str())) {
+        ++FlowBlockMatchCount[nullptr];
+        continue;
+      }
+      const FlowBlock *Block = It->second;
+      ++FlowBlockMatchCount[Block];
+    }
+    uint32_t BestMatchCount = 0;
+    uint32_t TotalMatchCount = 0;
+    const FlowBlock *BestMatchBlock = nullptr;
+    for (auto &[Block, Count] : FlowBlockMatchCount) {
+      logIf(true, formatv("block {0} count {1}",
+                          Block ? Block->Index : UINT64_MAX, Count)
+                      .str());
+      TotalMatchCount += Count;
+      if (Count > BestMatchCount ||
+          (Count == BestMatchCount && !BestMatchBlock)) {
+        BestMatchBlock = Block;
+        BestMatchCount = Count;
+      }
     }
-    // Returns nullptr if there is not a 1:1 mapping of the profile block pseudo
-    // probe and a binary block pseudo probe.
-    const FlowBlock *MatchedInlinedBlock =
-        matchWithInlinedBlockPseudoProbes(InlinedBlockPseudoProbes);
-    return MatchedInlinedBlock
-               ? MatchedInlinedBlock
-               : matchWithNonInlinedBlockPseudoProbes(BlockPseudoProbes);
+    return {BestMatchBlock, BestMatchCount / TotalMatchCount};
   }
 };
 
@@ -630,26 +579,7 @@ size_t matchWeightsByHashes(
 
   assert(Func.Blocks.size() == BlockOrder.size() + 2);
 
-  // Sets the YamlBFGUID in the StaleMatcher such that if either the profiled or
-  // binary function dne or they are not equal, to zero, as not to perform
-  // pseudo probe block matching. Otherwise, the YamlBF's GUID is used for
-  // pseudo probe block matching.
-  const MCPseudoProbeDecoder *PseudoProbeDecoder =
-      opts::ProfileUsePseudoProbes && opts::StaleMatchingWithBlockPseudoProbes
-          ? BC.getPseudoProbeDecoder()
-          : nullptr;
-  uint64_t BFPseudoProbeDescHash = 0;
-  if (opts::ProfileUsePseudoProbes &&
-      opts::StaleMatchingWithBlockPseudoProbes && BF.getGUID() != 0) {
-    assert(PseudoProbeDecoder &&
-           "If BF has pseudo probe, BC should have a pseudo probe decoder");
-    auto &GUID2FuncDescMap = PseudoProbeDecoder->getGUID2FuncDescMap();
-    auto It = GUID2FuncDescMap.find(BF.getGUID());
-    if (It != GUID2FuncDescMap.end())
-      BFPseudoProbeDescHash = It->second.FuncHash;
-  }
-
-  StaleMatcher Matcher(YamlBF.GUID);
+  StaleMatcher Matcher;
   std::vector<uint64_t> CallHashes;
   std::vector<FlowBlock *> Blocks;
   std::vector<BlendedBlockHash> BlendedHashes;
@@ -672,38 +602,55 @@ size_t matchWeightsByHashes(
     Blocks.push_back(&Func.Blocks[I + 1]);
     BlendedBlockHash BlendedHash(BB->getHash());
     BlendedHashes.push_back(BlendedHash);
-    // Collects pseudo probes attached to the BB for use in the StaleMatcher.
-    if (opts::ProfileUsePseudoProbes &&
-        opts::StaleMatchingWithBlockPseudoProbes && BFPseudoProbeDescHash &&
-        YamlBF.PseudoProbeDescHash &&
-        BFPseudoProbeDescHash == YamlBF.PseudoProbeDescHash) {
-      assert(PseudoProbeDecoder &&
-             "If pseudo probes are in use, psuedo probe decoder should exist");
-      const AddressProbesMap &ProbeMap =
-          PseudoProbeDecoder->getAddress2ProbesMap();
-      const uint64_t FuncAddr = BF.getAddress();
-      const std::pair<uint64_t, uint64_t> &BlockRange =
-          BB->getInputAddressRange();
-      const auto &BlockProbes =
-          llvm::make_range(ProbeMap.lower_bound(FuncAddr + BlockRange.first),
-                           ProbeMap.lower_bound(FuncAddr + BlockRange.second));
-      for (const auto &[_, Probes] : BlockProbes) {
-        for (const MCDecodedPseudoProbe &Probe : Probes) {
-          if (Probe.getType() != static_cast<uint8_t>(PseudoProbeType::Block))
-            continue;
-          if (Probe.getInlineTreeNode()->hasInlineSite())
-            Matcher.mapGUIDAndIndexToProbe(Probe.getGuid(), Probe.getIndex(),
-                                           &Probe);
-          else
-            Matcher.mapIndexToProbe(Probe.getIndex(), &Probe);
-          Matcher.mapProbeToBB(&Probe, Blocks[I]);
-        }
-      }
-    }
 
     LLVM_DEBUG(dbgs() << "BB with index " << I << " has hash = "
                       << Twine::utohexstr(BB->getHash()) << "\n");
   }
+  // Collects function pseudo probes for use in the StaleMatcher.
+  if (opts::StaleMatchingWithBlockPseudoProbes) {
+    const MCPseudoProbeDecoder *PseudoProbeDecoder = BC.getPseudoProbeDecoder();
+    assert(PseudoProbeDecoder &&
+           "If pseudo probes are in use, pseudo probe decoder should exist");
+    const AddressProbesMap &ProbeMap =
+        PseudoProbeDecoder->getAddress2ProbesMap();
+    const uint64_t FuncAddr = BF.getAddress();
+    for (const MCDecodedPseudoProbe &Probe :
+         ProbeMap.find(FuncAddr, FuncAddr + BF.getSize()))
+      if (const BinaryBasicBlock *BB =
+              BF.getBasicBlockContainingOffset(Probe.getAddress() - FuncAddr))
+        Matcher.mapProbeToBB(&Probe, Blocks[BB->getIndex()]);
+    // Match inline tree nodes by GUID, checksum, parent, and call site.
+    unsigned MatchedNodes = 0;
+    const MCDecodedPseudoProbeInlineTree *DummyInlineRoot =
+        &PseudoProbeDecoder->getDummyInlineRoot();
+    for (const yaml::bolt::InlineTreeInfo &InlineTreeNode : YamlBF.InlineTree) {
+      uint64_t GUID = InlineTreeNode.GUID;
+      uint64_t Hash = InlineTreeNode.Hash;
+      uint32_t InlineTreeNodeId = InlineTreeNode.Index;
+      uint32_t ParentId = InlineTreeNode.ParentIndex;
+      uint32_t CallSiteProbe = InlineTreeNode.CallSiteProbe;
+      const MCDecodedPseudoProbeInlineTree *ParentNode =
+          InlineTreeNodeId ? Matcher.getInlineTreeNode(ParentId)
+                           : DummyInlineRoot;
+      if (!ParentNode)
+        continue;
+      for (const MCDecodedPseudoProbeInlineTree &Child :
+           ParentNode->getChildren()) {
+        if (Child.Guid != GUID ||
+            PseudoProbeDecoder->getFuncDescForGUID(GUID)->FuncHash != Hash)
+          continue;
+        // Check inline site for non-toplev inline tree nodes.
+        if (ParentNode != DummyInlineRoot &&
+            std::get<1>(Child.getInlineSite()) != CallSiteProbe)
+          continue;
+        Matcher.mapInlineTreeNode(InlineTreeNodeId, &Child);
+        ++MatchedNodes;
+        break;
+      }
+    }
+    LLVM_DEBUG(errs() << "matched " << MatchedNodes << "/"
+                      << YamlBF.InlineTree.size() << " inline tree nodes\n");
+  }
   Matcher.init(Blocks, BlendedHashes, CallHashes);
 
   // Index in yaml profile => corresponding (matched) block
@@ -724,7 +671,9 @@ size_t matchWeightsByHashes(
       else
         llvm_unreachable("Unhandled HashFunction");
     }
-    MatchedBlock = Matcher.matchBlock(YamlHash, CallHash, YamlBB.PseudoProbes);
+    StaleMatcher::MatchMethod Method;
+    std::tie(MatchedBlock, Method) = Matcher.matchBlock(
+        YamlHash, CallHash, YamlBB.PseudoProbes, YamlBF.InlineTree);
     if (MatchedBlock == nullptr && YamlBB.Index == 0)
       MatchedBlock = Blocks[0];
     if (MatchedBlock != nullptr) {
@@ -737,16 +686,34 @@ size_t matchWeightsByHashes(
                         << " with hash " << Twine::utohexstr(BinHash.combine())
                         << "\n");
       // Update matching stats accounting for the matched block.
-      if (Matcher.isHighConfidenceMatch(BinHash, YamlHash)) {
+      switch (Method) {
+      case StaleMatcher::MATCH_EXACT:
         ++BC.Stats.NumExactMatchedBlocks;
         BC.Stats.ExactMatchedSampleCount += YamlBB.ExecCount;
-        LLVM_DEBUG(dbgs() << "  exact match\n");
-      } else if (Matcher.isPseudoProbeMatch(YamlHash)) {
-        ++BC.Stats.NumPseudoProbeMatchedBlocks;
-        BC.Stats.PseudoProbeMatchedSampleCount += YamlBB.ExecCount;
-        LLVM_DEBUG(dbgs() << "  pseudo probe match\n");
-      } else {
-        LLVM_DEBUG(dbgs() << "  loose match\n");
+        LLVM_DEBUG(dbgs() << "  exact hash match\n");
+        break;
+      case StaleMatcher::MATCH_PROBE_EXACT:
+        ++BC.Stats.NumPseudoProbeExactMatchedBlocks;
+        BC.Stats.PseudoProbeExactMatchedSampleCount += YamlBB.ExecCount;
+        LLVM_DEBUG(dbgs() << "  exact pseudo probe match\n");
+        break;
+      case StaleMatcher::MATCH_PROBE_LOOSE:
+        ++BC.Stats.NumPseudoProbeLooseMatchedBlocks;
+        BC.Stats.PseudoProbeLooseMatchedSampleCount += YamlBB.ExecCount;
+        LLVM_DEBUG(dbgs() << "  loose pseudo probe match\n");
+        break;
+      case StaleMatcher::MATCH_CALL:
+        ++BC.Stats.NumCallMatchedBlocks;
+        BC.Stats.CallMatchedSampleCount += YamlBB.ExecCount;
+        LLVM_DEBUG(dbgs() << "  call match\n");
+        break;
+      case StaleMatcher::MATCH_OPCODE:
+        ++BC.Stats.NumLooseMatchedBlocks;
+        BC.Stats.LooseMatchedSampleCount += YamlBB.ExecCount;
+        LLVM_DEBUG(dbgs() << "  loose hash match\n");
+        break;
+      case StaleMatcher::NO_MATCH:
+        LLVM_DEBUG(dbgs() << "  no match\n");
       }
       if (YamlBB.NumInstructions == BB->size())
         ++BC.Stats.NumStaleBlocksWithEqualIcount;
@@ -761,13 +728,6 @@ size_t matchWeightsByHashes(
     BC.Stats.StaleSampleCount += YamlBB.ExecCount;
   }
 
-  if (opts::Verbosity >= 2) {
-    outs() << "BOLT-INFO: " << Matcher.getNumBlocksMatchedWithPseudoProbes()
-           << " blocks matched with pseudo probes\n"
-           << "BOLT-INFO: " << Matcher.getNumBlocksMatchedWithOpcodes()
-           << " blocks matched with opcodes\n";
-  }
-
   // Match jumps from the profile to the jumps from CFG
   std::vector<uint64_t> OutWeight(Func.Blocks.size(), 0);
   std::vector<uint64_t> InWeight(Func.Blocks.size(), 0);
diff --git a/bolt/lib/Rewrite/PseudoProbeRewriter.cpp b/bolt/lib/Rewrite/PseudoProbeRewriter.cpp
index 4b3f9ab4cb64ae..43ab0d9fd63e51 100644
--- a/bolt/lib/Rewrite/PseudoProbeRewriter.cpp
+++ b/bolt/lib/Rewrite/PseudoProbeRewriter.cpp
@@ -51,6 +51,7 @@ static cl::opt<PrintPseudoProbesOptions> PrintPseudoProbes(
     cl::Hidden, cl::cat(BoltCategory));
 
 extern cl::opt<bool> ProfileWritePseudoProbes;
+extern cl::opt<bool> StaleMatchingWithBlockPseudoProbes;
 } // namespace opts
 
 namespace {
@@ -92,14 +93,15 @@ class PseudoProbeRewriter final : public MetadataRewriter {
 };
 
 Error PseudoProbeRewriter::preCFGInitializer() {
-  if (opts::ProfileWritePseudoProbes)
-    parsePseudoProbe(true);
+  if (opts::ProfileWritePseudoProbes ||
+      opts::StaleMatchingWithBlockPseudoProbes)
+    parsePseudoProbe(opts::ProfileWritePseudoProbes);
 
   return Error::success();
 }
 
 Error PseudoProbeRewriter::postEmitFinalizer() {
-  if (!opts::ProfileWritePseudoProbes)
+  if (!opts::StaleMatchingWithBlockPseudoProbes)
     parsePseudoProbe();
   updatePseudoProbes();
 



More information about the llvm-branch-commits mailing list