[llvm] [BOLT] Report flow conservation scores (PR #127954)

via llvm-commits llvm-commits at lists.llvm.org
Thu Feb 27 20:12:25 PST 2025


https://github.com/ShatianWang updated https://github.com/llvm/llvm-project/pull/127954

>From b18bb596c9345eb85231490436498ab434539c4b Mon Sep 17 00:00:00 2001
From: Shatian Wang <shatian at meta.com>
Date: Mon, 13 Jan 2025 11:17:55 -0800
Subject: [PATCH 1/4] [BOLT] Flow conservation scores

---
 bolt/include/bolt/Passes/ContinuityStats.h    |  61 --
 .../include/bolt/Passes/ProfileQualityStats.h |  98 +++
 bolt/lib/Passes/CMakeLists.txt                |   2 +-
 bolt/lib/Passes/ContinuityStats.cpp           | 250 --------
 bolt/lib/Passes/ProfileQualityStats.cpp       | 579 ++++++++++++++++++
 bolt/lib/Rewrite/BinaryPassManager.cpp        |  46 +-
 .../test/X86/cfg-discontinuity-reporting.test |   4 -
 bolt/test/X86/profile-quality-reporting.test  |   6 +
 8 files changed, 707 insertions(+), 339 deletions(-)
 delete mode 100644 bolt/include/bolt/Passes/ContinuityStats.h
 create mode 100644 bolt/include/bolt/Passes/ProfileQualityStats.h
 delete mode 100644 bolt/lib/Passes/ContinuityStats.cpp
 create mode 100644 bolt/lib/Passes/ProfileQualityStats.cpp
 delete mode 100644 bolt/test/X86/cfg-discontinuity-reporting.test
 create mode 100644 bolt/test/X86/profile-quality-reporting.test

diff --git a/bolt/include/bolt/Passes/ContinuityStats.h b/bolt/include/bolt/Passes/ContinuityStats.h
deleted file mode 100644
index bd4d491ad4a55..0000000000000
--- a/bolt/include/bolt/Passes/ContinuityStats.h
+++ /dev/null
@@ -1,61 +0,0 @@
-//===- bolt/Passes/ContinuityStats.h ----------------------------*- C++ -*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// This pass checks how well the BOLT input profile satisfies the following
-// "CFG continuity" property of a perfect profile:
-//
-//        Each positive-execution-count block in the function’s CFG
-//        should be *reachable* from a positive-execution-count function
-//        entry block through a positive-execution-count path.
-//
-// More specifically, for each of the hottest 1000 functions, the pass
-// calculates the function’s fraction of basic block execution counts
-// that is *unreachable*. It then reports the 95th percentile of the
-// distribution of the 1000 unreachable fractions in a single BOLT-INFO line.
-// The smaller the reported value is, the better the BOLT profile
-// satisfies the CFG continuity property.
-
-// The default value of 1000 above can be changed via the hidden BOLT option
-// `-num-functions-for-continuity-check=[N]`.
-// If more detailed stats are needed, `-v=1` can be used: the hottest N
-// functions will be grouped into 5 equally-sized buckets, from the hottest
-// to the coldest; for each bucket, various summary statistics of the
-// distribution of the unreachable fractions and the raw unreachable execution
-// counts will be reported.
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef BOLT_PASSES_CONTINUITYSTATS_H
-#define BOLT_PASSES_CONTINUITYSTATS_H
-
-#include "bolt/Passes/BinaryPasses.h"
-#include <vector>
-
-namespace llvm {
-
-class raw_ostream;
-
-namespace bolt {
-class BinaryContext;
-
-/// Compute and report to the user the function CFG continuity quality
-class PrintContinuityStats : public BinaryFunctionPass {
-public:
-  explicit PrintContinuityStats(const cl::opt<bool> &PrintPass)
-      : BinaryFunctionPass(PrintPass) {}
-
-  bool shouldOptimize(const BinaryFunction &BF) const override;
-  const char *getName() const override { return "continuity-stats"; }
-  bool shouldPrint(const BinaryFunction &) const override { return false; }
-  Error runOnFunctions(BinaryContext &BC) override;
-};
-
-} // namespace bolt
-} // namespace llvm
-
-#endif // BOLT_PASSES_CONTINUITYSTATS_H
diff --git a/bolt/include/bolt/Passes/ProfileQualityStats.h b/bolt/include/bolt/Passes/ProfileQualityStats.h
new file mode 100644
index 0000000000000..761fd33431a46
--- /dev/null
+++ b/bolt/include/bolt/Passes/ProfileQualityStats.h
@@ -0,0 +1,98 @@
+//===- bolt/Passes/ProfileQualityStats.h ------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass checks the BOLT input profile quality.
+//
+// Check 1: how well the input profile satisfies the following
+// "CFG continuity" property of a perfect profile:
+//
+//        Each positive-execution-count block in the function’s CFG
+//        is *reachable* from a positive-execution-count function
+//        entry block through a positive-execution-count path.
+//
+// More specifically, for each of the hottest 1000 functions, the pass
+// calculates the function’s fraction of basic block execution counts
+// that is *unreachable*. It then reports the 95th percentile of the
+// distribution of the 1000 unreachable fractions in a single BOLT-INFO line.
+// The smaller the reported value is, the better the BOLT profile
+// satisfies the CFG continuity property.
+//
+// Check 2: how well the input profile satisfies the "call graph flow
+// conservation" property of a perfect profile:
+//
+//        For each function that is not a program entry, the number of times the
+//        function is called is equal to the net CFG outflow of the
+//        function's entry block(s).
+//
+// More specifically, for each of the hottest 1000 functions, the pass obtains
+// A = number of times the function is called, B = the function's entry blocks'
+// inflow, C = the function's entry blocks' outflow, where B and C are computed
+// using the function's weighted CFG. It then computes gap = 1 - MIN(A,C-B) /
+// MAX(A, C-B). The pass reports the 95th percentile of the distribution of the
+// 1000 gaps in a single BOLT-INFO line. The smaller the reported value is, the
+// better the BOLT profile satisfies the call graph flow conservation property.
+//
+// Check 3: how well the input profile satisfies the "function CFG flow
+// conservation property" of a perfect profile:
+//
+//       A non-entry non-exit basic block's inflow is equal to its outflow.
+//
+// More specifically, for each of the hottest 1000 functions, the pass loops
+// over its basic blocks that are non-entry and non-exit, and for each block
+// obtains a block gap = 1 - MIN(block inflow, block outflow, block call count
+// if any) / MAX(block inflow, block outflow, block call count if any). It then
+// aggregates the block gaps into 2 values for the function: "weighted" is the
+// weighted average of the block conservation gaps, where the weights depend on
+// each block's execution count and instruction count; "worst" is the worst
+// (biggest) block gap acorss all basic blocks in the function with an execution
+// count of > 500. The pass then reports the 95th percentile of the weighted and
+// worst values of the 1000 functions in a single BOLT-INFO line. The smaller
+// the reported values are, the better the BOLT profile satisfies the function
+// CFG flow conservation property.
+//
+// The default value of 1000 above can be changed via the hidden BOLT option
+// `-num-functions-for-profile-quality-check=[N]`.
+// The default reporting of the 95th percentile can be changed via the hidden
+// BOLT option `-percentile-for-profile-quality-check=[M]`.
+//
+// If more detailed stats are needed, `-v=1` can be used: the hottest N
+// functions will be grouped into 5 equally-sized buckets, from the hottest
+// to the coldest; for each bucket, various summary statistics of the
+// profile quality will be reported.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef BOLT_PASSES_PROFILEQUALITYSTATS_H
+#define BOLT_PASSES_PROFILEQUALITYSTATS_H
+
+#include "bolt/Passes/BinaryPasses.h"
+#include <vector>
+
+namespace llvm {
+
+class raw_ostream;
+
+namespace bolt {
+class BinaryContext;
+
+/// Compute and report to the user the profile quality
+class PrintProfileQualityStats : public BinaryFunctionPass {
+public:
+  explicit PrintProfileQualityStats(const cl::opt<bool> &PrintPass)
+      : BinaryFunctionPass(PrintPass) {}
+
+  bool shouldOptimize(const BinaryFunction &BF) const override;
+  const char *getName() const override { return "profile-quality-stats"; }
+  bool shouldPrint(const BinaryFunction &) const override { return false; }
+  Error runOnFunctions(BinaryContext &BC) override;
+};
+
+} // namespace bolt
+} // namespace llvm
+
+#endif // BOLT_PASSES_PROFILEQUALITYSTATS_H
diff --git a/bolt/lib/Passes/CMakeLists.txt b/bolt/lib/Passes/CMakeLists.txt
index 1e3289484a5ba..692c82f2a5336 100644
--- a/bolt/lib/Passes/CMakeLists.txt
+++ b/bolt/lib/Passes/CMakeLists.txt
@@ -26,7 +26,7 @@ add_llvm_library(LLVMBOLTPasses
   PatchEntries.cpp
   PettisAndHansen.cpp
   PLTCall.cpp
-  ContinuityStats.cpp
+  ProfileQualityStats.cpp
   RegAnalysis.cpp
   RegReAssign.cpp
   ReorderAlgorithm.cpp
diff --git a/bolt/lib/Passes/ContinuityStats.cpp b/bolt/lib/Passes/ContinuityStats.cpp
deleted file mode 100644
index b32365b59065d..0000000000000
--- a/bolt/lib/Passes/ContinuityStats.cpp
+++ /dev/null
@@ -1,250 +0,0 @@
-//===- bolt/Passes/ContinuityStats.cpp --------------------------*- C++ -*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// This file implements the continuity stats calculation pass.
-//
-//===----------------------------------------------------------------------===//
-
-#include "bolt/Passes/ContinuityStats.h"
-#include "bolt/Core/BinaryBasicBlock.h"
-#include "bolt/Core/BinaryFunction.h"
-#include "bolt/Utils/CommandLineOpts.h"
-#include "llvm/Support/CommandLine.h"
-#include <queue>
-#include <unordered_map>
-#include <unordered_set>
-
-#define DEBUG_TYPE "bolt-opts"
-
-using namespace llvm;
-using namespace bolt;
-
-namespace opts {
-extern cl::opt<unsigned> Verbosity;
-cl::opt<unsigned> NumFunctionsForContinuityCheck(
-    "num-functions-for-continuity-check",
-    cl::desc("number of hottest functions to print aggregated "
-             "CFG discontinuity stats of."),
-    cl::init(1000), cl::ZeroOrMore, cl::Hidden, cl::cat(BoltOptCategory));
-} // namespace opts
-
-namespace {
-using FunctionListType = std::vector<const BinaryFunction *>;
-using function_iterator = FunctionListType::iterator;
-
-template <typename T>
-void printDistribution(raw_ostream &OS, std::vector<T> &values,
-                       bool Fraction = false) {
-  if (values.empty())
-    return;
-  // Sort values from largest to smallest and print the MAX, TOP 1%, 5%, 10%,
-  // 20%, 50%, 80%, MIN. If Fraction is true, then values are printed as
-  // fractions instead of integers.
-  std::sort(values.begin(), values.end());
-
-  auto printLine = [&](std::string Text, double Percent) {
-    int Rank = int(values.size() * (1.0 - Percent / 100));
-    if (Percent == 0)
-      Rank = values.size() - 1;
-    if (Fraction)
-      OS << "  " << Text << std::string(9 - Text.length(), ' ') << ": "
-         << format("%.2lf%%", values[Rank] * 100) << "\n";
-    else
-      OS << "  " << Text << std::string(9 - Text.length(), ' ') << ": "
-         << values[Rank] << "\n";
-  };
-
-  printLine("MAX", 0);
-  const int percentages[] = {1, 5, 10, 20, 50, 80};
-  for (size_t i = 0; i < sizeof(percentages) / sizeof(percentages[0]); ++i) {
-    printLine("TOP " + std::to_string(percentages[i]) + "%", percentages[i]);
-  }
-  printLine("MIN", 100);
-}
-
-void printCFGContinuityStats(raw_ostream &OS,
-                             iterator_range<function_iterator> &Functions) {
-  // Given a perfect profile, every positive-execution-count BB should be
-  // connected to an entry of the function through a positive-execution-count
-  // directed path in the control flow graph.
-  std::vector<size_t> NumUnreachables;
-  std::vector<size_t> SumECUnreachables;
-  std::vector<double> FractionECUnreachables;
-
-  for (auto it = Functions.begin(); it != Functions.end(); ++it) {
-    const BinaryFunction *Function = *it;
-    if (Function->size() <= 1)
-      continue;
-
-    // Compute the sum of all BB execution counts (ECs).
-    size_t NumPosECBBs = 0;
-    size_t SumAllBBEC = 0;
-    for (const BinaryBasicBlock &BB : *Function) {
-      const size_t BBEC = BB.getKnownExecutionCount();
-      NumPosECBBs += BBEC > 0 ? 1 : 0;
-      SumAllBBEC += BBEC;
-    }
-
-    // Perform BFS on subgraph of CFG induced by positive weight edges.
-    // Compute the number of BBs reachable from the entry(s) of the function and
-    // the sum of their execution counts (ECs).
-    std::unordered_map<unsigned, const BinaryBasicBlock *> IndexToBB;
-    std::unordered_set<unsigned> Visited;
-    std::queue<unsigned> Queue;
-    for (const BinaryBasicBlock &BB : *Function) {
-      // Make sure BB.getIndex() is not already in IndexToBB.
-      assert(IndexToBB.find(BB.getIndex()) == IndexToBB.end());
-      IndexToBB[BB.getIndex()] = &BB;
-      if (BB.isEntryPoint() && BB.getKnownExecutionCount() > 0) {
-        Queue.push(BB.getIndex());
-        Visited.insert(BB.getIndex());
-      }
-    }
-    while (!Queue.empty()) {
-      const unsigned BBIndex = Queue.front();
-      const BinaryBasicBlock *BB = IndexToBB[BBIndex];
-      Queue.pop();
-      auto SuccBIIter = BB->branch_info_begin();
-      for (const BinaryBasicBlock *Succ : BB->successors()) {
-        const uint64_t Count = SuccBIIter->Count;
-        if (Count == BinaryBasicBlock::COUNT_NO_PROFILE || Count == 0) {
-          ++SuccBIIter;
-          continue;
-        }
-        if (!Visited.insert(Succ->getIndex()).second) {
-          ++SuccBIIter;
-          continue;
-        }
-        Queue.push(Succ->getIndex());
-        ++SuccBIIter;
-      }
-    }
-
-    const size_t NumReachableBBs = Visited.size();
-
-    // Loop through Visited, and sum the corresponding BBs' execution counts
-    // (ECs).
-    size_t SumReachableBBEC = 0;
-    for (const unsigned BBIndex : Visited) {
-      const BinaryBasicBlock *BB = IndexToBB[BBIndex];
-      SumReachableBBEC += BB->getKnownExecutionCount();
-    }
-
-    const size_t NumPosECBBsUnreachableFromEntry =
-        NumPosECBBs - NumReachableBBs;
-    const size_t SumUnreachableBBEC = SumAllBBEC - SumReachableBBEC;
-    const double FractionECUnreachable =
-        (double)SumUnreachableBBEC / SumAllBBEC;
-
-    if (opts::Verbosity >= 2 && FractionECUnreachable >= 0.05) {
-      OS << "Non-trivial CFG discontinuity observed in function "
-         << Function->getPrintName() << "\n";
-      LLVM_DEBUG(Function->dump());
-    }
-
-    NumUnreachables.push_back(NumPosECBBsUnreachableFromEntry);
-    SumECUnreachables.push_back(SumUnreachableBBEC);
-    FractionECUnreachables.push_back(FractionECUnreachable);
-  }
-
-  if (FractionECUnreachables.empty())
-    return;
-
-  std::sort(FractionECUnreachables.begin(), FractionECUnreachables.end());
-  const int Rank = int(FractionECUnreachables.size() * 0.95);
-  OS << format("top 5%% function CFG discontinuity is %.2lf%%\n",
-               FractionECUnreachables[Rank] * 100);
-
-  if (opts::Verbosity >= 1) {
-    OS << "abbreviations: EC = execution count, POS BBs = positive EC BBs\n"
-       << "distribution of NUM(unreachable POS BBs) among all focal "
-          "functions\n";
-    printDistribution(OS, NumUnreachables);
-
-    OS << "distribution of SUM_EC(unreachable POS BBs) among all focal "
-          "functions\n";
-    printDistribution(OS, SumECUnreachables);
-
-    OS << "distribution of [(SUM_EC(unreachable POS BBs) / SUM_EC(all "
-          "POS BBs))] among all focal functions\n";
-    printDistribution(OS, FractionECUnreachables, /*Fraction=*/true);
-  }
-}
-
-void printAll(BinaryContext &BC, FunctionListType &ValidFunctions,
-              size_t NumTopFunctions) {
-  // Sort the list of functions by execution counts (reverse).
-  llvm::sort(ValidFunctions,
-             [&](const BinaryFunction *A, const BinaryFunction *B) {
-               return A->getKnownExecutionCount() > B->getKnownExecutionCount();
-             });
-
-  const size_t RealNumTopFunctions =
-      std::min(NumTopFunctions, ValidFunctions.size());
-
-  iterator_range<function_iterator> Functions(
-      ValidFunctions.begin(), ValidFunctions.begin() + RealNumTopFunctions);
-
-  BC.outs() << format("BOLT-INFO: among the hottest %zu functions ",
-                      RealNumTopFunctions);
-  printCFGContinuityStats(BC.outs(), Functions);
-
-  // Print more detailed bucketed stats if requested.
-  if (opts::Verbosity >= 1 && RealNumTopFunctions >= 5) {
-    const size_t PerBucketSize = RealNumTopFunctions / 5;
-    BC.outs() << format(
-        "Detailed stats for 5 buckets, each with  %zu functions:\n",
-        PerBucketSize);
-
-    // For each bucket, print the CFG continuity stats of the functions in the
-    // bucket.
-    for (size_t BucketIndex = 0; BucketIndex < 5; ++BucketIndex) {
-      const size_t StartIndex = BucketIndex * PerBucketSize;
-      const size_t EndIndex = StartIndex + PerBucketSize;
-      iterator_range<function_iterator> Functions(
-          ValidFunctions.begin() + StartIndex,
-          ValidFunctions.begin() + EndIndex);
-      const size_t MaxFunctionExecutionCount =
-          ValidFunctions[StartIndex]->getKnownExecutionCount();
-      const size_t MinFunctionExecutionCount =
-          ValidFunctions[EndIndex - 1]->getKnownExecutionCount();
-      BC.outs() << format("----------------\n|   Bucket %zu:  "
-                          "|\n----------------\n",
-                          BucketIndex + 1)
-                << format(
-                       "execution counts of the %zu functions in the bucket: "
-                       "%zu-%zu\n",
-                       EndIndex - StartIndex, MinFunctionExecutionCount,
-                       MaxFunctionExecutionCount);
-      printCFGContinuityStats(BC.outs(), Functions);
-    }
-  }
-}
-} // namespace
-
-bool PrintContinuityStats::shouldOptimize(const BinaryFunction &BF) const {
-  if (BF.empty() || !BF.hasValidProfile())
-    return false;
-
-  return BinaryFunctionPass::shouldOptimize(BF);
-}
-
-Error PrintContinuityStats::runOnFunctions(BinaryContext &BC) {
-  // Create a list of functions with valid profiles.
-  FunctionListType ValidFunctions;
-  for (const auto &BFI : BC.getBinaryFunctions()) {
-    const BinaryFunction *Function = &BFI.second;
-    if (PrintContinuityStats::shouldOptimize(*Function))
-      ValidFunctions.push_back(Function);
-  }
-  if (ValidFunctions.empty() || opts::NumFunctionsForContinuityCheck == 0)
-    return Error::success();
-
-  printAll(BC, ValidFunctions, opts::NumFunctionsForContinuityCheck);
-  return Error::success();
-}
diff --git a/bolt/lib/Passes/ProfileQualityStats.cpp b/bolt/lib/Passes/ProfileQualityStats.cpp
new file mode 100644
index 0000000000000..a42b4dc27947a
--- /dev/null
+++ b/bolt/lib/Passes/ProfileQualityStats.cpp
@@ -0,0 +1,579 @@
+//===- bolt/Passes/ProfileQualityStats.cpp ----------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the profile quality stats calculation pass.
+//
+//===----------------------------------------------------------------------===//
+
+#include "bolt/Passes/ProfileQualityStats.h"
+#include "bolt/Core/BinaryBasicBlock.h"
+#include "bolt/Core/BinaryFunction.h"
+#include "bolt/Utils/CommandLineOpts.h"
+#include "llvm/Support/CommandLine.h"
+#include <queue>
+#include <unordered_map>
+#include <unordered_set>
+
+#define DEBUG_TYPE "bolt-opts"
+
+using namespace llvm;
+using namespace bolt;
+
+namespace opts {
+extern cl::opt<unsigned> Verbosity;
+cl::opt<unsigned> NumFunctionsForProfileQualityCheck(
+    "num-functions-for-profile-quality-check",
+    cl::desc("number of hottest functions to print aggregated "
+             "profile quality stats of."),
+    cl::init(1000), cl::ZeroOrMore, cl::Hidden, cl::cat(BoltOptCategory));
+cl::opt<unsigned> PercentileForProfileQualityCheck(
+    "percentile-for-profile-quality-check",
+    cl::desc("Percentile of profile quality distributions over hottest "
+             "functions to display."),
+    cl::init(95), cl::ZeroOrMore, cl::Hidden, cl::cat(BoltOptCategory));
+} // namespace opts
+
+namespace {
+using FunctionListType = std::vector<const BinaryFunction *>;
+using function_iterator = FunctionListType::iterator;
+
+// BB index -> flow count
+using FlowMapTy = std::unordered_map<unsigned, uint64_t>;
+// Function number -> FlowMapTy
+using TotalFlowMapTy = std::unordered_map<uint64_t, FlowMapTy>;
+// Function number -> flow count
+using FunctionFlowMapTy = std::unordered_map<uint64_t, uint64_t>;
+struct FlowInfo {
+  TotalFlowMapTy TotalIncomingMaps;
+  TotalFlowMapTy TotalOutgoingMaps;
+  TotalFlowMapTy TotalMaxCountMaps;
+  TotalFlowMapTy TotalMinCountMaps;
+  FunctionFlowMapTy CallGraphIncomingMap;
+};
+
+template <typename T>
+void printDistribution(raw_ostream &OS, std::vector<T> &values,
+                       bool Fraction = false) {
+  // Assume values are sorted.
+  if (values.empty())
+    return;
+
+  OS << "  Length     : " << values.size() << "\n";
+
+  auto printLine = [&](std::string Text, double Percent) {
+    int Rank = int(values.size() * (100 - Percent) / 100);
+    if (Percent == 0)
+      Rank = values.size() - 1;
+    if (Fraction)
+      OS << "  " << Text << std::string(11 - Text.length(), ' ') << ": "
+         << format("%.2lf%%", values[Rank] * 100) << "\n";
+    else
+      OS << "  " << Text << std::string(11 - Text.length(), ' ') << ": "
+         << values[Rank] << "\n";
+  };
+
+  printLine("MAX", 0);
+  const int percentages[] = {1, 5, 10, 20, 50, 80};
+  for (size_t i = 0; i < sizeof(percentages) / sizeof(percentages[0]); ++i) {
+    printLine("TOP " + std::to_string(percentages[i]) + "%", percentages[i]);
+  }
+  printLine("MIN", 100);
+}
+
+void printCFGContinuityStats(raw_ostream &OS,
+                             iterator_range<function_iterator> &Functions) {
+  // Given a perfect profile, every positive-execution-count BB should be
+  // connected to an entry of the function through a positive-execution-count
+  // directed path in the control flow graph.
+  std::vector<size_t> NumUnreachables;
+  std::vector<size_t> SumECUnreachables;
+  std::vector<double> FractionECUnreachables;
+
+  for (auto it = Functions.begin(); it != Functions.end(); ++it) {
+    const BinaryFunction *Function = *it;
+    if (Function->size() <= 1)
+      continue;
+
+    // Compute the sum of all BB execution counts (ECs).
+    size_t NumPosECBBs = 0;
+    size_t SumAllBBEC = 0;
+    for (const BinaryBasicBlock &BB : *Function) {
+      const size_t BBEC = BB.getKnownExecutionCount();
+      NumPosECBBs += BBEC > 0 ? 1 : 0;
+      SumAllBBEC += BBEC;
+    }
+
+    // Perform BFS on subgraph of CFG induced by positive weight edges.
+    // Compute the number of BBs reachable from the entry(s) of the function and
+    // the sum of their execution counts (ECs).
+    std::unordered_map<unsigned, const BinaryBasicBlock *> IndexToBB;
+    std::unordered_set<unsigned> Visited;
+    std::queue<unsigned> Queue;
+    for (const BinaryBasicBlock &BB : *Function) {
+      // Make sure BB.getIndex() is not already in IndexToBB.
+      assert(IndexToBB.find(BB.getIndex()) == IndexToBB.end());
+      IndexToBB[BB.getIndex()] = &BB;
+      if (BB.isEntryPoint() && BB.getKnownExecutionCount() > 0) {
+        Queue.push(BB.getIndex());
+        Visited.insert(BB.getIndex());
+      }
+    }
+    while (!Queue.empty()) {
+      const unsigned BBIndex = Queue.front();
+      const BinaryBasicBlock *BB = IndexToBB[BBIndex];
+      Queue.pop();
+      auto SuccBIIter = BB->branch_info_begin();
+      for (const BinaryBasicBlock *Succ : BB->successors()) {
+        const uint64_t Count = SuccBIIter->Count;
+        if (Count == BinaryBasicBlock::COUNT_NO_PROFILE || Count == 0) {
+          ++SuccBIIter;
+          continue;
+        }
+        if (!Visited.insert(Succ->getIndex()).second) {
+          ++SuccBIIter;
+          continue;
+        }
+        Queue.push(Succ->getIndex());
+        ++SuccBIIter;
+      }
+    }
+
+    const size_t NumReachableBBs = Visited.size();
+
+    // Loop through Visited, and sum the corresponding BBs' execution counts
+    // (ECs).
+    size_t SumReachableBBEC = 0;
+    for (const unsigned BBIndex : Visited) {
+      const BinaryBasicBlock *BB = IndexToBB[BBIndex];
+      SumReachableBBEC += BB->getKnownExecutionCount();
+    }
+
+    const size_t NumPosECBBsUnreachableFromEntry =
+        NumPosECBBs - NumReachableBBs;
+    const size_t SumUnreachableBBEC = SumAllBBEC - SumReachableBBEC;
+    const double FractionECUnreachable =
+        (double)SumUnreachableBBEC / SumAllBBEC;
+
+    if (opts::Verbosity >= 2 && FractionECUnreachable >= 0.05) {
+      OS << "Non-trivial CFG discontinuity observed in function "
+         << Function->getPrintName() << "\n";
+      LLVM_DEBUG(Function->dump());
+    }
+
+    NumUnreachables.push_back(NumPosECBBsUnreachableFromEntry);
+    SumECUnreachables.push_back(SumUnreachableBBEC);
+    FractionECUnreachables.push_back(FractionECUnreachable);
+  }
+
+  if (FractionECUnreachables.empty())
+    return;
+
+  std::sort(FractionECUnreachables.begin(), FractionECUnreachables.end());
+  const int Rank = int(FractionECUnreachables.size() *
+                       opts::PercentileForProfileQualityCheck / 100);
+  OS << format("top %zu%% function CFG discontinuity is %.2lf%%\n",
+               100 - opts::PercentileForProfileQualityCheck,
+               FractionECUnreachables[Rank] * 100);
+  if (opts::Verbosity >= 1) {
+    OS << "abbreviations: EC = execution count, POS BBs = positive EC BBs\n"
+       << "distribution of NUM(unreachable POS BBs) per function\n";
+    std::sort(NumUnreachables.begin(), NumUnreachables.end());
+    printDistribution(OS, NumUnreachables);
+
+    OS << "distribution of SUM_EC(unreachable POS BBs) per function\n";
+    std::sort(SumECUnreachables.begin(), SumECUnreachables.end());
+    printDistribution(OS, SumECUnreachables);
+
+    OS << "distribution of [(SUM_EC(unreachable POS BBs) / SUM_EC(all "
+          "POS BBs))] per function\n";
+    printDistribution(OS, FractionECUnreachables, /*Fraction=*/true);
+  }
+}
+
+void printCallGraphFlowConservationStats(
+    raw_ostream &OS, iterator_range<function_iterator> &Functions,
+    FlowInfo &TotalFlowMap) {
+  std::vector<double> CallGraphGaps;
+
+  for (auto it = Functions.begin(); it != Functions.end(); ++it) {
+    const BinaryFunction *Function = *it;
+    if (Function->size() <= 1 || !Function->isSimple())
+      continue;
+
+    const uint64_t FunctionNum = Function->getFunctionNumber();
+    FlowMapTy &IncomingMap = TotalFlowMap.TotalIncomingMaps[FunctionNum];
+    FlowMapTy &OutgoingMap = TotalFlowMap.TotalOutgoingMaps[FunctionNum];
+    FunctionFlowMapTy &CallGraphIncomingMap = TotalFlowMap.CallGraphIncomingMap;
+
+    // Only consider functions that are not a program entry.
+    if (CallGraphIncomingMap.find(FunctionNum) != CallGraphIncomingMap.end()) {
+      uint64_t EntryInflow = 0;
+      uint64_t EntryOutflow = 0;
+      uint32_t NumConsideredEntryBlocks = 0;
+      for (const BinaryBasicBlock &BB : *Function) {
+        if (BB.isEntryPoint()) {
+          // If entry is an exit, then we don't consider it for flow
+          // conservation
+          if (BB.succ_size() == 0)
+            continue;
+          NumConsideredEntryBlocks++;
+
+          EntryInflow += IncomingMap[BB.getIndex()];
+          EntryOutflow += OutgoingMap[BB.getIndex()];
+        }
+      }
+      uint64_t NetEntryOutflow = 0;
+      if (EntryOutflow < EntryInflow) {
+        if (opts::Verbosity >= 1) {
+          // We expect entry blocks' CFG outflow >= inflow, i.e., it has a
+          // non-negative net outflow. If this is not the case, then raise a
+          // warning if requested.
+          OS << "BOLT WARNING: unexpected entry block CFG outflow < inflow in "
+                "function "
+             << Function->getPrintName() << "\n";
+          LLVM_DEBUG(Function->dump());
+        }
+      } else {
+        NetEntryOutflow = EntryOutflow - EntryInflow;
+      }
+      if (NumConsideredEntryBlocks > 0) {
+        const uint64_t CallGraphInflow =
+            TotalFlowMap.CallGraphIncomingMap[Function->getFunctionNumber()];
+        const uint64_t Min = std::min(NetEntryOutflow, CallGraphInflow);
+        const uint64_t Max = std::max(NetEntryOutflow, CallGraphInflow);
+        const double CallGraphGap = 1 - (double)Min / Max;
+
+        if (opts::Verbosity >= 2 && CallGraphGap >= 0.5) {
+          OS << "Nontrivial call graph gap of size "
+             << format("%.2lf%%", 100 * CallGraphGap)
+             << " observed in function " << Function->getPrintName() << "\n";
+          LLVM_DEBUG(Function->dump());
+        }
+
+        CallGraphGaps.push_back(CallGraphGap);
+      }
+    }
+  }
+
+  if (!CallGraphGaps.empty()) {
+    std::sort(CallGraphGaps.begin(), CallGraphGaps.end());
+    const int Rank = int(CallGraphGaps.size() *
+                         opts::PercentileForProfileQualityCheck / 100);
+    OS << format("top %zu%% call graph flow conservation gap is %.2lf%%\n",
+                 100 - opts::PercentileForProfileQualityCheck,
+                 CallGraphGaps[Rank] * 100);
+    if (opts::Verbosity >= 1) {
+      OS << "distribution of function entry flow conservation gaps\n";
+      printDistribution(OS, CallGraphGaps, /*Fraction=*/true);
+    }
+  }
+}
+
+void printCFGFlowConservationStats(raw_ostream &OS,
+                                   iterator_range<function_iterator> &Functions,
+                                   FlowInfo &TotalFlowMap) {
+  std::vector<double> CFGGapsWeightedAvg;
+  std::vector<double> CFGGapsWorst;
+  std::vector<uint64_t> CFGGapsWorstAbs;
+
+  for (auto it = Functions.begin(); it != Functions.end(); ++it) {
+    const BinaryFunction *Function = *it;
+    if (Function->size() <= 1 || !Function->isSimple())
+      continue;
+
+    const uint64_t FunctionNum = Function->getFunctionNumber();
+    FlowMapTy &MaxCountMaps = TotalFlowMap.TotalMaxCountMaps[FunctionNum];
+    FlowMapTy &MinCountMaps = TotalFlowMap.TotalMinCountMaps[FunctionNum];
+    double WeightedGapSum = 0.0;
+    double WeightSum = 0.0;
+    double WorstGap = 0.0;
+    uint64_t WorstGapAbs = 0;
+    BinaryBasicBlock *BBWorstGap = nullptr;
+    BinaryBasicBlock *BBWorstGapAbs = nullptr;
+    for (BinaryBasicBlock &BB : *Function) {
+      // We don't consider function entry or exit blocks for CFG flow
+      // conservation
+      if (BB.isEntryPoint() || BB.succ_size() == 0)
+        continue;
+
+      const uint64_t Max = MaxCountMaps[BB.getIndex()];
+      const uint64_t Min = MinCountMaps[BB.getIndex()];
+      const double Gap = 1 - (double)Min / Max;
+      double Weight = BB.getKnownExecutionCount() * BB.getNumNonPseudos();
+      if (Weight == 0)
+        continue;
+      // We use log to prevent the stats from being dominated by extremely hot
+      // blocks
+      Weight = log(Weight);
+      WeightedGapSum += Gap * Weight;
+      WeightSum += Weight;
+      if (BB.getKnownExecutionCount() > 500 && Gap > WorstGap) {
+        WorstGap = Gap;
+        BBWorstGap = &BB;
+      }
+      if (BB.getKnownExecutionCount() > 500 && Max - Min > WorstGapAbs) {
+        WorstGapAbs = Max - Min;
+        BBWorstGapAbs = &BB;
+      }
+    }
+    if (WeightSum > 0) {
+      const double WeightedGap = WeightedGapSum / WeightSum;
+      if (opts::Verbosity >= 2 && (WeightedGap >= 0.1 || WorstGap >= 0.9)) {
+        OS << "Nontrivial CFG gap observed in function "
+           << Function->getPrintName() << "\n"
+           << "Weighted gap: " << format("%.2lf%%", 100 * WeightedGap) << "\n";
+        if (BBWorstGap)
+          OS << "Worst gap: " << format("%.2lf%%", 100 * WorstGap)
+             << " at BB with input offset: 0x"
+             << Twine::utohexstr(BBWorstGap->getInputOffset()) << "\n";
+        if (BBWorstGapAbs)
+          OS << "Worst gap (absolute value): " << WorstGapAbs << " at BB with "
+             << "input offset 0x"
+             << Twine::utohexstr(BBWorstGapAbs->getInputOffset()) << "\n";
+        LLVM_DEBUG(Function->dump());
+      }
+
+      CFGGapsWeightedAvg.push_back(WeightedGap);
+      CFGGapsWorst.push_back(WorstGap);
+      CFGGapsWorstAbs.push_back(WorstGapAbs);
+    }
+  }
+
+  if (!CFGGapsWeightedAvg.empty()) {
+    std::sort(CFGGapsWeightedAvg.begin(), CFGGapsWeightedAvg.end());
+    const int RankWA = int(CFGGapsWeightedAvg.size() *
+                           opts::PercentileForProfileQualityCheck / 100);
+    std::sort(CFGGapsWorst.begin(), CFGGapsWorst.end());
+    const int RankW =
+        int(CFGGapsWorst.size() * opts::PercentileForProfileQualityCheck / 100);
+    OS << format(
+        "top %zu%% CFG flow conservation gap is %.2lf%% (weighted) and "
+        "%.2lf%% (worst)\n",
+        100 - opts::PercentileForProfileQualityCheck,
+        CFGGapsWeightedAvg[RankWA] * 100, CFGGapsWorst[RankW] * 100);
+    if (opts::Verbosity >= 1) {
+      OS << "distribution of weighted CFG flow conservation gaps\n";
+      printDistribution(OS, CFGGapsWeightedAvg, /*Fraction=*/true);
+      OS << "Consider only blocks with execution counts > 500:\n"
+         << "distribution of worst block flow conservation gap per "
+            "function \n";
+      printDistribution(OS, CFGGapsWorst, /*Fraction=*/true);
+      OS << "distribution of worst block flow conservation gap (absolute "
+            "value) per function\n";
+      std::sort(CFGGapsWorstAbs.begin(), CFGGapsWorstAbs.end());
+      printDistribution(OS, CFGGapsWorstAbs, /*Fraction=*/false);
+    }
+  }
+}
+
+void computeFlowMappings(const BinaryContext &BC, FlowInfo &TotalFlowMap) {
+  // Increment block inflow and outflow with CFG jump counts.
+  TotalFlowMapTy &TotalIncomingMaps = TotalFlowMap.TotalIncomingMaps;
+  TotalFlowMapTy &TotalOutgoingMaps = TotalFlowMap.TotalOutgoingMaps;
+  for (const auto &BFI : BC.getBinaryFunctions()) {
+    const BinaryFunction *Function = &BFI.second;
+    if (Function->empty() || !Function->hasValidProfile())
+      continue;
+    FlowMapTy &IncomingMap = TotalIncomingMaps[Function->getFunctionNumber()];
+    FlowMapTy &OutgoingMap = TotalOutgoingMaps[Function->getFunctionNumber()];
+    for (const BinaryBasicBlock &BB : *Function) {
+      uint64_t TotalOutgoing = 0ULL;
+      auto SuccBIIter = BB.branch_info_begin();
+      for (BinaryBasicBlock *Succ : BB.successors()) {
+        const uint64_t Count = SuccBIIter->Count;
+        if (Count == BinaryBasicBlock::COUNT_NO_PROFILE || Count == 0) {
+          ++SuccBIIter;
+          continue;
+        }
+        TotalOutgoing += Count;
+        IncomingMap[Succ->getIndex()] += Count;
+        ++SuccBIIter;
+      }
+      OutgoingMap[BB.getIndex()] = TotalOutgoing;
+    }
+  }
+
+  // Initialize TotalMaxCountMaps and TotalMinCountMaps using
+  // TotalIncomingMaps and TotalOutgoingMaps
+  TotalFlowMapTy &TotalMaxCountMaps = TotalFlowMap.TotalMaxCountMaps;
+  TotalFlowMapTy &TotalMinCountMaps = TotalFlowMap.TotalMinCountMaps;
+  for (const auto &BFI : BC.getBinaryFunctions()) {
+    const BinaryFunction *Function = &BFI.second;
+    if (Function->empty() || !Function->hasValidProfile())
+      continue;
+    uint64_t FunctionNum = Function->getFunctionNumber();
+    FlowMapTy &IncomingMap = TotalIncomingMaps[FunctionNum];
+    FlowMapTy &OutgoingMap = TotalOutgoingMaps[FunctionNum];
+    FlowMapTy &MaxCountMap = TotalMaxCountMaps[FunctionNum];
+    FlowMapTy &MinCountMap = TotalMinCountMaps[FunctionNum];
+    for (const BinaryBasicBlock &BB : *Function) {
+      uint64_t BBNum = BB.getIndex();
+      MaxCountMap[BBNum] = std::max(IncomingMap[BBNum], OutgoingMap[BBNum]);
+      MinCountMap[BBNum] = std::min(IncomingMap[BBNum], OutgoingMap[BBNum]);
+    }
+  }
+
+  // Modify TotalMaxCountMaps and TotalMinCountMaps using call counts and
+  // fill out CallGraphIncomingMap
+  FunctionFlowMapTy &CallGraphIncomingMap = TotalFlowMap.CallGraphIncomingMap;
+  for (const auto &BFI : BC.getBinaryFunctions()) {
+    const BinaryFunction *Function = &BFI.second;
+    uint64_t FunctionNum = Function->getFunctionNumber();
+    FlowMapTy &MaxCountMap = TotalMaxCountMaps[FunctionNum];
+    FlowMapTy &MinCountMap = TotalMinCountMaps[FunctionNum];
+
+    // Update MaxCountMap, MinCountMap, and CallGraphIncomingMap
+    auto recordCall = [&](const BinaryBasicBlock *SourceBB,
+                          const MCSymbol *DestSymbol, uint64_t Count) {
+      if (Count == BinaryBasicBlock::COUNT_NO_PROFILE)
+        Count = 0;
+      const BinaryFunction *DstFunc =
+          DestSymbol ? BC.getFunctionForSymbol(DestSymbol) : nullptr;
+      if (DstFunc)
+        CallGraphIncomingMap[DstFunc->getFunctionNumber()] += Count;
+      if (SourceBB) {
+        unsigned BlockIndex = SourceBB->getIndex();
+        MaxCountMap[BlockIndex] = std::max(MaxCountMap[BlockIndex], Count);
+        MinCountMap[BlockIndex] = std::min(MinCountMap[BlockIndex], Count);
+      }
+    };
+
+    // Get pairs of (symbol, count) for each target at this callsite.
+    // If the call is to an unknown function the symbol will be nullptr.
+    // If there is no profiling data the count will be COUNT_NO_PROFILE.
+    using TargetDesc = std::pair<const MCSymbol *, uint64_t>;
+    using CallInfoTy = std::vector<TargetDesc>;
+    auto getCallInfo = [&](const BinaryBasicBlock *BB, const MCInst &Inst) {
+      CallInfoTy Counts;
+      const MCSymbol *DstSym = BC.MIB->getTargetSymbol(Inst);
+
+      // If this is an indirect call use perf data directly.
+      if (!DstSym && BC.MIB->hasAnnotation(Inst, "CallProfile")) {
+        const auto &ICSP = BC.MIB->getAnnotationAs<IndirectCallSiteProfile>(
+            Inst, "CallProfile");
+        for (const IndirectCallProfile &CSI : ICSP)
+          if (CSI.Symbol)
+            Counts.emplace_back(CSI.Symbol, CSI.Count);
+      } else {
+        const uint64_t Count = BB->getExecutionCount();
+        Counts.emplace_back(DstSym, Count);
+      }
+
+      return Counts;
+    };
+
+    // If the function has an invalid profile, try to use the perf data
+    // directly. The call EC is only used to update CallGraphIncomingMap.
+    if (!Function->hasValidProfile() && !Function->getAllCallSites().empty()) {
+      for (const IndirectCallProfile &CSI : Function->getAllCallSites()) {
+        if (!CSI.Symbol)
+          continue;
+        recordCall(nullptr, CSI.Symbol, CSI.Count);
+      }
+      continue;
+    } else {
+      // If the function has a valid profile
+      for (BinaryBasicBlock &BB : *Function) {
+        for (MCInst &Inst : BB) {
+          if (!BC.MIB->isCall(Inst))
+            continue;
+          // Find call instructions and extract target symbols from each
+          // one.
+          const CallInfoTy CallInfo = getCallInfo(&BB, Inst);
+          for (const TargetDesc &CI : CallInfo) {
+            recordCall(&BB, CI.first, CI.second);
+          }
+        }
+      }
+    }
+  }
+}
+
+void printAll(BinaryContext &BC, FunctionListType &ValidFunctions,
+              size_t NumTopFunctions) {
+  // Sort the list of functions by execution counts (reverse).
+  llvm::sort(ValidFunctions,
+             [&](const BinaryFunction *A, const BinaryFunction *B) {
+               return A->getKnownExecutionCount() > B->getKnownExecutionCount();
+             });
+
+  const size_t RealNumTopFunctions =
+      std::min(NumTopFunctions, ValidFunctions.size());
+
+  iterator_range<function_iterator> Functions(
+      ValidFunctions.begin(), ValidFunctions.begin() + RealNumTopFunctions);
+
+  FlowInfo TotalFlowMap;
+  computeFlowMappings(BC, TotalFlowMap);
+
+  BC.outs() << format("BOLT-INFO: among the hottest %zu functions ",
+                      RealNumTopFunctions);
+  printCFGContinuityStats(BC.outs(), Functions);
+  BC.outs() << format("BOLT-INFO: among the hottest %zu functions ",
+                      RealNumTopFunctions);
+  printCallGraphFlowConservationStats(BC.outs(), Functions, TotalFlowMap);
+  BC.outs() << format("BOLT-INFO: among the hottest %zu functions ",
+                      RealNumTopFunctions);
+  printCFGFlowConservationStats(BC.outs(), Functions, TotalFlowMap);
+
+  // Print more detailed bucketed stats if requested.
+  if (opts::Verbosity >= 1 && RealNumTopFunctions >= 5) {
+    const size_t PerBucketSize = RealNumTopFunctions / 5;
+    BC.outs() << format(
+        "Detailed stats for 5 buckets, each with  %zu functions:\n",
+        PerBucketSize);
+
+    // For each bucket, print the CFG continuity stats of the functions in
+    // the bucket.
+    for (size_t BucketIndex = 0; BucketIndex < 5; ++BucketIndex) {
+      const size_t StartIndex = BucketIndex * PerBucketSize;
+      const size_t EndIndex = StartIndex + PerBucketSize;
+      iterator_range<function_iterator> Functions(
+          ValidFunctions.begin() + StartIndex,
+          ValidFunctions.begin() + EndIndex);
+      const size_t MaxFunctionExecutionCount =
+          ValidFunctions[StartIndex]->getKnownExecutionCount();
+      const size_t MinFunctionExecutionCount =
+          ValidFunctions[EndIndex - 1]->getKnownExecutionCount();
+      BC.outs() << format("----------------\n|   Bucket %zu:  "
+                          "|\n----------------\n",
+                          BucketIndex + 1)
+                << format(
+                       "execution counts of the %zu functions in the bucket: "
+                       "%zu-%zu\n",
+                       EndIndex - StartIndex, MinFunctionExecutionCount,
+                       MaxFunctionExecutionCount);
+      printCFGContinuityStats(BC.outs(), Functions);
+      printCallGraphFlowConservationStats(BC.outs(), Functions, TotalFlowMap);
+      printCFGFlowConservationStats(BC.outs(), Functions, TotalFlowMap);
+    }
+  }
+}
+} // namespace
+
+bool PrintProfileQualityStats::shouldOptimize(const BinaryFunction &BF) const {
+  if (BF.empty() || !BF.hasValidProfile())
+    return false;
+
+  return BinaryFunctionPass::shouldOptimize(BF);
+}
+
+Error PrintProfileQualityStats::runOnFunctions(BinaryContext &BC) {
+  // Create a list of functions with valid profiles.
+  FunctionListType ValidFunctions;
+  for (const auto &BFI : BC.getBinaryFunctions()) {
+    const BinaryFunction *Function = &BFI.second;
+    if (PrintProfileQualityStats::shouldOptimize(*Function))
+      ValidFunctions.push_back(Function);
+  }
+  if (ValidFunctions.empty() || opts::NumFunctionsForProfileQualityCheck == 0)
+    return Error::success();
+
+  printAll(BC, ValidFunctions, opts::NumFunctionsForProfileQualityCheck);
+  return Error::success();
+}
diff --git a/bolt/lib/Rewrite/BinaryPassManager.cpp b/bolt/lib/Rewrite/BinaryPassManager.cpp
index 2d851c751ae10..6f750b95adf9a 100644
--- a/bolt/lib/Rewrite/BinaryPassManager.cpp
+++ b/bolt/lib/Rewrite/BinaryPassManager.cpp
@@ -12,7 +12,6 @@
 #include "bolt/Passes/AllocCombiner.h"
 #include "bolt/Passes/AsmDump.h"
 #include "bolt/Passes/CMOVConversion.h"
-#include "bolt/Passes/ContinuityStats.h"
 #include "bolt/Passes/FixRISCVCallsPass.h"
 #include "bolt/Passes/FixRelaxationPass.h"
 #include "bolt/Passes/FrameOptimizer.h"
@@ -27,6 +26,7 @@
 #include "bolt/Passes/MCF.h"
 #include "bolt/Passes/PLTCall.h"
 #include "bolt/Passes/PatchEntries.h"
+#include "bolt/Passes/ProfileQualityStats.h"
 #include "bolt/Passes/RegReAssign.h"
 #include "bolt/Passes/ReorderData.h"
 #include "bolt/Passes/ReorderFunctions.h"
@@ -58,10 +58,10 @@ extern cl::opt<bolt::IdenticalCodeFolding::ICFLevel, false,
                llvm::bolt::DeprecatedICFNumericOptionParser>
     ICF;
 
-static cl::opt<bool>
-DynoStatsAll("dyno-stats-all",
-  cl::desc("print dyno stats after each stage"),
-  cl::ZeroOrMore, cl::Hidden, cl::cat(BoltCategory));
+static cl::opt<bool> DynoStatsAll("dyno-stats-all",
+                                  cl::desc("print dyno stats after each stage"),
+                                  cl::ZeroOrMore, cl::Hidden,
+                                  cl::cat(BoltCategory));
 
 static cl::opt<bool>
     EliminateUnreachable("eliminate-unreachable",
@@ -82,25 +82,24 @@ cl::opt<bool>
 cl::opt<bool> NeverPrint("never-print", cl::desc("never print"),
                          cl::ReallyHidden, cl::cat(BoltOptCategory));
 
-cl::opt<bool>
-PrintAfterBranchFixup("print-after-branch-fixup",
-  cl::desc("print function after fixing local branches"),
-  cl::Hidden, cl::cat(BoltOptCategory));
+cl::opt<bool> PrintAfterBranchFixup(
+    "print-after-branch-fixup",
+    cl::desc("print function after fixing local branches"), cl::Hidden,
+    cl::cat(BoltOptCategory));
 
 static cl::opt<bool>
-PrintAfterLowering("print-after-lowering",
-  cl::desc("print function after instruction lowering"),
-  cl::Hidden, cl::cat(BoltOptCategory));
+    PrintAfterLowering("print-after-lowering",
+                       cl::desc("print function after instruction lowering"),
+                       cl::Hidden, cl::cat(BoltOptCategory));
 
 static cl::opt<bool> PrintEstimateEdgeCounts(
     "print-estimate-edge-counts",
     cl::desc("print function after edge counts are set for no-LBR profile"),
     cl::Hidden, cl::cat(BoltOptCategory));
 
-cl::opt<bool>
-PrintFinalized("print-finalized",
-  cl::desc("print function after CFG is finalized"),
-  cl::Hidden, cl::cat(BoltOptCategory));
+cl::opt<bool> PrintFinalized("print-finalized",
+                             cl::desc("print function after CFG is finalized"),
+                             cl::Hidden, cl::cat(BoltOptCategory));
 
 static cl::opt<bool>
     PrintFOP("print-fop",
@@ -235,12 +234,13 @@ static cl::opt<bool> SimplifyRODataLoads(
              "operand with the constant found in the corresponding section"),
     cl::cat(BoltOptCategory));
 
-static cl::list<std::string>
-SpecializeMemcpy1("memcpy1-spec",
-  cl::desc("list of functions with call sites for which to specialize memcpy() "
-           "for size 1"),
-  cl::value_desc("func1,func2:cs1:cs2,func3:cs1,..."),
-  cl::ZeroOrMore, cl::cat(BoltOptCategory));
+static cl::list<std::string> SpecializeMemcpy1(
+    "memcpy1-spec",
+    cl::desc(
+        "list of functions with call sites for which to specialize memcpy() "
+        "for size 1"),
+    cl::value_desc("func1,func2:cs1:cs2,func3:cs1,..."), cl::ZeroOrMore,
+    cl::cat(BoltOptCategory));
 
 static cl::opt<bool> Stoke("stoke", cl::desc("turn on the stoke analysis"),
                            cl::cat(BoltOptCategory));
@@ -379,7 +379,7 @@ Error BinaryFunctionPassManager::runAllPasses(BinaryContext &BC) {
   if (opts::PrintProfileStats)
     Manager.registerPass(std::make_unique<PrintProfileStats>(NeverPrint));
 
-  Manager.registerPass(std::make_unique<PrintContinuityStats>(NeverPrint));
+  Manager.registerPass(std::make_unique<PrintProfileQualityStats>(NeverPrint));
 
   Manager.registerPass(std::make_unique<ValidateInternalCalls>(NeverPrint));
 
diff --git a/bolt/test/X86/cfg-discontinuity-reporting.test b/bolt/test/X86/cfg-discontinuity-reporting.test
deleted file mode 100644
index 4d7d3305cdb75..0000000000000
--- a/bolt/test/X86/cfg-discontinuity-reporting.test
+++ /dev/null
@@ -1,4 +0,0 @@
-## Check profile discontinuity reporting
-RUN: yaml2obj %p/Inputs/blarge_new.yaml &> %t.exe
-RUN: llvm-bolt %t.exe -o %t.out --pa -p %p/Inputs/blarge_new.preagg.txt | FileCheck %s
-CHECK: among the hottest 5 functions top 5% function CFG discontinuity is 100.00%
diff --git a/bolt/test/X86/profile-quality-reporting.test b/bolt/test/X86/profile-quality-reporting.test
new file mode 100644
index 0000000000000..ee4ecb9853dff
--- /dev/null
+++ b/bolt/test/X86/profile-quality-reporting.test
@@ -0,0 +1,6 @@
+## Check profile quality stats reporting
+RUN: yaml2obj %p/Inputs/blarge_new.yaml &> %t.exe
+RUN: llvm-bolt %t.exe -o %t.out --pa -p %p/Inputs/blarge_new.preagg.txt | FileCheck %s
+CHECK: among the hottest 5 functions top 5% function CFG discontinuity is 100.00%
+CHECK: among the hottest 5 functions top 5% call graph flow conservation gap is 60.00%
+CHECK: among the hottest 5 functions top 5% CFG flow conservation gap is 45.53% (weighted) and 96.87% (worst)

>From 41a1c3867bcae29c41f92050eb13075e9cc21869 Mon Sep 17 00:00:00 2001
From: Shatian Wang <shatian at meta.com>
Date: Wed, 26 Feb 2025 09:25:26 -0800
Subject: [PATCH 2/4] fixup! [BOLT] Flow conservation scores

---
 bolt/lib/Passes/ProfileQualityStats.cpp      | 204 +++++++++----------
 bolt/test/X86/profile-quality-reporting.test |   4 +-
 2 files changed, 101 insertions(+), 107 deletions(-)

diff --git a/bolt/lib/Passes/ProfileQualityStats.cpp b/bolt/lib/Passes/ProfileQualityStats.cpp
index a42b4dc27947a..2f827847913ae 100644
--- a/bolt/lib/Passes/ProfileQualityStats.cpp
+++ b/bolt/lib/Passes/ProfileQualityStats.cpp
@@ -19,8 +19,6 @@
 #include <unordered_map>
 #include <unordered_set>
 
-#define DEBUG_TYPE "bolt-opts"
-
 using namespace llvm;
 using namespace bolt;
 
@@ -34,7 +32,7 @@ cl::opt<unsigned> NumFunctionsForProfileQualityCheck(
 cl::opt<unsigned> PercentileForProfileQualityCheck(
     "percentile-for-profile-quality-check",
     cl::desc("Percentile of profile quality distributions over hottest "
-             "functions to display."),
+             "functions to report."),
     cl::init(95), cl::ZeroOrMore, cl::Hidden, cl::cat(BoltOptCategory));
 } // namespace opts
 
@@ -94,8 +92,7 @@ void printCFGContinuityStats(raw_ostream &OS,
   std::vector<size_t> SumECUnreachables;
   std::vector<double> FractionECUnreachables;
 
-  for (auto it = Functions.begin(); it != Functions.end(); ++it) {
-    const BinaryFunction *Function = *it;
+  for (const BinaryFunction *Function : Functions) {
     if (Function->size() <= 1)
       continue;
 
@@ -104,28 +101,32 @@ void printCFGContinuityStats(raw_ostream &OS,
     size_t SumAllBBEC = 0;
     for (const BinaryBasicBlock &BB : *Function) {
       const size_t BBEC = BB.getKnownExecutionCount();
-      NumPosECBBs += BBEC > 0 ? 1 : 0;
+      NumPosECBBs += !!BBEC;
       SumAllBBEC += BBEC;
     }
 
     // Perform BFS on subgraph of CFG induced by positive weight edges.
     // Compute the number of BBs reachable from the entry(s) of the function and
     // the sum of their execution counts (ECs).
-    std::unordered_map<unsigned, const BinaryBasicBlock *> IndexToBB;
     std::unordered_set<unsigned> Visited;
     std::queue<unsigned> Queue;
-    for (const BinaryBasicBlock &BB : *Function) {
-      // Make sure BB.getIndex() is not already in IndexToBB.
-      assert(IndexToBB.find(BB.getIndex()) == IndexToBB.end());
-      IndexToBB[BB.getIndex()] = &BB;
-      if (BB.isEntryPoint() && BB.getKnownExecutionCount() > 0) {
-        Queue.push(BB.getIndex());
-        Visited.insert(BB.getIndex());
+    size_t SumReachableBBEC = 0;
+
+    Function->forEachEntryPoint([&](uint64_t Offset, const MCSymbol *Label) {
+      const BinaryBasicBlock *EntryBB = Function->getBasicBlockAtOffset(Offset);
+      if (EntryBB && EntryBB->getKnownExecutionCount() > 0) {
+        Queue.push(EntryBB->getLayoutIndex());
+        Visited.insert(EntryBB->getLayoutIndex());
+        SumReachableBBEC += EntryBB->getKnownExecutionCount();
       }
-    }
+      return true;
+    });
+
+    const FunctionLayout &Layout = Function->getLayout();
+
     while (!Queue.empty()) {
       const unsigned BBIndex = Queue.front();
-      const BinaryBasicBlock *BB = IndexToBB[BBIndex];
+      const BinaryBasicBlock *BB = Layout.getBlock(BBIndex);
       Queue.pop();
       auto SuccBIIter = BB->branch_info_begin();
       for (const BinaryBasicBlock *Succ : BB->successors()) {
@@ -134,25 +135,18 @@ void printCFGContinuityStats(raw_ostream &OS,
           ++SuccBIIter;
           continue;
         }
-        if (!Visited.insert(Succ->getIndex()).second) {
+        if (!Visited.insert(Succ->getLayoutIndex()).second) {
           ++SuccBIIter;
           continue;
         }
-        Queue.push(Succ->getIndex());
+        SumReachableBBEC += Succ->getKnownExecutionCount();
+        Queue.push(Succ->getLayoutIndex());
         ++SuccBIIter;
       }
     }
 
     const size_t NumReachableBBs = Visited.size();
 
-    // Loop through Visited, and sum the corresponding BBs' execution counts
-    // (ECs).
-    size_t SumReachableBBEC = 0;
-    for (const unsigned BBIndex : Visited) {
-      const BinaryBasicBlock *BB = IndexToBB[BBIndex];
-      SumReachableBBEC += BB->getKnownExecutionCount();
-    }
-
     const size_t NumPosECBBsUnreachableFromEntry =
         NumPosECBBs - NumReachableBBs;
     const size_t SumUnreachableBBEC = SumAllBBEC - SumReachableBBEC;
@@ -162,7 +156,8 @@ void printCFGContinuityStats(raw_ostream &OS,
     if (opts::Verbosity >= 2 && FractionECUnreachable >= 0.05) {
       OS << "Non-trivial CFG discontinuity observed in function "
          << Function->getPrintName() << "\n";
-      LLVM_DEBUG(Function->dump());
+      if (opts::Verbosity >= 3)
+        Function->dump();
     }
 
     NumUnreachables.push_back(NumPosECBBsUnreachableFromEntry);
@@ -176,11 +171,10 @@ void printCFGContinuityStats(raw_ostream &OS,
   std::sort(FractionECUnreachables.begin(), FractionECUnreachables.end());
   const int Rank = int(FractionECUnreachables.size() *
                        opts::PercentileForProfileQualityCheck / 100);
-  OS << format("top %zu%% function CFG discontinuity is %.2lf%%\n",
-               100 - opts::PercentileForProfileQualityCheck,
+  OS << format("function CFG discontinuity %.2lf%%; ",
                FractionECUnreachables[Rank] * 100);
   if (opts::Verbosity >= 1) {
-    OS << "abbreviations: EC = execution count, POS BBs = positive EC BBs\n"
+    OS << "\nabbreviations: EC = execution count, POS BBs = positive EC BBs\n"
        << "distribution of NUM(unreachable POS BBs) per function\n";
     std::sort(NumUnreachables.begin(), NumUnreachables.end());
     printDistribution(OS, NumUnreachables);
@@ -200,8 +194,7 @@ void printCallGraphFlowConservationStats(
     FlowInfo &TotalFlowMap) {
   std::vector<double> CallGraphGaps;
 
-  for (auto it = Functions.begin(); it != Functions.end(); ++it) {
-    const BinaryFunction *Function = *it;
+  for (const BinaryFunction *Function : Functions) {
     if (Function->size() <= 1 || !Function->isSimple())
       continue;
 
@@ -223,20 +216,22 @@ void printCallGraphFlowConservationStats(
             continue;
           NumConsideredEntryBlocks++;
 
-          EntryInflow += IncomingMap[BB.getIndex()];
-          EntryOutflow += OutgoingMap[BB.getIndex()];
+          EntryInflow += IncomingMap[BB.getLayoutIndex()];
+          EntryOutflow += OutgoingMap[BB.getLayoutIndex()];
         }
       }
       uint64_t NetEntryOutflow = 0;
       if (EntryOutflow < EntryInflow) {
-        if (opts::Verbosity >= 1) {
+        if (opts::Verbosity >= 2) {
           // We expect entry blocks' CFG outflow >= inflow, i.e., it has a
           // non-negative net outflow. If this is not the case, then raise a
           // warning if requested.
-          OS << "BOLT WARNING: unexpected entry block CFG outflow < inflow in "
+          OS << "BOLT WARNING: unexpected entry block CFG outflow < inflow "
+                "in "
                 "function "
              << Function->getPrintName() << "\n";
-          LLVM_DEBUG(Function->dump());
+          if (opts::Verbosity >= 3)
+            Function->dump();
         }
       } else {
         NetEntryOutflow = EntryOutflow - EntryInflow;
@@ -252,7 +247,8 @@ void printCallGraphFlowConservationStats(
           OS << "Nontrivial call graph gap of size "
              << format("%.2lf%%", 100 * CallGraphGap)
              << " observed in function " << Function->getPrintName() << "\n";
-          LLVM_DEBUG(Function->dump());
+          if (opts::Verbosity >= 3)
+            Function->dump();
         }
 
         CallGraphGaps.push_back(CallGraphGap);
@@ -260,17 +256,17 @@ void printCallGraphFlowConservationStats(
     }
   }
 
-  if (!CallGraphGaps.empty()) {
-    std::sort(CallGraphGaps.begin(), CallGraphGaps.end());
-    const int Rank = int(CallGraphGaps.size() *
-                         opts::PercentileForProfileQualityCheck / 100);
-    OS << format("top %zu%% call graph flow conservation gap is %.2lf%%\n",
-                 100 - opts::PercentileForProfileQualityCheck,
-                 CallGraphGaps[Rank] * 100);
-    if (opts::Verbosity >= 1) {
-      OS << "distribution of function entry flow conservation gaps\n";
-      printDistribution(OS, CallGraphGaps, /*Fraction=*/true);
-    }
+  if (CallGraphGaps.empty())
+    return;
+
+  std::sort(CallGraphGaps.begin(), CallGraphGaps.end());
+  const int Rank =
+      int(CallGraphGaps.size() * opts::PercentileForProfileQualityCheck / 100);
+  OS << format("call graph flow conservation gap %.2lf%%; ",
+               CallGraphGaps[Rank] * 100);
+  if (opts::Verbosity >= 1) {
+    OS << "\ndistribution of function entry flow conservation gaps\n";
+    printDistribution(OS, CallGraphGaps, /*Fraction=*/true);
   }
 }
 
@@ -281,8 +277,7 @@ void printCFGFlowConservationStats(raw_ostream &OS,
   std::vector<double> CFGGapsWorst;
   std::vector<uint64_t> CFGGapsWorstAbs;
 
-  for (auto it = Functions.begin(); it != Functions.end(); ++it) {
-    const BinaryFunction *Function = *it;
+  for (const BinaryFunction *Function : Functions) {
     if (Function->size() <= 1 || !Function->isSimple())
       continue;
 
@@ -301,8 +296,8 @@ void printCFGFlowConservationStats(raw_ostream &OS,
       if (BB.isEntryPoint() || BB.succ_size() == 0)
         continue;
 
-      const uint64_t Max = MaxCountMaps[BB.getIndex()];
-      const uint64_t Min = MinCountMaps[BB.getIndex()];
+      const uint64_t Max = MaxCountMaps[BB.getLayoutIndex()];
+      const uint64_t Min = MinCountMaps[BB.getLayoutIndex()];
       const double Gap = 1 - (double)Min / Max;
       double Weight = BB.getKnownExecutionCount() * BB.getNumNonPseudos();
       if (Weight == 0)
@@ -335,7 +330,8 @@ void printCFGFlowConservationStats(raw_ostream &OS,
           OS << "Worst gap (absolute value): " << WorstGapAbs << " at BB with "
              << "input offset 0x"
              << Twine::utohexstr(BBWorstGapAbs->getInputOffset()) << "\n";
-        LLVM_DEBUG(Function->dump());
+        if (opts::Verbosity >= 3)
+          Function->dump();
       }
 
       CFGGapsWeightedAvg.push_back(WeightedGap);
@@ -344,30 +340,27 @@ void printCFGFlowConservationStats(raw_ostream &OS,
     }
   }
 
-  if (!CFGGapsWeightedAvg.empty()) {
-    std::sort(CFGGapsWeightedAvg.begin(), CFGGapsWeightedAvg.end());
-    const int RankWA = int(CFGGapsWeightedAvg.size() *
-                           opts::PercentileForProfileQualityCheck / 100);
-    std::sort(CFGGapsWorst.begin(), CFGGapsWorst.end());
-    const int RankW =
-        int(CFGGapsWorst.size() * opts::PercentileForProfileQualityCheck / 100);
-    OS << format(
-        "top %zu%% CFG flow conservation gap is %.2lf%% (weighted) and "
-        "%.2lf%% (worst)\n",
-        100 - opts::PercentileForProfileQualityCheck,
-        CFGGapsWeightedAvg[RankWA] * 100, CFGGapsWorst[RankW] * 100);
-    if (opts::Verbosity >= 1) {
-      OS << "distribution of weighted CFG flow conservation gaps\n";
-      printDistribution(OS, CFGGapsWeightedAvg, /*Fraction=*/true);
-      OS << "Consider only blocks with execution counts > 500:\n"
-         << "distribution of worst block flow conservation gap per "
-            "function \n";
-      printDistribution(OS, CFGGapsWorst, /*Fraction=*/true);
-      OS << "distribution of worst block flow conservation gap (absolute "
-            "value) per function\n";
-      std::sort(CFGGapsWorstAbs.begin(), CFGGapsWorstAbs.end());
-      printDistribution(OS, CFGGapsWorstAbs, /*Fraction=*/false);
-    }
+  if (CFGGapsWeightedAvg.empty())
+    return;
+  std::sort(CFGGapsWeightedAvg.begin(), CFGGapsWeightedAvg.end());
+  const int RankWA = int(CFGGapsWeightedAvg.size() *
+                         opts::PercentileForProfileQualityCheck / 100);
+  std::sort(CFGGapsWorst.begin(), CFGGapsWorst.end());
+  const int RankW =
+      int(CFGGapsWorst.size() * opts::PercentileForProfileQualityCheck / 100);
+  OS << format("CFG flow conservation gap %.2lf%% (weighted) %.2lf%% (worst)\n",
+               CFGGapsWeightedAvg[RankWA] * 100, CFGGapsWorst[RankW] * 100);
+  if (opts::Verbosity >= 1) {
+    OS << "distribution of weighted CFG flow conservation gaps\n";
+    printDistribution(OS, CFGGapsWeightedAvg, /*Fraction=*/true);
+    OS << "Consider only blocks with execution counts > 500:\n"
+       << "distribution of worst block flow conservation gap per "
+          "function \n";
+    printDistribution(OS, CFGGapsWorst, /*Fraction=*/true);
+    OS << "distribution of worst block flow conservation gap (absolute "
+          "value) per function\n";
+    std::sort(CFGGapsWorstAbs.begin(), CFGGapsWorstAbs.end());
+    printDistribution(OS, CFGGapsWorstAbs, /*Fraction=*/false);
   }
 }
 
@@ -391,10 +384,10 @@ void computeFlowMappings(const BinaryContext &BC, FlowInfo &TotalFlowMap) {
           continue;
         }
         TotalOutgoing += Count;
-        IncomingMap[Succ->getIndex()] += Count;
+        IncomingMap[Succ->getLayoutIndex()] += Count;
         ++SuccBIIter;
       }
-      OutgoingMap[BB.getIndex()] = TotalOutgoing;
+      OutgoingMap[BB.getLayoutIndex()] = TotalOutgoing;
     }
   }
 
@@ -412,7 +405,7 @@ void computeFlowMappings(const BinaryContext &BC, FlowInfo &TotalFlowMap) {
     FlowMapTy &MaxCountMap = TotalMaxCountMaps[FunctionNum];
     FlowMapTy &MinCountMap = TotalMinCountMaps[FunctionNum];
     for (const BinaryBasicBlock &BB : *Function) {
-      uint64_t BBNum = BB.getIndex();
+      uint64_t BBNum = BB.getLayoutIndex();
       MaxCountMap[BBNum] = std::max(IncomingMap[BBNum], OutgoingMap[BBNum]);
       MinCountMap[BBNum] = std::min(IncomingMap[BBNum], OutgoingMap[BBNum]);
     }
@@ -429,7 +422,8 @@ void computeFlowMappings(const BinaryContext &BC, FlowInfo &TotalFlowMap) {
 
     // Update MaxCountMap, MinCountMap, and CallGraphIncomingMap
     auto recordCall = [&](const BinaryBasicBlock *SourceBB,
-                          const MCSymbol *DestSymbol, uint64_t Count) {
+                          const MCSymbol *DestSymbol, uint64_t Count,
+                          uint64_t TotalCallCount) {
       if (Count == BinaryBasicBlock::COUNT_NO_PROFILE)
         Count = 0;
       const BinaryFunction *DstFunc =
@@ -437,9 +431,11 @@ void computeFlowMappings(const BinaryContext &BC, FlowInfo &TotalFlowMap) {
       if (DstFunc)
         CallGraphIncomingMap[DstFunc->getFunctionNumber()] += Count;
       if (SourceBB) {
-        unsigned BlockIndex = SourceBB->getIndex();
-        MaxCountMap[BlockIndex] = std::max(MaxCountMap[BlockIndex], Count);
-        MinCountMap[BlockIndex] = std::min(MinCountMap[BlockIndex], Count);
+        unsigned BlockIndex = SourceBB->getLayoutIndex();
+        MaxCountMap[BlockIndex] =
+            std::max(MaxCountMap[BlockIndex], TotalCallCount);
+        MinCountMap[BlockIndex] =
+            std::min(MinCountMap[BlockIndex], TotalCallCount);
       }
     };
 
@@ -452,7 +448,6 @@ void computeFlowMappings(const BinaryContext &BC, FlowInfo &TotalFlowMap) {
       CallInfoTy Counts;
       const MCSymbol *DstSym = BC.MIB->getTargetSymbol(Inst);
 
-      // If this is an indirect call use perf data directly.
       if (!DstSym && BC.MIB->hasAnnotation(Inst, "CallProfile")) {
         const auto &ICSP = BC.MIB->getAnnotationAs<IndirectCallSiteProfile>(
             Inst, "CallProfile");
@@ -471,22 +466,25 @@ void computeFlowMappings(const BinaryContext &BC, FlowInfo &TotalFlowMap) {
     // directly. The call EC is only used to update CallGraphIncomingMap.
     if (!Function->hasValidProfile() && !Function->getAllCallSites().empty()) {
       for (const IndirectCallProfile &CSI : Function->getAllCallSites()) {
-        if (!CSI.Symbol)
-          continue;
-        recordCall(nullptr, CSI.Symbol, CSI.Count);
+        if (CSI.Symbol)
+          recordCall(nullptr, CSI.Symbol, CSI.Count, CSI.Count);
       }
       continue;
     } else {
       // If the function has a valid profile
-      for (BinaryBasicBlock &BB : *Function) {
-        for (MCInst &Inst : BB) {
-          if (!BC.MIB->isCall(Inst))
-            continue;
-          // Find call instructions and extract target symbols from each
-          // one.
-          const CallInfoTy CallInfo = getCallInfo(&BB, Inst);
-          for (const TargetDesc &CI : CallInfo) {
-            recordCall(&BB, CI.first, CI.second);
+      for (const BinaryBasicBlock &BB : *Function) {
+        for (const MCInst &Inst : BB) {
+          if (BC.MIB->isCall(Inst)) {
+            // Find call instructions and extract target symbols from each
+            // one.
+            const CallInfoTy CallInfo = getCallInfo(&BB, Inst);
+            // We need the total call count to update MaxCountMap and
+            // MinCountMap in recordCall for indirect calls
+            uint64_t TotalCallCount = 0;
+            for (const TargetDesc &CI : CallInfo)
+              TotalCallCount += CI.second;
+            for (const TargetDesc &CI : CallInfo)
+              recordCall(&BB, CI.first, CI.second, TotalCallCount);
           }
         }
       }
@@ -511,14 +509,12 @@ void printAll(BinaryContext &BC, FunctionListType &ValidFunctions,
   FlowInfo TotalFlowMap;
   computeFlowMappings(BC, TotalFlowMap);
 
-  BC.outs() << format("BOLT-INFO: among the hottest %zu functions ",
-                      RealNumTopFunctions);
+  BC.outs() << format("BOLT-INFO: profile quality metrics for the hottest %zu "
+                      "functions (reporting top %zu%% values): ",
+                      RealNumTopFunctions,
+                      100 - opts::PercentileForProfileQualityCheck);
   printCFGContinuityStats(BC.outs(), Functions);
-  BC.outs() << format("BOLT-INFO: among the hottest %zu functions ",
-                      RealNumTopFunctions);
   printCallGraphFlowConservationStats(BC.outs(), Functions, TotalFlowMap);
-  BC.outs() << format("BOLT-INFO: among the hottest %zu functions ",
-                      RealNumTopFunctions);
   printCFGFlowConservationStats(BC.outs(), Functions, TotalFlowMap);
 
   // Print more detailed bucketed stats if requested.
diff --git a/bolt/test/X86/profile-quality-reporting.test b/bolt/test/X86/profile-quality-reporting.test
index ee4ecb9853dff..2e15a6b245afa 100644
--- a/bolt/test/X86/profile-quality-reporting.test
+++ b/bolt/test/X86/profile-quality-reporting.test
@@ -1,6 +1,4 @@
 ## Check profile quality stats reporting
 RUN: yaml2obj %p/Inputs/blarge_new.yaml &> %t.exe
 RUN: llvm-bolt %t.exe -o %t.out --pa -p %p/Inputs/blarge_new.preagg.txt | FileCheck %s
-CHECK: among the hottest 5 functions top 5% function CFG discontinuity is 100.00%
-CHECK: among the hottest 5 functions top 5% call graph flow conservation gap is 60.00%
-CHECK: among the hottest 5 functions top 5% CFG flow conservation gap is 45.53% (weighted) and 96.87% (worst)
+CHECK: profile quality metrics for the hottest 5 functions (reporting top 5% values): function CFG discontinuity 100.00%; call graph flow conservation gap 60.00%; CFG flow conservation gap 45.53% (weighted) 96.87% (worst)

>From 94eee67048447b79c3ad8d5ee9387a7bb8b3a96d Mon Sep 17 00:00:00 2001
From: Shatian Wang <shatian at meta.com>
Date: Thu, 27 Feb 2025 11:53:02 -0800
Subject: [PATCH 3/4] fixup! fixup! [BOLT] Flow conservation scores

---
 .../include/bolt/Passes/ProfileQualityStats.h |   2 +-
 bolt/lib/Passes/ProfileQualityStats.cpp       | 224 +++++++++---------
 bolt/lib/Rewrite/BinaryPassManager.cpp        |  42 ++--
 3 files changed, 135 insertions(+), 133 deletions(-)

diff --git a/bolt/include/bolt/Passes/ProfileQualityStats.h b/bolt/include/bolt/Passes/ProfileQualityStats.h
index 761fd33431a46..86fc88cefc10e 100644
--- a/bolt/include/bolt/Passes/ProfileQualityStats.h
+++ b/bolt/include/bolt/Passes/ProfileQualityStats.h
@@ -56,7 +56,7 @@
 // CFG flow conservation property.
 //
 // The default value of 1000 above can be changed via the hidden BOLT option
-// `-num-functions-for-profile-quality-check=[N]`.
+// `-top-functions-for-profile-quality-check=[N]`.
 // The default reporting of the 95th percentile can be changed via the hidden
 // BOLT option `-percentile-for-profile-quality-check=[M]`.
 //
diff --git a/bolt/lib/Passes/ProfileQualityStats.cpp b/bolt/lib/Passes/ProfileQualityStats.cpp
index 2f827847913ae..a5d0a621c37ce 100644
--- a/bolt/lib/Passes/ProfileQualityStats.cpp
+++ b/bolt/lib/Passes/ProfileQualityStats.cpp
@@ -24,8 +24,8 @@ using namespace bolt;
 
 namespace opts {
 extern cl::opt<unsigned> Verbosity;
-cl::opt<unsigned> NumFunctionsForProfileQualityCheck(
-    "num-functions-for-profile-quality-check",
+cl::opt<unsigned> TopFunctionsForProfileQualityCheck(
+    "top-functions-for-profile-quality-check",
     cl::desc("number of hottest functions to print aggregated "
              "profile quality stats of."),
     cl::init(1000), cl::ZeroOrMore, cl::Hidden, cl::cat(BoltOptCategory));
@@ -40,18 +40,16 @@ namespace {
 using FunctionListType = std::vector<const BinaryFunction *>;
 using function_iterator = FunctionListType::iterator;
 
-// BB index -> flow count
-using FlowMapTy = std::unordered_map<unsigned, uint64_t>;
-// Function number -> FlowMapTy
-using TotalFlowMapTy = std::unordered_map<uint64_t, FlowMapTy>;
+// Function number -> vector of flows for BBs in the function
+using TotalFlowMapTy = std::unordered_map<uint64_t, std::vector<uint64_t>>;
 // Function number -> flow count
 using FunctionFlowMapTy = std::unordered_map<uint64_t, uint64_t>;
 struct FlowInfo {
-  TotalFlowMapTy TotalIncomingMaps;
-  TotalFlowMapTy TotalOutgoingMaps;
+  TotalFlowMapTy TotalIncomingFlows;
+  TotalFlowMapTy TotalOutgoingFlows;
   TotalFlowMapTy TotalMaxCountMaps;
   TotalFlowMapTy TotalMinCountMaps;
-  FunctionFlowMapTy CallGraphIncomingMap;
+  FunctionFlowMapTy CallGraphIncomingFlows;
 };
 
 template <typename T>
@@ -69,7 +67,7 @@ void printDistribution(raw_ostream &OS, std::vector<T> &values,
       Rank = values.size() - 1;
     if (Fraction)
       OS << "  " << Text << std::string(11 - Text.length(), ' ') << ": "
-         << format("%.2lf%%", values[Rank] * 100) << "\n";
+         << formatv("{0:P}", values[Rank]) << "\n";
     else
       OS << "  " << Text << std::string(11 - Text.length(), ' ') << ": "
          << values[Rank] << "\n";
@@ -114,11 +112,11 @@ void printCFGContinuityStats(raw_ostream &OS,
 
     Function->forEachEntryPoint([&](uint64_t Offset, const MCSymbol *Label) {
       const BinaryBasicBlock *EntryBB = Function->getBasicBlockAtOffset(Offset);
-      if (EntryBB && EntryBB->getKnownExecutionCount() > 0) {
-        Queue.push(EntryBB->getLayoutIndex());
-        Visited.insert(EntryBB->getLayoutIndex());
-        SumReachableBBEC += EntryBB->getKnownExecutionCount();
-      }
+      if (!EntryBB || EntryBB->getKnownExecutionCount() == 0)
+        return true;
+      Queue.push(EntryBB->getLayoutIndex());
+      Visited.insert(EntryBB->getLayoutIndex());
+      SumReachableBBEC += EntryBB->getKnownExecutionCount();
       return true;
     });
 
@@ -128,20 +126,15 @@ void printCFGContinuityStats(raw_ostream &OS,
       const unsigned BBIndex = Queue.front();
       const BinaryBasicBlock *BB = Layout.getBlock(BBIndex);
       Queue.pop();
-      auto SuccBIIter = BB->branch_info_begin();
-      for (const BinaryBasicBlock *Succ : BB->successors()) {
-        const uint64_t Count = SuccBIIter->Count;
-        if (Count == BinaryBasicBlock::COUNT_NO_PROFILE || Count == 0) {
-          ++SuccBIIter;
-          continue;
-        }
-        if (!Visited.insert(Succ->getLayoutIndex()).second) {
-          ++SuccBIIter;
+      for (const auto &Tuple : llvm::zip(BB->successors(), BB->branch_info())) {
+        const auto *Succ = std::get<0>(Tuple);
+        const auto &BI = std::get<1>(Tuple);
+        const uint64_t Count = BI.Count;
+        if (Count == BinaryBasicBlock::COUNT_NO_PROFILE || Count == 0 ||
+            !Visited.insert(Succ->getLayoutIndex()).second)
           continue;
-        }
         SumReachableBBEC += Succ->getKnownExecutionCount();
         Queue.push(Succ->getLayoutIndex());
-        ++SuccBIIter;
       }
     }
 
@@ -168,19 +161,19 @@ void printCFGContinuityStats(raw_ostream &OS,
   if (FractionECUnreachables.empty())
     return;
 
-  std::sort(FractionECUnreachables.begin(), FractionECUnreachables.end());
+  llvm::sort(FractionECUnreachables);
   const int Rank = int(FractionECUnreachables.size() *
                        opts::PercentileForProfileQualityCheck / 100);
-  OS << format("function CFG discontinuity %.2lf%%; ",
-               FractionECUnreachables[Rank] * 100);
+  OS << formatv("function CFG discontinuity {0:P}; ",
+                FractionECUnreachables[Rank]);
   if (opts::Verbosity >= 1) {
     OS << "\nabbreviations: EC = execution count, POS BBs = positive EC BBs\n"
        << "distribution of NUM(unreachable POS BBs) per function\n";
-    std::sort(NumUnreachables.begin(), NumUnreachables.end());
+    llvm::sort(NumUnreachables);
     printDistribution(OS, NumUnreachables);
 
     OS << "distribution of SUM_EC(unreachable POS BBs) per function\n";
-    std::sort(SumECUnreachables.begin(), SumECUnreachables.end());
+    llvm::sort(SumECUnreachables);
     printDistribution(OS, SumECUnreachables);
 
     OS << "distribution of [(SUM_EC(unreachable POS BBs) / SUM_EC(all "
@@ -199,27 +192,31 @@ void printCallGraphFlowConservationStats(
       continue;
 
     const uint64_t FunctionNum = Function->getFunctionNumber();
-    FlowMapTy &IncomingMap = TotalFlowMap.TotalIncomingMaps[FunctionNum];
-    FlowMapTy &OutgoingMap = TotalFlowMap.TotalOutgoingMaps[FunctionNum];
-    FunctionFlowMapTy &CallGraphIncomingMap = TotalFlowMap.CallGraphIncomingMap;
+    std::vector<uint64_t> &IncomingFlows =
+        TotalFlowMap.TotalIncomingFlows[FunctionNum];
+    std::vector<uint64_t> &OutgoingFlows =
+        TotalFlowMap.TotalOutgoingFlows[FunctionNum];
+    FunctionFlowMapTy &CallGraphIncomingFlows =
+        TotalFlowMap.CallGraphIncomingFlows;
 
     // Only consider functions that are not a program entry.
-    if (CallGraphIncomingMap.find(FunctionNum) != CallGraphIncomingMap.end()) {
+    if (CallGraphIncomingFlows.find(FunctionNum) !=
+        CallGraphIncomingFlows.end()) {
       uint64_t EntryInflow = 0;
       uint64_t EntryOutflow = 0;
       uint32_t NumConsideredEntryBlocks = 0;
-      for (const BinaryBasicBlock &BB : *Function) {
-        if (BB.isEntryPoint()) {
-          // If entry is an exit, then we don't consider it for flow
-          // conservation
-          if (BB.succ_size() == 0)
-            continue;
-          NumConsideredEntryBlocks++;
 
-          EntryInflow += IncomingMap[BB.getLayoutIndex()];
-          EntryOutflow += OutgoingMap[BB.getLayoutIndex()];
-        }
-      }
+      Function->forEachEntryPoint([&](uint64_t Offset, const MCSymbol *Label) {
+        const BinaryBasicBlock *EntryBB =
+            Function->getBasicBlockAtOffset(Offset);
+        if (!EntryBB || EntryBB->succ_size() == 0)
+          return true;
+        NumConsideredEntryBlocks++;
+        EntryInflow += IncomingFlows[EntryBB->getLayoutIndex()];
+        EntryOutflow += OutgoingFlows[EntryBB->getLayoutIndex()];
+        return true;
+      });
+
       uint64_t NetEntryOutflow = 0;
       if (EntryOutflow < EntryInflow) {
         if (opts::Verbosity >= 2) {
@@ -227,8 +224,7 @@ void printCallGraphFlowConservationStats(
           // non-negative net outflow. If this is not the case, then raise a
           // warning if requested.
           OS << "BOLT WARNING: unexpected entry block CFG outflow < inflow "
-                "in "
-                "function "
+                "in function "
              << Function->getPrintName() << "\n";
           if (opts::Verbosity >= 3)
             Function->dump();
@@ -238,15 +234,15 @@ void printCallGraphFlowConservationStats(
       }
       if (NumConsideredEntryBlocks > 0) {
         const uint64_t CallGraphInflow =
-            TotalFlowMap.CallGraphIncomingMap[Function->getFunctionNumber()];
+            TotalFlowMap.CallGraphIncomingFlows[Function->getFunctionNumber()];
         const uint64_t Min = std::min(NetEntryOutflow, CallGraphInflow);
         const uint64_t Max = std::max(NetEntryOutflow, CallGraphInflow);
         const double CallGraphGap = 1 - (double)Min / Max;
 
         if (opts::Verbosity >= 2 && CallGraphGap >= 0.5) {
           OS << "Nontrivial call graph gap of size "
-             << format("%.2lf%%", 100 * CallGraphGap)
-             << " observed in function " << Function->getPrintName() << "\n";
+             << formatv("{0:P}", CallGraphGap) << " observed in function "
+             << Function->getPrintName() << "\n";
           if (opts::Verbosity >= 3)
             Function->dump();
         }
@@ -259,11 +255,11 @@ void printCallGraphFlowConservationStats(
   if (CallGraphGaps.empty())
     return;
 
-  std::sort(CallGraphGaps.begin(), CallGraphGaps.end());
+  llvm::sort(CallGraphGaps);
   const int Rank =
       int(CallGraphGaps.size() * opts::PercentileForProfileQualityCheck / 100);
-  OS << format("call graph flow conservation gap %.2lf%%; ",
-               CallGraphGaps[Rank] * 100);
+  OS << formatv("call graph flow conservation gap {0:P}; ",
+                CallGraphGaps[Rank]);
   if (opts::Verbosity >= 1) {
     OS << "\ndistribution of function entry flow conservation gaps\n";
     printDistribution(OS, CallGraphGaps, /*Fraction=*/true);
@@ -282,8 +278,10 @@ void printCFGFlowConservationStats(raw_ostream &OS,
       continue;
 
     const uint64_t FunctionNum = Function->getFunctionNumber();
-    FlowMapTy &MaxCountMaps = TotalFlowMap.TotalMaxCountMaps[FunctionNum];
-    FlowMapTy &MinCountMaps = TotalFlowMap.TotalMinCountMaps[FunctionNum];
+    std::vector<uint64_t> &MaxCountMaps =
+        TotalFlowMap.TotalMaxCountMaps[FunctionNum];
+    std::vector<uint64_t> &MinCountMaps =
+        TotalFlowMap.TotalMinCountMaps[FunctionNum];
     double WeightedGapSum = 0.0;
     double WeightSum = 0.0;
     double WorstGap = 0.0;
@@ -321,9 +319,9 @@ void printCFGFlowConservationStats(raw_ostream &OS,
       if (opts::Verbosity >= 2 && (WeightedGap >= 0.1 || WorstGap >= 0.9)) {
         OS << "Nontrivial CFG gap observed in function "
            << Function->getPrintName() << "\n"
-           << "Weighted gap: " << format("%.2lf%%", 100 * WeightedGap) << "\n";
+           << "Weighted gap: " << formatv("{0:P}", WeightedGap) << "\n";
         if (BBWorstGap)
-          OS << "Worst gap: " << format("%.2lf%%", 100 * WorstGap)
+          OS << "Worst gap: " << formatv("{0:P}", WorstGap)
              << " at BB with input offset: 0x"
              << Twine::utohexstr(BBWorstGap->getInputOffset()) << "\n";
         if (BBWorstGapAbs)
@@ -342,14 +340,14 @@ void printCFGFlowConservationStats(raw_ostream &OS,
 
   if (CFGGapsWeightedAvg.empty())
     return;
-  std::sort(CFGGapsWeightedAvg.begin(), CFGGapsWeightedAvg.end());
+  llvm::sort(CFGGapsWeightedAvg);
   const int RankWA = int(CFGGapsWeightedAvg.size() *
                          opts::PercentileForProfileQualityCheck / 100);
-  std::sort(CFGGapsWorst.begin(), CFGGapsWorst.end());
+  llvm::sort(CFGGapsWorst);
   const int RankW =
       int(CFGGapsWorst.size() * opts::PercentileForProfileQualityCheck / 100);
-  OS << format("CFG flow conservation gap %.2lf%% (weighted) %.2lf%% (worst)\n",
-               CFGGapsWeightedAvg[RankWA] * 100, CFGGapsWorst[RankW] * 100);
+  OS << formatv("CFG flow conservation gap {0:P} (weighted) {1:P} (worst)\n",
+                CFGGapsWeightedAvg[RankWA], CFGGapsWorst[RankW]);
   if (opts::Verbosity >= 1) {
     OS << "distribution of weighted CFG flow conservation gaps\n";
     printDistribution(OS, CFGGapsWeightedAvg, /*Fraction=*/true);
@@ -359,68 +357,74 @@ void printCFGFlowConservationStats(raw_ostream &OS,
     printDistribution(OS, CFGGapsWorst, /*Fraction=*/true);
     OS << "distribution of worst block flow conservation gap (absolute "
           "value) per function\n";
-    std::sort(CFGGapsWorstAbs.begin(), CFGGapsWorstAbs.end());
+    llvm::sort(CFGGapsWorstAbs);
     printDistribution(OS, CFGGapsWorstAbs, /*Fraction=*/false);
   }
 }
 
 void computeFlowMappings(const BinaryContext &BC, FlowInfo &TotalFlowMap) {
   // Increment block inflow and outflow with CFG jump counts.
-  TotalFlowMapTy &TotalIncomingMaps = TotalFlowMap.TotalIncomingMaps;
-  TotalFlowMapTy &TotalOutgoingMaps = TotalFlowMap.TotalOutgoingMaps;
+  TotalFlowMapTy &TotalIncomingFlows = TotalFlowMap.TotalIncomingFlows;
+  TotalFlowMapTy &TotalOutgoingFlows = TotalFlowMap.TotalOutgoingFlows;
   for (const auto &BFI : BC.getBinaryFunctions()) {
     const BinaryFunction *Function = &BFI.second;
+    std::vector<uint64_t> &IncomingFlows =
+        TotalIncomingFlows[Function->getFunctionNumber()];
+    std::vector<uint64_t> &OutgoingFlows =
+        TotalOutgoingFlows[Function->getFunctionNumber()];
+    const uint64_t NumBlocks = Function->size();
+    IncomingFlows.resize(NumBlocks, 0);
+    OutgoingFlows.resize(NumBlocks, 0);
     if (Function->empty() || !Function->hasValidProfile())
       continue;
-    FlowMapTy &IncomingMap = TotalIncomingMaps[Function->getFunctionNumber()];
-    FlowMapTy &OutgoingMap = TotalOutgoingMaps[Function->getFunctionNumber()];
     for (const BinaryBasicBlock &BB : *Function) {
       uint64_t TotalOutgoing = 0ULL;
-      auto SuccBIIter = BB.branch_info_begin();
-      for (BinaryBasicBlock *Succ : BB.successors()) {
-        const uint64_t Count = SuccBIIter->Count;
-        if (Count == BinaryBasicBlock::COUNT_NO_PROFILE || Count == 0) {
-          ++SuccBIIter;
+      for (const auto &Tuple : llvm::zip(BB.successors(), BB.branch_info())) {
+        const auto *Succ = std::get<0>(Tuple);
+        const auto &BI = std::get<1>(Tuple);
+        const uint64_t Count = BI.Count;
+        if (Count == BinaryBasicBlock::COUNT_NO_PROFILE || Count == 0)
           continue;
-        }
         TotalOutgoing += Count;
-        IncomingMap[Succ->getLayoutIndex()] += Count;
-        ++SuccBIIter;
+        IncomingFlows[Succ->getLayoutIndex()] += Count;
       }
-      OutgoingMap[BB.getLayoutIndex()] = TotalOutgoing;
+      OutgoingFlows[BB.getLayoutIndex()] = TotalOutgoing;
     }
   }
-
   // Initialize TotalMaxCountMaps and TotalMinCountMaps using
-  // TotalIncomingMaps and TotalOutgoingMaps
+  // TotalIncomingFlows and TotalOutgoingFlows
   TotalFlowMapTy &TotalMaxCountMaps = TotalFlowMap.TotalMaxCountMaps;
   TotalFlowMapTy &TotalMinCountMaps = TotalFlowMap.TotalMinCountMaps;
   for (const auto &BFI : BC.getBinaryFunctions()) {
     const BinaryFunction *Function = &BFI.second;
+    uint64_t FunctionNum = Function->getFunctionNumber();
+    std::vector<uint64_t> &IncomingFlows = TotalIncomingFlows[FunctionNum];
+    std::vector<uint64_t> &OutgoingFlows = TotalOutgoingFlows[FunctionNum];
+    std::vector<uint64_t> &MaxCountMap = TotalMaxCountMaps[FunctionNum];
+    std::vector<uint64_t> &MinCountMap = TotalMinCountMaps[FunctionNum];
+    const uint64_t NumBlocks = Function->size();
+    MaxCountMap.resize(NumBlocks, 0);
+    MinCountMap.resize(NumBlocks, 0);
     if (Function->empty() || !Function->hasValidProfile())
       continue;
-    uint64_t FunctionNum = Function->getFunctionNumber();
-    FlowMapTy &IncomingMap = TotalIncomingMaps[FunctionNum];
-    FlowMapTy &OutgoingMap = TotalOutgoingMaps[FunctionNum];
-    FlowMapTy &MaxCountMap = TotalMaxCountMaps[FunctionNum];
-    FlowMapTy &MinCountMap = TotalMinCountMaps[FunctionNum];
     for (const BinaryBasicBlock &BB : *Function) {
       uint64_t BBNum = BB.getLayoutIndex();
-      MaxCountMap[BBNum] = std::max(IncomingMap[BBNum], OutgoingMap[BBNum]);
-      MinCountMap[BBNum] = std::min(IncomingMap[BBNum], OutgoingMap[BBNum]);
+      MaxCountMap[BBNum] = std::max(IncomingFlows[BBNum], OutgoingFlows[BBNum]);
+      MinCountMap[BBNum] = std::min(IncomingFlows[BBNum], OutgoingFlows[BBNum]);
     }
   }
 
   // Modify TotalMaxCountMaps and TotalMinCountMaps using call counts and
-  // fill out CallGraphIncomingMap
-  FunctionFlowMapTy &CallGraphIncomingMap = TotalFlowMap.CallGraphIncomingMap;
+  // fill out CallGraphIncomingFlows
+  FunctionFlowMapTy &CallGraphIncomingFlows =
+      TotalFlowMap.CallGraphIncomingFlows;
   for (const auto &BFI : BC.getBinaryFunctions()) {
     const BinaryFunction *Function = &BFI.second;
     uint64_t FunctionNum = Function->getFunctionNumber();
-    FlowMapTy &MaxCountMap = TotalMaxCountMaps[FunctionNum];
-    FlowMapTy &MinCountMap = TotalMinCountMaps[FunctionNum];
+    std::vector<uint64_t> &MaxCountMap = TotalMaxCountMaps[FunctionNum];
+    std::vector<uint64_t> &MinCountMap = TotalMinCountMaps[FunctionNum];
 
-    // Update MaxCountMap, MinCountMap, and CallGraphIncomingMap
+    // Update MaxCountMap, MinCountMap, and CallGraphIncomingFlows
     auto recordCall = [&](const BinaryBasicBlock *SourceBB,
                           const MCSymbol *DestSymbol, uint64_t Count,
                           uint64_t TotalCallCount) {
@@ -429,7 +433,7 @@ void computeFlowMappings(const BinaryContext &BC, FlowInfo &TotalFlowMap) {
       const BinaryFunction *DstFunc =
           DestSymbol ? BC.getFunctionForSymbol(DestSymbol) : nullptr;
       if (DstFunc)
-        CallGraphIncomingMap[DstFunc->getFunctionNumber()] += Count;
+        CallGraphIncomingFlows[DstFunc->getFunctionNumber()] += Count;
       if (SourceBB) {
         unsigned BlockIndex = SourceBB->getLayoutIndex();
         MaxCountMap[BlockIndex] =
@@ -449,9 +453,8 @@ void computeFlowMappings(const BinaryContext &BC, FlowInfo &TotalFlowMap) {
       const MCSymbol *DstSym = BC.MIB->getTargetSymbol(Inst);
 
       if (!DstSym && BC.MIB->hasAnnotation(Inst, "CallProfile")) {
-        const auto &ICSP = BC.MIB->getAnnotationAs<IndirectCallSiteProfile>(
-            Inst, "CallProfile");
-        for (const IndirectCallProfile &CSI : ICSP)
+        for (const auto &CSI : BC.MIB->getAnnotationAs<IndirectCallSiteProfile>(
+                 Inst, "CallProfile"))
           if (CSI.Symbol)
             Counts.emplace_back(CSI.Symbol, CSI.Count);
       } else {
@@ -463,29 +466,28 @@ void computeFlowMappings(const BinaryContext &BC, FlowInfo &TotalFlowMap) {
     };
 
     // If the function has an invalid profile, try to use the perf data
-    // directly. The call EC is only used to update CallGraphIncomingMap.
+    // directly. The call EC is only used to update CallGraphIncomingFlows.
     if (!Function->hasValidProfile() && !Function->getAllCallSites().empty()) {
-      for (const IndirectCallProfile &CSI : Function->getAllCallSites()) {
+      for (const IndirectCallProfile &CSI : Function->getAllCallSites())
         if (CSI.Symbol)
           recordCall(nullptr, CSI.Symbol, CSI.Count, CSI.Count);
-      }
       continue;
     } else {
       // If the function has a valid profile
       for (const BinaryBasicBlock &BB : *Function) {
         for (const MCInst &Inst : BB) {
-          if (BC.MIB->isCall(Inst)) {
-            // Find call instructions and extract target symbols from each
-            // one.
-            const CallInfoTy CallInfo = getCallInfo(&BB, Inst);
-            // We need the total call count to update MaxCountMap and
-            // MinCountMap in recordCall for indirect calls
-            uint64_t TotalCallCount = 0;
-            for (const TargetDesc &CI : CallInfo)
-              TotalCallCount += CI.second;
-            for (const TargetDesc &CI : CallInfo)
-              recordCall(&BB, CI.first, CI.second, TotalCallCount);
-          }
+          if (!BC.MIB->isCall(Inst))
+            continue;
+          // Find call instructions and extract target symbols from each
+          // one.
+          const CallInfoTy CallInfo = getCallInfo(&BB, Inst);
+          // We need the total call count to update MaxCountMap and
+          // MinCountMap in recordCall for indirect calls
+          uint64_t TotalCallCount = 0;
+          for (const TargetDesc &CI : CallInfo)
+            TotalCallCount += CI.second;
+          for (const TargetDesc &CI : CallInfo)
+            recordCall(&BB, CI.first, CI.second, TotalCallCount);
         }
       }
     }
@@ -567,9 +569,9 @@ Error PrintProfileQualityStats::runOnFunctions(BinaryContext &BC) {
     if (PrintProfileQualityStats::shouldOptimize(*Function))
       ValidFunctions.push_back(Function);
   }
-  if (ValidFunctions.empty() || opts::NumFunctionsForProfileQualityCheck == 0)
+  if (ValidFunctions.empty() || opts::TopFunctionsForProfileQualityCheck == 0)
     return Error::success();
 
-  printAll(BC, ValidFunctions, opts::NumFunctionsForProfileQualityCheck);
+  printAll(BC, ValidFunctions, opts::TopFunctionsForProfileQualityCheck);
   return Error::success();
 }
diff --git a/bolt/lib/Rewrite/BinaryPassManager.cpp b/bolt/lib/Rewrite/BinaryPassManager.cpp
index 6f750b95adf9a..dd48653931eb9 100644
--- a/bolt/lib/Rewrite/BinaryPassManager.cpp
+++ b/bolt/lib/Rewrite/BinaryPassManager.cpp
@@ -58,10 +58,10 @@ extern cl::opt<bolt::IdenticalCodeFolding::ICFLevel, false,
                llvm::bolt::DeprecatedICFNumericOptionParser>
     ICF;
 
-static cl::opt<bool> DynoStatsAll("dyno-stats-all",
-                                  cl::desc("print dyno stats after each stage"),
-                                  cl::ZeroOrMore, cl::Hidden,
-                                  cl::cat(BoltCategory));
+static cl::opt<bool>
+DynoStatsAll("dyno-stats-all",
+  cl::desc("print dyno stats after each stage"),
+  cl::ZeroOrMore, cl::Hidden, cl::cat(BoltCategory));
 
 static cl::opt<bool>
     EliminateUnreachable("eliminate-unreachable",
@@ -82,24 +82,25 @@ cl::opt<bool>
 cl::opt<bool> NeverPrint("never-print", cl::desc("never print"),
                          cl::ReallyHidden, cl::cat(BoltOptCategory));
 
-cl::opt<bool> PrintAfterBranchFixup(
-    "print-after-branch-fixup",
-    cl::desc("print function after fixing local branches"), cl::Hidden,
-    cl::cat(BoltOptCategory));
+cl::opt<bool>
+PrintAfterBranchFixup("print-after-branch-fixup",
+  cl::desc("print function after fixing local branches"),
+  cl::Hidden, cl::cat(BoltOptCategory));
 
 static cl::opt<bool>
-    PrintAfterLowering("print-after-lowering",
-                       cl::desc("print function after instruction lowering"),
-                       cl::Hidden, cl::cat(BoltOptCategory));
+PrintAfterLowering("print-after-lowering",
+  cl::desc("print function after instruction lowering"),
+  cl::Hidden, cl::cat(BoltOptCategory));
 
 static cl::opt<bool> PrintEstimateEdgeCounts(
     "print-estimate-edge-counts",
     cl::desc("print function after edge counts are set for no-LBR profile"),
     cl::Hidden, cl::cat(BoltOptCategory));
 
-cl::opt<bool> PrintFinalized("print-finalized",
-                             cl::desc("print function after CFG is finalized"),
-                             cl::Hidden, cl::cat(BoltOptCategory));
+cl::opt<bool>
+PrintFinalized("print-finalized",
+  cl::desc("print function after CFG is finalized"),
+  cl::Hidden, cl::cat(BoltOptCategory));
 
 static cl::opt<bool>
     PrintFOP("print-fop",
@@ -234,13 +235,12 @@ static cl::opt<bool> SimplifyRODataLoads(
              "operand with the constant found in the corresponding section"),
     cl::cat(BoltOptCategory));
 
-static cl::list<std::string> SpecializeMemcpy1(
-    "memcpy1-spec",
-    cl::desc(
-        "list of functions with call sites for which to specialize memcpy() "
-        "for size 1"),
-    cl::value_desc("func1,func2:cs1:cs2,func3:cs1,..."), cl::ZeroOrMore,
-    cl::cat(BoltOptCategory));
+static cl::list<std::string>
+SpecializeMemcpy1("memcpy1-spec",
+  cl::desc("list of functions with call sites for which to specialize memcpy() "
+           "for size 1"),
+  cl::value_desc("func1,func2:cs1:cs2,func3:cs1,..."),
+  cl::ZeroOrMore, cl::cat(BoltOptCategory));
 
 static cl::opt<bool> Stoke("stoke", cl::desc("turn on the stoke analysis"),
                            cl::cat(BoltOptCategory));

>From c2962d92676d07270240470a3791056083a09d70 Mon Sep 17 00:00:00 2001
From: Shatian Wang <shatian at meta.com>
Date: Thu, 27 Feb 2025 19:58:27 -0800
Subject: [PATCH 4/4] fixup! fixup! fixup! [BOLT] Flow conservation scores

---
 bolt/lib/Passes/ProfileQualityStats.cpp | 22 ++++++++++++----------
 1 file changed, 12 insertions(+), 10 deletions(-)

diff --git a/bolt/lib/Passes/ProfileQualityStats.cpp b/bolt/lib/Passes/ProfileQualityStats.cpp
index a5d0a621c37ce..78e6412f56ba1 100644
--- a/bolt/lib/Passes/ProfileQualityStats.cpp
+++ b/bolt/lib/Passes/ProfileQualityStats.cpp
@@ -126,9 +126,8 @@ void printCFGContinuityStats(raw_ostream &OS,
       const unsigned BBIndex = Queue.front();
       const BinaryBasicBlock *BB = Layout.getBlock(BBIndex);
       Queue.pop();
-      for (const auto &Tuple : llvm::zip(BB->successors(), BB->branch_info())) {
-        const auto *Succ = std::get<0>(Tuple);
-        const auto &BI = std::get<1>(Tuple);
+      for (const auto &[Succ, BI] :
+           llvm::zip(BB->successors(), BB->branch_info())) {
         const uint64_t Count = BI.Count;
         if (Count == BinaryBasicBlock::COUNT_NO_PROFILE || Count == 0 ||
             !Visited.insert(Succ->getLayoutIndex()).second)
@@ -272,7 +271,9 @@ void printCFGFlowConservationStats(raw_ostream &OS,
   std::vector<double> CFGGapsWeightedAvg;
   std::vector<double> CFGGapsWorst;
   std::vector<uint64_t> CFGGapsWorstAbs;
-
+  // We only consider blocks with execution counts > MinBlockCount when
+  // reporting the distribution of worst gaps.
+  const uint16_t MinBlockCount = 500;
   for (const BinaryFunction *Function : Functions) {
     if (Function->size() <= 1 || !Function->isSimple())
       continue;
@@ -305,11 +306,12 @@ void printCFGFlowConservationStats(raw_ostream &OS,
       Weight = log(Weight);
       WeightedGapSum += Gap * Weight;
       WeightSum += Weight;
-      if (BB.getKnownExecutionCount() > 500 && Gap > WorstGap) {
+      if (BB.getKnownExecutionCount() > MinBlockCount && Gap > WorstGap) {
         WorstGap = Gap;
         BBWorstGap = &BB;
       }
-      if (BB.getKnownExecutionCount() > 500 && Max - Min > WorstGapAbs) {
+      if (BB.getKnownExecutionCount() > MinBlockCount &&
+          Max - Min > WorstGapAbs) {
         WorstGapAbs = Max - Min;
         BBWorstGapAbs = &BB;
       }
@@ -351,7 +353,8 @@ void printCFGFlowConservationStats(raw_ostream &OS,
   if (opts::Verbosity >= 1) {
     OS << "distribution of weighted CFG flow conservation gaps\n";
     printDistribution(OS, CFGGapsWeightedAvg, /*Fraction=*/true);
-    OS << "Consider only blocks with execution counts > 500:\n"
+    OS << format("Consider only blocks with execution counts > %zu:\n",
+                 MinBlockCount)
        << "distribution of worst block flow conservation gap per "
           "function \n";
     printDistribution(OS, CFGGapsWorst, /*Fraction=*/true);
@@ -379,9 +382,8 @@ void computeFlowMappings(const BinaryContext &BC, FlowInfo &TotalFlowMap) {
       continue;
     for (const BinaryBasicBlock &BB : *Function) {
       uint64_t TotalOutgoing = 0ULL;
-      for (const auto &Tuple : llvm::zip(BB.successors(), BB.branch_info())) {
-        const auto *Succ = std::get<0>(Tuple);
-        const auto &BI = std::get<1>(Tuple);
+      for (const auto &[Succ, BI] :
+           llvm::zip(BB.successors(), BB.branch_info())) {
         const uint64_t Count = BI.Count;
         if (Count == BinaryBasicBlock::COUNT_NO_PROFILE || Count == 0)
           continue;



More information about the llvm-commits mailing list