[llvm] 0a0800c - A post-processing for BFI inference

Fri Jun 11 21:52:24 PDT 2021

Author: spupyrev
Date: 2021-06-11T21:46:04-07:00
New Revision: 0a0800c4d10c250ffb152b5f059d6f9a19ed8efe

URL: https://github.com/llvm/llvm-project/commit/0a0800c4d10c250ffb152b5f059d6f9a19ed8efe
DIFF: https://github.com/llvm/llvm-project/commit/0a0800c4d10c250ffb152b5f059d6f9a19ed8efe.diff

LOG: A post-processing for BFI inference

The current implementation for computing relative block frequencies does
not handle correctly control-flow graphs containing irreducible loops. This
results in suboptimally generated binaries, whose perf can be up to 5%
worse than optimal.

To resolve the problem, we apply a post-processing step, which iteratively
updates block frequencies based on the frequencies of their predesessors.
This corresponds to finding the stationary point of the Markov chain by
an iterative method aka "PageRank computation". The algorithm takes at
most O(|E| * IterativeBFIMaxIterations) steps but typically converges faster.

It is turned on by passing option `use-iterative-bfi-inference`
and applied only for functions containing profile data and irreducible loops.

Tested on SPEC06/17, where it is helping to get correct profile counts for one of
the binaries (403.gcc). In prod binaries, we've seen a speedup of up to 2%-5%
for binaries containing functions with hot irreducible loops.

Reviewed By: hoy, wenlei, davidxl

Differential Revision: https://reviews.llvm.org/D103289

Added: 
    llvm/test/Transforms/SampleProfile/Inputs/profile-correlation-irreducible-loops.prof
    llvm/test/Transforms/SampleProfile/profile-correlation-irreducible-loops.ll

Modified: 
    llvm/include/llvm/Analysis/BlockFrequencyInfoImpl.h
    llvm/lib/Analysis/BlockFrequencyInfoImpl.cpp

Removed: 
    


################################################################################
diff  --git a/llvm/include/llvm/Analysis/BlockFrequencyInfoImpl.h b/llvm/include/llvm/Analysis/BlockFrequencyInfoImpl.h
index 4e5cd952b353a..268377db9507a 100644

--- a/llvm/include/llvm/Analysis/BlockFrequencyInfoImpl.h
+++ b/llvm/include/llvm/Analysis/BlockFrequencyInfoImpl.h
@@ -42,7 +42,9 @@
 #include <iterator>
 #include <limits>
 #include <list>
+#include <queue>
 #include <string>
+#include <unordered_set>
 #include <utility>
 #include <vector>
 
@@ -51,6 +53,10 @@
 namespace llvm {
 extern llvm::cl::opt<bool> CheckBFIUnknownBlockQueries;
 
+extern llvm::cl::opt<bool> UseIterativeBFIInference;
+extern llvm::cl::opt<unsigned> IterativeBFIMaxIterationsPerBlock;
+extern llvm::cl::opt<double> IterativeBFIPrecision;
+
 class BranchProbabilityInfo;
 class Function;
 class Loop;
@@ -967,6 +973,45 @@ template <class BT> class BlockFrequencyInfoImpl : BlockFrequencyInfoImplBase {
     return bfi_detail::getBlockName(getBlock(Node));
   }
 
+  /// The current implementation for computing relative block frequencies does
+  /// not handle correctly control-flow graphs containing irreducible loops. To
+  /// resolve the problem, we apply a post-processing step, which iteratively
+  /// updates block frequencies based on the frequencies of their predesessors.
+  /// This corresponds to finding the stationary point of the Markov chain by
+  /// an iterative method aka "PageRank computation".
+  /// The algorithm takes at most O(|E| * IterativeBFIMaxIterations) steps but
+  /// typically converges faster.
+  ///
+  /// Decide whether we want to apply iterative inference for a given function.
+  bool needIterativeInference() const;
+
+  /// Apply an iterative post-processing to infer correct counts for irr loops.
+  void applyIterativeInference();
+
+  using ProbMatrixType = std::vector<std::vector<std::pair<size_t, Scaled64>>>;
+
+  /// Run iterative inference for a probability matrix and initial frequencies.
+  void iterativeInference(const ProbMatrixType &ProbMatrix,
+                          std::vector<Scaled64> &Freq) const;
+
+  /// Find all blocks to apply inference on, that is, reachable from the entry
+  /// and backward reachable from exists along edges with positive probability.
+  void findReachableBlocks(std::vector<const BlockT *> &Blocks) const;
+
+  /// Build a matrix of probabilities with transitions (edges) between the
+  /// blocks: ProbMatrix[I] holds pairs (J, P), where Pr[J -> I | J] = P
+  void initTransitionProbabilities(
+      const std::vector<const BlockT *> &Blocks,
+      const DenseMap<const BlockT *, size_t> &BlockIndex,
+      ProbMatrixType &ProbMatrix) const;
+
+#ifndef NDEBUG
+  /// Compute the discrepancy between current block frequencies and the
+  /// probability matrix.
+  Scaled64 discrepancy(const ProbMatrixType &ProbMatrix,
+                       const std::vector<Scaled64> &Freq) const;
+#endif
+
 public:
   BlockFrequencyInfoImpl() = default;
 
@@ -1093,6 +1138,10 @@ void BlockFrequencyInfoImpl<BT>::calculate(const FunctionT &F,
   computeMassInLoops();
   computeMassInFunction();
   unwrapLoops();
+  // Apply a post-processing step improving computed frequencies for functions
+  // with irreducible loops.
+  if (needIterativeInference())
+    applyIterativeInference();
   finalizeMetrics();
 
   if (CheckBFIUnknownBlockQueries) {
@@ -1313,6 +1362,294 @@ template <class BT> void BlockFrequencyInfoImpl<BT>::computeMassInFunction() {
   llvm_unreachable("unhandled irreducible control flow");
 }
 
+template <class BT>
+bool BlockFrequencyInfoImpl<BT>::needIterativeInference() const {
+  if (!UseIterativeBFIInference)
+    return false;
+  if (!F->getFunction().hasProfileData())
+    return false;
+  // Apply iterative inference only if the function contains irreducible loops;
+  // otherwise, computed block frequencies are reasonably correct.
+  for (auto L = Loops.rbegin(), E = Loops.rend(); L != E; ++L) {
+    if (L->isIrreducible())
+      return true;
+  }
+  return false;
+}
+
+template <class BT> void BlockFrequencyInfoImpl<BT>::applyIterativeInference() {
+  // Extract blocks for processing: a block is considered for inference iff it
+  // can be reached from the entry by edges with a positive probability.
+  // Non-processed blocks are assigned with the zero frequency and are ignored
+  // in the computation
+  std::vector<const BlockT *> ReachableBlocks;
+  findReachableBlocks(ReachableBlocks);
+  if (ReachableBlocks.empty())
+    return;
+
+  // The map is used to to index successors/predecessors of reachable blocks in
+  // the ReachableBlocks vector
+  DenseMap<const BlockT *, size_t> BlockIndex;
+  // Extract initial frequencies for the reachable blocks
+  auto Freq = std::vector<Scaled64>(ReachableBlocks.size());
+  Scaled64 SumFreq;
+  for (size_t I = 0; I < ReachableBlocks.size(); I++) {
+    const BlockT *BB = ReachableBlocks[I];
+    BlockIndex[BB] = I;
+    Freq[I] = getFloatingBlockFreq(BB);
+    SumFreq += Freq[I];
+  }
+  assert(!SumFreq.isZero() && "empty initial block frequencies");
+
+  LLVM_DEBUG(dbgs() << "Applying iterative inference for " << F->getName()
+                    << " with " << ReachableBlocks.size() << " blocks\n");
+
+  // Normalizing frequencies so they sum up to 1.0
+  for (auto &Value : Freq) {
+    Value /= SumFreq;
+  }
+
+  // Setting up edge probabilities using sparse matrix representation:
+  // ProbMatrix[I] holds a vector of pairs (J, P) where Pr[J -> I | J] = P
+  ProbMatrixType ProbMatrix;
+  initTransitionProbabilities(ReachableBlocks, BlockIndex, ProbMatrix);
+
+  // Run the propagation
+  iterativeInference(ProbMatrix, Freq);
+
+  // Assign computed frequency values
+  for (const BlockT &BB : *F) {
+    auto Node = getNode(&BB);
+    if (!Node.isValid())
+      continue;
+    if (BlockIndex.count(&BB)) {
+      Freqs[Node.Index].Scaled = Freq[BlockIndex[&BB]];
+    } else {
+      Freqs[Node.Index].Scaled = Scaled64::getZero();
+    }
+  }
+}
+
+template <class BT>
+void BlockFrequencyInfoImpl<BT>::iterativeInference(
+    const ProbMatrixType &ProbMatrix, std::vector<Scaled64> &Freq) const {
+  assert(0.0 < IterativeBFIPrecision && IterativeBFIPrecision < 1.0 &&
+         "incorrectly specified precision");
+  // Convert double precision to Scaled64
+  const auto Precision =
+      Scaled64::getInverse(static_cast<uint64_t>(1.0 / IterativeBFIPrecision));
+  const size_t MaxIterations = IterativeBFIMaxIterationsPerBlock * Freq.size();
+
+#ifndef NDEBUG
+  LLVM_DEBUG(dbgs() << "  Initial discrepancy = "
+                    << discrepancy(ProbMatrix, Freq).toString() << "\n");
+#endif
+
+  // Successors[I] holds unique sucessors of the I-th block
+  auto Successors = std::vector<std::vector<size_t>>(Freq.size());
+  for (size_t I = 0; I < Freq.size(); I++) {
+    for (auto &Jump : ProbMatrix[I]) {
+      Successors[Jump.first].push_back(I);
+    }
+  }
+
+  // To speedup computation, we maintain a set of "active" blocks whose
+  // frequencies need to be updated based on the incoming edges.
+  // The set is dynamic and changes after every update. Initially all blocks
+  // with a positive frequency are active
+  auto IsActive = std::vector<bool>(Freq.size(), false);
+  std::queue<size_t> ActiveSet;
+  for (size_t I = 0; I < Freq.size(); I++) {
+    if (Freq[I] > 0) {
+      ActiveSet.push(I);
+      IsActive[I] = true;
+    }
+  }
+
+  // Iterate over the blocks propagating frequencies
+  size_t It = 0;
+  while (It++ < MaxIterations && !ActiveSet.empty()) {
+    size_t I = ActiveSet.front();
+    ActiveSet.pop();
+    IsActive[I] = false;
+
+    // Compute a new frequency for the block: NewFreq := Freq \times ProbMatrix.
+    // A special care is taken for self-edges that needs to be scaled by
+    // (1.0 - SelfProb), where SelfProb is the sum of probabilities on the edges
+    Scaled64 NewFreq;
+    Scaled64 OneMinusSelfProb = Scaled64::getOne();
+    for (auto &Jump : ProbMatrix[I]) {
+      if (Jump.first == I) {
+        OneMinusSelfProb -= Jump.second;
+      } else {
+        NewFreq += Freq[Jump.first] * Jump.second;
+      }
+    }
+    if (OneMinusSelfProb != Scaled64::getOne())
+      NewFreq /= OneMinusSelfProb;
+
+    // If the block's frequency has changed enough, then
+    // make sure the block and its successors are in the active set
+    auto Change = Freq[I] >= NewFreq ? Freq[I] - NewFreq : NewFreq - Freq[I];
+    if (Change > Precision) {
+      ActiveSet.push(I);
+      IsActive[I] = true;
+      for (size_t Succ : Successors[I]) {
+        if (!IsActive[Succ]) {
+          ActiveSet.push(Succ);
+          IsActive[Succ] = true;
+        }
+      }
+    }
+
+    // Update the frequency for the block
+    Freq[I] = NewFreq;
+  }
+
+  LLVM_DEBUG(dbgs() << "  Completed " << It << " inference iterations"
+                    << format(" (%0.0f per block)", double(It) / Freq.size())
+                    << "\n");
+#ifndef NDEBUG
+  LLVM_DEBUG(dbgs() << "  Final   discrepancy = "
+                    << discrepancy(ProbMatrix, Freq).toString() << "\n");
+#endif
+}
+
+template <class BT>
+void BlockFrequencyInfoImpl<BT>::findReachableBlocks(
+    std::vector<const BlockT *> &Blocks) const {
+  // Find all blocks to apply inference on, that is, reachable from the entry
+  // along edges with non-zero probablities
+  std::queue<const BlockT *> Queue;
+  std::unordered_set<const BlockT *> Reachable;
+  const BlockT *Entry = &F->front();
+  Queue.push(Entry);
+  Reachable.insert(Entry);
+  while (!Queue.empty()) {
+    const BlockT *SrcBB = Queue.front();
+    Queue.pop();
+    for (const BlockT *DstBB : children<const BlockT *>(SrcBB)) {
+      auto EP = BPI->getEdgeProbability(SrcBB, DstBB);
+      if (EP.isZero())
+        continue;
+      if (Reachable.find(DstBB) == Reachable.end()) {
+        Queue.push(DstBB);
+        Reachable.insert(DstBB);
+      }
+    }
+  }
+
+  // Find all blocks to apply inference on, that is, backward reachable from
+  // the entry along (backward) edges with non-zero probablities
+  std::unordered_set<const BlockT *> InverseReachable;
+  for (const BlockT &BB : *F) {
+    // An exit block is a block without any successors
+    bool HasSucc = GraphTraits<const BlockT *>::child_begin(&BB) !=
+                   GraphTraits<const BlockT *>::child_end(&BB);
+    if (!HasSucc && Reachable.count(&BB)) {
+      Queue.push(&BB);
+      InverseReachable.insert(&BB);
+    }
+  }
+  while (!Queue.empty()) {
+    const BlockT *SrcBB = Queue.front();
+    Queue.pop();
+    for (const BlockT *DstBB : children<Inverse<const BlockT *>>(SrcBB)) {
+      auto EP = BPI->getEdgeProbability(DstBB, SrcBB);
+      if (EP.isZero())
+        continue;
+      if (InverseReachable.find(DstBB) == InverseReachable.end()) {
+        Queue.push(DstBB);
+        InverseReachable.insert(DstBB);
+      }
+    }
+  }
+
+  // Collect the result
+  Blocks.reserve(F->size());
+  for (const BlockT &BB : *F) {
+    if (Reachable.count(&BB) && InverseReachable.count(&BB)) {
+      Blocks.push_back(&BB);
+    }
+  }
+}
+
+template <class BT>
+void BlockFrequencyInfoImpl<BT>::initTransitionProbabilities(
+    const std::vector<const BlockT *> &Blocks,
+    const DenseMap<const BlockT *, size_t> &BlockIndex,
+    ProbMatrixType &ProbMatrix) const {
+  const size_t NumBlocks = Blocks.size();
+  auto Succs = std::vector<std::vector<std::pair<size_t, Scaled64>>>(NumBlocks);
+  auto SumProb = std::vector<Scaled64>(NumBlocks);
+
+  // Find unique successors and corresponding probabilities for every block
+  for (size_t Src = 0; Src < NumBlocks; Src++) {
+    const BlockT *BB = Blocks[Src];
+    std::unordered_set<const BlockT *> UniqueSuccs;
+    for (const auto SI : children<const BlockT *>(BB)) {
+      // Ignore cold blocks
+      if (BlockIndex.find(SI) == BlockIndex.end())
+        continue;
+      // Ignore parallel edges between BB and SI blocks
+      if (UniqueSuccs.find(SI) != UniqueSuccs.end())
+        continue;
+      UniqueSuccs.insert(SI);
+      // Ignore jumps with zero probability
+      auto EP = BPI->getEdgeProbability(BB, SI);
+      if (EP.isZero())
+        continue;
+
+      auto EdgeProb =
+          Scaled64::getFraction(EP.getNumerator(), EP.getDenominator());
+      size_t Dst = BlockIndex.find(SI)->second;
+      Succs[Src].push_back(std::make_pair(Dst, EdgeProb));
+      SumProb[Src] += EdgeProb;
+    }
+  }
+
+  // Add transitions for every jump with positive branch probability
+  ProbMatrix = ProbMatrixType(NumBlocks);
+  for (size_t Src = 0; Src < NumBlocks; Src++) {
+    // Ignore blocks w/o successors
+    if (Succs[Src].empty())
+      continue;
+
+    assert(!SumProb[Src].isZero() && "Zero sum probability of non-exit block");
+    for (auto &Jump : Succs[Src]) {
+      size_t Dst = Jump.first;
+      Scaled64 Prob = Jump.second;
+      ProbMatrix[Dst].push_back(std::make_pair(Src, Prob / SumProb[Src]));
+    }
+  }
+
+  // Add transitions from sinks to the source
+  size_t EntryIdx = BlockIndex.find(&F->front())->second;
+  for (size_t Src = 0; Src < NumBlocks; Src++) {
+    if (Succs[Src].empty()) {
+      ProbMatrix[EntryIdx].push_back(std::make_pair(Src, Scaled64::getOne()));
+    }
+  }
+}
+
+#ifndef NDEBUG
+template <class BT>
+BlockFrequencyInfoImplBase::Scaled64 BlockFrequencyInfoImpl<BT>::discrepancy(
+    const ProbMatrixType &ProbMatrix, const std::vector<Scaled64> &Freq) const {
+  assert(Freq[0] > 0 && "Incorrectly computed frequency of the entry block");
+  Scaled64 Discrepancy;
+  for (size_t I = 0; I < ProbMatrix.size(); I++) {
+    Scaled64 Sum;
+    for (const auto &Jump : ProbMatrix[I]) {
+      Sum += Freq[Jump.first] * Jump.second;
+    }
+    Discrepancy += Freq[I] >= Sum ? Freq[I] - Sum : Sum - Freq[I];
+  }
+  // Normalizing by the frequency of the entry block
+  return Discrepancy / Freq[0];
+}
+#endif
+
 /// \note This should be a lambda, but that crashes GCC 4.7.
 namespace bfi_detail {
 

diff  --git a/llvm/lib/Analysis/BlockFrequencyInfoImpl.cpp b/llvm/lib/Analysis/BlockFrequencyInfoImpl.cpp
index 15503c5117e58..037f0091f0848 100644
--- a/llvm/lib/Analysis/BlockFrequencyInfoImpl.cpp
+++ b/llvm/lib/Analysis/BlockFrequencyInfoImpl.cpp
@@ -46,6 +46,20 @@ cl::opt<bool> CheckBFIUnknownBlockQueries(
     cl::init(false), cl::Hidden,
     cl::desc("Check if block frequency is queried for an unknown block "
              "for debugging missed BFI updates"));
+
+cl::opt<bool> UseIterativeBFIInference(
+    "use-iterative-bfi-inference", cl::init(false), cl::Hidden,
+    cl::desc("Apply an iterative post-processing to infer correct BFI counts"));
+
+cl::opt<unsigned> IterativeBFIMaxIterationsPerBlock(
+    "iterative-bfi-max-iterations-per-block", cl::init(1000), cl::Hidden,
+    cl::desc("Iterative inference: maximum number of update iterations "
+             "per block"));
+
+cl::opt<double> IterativeBFIPrecision(
+    "iterative-bfi-precision", cl::init(1e-12), cl::Hidden,
+    cl::desc("Iterative inference: delta convergence precision; smaller values "
+             "typically lead to better results at the cost of worsen runtime"));
 }
 
 ScaledNumber<uint64_t> BlockMass::toScaled() const {

diff  --git a/llvm/test/Transforms/SampleProfile/Inputs/profile-correlation-irreducible-loops.prof b/llvm/test/Transforms/SampleProfile/Inputs/profile-correlation-irreducible-loops.prof
new file mode 100644
index 0000000000000..cd0eb4193f1a1
--- /dev/null
+++ b/llvm/test/Transforms/SampleProfile/Inputs/profile-correlation-irreducible-loops.prof
@@ -0,0 +1,19 @@
+yyparse_1:10822:0
+ 1: 1
+ 2: 1003
+ 3: 1002
+ 4: 1002
+ 5: 1
+ 6: 0
+ 7: 1
+ 8: 0
+ 9: 1
+ !CFGChecksum: 158496288380146391
+
+foo1:2297361:0
+ 1: 1
+ 2: 86
+ 3: 8212
+ 4: 1
+ 5: 17747
+ !CFGChecksum: 404850113186107133

diff  --git a/llvm/test/Transforms/SampleProfile/profile-correlation-irreducible-loops.ll b/llvm/test/Transforms/SampleProfile/profile-correlation-irreducible-loops.ll
new file mode 100644
index 0000000000000..3db1804e5b0cc
--- /dev/null
+++ b/llvm/test/Transforms/SampleProfile/profile-correlation-irreducible-loops.ll
@@ -0,0 +1,187 @@
+; RUN: opt < %s -passes=sample-profile -sample-profile-file=%S/Inputs/profile-correlation-irreducible-loops.prof | opt -analyze -block-freq -enable-new-pm=0  -use-iterative-bfi-inference | FileCheck %s
+; RUN: opt < %s -passes=sample-profile -sample-profile-file=%S/Inputs/profile-correlation-irreducible-loops.prof -S | FileCheck %s --check-prefix=CHECK2
+; RUN: opt < %s -analyze -block-freq -enable-new-pm=0  -use-iterative-bfi-inference | FileCheck %s --check-prefix=CHECK3
+
+; The C++ code for this test case is from c-parse.c in 403.gcc (SPEC2006)
+; The problem with BFI for the test is solved by applying iterative inference.
+; The corresponding CFG graph is shown below, with intended counts for every
+; basic block. The hot loop, b3->b4->b2, is not getting proper (large) counts
+; unless the -use-iterative-bfi-inference option is specified.
+;
+;   +-------------------------------------------+
+;   |                                           |
+;   |                   +----------+            |
+;   |                   |  b1 [1]  |            |
+;   |                   +----------+            |
+;   |                     |                     |
+;   |                     |                     |
+;   |                     v                     |
+;   |                   +----------+            |
+;   |    +------------> | b2 [625] | -+         |
+;   |    |              +----------+  |         |
+;   |    |                |           |         |
+;   |    |                |           |         |
+;   |    |                v           |         |
+;   |  +----------+     +----------+  |         |
+;   |  | b4 [624] | <-- | b3 [625] | <+---------+
+;   |  +----------+     +----------+  |
+;   |                     |           |
+;   +----+                |           |
+;        |                v           v
+;      +----------+     +--------------------+
+;      |  b8 [1]  | <-- |       b7 [2]       |
+;      +----------+     +--------------------+
+;                         |           ^
+;                         |           |
+;                         v           |
+;      +----------+     +----------+  |
+;      |  b9 [1]  | <-- |  b5 [2]  |  |
+;      +----------+     +----------+  |
+;                         |           |
+;                         |           |
+;                         v           |
+;                       +----------+  |
+;                       |  b6 [1]  | -+
+;                       +----------+
+
+ at yydebug = dso_local global i32 0, align 4
+
+; Function Attrs: noinline nounwind uwtable
+define dso_local i32 @yyparse_1() #0 {
+b1:
+  call void @llvm.pseudoprobe(i64 -7702751003264189226, i64 1, i32 0, i64 -1)
+  %0 = load i32, i32* @yydebug, align 4
+  %cmp = icmp ne i32 %0, 0
+  br label %b2
+; CHECK: - b1: float = {{.*}}, int = {{.*}}, count = 1
+
+b2:
+  call void @llvm.pseudoprobe(i64 -7702751003264189226, i64 2, i32 0, i64 -1)
+  br i1 %cmp, label %b7, label %b3
+; CHECK: - b2: float = {{.*}}, int = {{.*}}, count = 625
+
+b3:
+  call void @llvm.pseudoprobe(i64 -7702751003264189226, i64 3, i32 0, i64 -1)
+  br i1 %cmp, label %b7, label %b4
+; CHECK: - b3: float = {{.*}}, int = {{.*}}, count = 625
+; CHECK2: br i1 %cmp, label %b7, label %b4,
+; CHECK2-SAME: !prof ![[END172_PROF:[0-9]+]]
+
+b4:
+  call void @llvm.pseudoprobe(i64 -7702751003264189226, i64 4, i32 0, i64 -1)
+  br label %b2
+; CHECK: - b4: float = {{.*}}, int = {{.*}}, count = 624
+
+b5:
+  call void @llvm.pseudoprobe(i64 -7702751003264189226, i64 5, i32 0, i64 -1)
+  br i1 %cmp, label %b9, label %b6
+; CHECK: - b5: float = {{.*}}, int = {{.*}}, count = 2
+
+b6:
+  call void @llvm.pseudoprobe(i64 -7702751003264189226, i64 6, i32 0, i64 -1)
+  br label %b7
+; CHECK: - b6: float = {{.*}}, int = {{.*}}, count = 1
+
+b7:
+  call void @llvm.pseudoprobe(i64 -7702751003264189226, i64 7, i32 0, i64 -1)
+  br i1 %cmp, label %b5, label %b8
+; CHECK: - b7: float = {{.*}}, int = {{.*}}, count = 2
+; CHECK2: br i1 %cmp, label %b5, label %b8,
+; CHECK2-SAME: !prof ![[FALSE4858_PROF:[0-9]+]]
+
+b8:
+  call void @llvm.pseudoprobe(i64 -7702751003264189226, i64 8, i32 0, i64 -1)
+  br label %b3
+; CHECK: - b8: float = {{.*}}, int = {{.*}}, count = 1
+
+b9:
+  call void @llvm.pseudoprobe(i64 -7702751003264189226, i64 9, i32 0, i64 -1)
+  %1 = load i32, i32* @yydebug, align 4
+  ret i32 %1
+; CHECK: - b9: float = {{.*}}, int = {{.*}}, count = 1
+
+}
+
+; Another 
diff icult (for BFI) instance with irreducible loops,
+; containing 'indirectbr'. The corresponding CFG graph is shown below, with
+; intended counts for every basic block.
+;
+;      +-----------+
+;      |  b1 [1]   |
+;      +-----------+
+;        |
+;        |
+;        v
+;      +------------------------+
+;   +- |        b2 [86]         | <+
+;   |  +------------------------+  |
+;   |    |            |            |
+;   |    |            |            |
+;   |    v            |            |
+;   |  +-----------+  |            |
+;   |  | b3 [8212] | <+-------+    |
+;   |  +-----------+  |       |    |
+;   |    |            |       |    |
+;   |    |            |       |    |
+;   |    v            v       |    |
+;   |  +------------------------+  |
+;   |  |  indirectgoto [17747]  | -+
+;   |  +------------------------+
+;   |    |            ^  |
+;   |    |            +--+
+;   |    v
+;   |  +-----------+
+;   +> |  b4 [1]   |
+;      +-----------+
+
+; Function Attrs: nounwind uwtable
+define dso_local i32 @foo1() #0 !prof !132 {
+b1:
+  call void @llvm.pseudoprobe(i64 7682762345278052905, i64 1, i32 0, i64 -1)
+  %0 = load i32, i32* @yydebug, align 4
+  %cmp = icmp ne i32 %0, 0
+  br label %b2
+; CHECK3: - b1: float = {{.*}}, int = {{.*}}, count = 1
+
+b2:
+  call void @llvm.pseudoprobe(i64 7682762345278052905, i64 2, i32 0, i64 -1)
+  %1 = load i32, i32* @yydebug, align 4
+  switch i32 %1, label %b4 [
+    i32 1, label %indirectgoto
+    i32 2, label %b3
+  ], !prof !133
+; CHECK3: - b2: float = {{.*}}, int = {{.*}}, count = 86
+
+b3:
+  call void @llvm.pseudoprobe(i64 7682762345278052905, i64 3, i32 0, i64 -1)
+  br label %indirectgoto
+; CHECK3: - b3: float = {{.*}}, int = {{.*}}, count = 8212
+
+b4:
+  call void @llvm.pseudoprobe(i64 7682762345278052905, i64 4, i32 0, i64 -1)
+  %2 = load i32, i32* @yydebug, align 4
+  ret i32 %2
+; CHECK3: - b4: float = {{.*}}, int = {{.*}}, count = 1
+
+indirectgoto:
+  %indirect.goto.dest = alloca i8, align 4
+  call void @llvm.pseudoprobe(i64 7682762345278052905, i64 5, i32 0, i64 -1)
+  indirectbr i8* %indirect.goto.dest, [label %b2, label %indirectgoto, label %b4, label %b3], !prof !134
+; CHECK3: - indirectgoto: float = {{.*}}, int = {{.*}}, count = 17747
+
+}
+
+declare void @llvm.pseudoprobe(i64, i64, i32, i64) #1
+
+attributes #0 = { noinline nounwind uwtable "use-sample-profile"}
+attributes #1 = { nounwind }
+
+!llvm.pseudo_probe_desc = !{!1079, !4496}
+!1079 = !{i64 -7702751003264189226, i64 158496288380146391, !"yyparse_1", null}
+!4496 = !{i64 7682762345278052905, i64 404850113186107133, !"foo1", null}
+!132 = !{!"function_entry_count", i64 1}
+!133 = !{!"branch_weights", i32 0, i32 86, i32 0}
+!134 = !{!"branch_weights", i32 85, i32 9449, i32 1, i32 8212}
+
+; CHECK2: ![[END172_PROF]] = !{!"branch_weights", i32 1, i32 1003}
+; CHECK2: ![[FALSE4858_PROF]] = !{!"branch_weights", i32 2, i32 1}