[llvm] 13d1364 - A better profi rebalancer

Tue Jan 18 12:20:39 PST 2022

Author: spupyrev
Date: 2022-01-18T12:14:24-08:00
New Revision: 13d1364a344328145b4bc9449fa6aae34f300886

URL: https://github.com/llvm/llvm-project/commit/13d1364a344328145b4bc9449fa6aae34f300886
DIFF: https://github.com/llvm/llvm-project/commit/13d1364a344328145b4bc9449fa6aae34f300886.diff

LOG: A better profi rebalancer

This is an extension of **profi** post-processing step that rebalances counts
in CFGs that have basic blocks w/o probes (aka "unknown" blocks). Specifically,
the new version finds many more "unknown" subgraphs and marks more "unknown"
basic blocks as hot (which prevents unwanted optimization passes).

I see up to 0.5% perf on some (large) binaries, e.g., clang-10 and gcc-8.

The algorithm is still linear and yields no build time overhead.

Added: 
    llvm/test/Transforms/SampleProfile/Inputs/profile-inference-rebalance-large.prof
    llvm/test/Transforms/SampleProfile/profile-inference-rebalance-large.ll

Modified: 
    llvm/lib/Transforms/Utils/SampleProfileInference.cpp
    llvm/test/Transforms/SampleProfile/profile-inference-rebalance.ll

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Transforms/Utils/SampleProfileInference.cpp b/llvm/lib/Transforms/Utils/SampleProfileInference.cpp
index 1d5582980af50..961adf2570a73 100644

--- a/llvm/lib/Transforms/Utils/SampleProfileInference.cpp
+++ b/llvm/lib/Transforms/Utils/SampleProfileInference.cpp
@@ -145,7 +145,7 @@ class MinCostMaxFlow {
   /// A cost of decreasing the entry block's count by one.
   static constexpr int64_t AuxCostDecEntry = 10;
   /// A cost of taking an unlikely jump.
-  static constexpr int64_t AuxCostUnlikely = ((int64_t)1) << 20;
+  static constexpr int64_t AuxCostUnlikely = ((int64_t)1) << 30;
 
 private:
   /// Check for existence of an augmenting path with a positive capacity.
@@ -237,7 +237,7 @@ class MinCostMaxFlow {
     }
   }
 
-  /// An node in a flow network.
+  /// A node in a flow network.
   struct Node {
     /// The cost of the cheapest path from the source to the current node.
     int64_t Distance;
@@ -304,9 +304,6 @@ class FlowAdjuster {
     rebalanceUnknownSubgraphs();
   }
 
-  /// The probability for the first successor of a unknown subgraph
-  static constexpr double UnknownFirstSuccProbability = 0.5;
-
 private:
   void joinIsolatedComponents() {
     // Find blocks that are reachable from the source
@@ -453,45 +450,70 @@ class FlowAdjuster {
 
   uint64_t NumBlocks() const { return Func.Blocks.size(); }
 
-  /// Rebalance unknown subgraphs so as each branch splits with probabilities
-  /// UnknownFirstSuccProbability and 1 - UnknownFirstSuccProbability
+  /// Rebalance unknown subgraphs so that the flow is split evenly across the
+  /// outgoing branches of every block of the subgraph. The method iterates over
+  /// blocks with known weight and identifies unknown subgraphs rooted at the
+  /// blocks. Then it verifies if flow rebalancing is feasible and applies it.
   void rebalanceUnknownSubgraphs() {
-    static_assert(
-        UnknownFirstSuccProbability >= 0.0 &&
-            UnknownFirstSuccProbability <= 1.0,
-        "the share of the unknown successor should be between 0 and 1");
-    // Try to find unknown subgraphs from each non-unknown block
+    // Try to find unknown subgraphs from each block
     for (uint64_t I = 0; I < Func.Blocks.size(); I++) {
       auto SrcBlock = &Func.Blocks[I];
-      // Do not attempt to find unknown successors from a unknown or a
-      // zero-flow block
-      if (SrcBlock->UnknownWeight || SrcBlock->Flow == 0)
+      // Verify if rebalancing rooted at SrcBlock is feasible
+      if (!canRebalanceAtRoot(SrcBlock))
         continue;
 
-      std::vector<FlowBlock *> UnknownSuccs;
+      // Find an unknown subgraphs starting at SrcBlock. Along the way,
+      // fill in known destinations and intermediate unknown blocks.
+      std::vector<FlowBlock *> UnknownBlocks;
+      std::vector<FlowBlock *> KnownDstBlocks;
+      findUnknownSubgraph(SrcBlock, KnownDstBlocks, UnknownBlocks);
+
+      // Verify if rebalancing of the subgraph is feasible. If the search is
+      // successful, find the unique destination block (which can be null)
       FlowBlock *DstBlock = nullptr;
-      // Find a unknown subgraphs starting at block SrcBlock
-      if (!findUnknownSubgraph(SrcBlock, DstBlock, UnknownSuccs))
+      if (!canRebalanceSubgraph(SrcBlock, KnownDstBlocks, UnknownBlocks,
+                                DstBlock))
         continue;
-      // At the moment, we do not rebalance subgraphs containing cycles among
-      // unknown blocks
-      if (!isAcyclicSubgraph(SrcBlock, DstBlock, UnknownSuccs))
+
+      // We cannot rebalance subgraphs containing cycles among unknown blocks
+      if (!isAcyclicSubgraph(SrcBlock, DstBlock, UnknownBlocks))
         continue;
 
       // Rebalance the flow
-      rebalanceUnknownSubgraph(SrcBlock, DstBlock, UnknownSuccs);
+      rebalanceUnknownSubgraph(SrcBlock, DstBlock, UnknownBlocks);
     }
   }
 
-  /// Find a unknown subgraph starting at block SrcBlock.
-  /// If the search is successful, the method sets DstBlock and UnknownSuccs.
-  bool findUnknownSubgraph(FlowBlock *SrcBlock, FlowBlock *&DstBlock,
-                           std::vector<FlowBlock *> &UnknownSuccs) {
+  /// Verify if rebalancing rooted at a given block is possible.
+  bool canRebalanceAtRoot(const FlowBlock *SrcBlock) {
+    // Do not attempt to find unknown subgraphs from an unknown or a
+    // zero-flow block
+    if (SrcBlock->UnknownWeight || SrcBlock->Flow == 0)
+      return false;
+
+    // Do not attempt to process subgraphs from a block w/o unknown sucessors
+    bool HasUnknownSuccs = false;
+    for (auto Jump : SrcBlock->SuccJumps) {
+      if (Func.Blocks[Jump->Target].UnknownWeight) {
+        HasUnknownSuccs = true;
+        break;
+      }
+    }
+    if (!HasUnknownSuccs)
+      return false;
+
+    return true;
+  }
+
+  /// Find an unknown subgraph starting at block SrcBlock. The method sets
+  /// identified destinations, KnownDstBlocks, and intermediate UnknownBlocks.
+  void findUnknownSubgraph(const FlowBlock *SrcBlock,
+                           std::vector<FlowBlock *> &KnownDstBlocks,
+                           std::vector<FlowBlock *> &UnknownBlocks) {
     // Run BFS from SrcBlock and make sure all paths are going through unknown
     // blocks and end at a non-unknown DstBlock
     auto Visited = BitVector(NumBlocks(), false);
     std::queue<uint64_t> Queue;
-    DstBlock = nullptr;
 
     Queue.push(SrcBlock->Index);
     Visited[SrcBlock->Index] = true;
@@ -500,52 +522,105 @@ class FlowAdjuster {
       Queue.pop();
       // Process blocks reachable from Block
       for (auto Jump : Block.SuccJumps) {
+        // If Jump can be ignored, skip it
+        if (ignoreJump(SrcBlock, nullptr, Jump))
+          continue;
+
         uint64_t Dst = Jump->Target;
+        // If Dst has been visited, skip Jump
         if (Visited[Dst])
           continue;
+        // Process block Dst
         Visited[Dst] = true;
         if (!Func.Blocks[Dst].UnknownWeight) {
-          // If we see non-unique non-unknown block reachable from SrcBlock,
-          // stop processing and skip rebalancing
-          FlowBlock *CandidateDstBlock = &Func.Blocks[Dst];
-          if (DstBlock != nullptr && DstBlock != CandidateDstBlock)
-            return false;
-          DstBlock = CandidateDstBlock;
+          KnownDstBlocks.push_back(&Func.Blocks[Dst]);
         } else {
           Queue.push(Dst);
-          UnknownSuccs.push_back(&Func.Blocks[Dst]);
+          UnknownBlocks.push_back(&Func.Blocks[Dst]);
         }
       }
     }
+  }
 
+  /// Verify if rebalancing of the subgraph is feasible. If the checks are
+  /// successful, set the unique destination block, DstBlock (can be null).
+  bool canRebalanceSubgraph(const FlowBlock *SrcBlock,
+                            const std::vector<FlowBlock *> &KnownDstBlocks,
+                            const std::vector<FlowBlock *> &UnknownBlocks,
+                            FlowBlock *&DstBlock) {
     // If the list of unknown blocks is empty, we don't need rebalancing
-    if (UnknownSuccs.empty())
+    if (UnknownBlocks.empty())
       return false;
-    // If all reachable nodes from SrcBlock are unknown, skip rebalancing
-    if (DstBlock == nullptr)
+
+    // If there are multiple known sinks, we can't rebalance
+    if (KnownDstBlocks.size() > 1)
       return false;
-    // If any of the unknown blocks is an exit block, skip rebalancing
-    for (auto Block : UnknownSuccs) {
-      if (Block->isExit())
+    DstBlock = KnownDstBlocks.empty() ? nullptr : KnownDstBlocks.front();
+
+    // Verify sinks of the subgraph
+    for (auto Block : UnknownBlocks) {
+      if (Block->SuccJumps.empty()) {
+        // If there are multiple (known and unknown) sinks, we can't rebalance
+        if (DstBlock != nullptr)
+          return false;
+        continue;
+      }
+      size_t NumIgnoredJumps = 0;
+      for (auto Jump : Block->SuccJumps) {
+        if (ignoreJump(SrcBlock, DstBlock, Jump))
+          NumIgnoredJumps++;
+      }
+      // If there is a non-sink block in UnknownBlocks with all jumps ignored,
+      // then we can't rebalance
+      if (NumIgnoredJumps == Block->SuccJumps.size())
         return false;
     }
 
     return true;
   }
 
+  /// Decide whether the Jump is ignored while processing an unknown subgraphs
+  /// rooted at basic block SrcBlock with the destination block, DstBlock.
+  bool ignoreJump(const FlowBlock *SrcBlock, const FlowBlock *DstBlock,
+                  const FlowJump *Jump) {
+    // Ignore unlikely jumps with zero flow
+    if (Jump->IsUnlikely && Jump->Flow == 0)
+      return true;
+
+    auto JumpSource = &Func.Blocks[Jump->Source];
+    auto JumpTarget = &Func.Blocks[Jump->Target];
+
+    // Do not ignore jumps coming into DstBlock
+    if (DstBlock != nullptr && JumpTarget == DstBlock)
+      return false;
+
+    // Ignore jumps out of SrcBlock to known blocks
+    if (!JumpTarget->UnknownWeight && JumpSource == SrcBlock)
+      return true;
+
+    // Ignore jumps to known blocks with zero flow
+    if (!JumpTarget->UnknownWeight && JumpTarget->Flow == 0)
+      return true;
+
+    return false;
+  }
+
   /// Verify if the given unknown subgraph is acyclic, and if yes, reorder
-  /// UnknownSuccs in the topological order (so that all jumps are "forward").
-  bool isAcyclicSubgraph(FlowBlock *SrcBlock, FlowBlock *DstBlock,
-                         std::vector<FlowBlock *> &UnknownSuccs) {
+  /// UnknownBlocks in the topological order (so that all jumps are "forward").
+  bool isAcyclicSubgraph(const FlowBlock *SrcBlock, const FlowBlock *DstBlock,
+                         std::vector<FlowBlock *> &UnknownBlocks) {
     // Extract local in-degrees in the considered subgraph
     auto LocalInDegree = std::vector<uint64_t>(NumBlocks(), 0);
-    for (auto Jump : SrcBlock->SuccJumps) {
-      LocalInDegree[Jump->Target]++;
-    }
-    for (uint64_t I = 0; I < UnknownSuccs.size(); I++) {
-      for (auto Jump : UnknownSuccs[I]->SuccJumps) {
+    auto fillInDegree = [&](const FlowBlock *Block) {
+      for (auto Jump : Block->SuccJumps) {
+        if (ignoreJump(SrcBlock, DstBlock, Jump))
+          continue;
         LocalInDegree[Jump->Target]++;
       }
+    };
+    fillInDegree(SrcBlock);
+    for (auto Block : UnknownBlocks) {
+      fillInDegree(Block);
     }
     // A loop containing SrcBlock
     if (LocalInDegree[SrcBlock->Index] > 0)
@@ -555,15 +630,20 @@ class FlowAdjuster {
     std::queue<uint64_t> Queue;
     Queue.push(SrcBlock->Index);
     while (!Queue.empty()) {
-      auto &Block = Func.Blocks[Queue.front()];
+      FlowBlock *Block = &Func.Blocks[Queue.front()];
       Queue.pop();
-      // Stop propagation once we reach DstBlock
-      if (Block.Index == DstBlock->Index)
+      // Stop propagation once we reach DstBlock, if any
+      if (DstBlock != nullptr && Block == DstBlock)
         break;
 
-      AcyclicOrder.push_back(&Block);
+      // Keep an acyclic order of unknown blocks
+      if (Block->UnknownWeight && Block != SrcBlock)
+        AcyclicOrder.push_back(Block);
+
       // Add to the queue all successors with zero local in-degree
-      for (auto Jump : Block.SuccJumps) {
+      for (auto Jump : Block->SuccJumps) {
+        if (ignoreJump(SrcBlock, DstBlock, Jump))
+          continue;
         uint64_t Dst = Jump->Target;
         LocalInDegree[Dst]--;
         if (LocalInDegree[Dst] == 0) {
@@ -574,42 +654,69 @@ class FlowAdjuster {
 
     // If there is a cycle in the subgraph, AcyclicOrder contains only a subset
     // of all blocks
-    if (UnknownSuccs.size() + 1 != AcyclicOrder.size())
+    if (UnknownBlocks.size() != AcyclicOrder.size())
       return false;
-    UnknownSuccs = AcyclicOrder;
+    UnknownBlocks = AcyclicOrder;
     return true;
   }
 
-  /// Rebalance a given subgraph.
-  void rebalanceUnknownSubgraph(FlowBlock *SrcBlock, FlowBlock *DstBlock,
-                                std::vector<FlowBlock *> &UnknownSuccs) {
+  /// Rebalance a given subgraph rooted at SrcBlock, ending at DstBlock and
+  /// having UnknownBlocks intermediate blocks.
+  void rebalanceUnknownSubgraph(const FlowBlock *SrcBlock,
+                                const FlowBlock *DstBlock,
+                                const std::vector<FlowBlock *> &UnknownBlocks) {
     assert(SrcBlock->Flow > 0 && "zero-flow block in unknown subgraph");
-    assert(UnknownSuccs.front() == SrcBlock && "incorrect order of unknowns");
 
-    for (auto Block : UnknownSuccs) {
+    // Ditribute flow from the source block
+    uint64_t BlockFlow = 0;
+    // SrcBlock's flow is the sum of outgoing flows along non-ignored jumps
+    for (auto Jump : SrcBlock->SuccJumps) {
+      if (ignoreJump(SrcBlock, DstBlock, Jump))
+        continue;
+      BlockFlow += Jump->Flow;
+    }
+    rebalanceBlock(SrcBlock, DstBlock, SrcBlock, BlockFlow);
+
+    // Ditribute flow from the remaining blocks
+    for (auto Block : UnknownBlocks) {
+      assert(Block->UnknownWeight && "incorrect unknown subgraph");
+      uint64_t BlockFlow = 0;
       // Block's flow is the sum of incoming flows
-      uint64_t TotalFlow = 0;
-      if (Block == SrcBlock) {
-        TotalFlow = Block->Flow;
-      } else {
-        for (auto Jump : Block->PredJumps) {
-          TotalFlow += Jump->Flow;
-        }
-        Block->Flow = TotalFlow;
+      for (auto Jump : Block->PredJumps) {
+        BlockFlow += Jump->Flow;
       }
+      Block->Flow = BlockFlow;
+      rebalanceBlock(SrcBlock, DstBlock, Block, BlockFlow);
+    }
+  }
 
-      // Process all successor jumps and update corresponding flow values
-      for (uint64_t I = 0; I < Block->SuccJumps.size(); I++) {
-        auto Jump = Block->SuccJumps[I];
-        if (I + 1 == Block->SuccJumps.size()) {
-          Jump->Flow = TotalFlow;
-          continue;
-        }
-        uint64_t Flow = uint64_t(TotalFlow * UnknownFirstSuccProbability);
-        Jump->Flow = Flow;
-        TotalFlow -= Flow;
-      }
+  /// Redistribute flow for a block in a subgraph rooted at SrcBlock,
+  /// and ending at DstBlock.
+  void rebalanceBlock(const FlowBlock *SrcBlock, const FlowBlock *DstBlock,
+                      const FlowBlock *Block, uint64_t BlockFlow) {
+    // Process all successor jumps and update corresponding flow values
+    size_t BlockDegree = 0;
+    for (auto Jump : Block->SuccJumps) {
+      if (ignoreJump(SrcBlock, DstBlock, Jump))
+        continue;
+      BlockDegree++;
+    }
+    // If all successor jumps of the block are ignored, skip it
+    if (DstBlock == nullptr && BlockDegree == 0)
+      return;
+    assert(BlockDegree > 0 && "all outgoing jumps are ignored");
+
+    // Each of the Block's successors gets the following amount of flow.
+    // Rounding the value up so that all flow is propagated
+    uint64_t SuccFlow = (BlockFlow + BlockDegree - 1) / BlockDegree;
+    for (auto Jump : Block->SuccJumps) {
+      if (ignoreJump(SrcBlock, DstBlock, Jump))
+        continue;
+      uint64_t Flow = std::min(SuccFlow, BlockFlow);
+      Jump->Flow = Flow;
+      BlockFlow -= Flow;
     }
+    assert(BlockFlow == 0 && "not all flow is propagated");
   }
 
   /// A constant indicating an arbitrary exit block of a function.

diff  --git a/llvm/test/Transforms/SampleProfile/Inputs/profile-inference-rebalance-large.prof b/llvm/test/Transforms/SampleProfile/Inputs/profile-inference-rebalance-large.prof
new file mode 100644
index 0000000000000..6dcd3fe241b77
--- /dev/null
+++ b/llvm/test/Transforms/SampleProfile/Inputs/profile-inference-rebalance-large.prof
@@ -0,0 +1,26 @@
+foo1:37078302:0
+ 1: 3300
+ 2: 0
+ 6: 3300
+ !CFGChecksum: 157181141624
+
+foo2:37078302:0
+ 1: 128
+ 2: 128
+ 3: 128
+ 4: 128
+ !CFGChecksum: 208782362068
+
+foo3:37078302:0
+ 1: 500
+ 2: 1500
+ 4: 1200
+ 6: 900
+ 9: 500
+ !CFGChecksum: 189901498683
+
+foo4:37078302:0
+ 1: 400
+ 3: 400
+ 10: 400
+ !CFGChecksum: 241030178952

diff  --git a/llvm/test/Transforms/SampleProfile/profile-inference-rebalance-large.ll b/llvm/test/Transforms/SampleProfile/profile-inference-rebalance-large.ll
new file mode 100644
index 0000000000000..cf633abefdef8
--- /dev/null
+++ b/llvm/test/Transforms/SampleProfile/profile-inference-rebalance-large.ll
@@ -0,0 +1,387 @@
+; RUN: opt < %s -passes=pseudo-probe,sample-profile -sample-profile-use-profi -sample-profile-file=%S/Inputs/profile-inference-rebalance-large.prof | opt -analyze -branch-prob -enable-new-pm=0 | FileCheck %s
+; RUN: opt < %s -passes=pseudo-probe,sample-profile -sample-profile-use-profi -sample-profile-file=%S/Inputs/profile-inference-rebalance-large.prof | opt -analyze -block-freq  -enable-new-pm=0 | FileCheck %s --check-prefix=CHECK2
+
+; The test verifies that counts can rebalanced in switch statements that contain
+; both 'known' and 'unknown' basic blocks.
+;
+;                      +---------+
+;   +----------------- | b15 [?] |
+;   |                  +---------+
+;   |                    ^
+;   |                    |
+;   |                    |
+;   |  +---------+     +--------------+     +---------+
+;   |  | b13 [?] | <-- |  b11 [3300]  | --> | b14 [?] |
+;   |  +---------+     +--------------+     +---------+
+;   |    |               |          |         |
+;   |    |               |          |         |
+;   |    |               v          |         |
+;   |    |             +---------+  |         |
+;   |    |             | b12 [0] |  |         |
+;   |    |             +---------+  |         |
+;   |    |               |          |         |
+;   |    |               |          |         |
+;   |    |               v          v         |
+;   |    |             +--------------+       |
+;   |    +-----------> |              | <-----+
+;   |                  |  b16 [3300]  |
+;   +----------------> |              |
+;                      +--------------+
+
+ at yydebug = dso_local global i32 0, align 4
+
+; Function Attrs: nounwind uwtable
+define dso_local i32 @foo1(i32 %0, i32 %1) #0 {
+b11:
+  call void @llvm.pseudoprobe(i64 7682762345278052905, i64 1, i32 0, i64 -1)
+  %cmp = icmp ne i32 %0, 0
+  switch i32 %1, label %b12 [
+    i32 1, label %b13
+    i32 2, label %b14
+    i32 3, label %b15
+    i32 4, label %b16
+  ]
+; CHECK:  edge b11 -> b12 probability is 0x00000000 / 0x80000000 = 0.00%
+; CHECK:  edge b11 -> b13 probability is 0x20000000 / 0x80000000 = 25.00%
+; CHECK:  edge b11 -> b14 probability is 0x20000000 / 0x80000000 = 25.00%
+; CHECK:  edge b11 -> b15 probability is 0x20000000 / 0x80000000 = 25.00%
+; CHECK:  edge b11 -> b16 probability is 0x20000000 / 0x80000000 = 25.00%
+; CHECK2: - b11: float = {{.*}}, int = {{.*}}, count = 3300
+
+b12:
+  call void @llvm.pseudoprobe(i64 7682762345278052905, i64 2, i32 0, i64 -1)
+  br label %b16
+; CHECK2: - b12: float = {{.*}}, int = {{.*}}, count = 0
+
+b13:
+  call void @llvm.pseudoprobe(i64 7682762345278052905, i64 3, i32 0, i64 -1)
+  br label %b16
+; CHECK2: - b13: float = {{.*}}, int = {{.*}}, count = 825
+
+b14:
+  call void @llvm.pseudoprobe(i64 7682762345278052905, i64 4, i32 0, i64 -1)
+  br label %b16
+; CHECK2: - b14: float = {{.*}}, int = {{.*}}, count = 825
+
+b15:
+  call void @llvm.pseudoprobe(i64 7682762345278052905, i64 5, i32 0, i64 -1)
+  br label %b16
+; CHECK2: - b15: float = {{.*}}, int = {{.*}}, count = 825
+
+b16:
+  call void @llvm.pseudoprobe(i64 7682762345278052905, i64 6, i32 0, i64 -1)
+  ret i32 %1
+; CHECK2: - b16: float = {{.*}}, int = {{.*}}, count = 3300
+}
+
+
+; The test verifies that counts can rebalanced even when control-flow ends at
+; a basic block with an unknown count.
+;
+;                 +-----------+
+;                 | b21 [128] | -+
+;                 +-----------+  |
+;                   |            |
+;                   v            |
+;                 +-----------+  |
+;                 | b22 [128] |  |
+;                 +-----------+  |
+;                   |            |
+;                   v            |
+;                 +-----------+  |
+;   +------------ | b23 [128] | <+
+;   |             +-----------+
+;   |               |
+;   v               v
+; +---------+     +-----------+
+; | b26 [?] | <-- | b24 [128] |
+; +---------+     +-----------+
+;   |               |
+;   |               v
+;   |             +-----------+
+;   |             |  b25 [?]  |
+;   |             +-----------+
+;   |               |
+;   |               v
+;   |             +-----------+
+;   +-----------> |  b27 [?]  | -+
+;                 +-----------+  |
+;                   |            |
+;                   v            |
+;                 +-----------+  |
+;                 |  b28 [?]  |  |
+;                 +-----------+  |
+;                   |            |
+;                   v            |
+;                 +-----------+  |
+;                 |  b29 [?]  | <+
+;                 +-----------+
+
+define dso_local i32 @foo2(i32 %0, i32 %1) #0 {
+b21:
+  call void @llvm.pseudoprobe(i64 2494702099028631698, i64 1, i32 0, i64 -1)
+  %cmp = icmp ne i32 %0, 0
+  br i1 %cmp, label %b22, label %b23
+; CHECK:  edge b21 -> b22 probability is 0x80000000 / 0x80000000 = 100.00% [HOT edge]
+; CHECK:  edge b21 -> b23 probability is 0x00000000 / 0x80000000 = 0.00%
+; CHECK2: - b21: float = {{.*}}, int = {{.*}}, count = 128
+
+b22:
+  call void @llvm.pseudoprobe(i64 2494702099028631698, i64 2, i32 0, i64 -1)
+  br label %b23
+
+b23:
+  call void @llvm.pseudoprobe(i64 2494702099028631698, i64 3, i32 0, i64 -1)
+  br i1 %cmp, label %b24, label %b26
+; CHECK:  edge b23 -> b24 probability is 0x80000000 / 0x80000000 = 100.00% [HOT edge]
+; CHECK:  edge b23 -> b26 probability is 0x00000000 / 0x80000000 = 0.00%
+; CHECK2: - b23: float = {{.*}}, int = {{.*}}, count = 128
+
+b24:
+  call void @llvm.pseudoprobe(i64 2494702099028631698, i64 4, i32 0, i64 -1)
+  br i1 %cmp, label %b25, label %b26
+; CHECK:  edge b24 -> b25 probability is 0x40000000 / 0x80000000 = 50.00%
+; CHECK:  edge b24 -> b26 probability is 0x40000000 / 0x80000000 = 50.00%
+
+b25:
+  call void @llvm.pseudoprobe(i64 2494702099028631698, i64 5, i32 0, i64 -1)
+  br label %b27
+; CHECK2: - b25: float = {{.*}}, int = {{.*}}, count = 64
+
+b26:
+  call void @llvm.pseudoprobe(i64 2494702099028631698, i64 6, i32 0, i64 -1)
+  br label %b27
+; CHECK2: - b26: float = {{.*}}, int = {{.*}}, count = 64
+
+b27:
+  call void @llvm.pseudoprobe(i64 2494702099028631698, i64 7, i32 0, i64 -1)
+  br i1 %cmp, label %b28, label %b29
+; CHECK:  edge b27 -> b28 probability is 0x40000000 / 0x80000000 = 50.00%
+; CHECK:  edge b27 -> b29 probability is 0x40000000 / 0x80000000 = 50.00%
+; CHECK2: - b27: float = {{.*}}, int = {{.*}}, count = 128
+
+b28:
+  call void @llvm.pseudoprobe(i64 2494702099028631698, i64 8, i32 0, i64 -1)
+  br label %b29
+; CHECK2: - b28: float = {{.*}}, int = {{.*}}, count = 64
+
+b29:
+  call void @llvm.pseudoprobe(i64 2494702099028631698, i64 9, i32 0, i64 -1)
+  ret i32 %1
+; CHECK2: - b29: float = {{.*}}, int = {{.*}}, count = 128
+}
+
+
+; The test verifies a flexible mode of rebalancing in which some jumps to known
+; basic blocks are ignored.
+;
+;                 +------------+
+;                 | b31 [500]  |
+;                 +------------+
+;                   |
+;                   v
+; +---------+     +------------+
+; | b33 [?] | <-- | b32 [1500] | <-----+
+; +---------+     +------------+       |
+;   |               |                  |
+;   |               v                  |
+;   |             +------------+     +-----------+
+;   |             | b34 [1200] | --> | b36 [900] |
+;   |             +------------+     +-----------+
+;   |               |
+;   |               v
+;   |             +------------+
+;   |             |  b35 [?]   |
+;   |             +------------+
+;   |               |
+;   |               v
+;   |             +------------+
+;   +-----------> |  b37 [?]   | -+
+;                 +------------+  |
+;                   |             |
+;                   v             |
+;                 +------------+  |
+;                 |  b38 [?]   |  |
+;                 +------------+  |
+;                   |             |
+;                   v             |
+;                 +------------+  |
+;                 | b39 [500]  | <+
+;                 +------------+
+;
+
+define dso_local i32 @foo3(i32 %0, i32 %1) #0 {
+b31:
+  call void @llvm.pseudoprobe(i64 -7908226060800700466, i64 1, i32 0, i64 -1)
+  %cmp = icmp ne i32 %0, 0
+  br label %b32
+; CHECK2: - b31: float = {{.*}}, int = {{.*}}, count = 500
+
+b32:
+  call void @llvm.pseudoprobe(i64 -7908226060800700466, i64 2, i32 0, i64 -1)
+  br i1 %cmp, label %b33, label %b34
+; CHECK:  edge b32 -> b33 probability is 0x1999999a / 0x80000000 = 20.00%
+; CHECK:  edge b32 -> b34 probability is 0x66666666 / 0x80000000 = 80.00%
+; CHECK2: - b32: float = {{.*}}, int = {{.*}}, count = 1500
+
+b33:
+  call void @llvm.pseudoprobe(i64 -7908226060800700466, i64 3, i32 0, i64 -1)
+  br label %b37
+; CHECK2: - b33: float = {{.*}}, int = {{.*}}, count = 300
+
+b34:
+  call void @llvm.pseudoprobe(i64 -7908226060800700466, i64 4, i32 0, i64 -1)
+  br i1 %cmp, label %b35, label %b36
+; CHECK:  edge b34 -> b35 probability is 0x15555555 / 0x80000000 = 16.67%
+; CHECK:  edge b34 -> b36 probability is 0x6aaaaaab / 0x80000000 = 83.33% [HOT edge]
+; CHECK2: - b34: float = {{.*}}, int = {{.*}}, count = 1200
+
+b35:
+  call void @llvm.pseudoprobe(i64 -7908226060800700466, i64 5, i32 0, i64 -1)
+  br label %b37
+; CHECK2: - b35: float = {{.*}}, int = {{.*}}, count = 200
+
+b36:
+  call void @llvm.pseudoprobe(i64 -7908226060800700466, i64 6, i32 0, i64 -1)
+  br label %b32
+; CHECK2: - b36: float = {{.*}}, int = {{.*}}, count = 1000
+
+b37:
+  call void @llvm.pseudoprobe(i64 -7908226060800700466, i64 7, i32 0, i64 -1)
+  br i1 %cmp, label %b38, label %b39
+; CHECK:  edge b37 -> b38 probability is 0x40000000 / 0x80000000 = 50.00%
+; CHECK:  edge b37 -> b39 probability is 0x40000000 / 0x80000000 = 50.00%
+; CHECK2: - b37: float = {{.*}}, int = {{.*}}, count = 500
+
+b38:
+  call void @llvm.pseudoprobe(i64 -7908226060800700466, i64 8, i32 0, i64 -1)
+  br label %b39
+; CHECK2: - b38: float = {{.*}}, int = {{.*}}, count = 250
+
+b39:
+  call void @llvm.pseudoprobe(i64 -7908226060800700466, i64 9, i32 0, i64 -1)
+  ret i32 %1
+; CHECK2: - b39: float = {{.*}}, int = {{.*}}, count = 500
+}
+
+
+; The test verifies that flow rebalancer can ignore 'unlikely' jumps.
+;
+;                            +-----------+
+;                            | b41 [400] | -+
+;                            +-----------+  |
+;                              |            |
+;                              |            |
+;                              v            |
+;                            +-----------+  |
+;                            |  b42 [?]  |  |
+;                            +-----------+  |
+;                              |            |
+;                              |            |
+;                              v            v
+; +---------++---------+     +---------------------------+     +---------++---------+
+; | b48 [?] || b46 [?] | <-- |                           | --> | b47 [?] || b49 [?] |
+; +---------++---------+     |                           |     +---------++---------+
+;   |  ^       |             |                           |       |          ^
+;   |  |       |             |         b43 [400]         |       |          |
+;   |  +-------+-------------|                           |       |          |
+;   |          |             |                           |       |          |
+;   |          |             |                           | ------+----------+
+;   |          |             +---------------------------+       |
+;   |          |               |                 |               |
+;   |          |               |                 |               |
+;   |          |               v                 v               |
+;   |          |             +-----------+     +---------+       |
+;   |          |             |  b44 [?]  |     | b45 [?] |       |
+;   |          |             +-----------+     +---------+       |
+;   |          |               |                 |               |
+;   |          |               |                 |               |
+;   |          |               v                 v               |
+;   |          |             +---------------------------+       |
+;   |          +-----------> |                           | <-----+
+;   |                        |        b410 [400]         |
+;   |                        |                           |
+;   +----------------------> |                           |
+;                            +---------------------------+
+
+
+define dso_local void @foo4(i32 %0, i32 %1) #0 {
+b41:
+  call void @llvm.pseudoprobe(i64 -6882312132165544686, i64 1, i32 0, i64 -1)
+  %cmp = icmp ne i32 %0, 0
+  br i1 %cmp, label %b42, label %b43
+; CHECK:  edge b41 -> b42 probability is 0x40000000 / 0x80000000 = 50.00%
+; CHECK:  edge b41 -> b43 probability is 0x40000000 / 0x80000000 = 50.00%
+; CHECK2: - b41: float = {{.*}}, int = {{.*}}, count = 400
+
+b42:
+  call void @llvm.pseudoprobe(i64 -6882312132165544686, i64 2, i32 0, i64 -1)
+  br label %b43
+; CHECK2: - b42: float = {{.*}}, int = {{.*}}, count = 200
+
+b43:
+  call void @llvm.pseudoprobe(i64 -6882312132165544686, i64 3, i32 0, i64 -1)
+  switch i32 %1, label %b49 [
+    i32 1, label %b44
+    i32 2, label %b45
+    i32 3, label %b46
+    i32 4, label %b47
+    i32 5, label %b48
+  ]
+; CHECK:  edge b43 -> b49 probability is 0x00000000 / 0x80000000 = 0.00%
+; CHECK:  edge b43 -> b44 probability is 0x1999999a / 0x80000000 = 20.00%
+; CHECK:  edge b43 -> b45 probability is 0x1999999a / 0x80000000 = 20.00%
+; CHECK:  edge b43 -> b46 probability is 0x1999999a / 0x80000000 = 20.00%
+; CHECK:  edge b43 -> b47 probability is 0x1999999a / 0x80000000 = 20.00%
+; CHECK:  edge b43 -> b48 probability is 0x1999999a / 0x80000000 = 20.00%
+; CHECK2: - b43: float = {{.*}}, int = {{.*}}, count = 400
+
+b44:
+  call void @llvm.pseudoprobe(i64 -6882312132165544686, i64 4, i32 0, i64 -1)
+  br label %b410
+; CHECK2: - b44: float = {{.*}}, int = {{.*}}, count = 80
+
+b45:
+  call void @llvm.pseudoprobe(i64 -6882312132165544686, i64 5, i32 0, i64 -1)
+  br label %b410
+; CHECK2: - b45: float = {{.*}}, int = {{.*}}, count = 80
+
+b46:
+  call void @llvm.pseudoprobe(i64 -6882312132165544686, i64 6, i32 0, i64 -1)
+  br label %b410
+; CHECK2: - b46: float = {{.*}}, int = {{.*}}, count = 80
+
+b47:
+  call void @llvm.pseudoprobe(i64 -6882312132165544686, i64 7, i32 0, i64 -1)
+  br label %b410
+; CHECK2: - b47: float = {{.*}}, int = {{.*}}, count = 80
+
+b48:
+  call void @llvm.pseudoprobe(i64 -6882312132165544686, i64 8, i32 0, i64 -1)
+  br label %b410
+; CHECK2: - b48: float = {{.*}}, int = {{.*}}, count = 80
+
+b49:
+  call void @llvm.pseudoprobe(i64 -6882312132165544686, i64 9, i32 0, i64 -1)
+  unreachable
+; CHECK2: - b49: float = {{.*}}, int = {{.*}}, count = 0
+
+b410:
+  call void @llvm.pseudoprobe(i64 -6882312132165544686, i64 10, i32 0, i64 -1)
+  ret void
+; CHECK2: - b410: float = {{.*}}, int = {{.*}}, count = 400
+}
+
+
+; Function Attrs: inaccessiblememonly nounwind willreturn
+declare void @llvm.pseudoprobe(i64, i64, i32, i64) #4
+
+attributes #0 = { noinline nounwind uwtable "use-sample-profile" }
+attributes #4 = { inaccessiblememonly nounwind willreturn }
+
+!llvm.pseudo_probe_desc = !{!7, !8, !9, !10}
+
+!7 = !{i64 7682762345278052905, i64 157181141624, !"foo1", null}
+!8 = !{i64 2494702099028631698, i64 208782362068, !"foo2", null}
+!9 = !{i64 -7908226060800700466, i64 189901498683, !"foo3", null}
+!10 = !{i64 -6882312132165544686, i64 241030178952, !"foo4", null}

diff  --git a/llvm/test/Transforms/SampleProfile/profile-inference-rebalance.ll b/llvm/test/Transforms/SampleProfile/profile-inference-rebalance.ll
index 36d2cdb8124cf..9290e17ddbc42 100644
--- a/llvm/test/Transforms/SampleProfile/profile-inference-rebalance.ll
+++ b/llvm/test/Transforms/SampleProfile/profile-inference-rebalance.ll
@@ -1,7 +1,7 @@
 ; RUN: opt < %s -passes=pseudo-probe,sample-profile -sample-profile-use-profi -sample-profile-file=%S/Inputs/profile-inference-rebalance.prof | opt -analyze -branch-prob -enable-new-pm=0 | FileCheck %s
 ; RUN: opt < %s -passes=pseudo-probe,sample-profile -sample-profile-use-profi -sample-profile-file=%S/Inputs/profile-inference-rebalance.prof | opt -analyze -block-freq  -enable-new-pm=0 | FileCheck %s --check-prefix=CHECK2
 
-; The test contains a "dimanond" and a "triangle" that needs to be rebalanced
+; The test contains a "diamond" and a "triangle" that needs to be rebalanced
 ; after basic profile inference.
 ;
 ;                  +----------------+