[llvm] f2ade65 - [CSSPGO] Even flow distribution

Wed Mar 2 13:12:24 PST 2022

Author: spupyrev
Date: 2022-03-02T13:12:05-08:00
New Revision: f2ade65fb2a63679d48178608286753616809bc0

URL: https://github.com/llvm/llvm-project/commit/f2ade65fb2a63679d48178608286753616809bc0
DIFF: https://github.com/llvm/llvm-project/commit/f2ade65fb2a63679d48178608286753616809bc0.diff

LOG: [CSSPGO] Even flow distribution

Differential Revision: https://reviews.llvm.org/D118640

Added: 
    llvm/test/Transforms/SampleProfile/Inputs/profile-inference-even-count-distribution.prof
    llvm/test/Transforms/SampleProfile/profile-inference-even-count-distribution.ll

Modified: 
    llvm/lib/Transforms/Utils/SampleProfileInference.cpp
    llvm/test/Transforms/SampleProfile/csspgo-import-list.ll
    llvm/test/Transforms/SampleProfile/profile-context-tracker.ll

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Transforms/Utils/SampleProfileInference.cpp b/llvm/lib/Transforms/Utils/SampleProfileInference.cpp
index 961adf2570a73..8fbebb6d4ddee 100644

--- a/llvm/lib/Transforms/Utils/SampleProfileInference.cpp
+++ b/llvm/lib/Transforms/Utils/SampleProfileInference.cpp
@@ -15,15 +15,27 @@
 
 #include "llvm/Transforms/Utils/SampleProfileInference.h"
 #include "llvm/ADT/BitVector.h"
+#include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
 #include <queue>
 #include <set>
+#include <stack>
 
 using namespace llvm;
 #define DEBUG_TYPE "sample-profile-inference"
 
 namespace {
 
+static cl::opt<bool> SampleProfileEvenCountDistribution(
+    "sample-profile-even-count-distribution", cl::init(true), cl::Hidden,
+    cl::ZeroOrMore,
+    cl::desc("Try to evenly distribute counts when there are multiple equally "
+             "likely options."));
+
+static cl::opt<unsigned> SampleProfileMaxDfsCalls(
+    "sample-profile-max-dfs-calls", cl::init(10), cl::Hidden, cl::ZeroOrMore,
+    cl::desc("Maximum number of dfs iterations for even count distribution."));
+
 /// A value indicating an infinite flow/capacity/weight of a block/edge.
 /// Not using numeric_limits<int64_t>::max(), as the values can be summed up
 /// during the execution.
@@ -52,16 +64,16 @@ class MinCostMaxFlow {
 
     Nodes = std::vector<Node>(NodeCount);
     Edges = std::vector<std::vector<Edge>>(NodeCount, std::vector<Edge>());
+    if (SampleProfileEvenCountDistribution)
+      AugmentingEdges =
+          std::vector<std::vector<Edge *>>(NodeCount, std::vector<Edge *>());
   }
 
   // Run the algorithm.
   int64_t run() {
-    // Find an augmenting path and update the flow along the path
-    size_t AugmentationIters = 0;
-    while (findAugmentingPath()) {
-      augmentFlowAlongPath();
-      AugmentationIters++;
-    }
+    // Iteratively find an augmentation path/dag in the network and send the
+    // flow along its edges
+    size_t AugmentationIters = applyFlowAugmentation();
 
     // Compute the total flow and its cost
     int64_t TotalCost = 0;
@@ -79,6 +91,7 @@ class MinCostMaxFlow {
                       << " iterations with " << TotalFlow << " total flow"
                       << " of " << TotalCost << " cost\n");
     (void)TotalFlow;
+    (void)AugmentationIters;
     return TotalCost;
   }
 
@@ -148,6 +161,55 @@ class MinCostMaxFlow {
   static constexpr int64_t AuxCostUnlikely = ((int64_t)1) << 30;
 
 private:
+  /// Iteratively find an augmentation path/dag in the network and send the
+  /// flow along its edges. The method returns the number of applied iterations.
+  size_t applyFlowAugmentation() {
+    size_t AugmentationIters = 0;
+    while (findAugmentingPath()) {
+      uint64_t PathCapacity = computeAugmentingPathCapacity();
+      while (PathCapacity > 0) {
+        bool Progress = false;
+        if (SampleProfileEvenCountDistribution) {
+          // Identify node/edge candidates for augmentation
+          identifyShortestEdges(PathCapacity);
+
+          // Find an augmenting DAG
+          auto AugmentingOrder = findAugmentingDAG();
+
+          // Apply the DAG augmentation
+          Progress = augmentFlowAlongDAG(AugmentingOrder);
+          PathCapacity = computeAugmentingPathCapacity();
+        }
+
+        if (!Progress) {
+          augmentFlowAlongPath(PathCapacity);
+          PathCapacity = 0;
+        }
+
+        AugmentationIters++;
+      }
+    }
+    return AugmentationIters;
+  }
+
+  /// Compute the capacity of the cannonical augmenting path. If the path is
+  /// saturated (that is, no flow can be sent along the path), then return 0.
+  uint64_t computeAugmentingPathCapacity() {
+    uint64_t PathCapacity = INF;
+    uint64_t Now = Target;
+    while (Now != Source) {
+      uint64_t Pred = Nodes[Now].ParentNode;
+      auto &Edge = Edges[Pred][Nodes[Now].ParentEdgeIndex];
+
+      assert(Edge.Capacity >= Edge.Flow && "incorrect edge flow");
+      uint64_t EdgeCapacity = uint64_t(Edge.Capacity - Edge.Flow);
+      PathCapacity = std::min(PathCapacity, EdgeCapacity);
+
+      Now = Pred;
+    }
+    return PathCapacity;
+  }
+
   /// Check for existence of an augmenting path with a positive capacity.
   bool findAugmentingPath() {
     // Initialize data structures
@@ -180,7 +242,7 @@ class MinCostMaxFlow {
       //    from Source to Target; it follows from inequalities
       //    Dist[Source, Target] >= Dist[Source, V] + Dist[V, Target]
       //                         >= Dist[Source, V]
-      if (Nodes[Target].Distance == 0)
+      if (!SampleProfileEvenCountDistribution && Nodes[Target].Distance == 0)
         break;
       if (Nodes[Src].Distance > Nodes[Target].Distance)
         continue;
@@ -210,21 +272,9 @@ class MinCostMaxFlow {
   }
 
   /// Update the current flow along the augmenting path.
-  void augmentFlowAlongPath() {
-    // Find path capacity
-    int64_t PathCapacity = INF;
-    uint64_t Now = Target;
-    while (Now != Source) {
-      uint64_t Pred = Nodes[Now].ParentNode;
-      auto &Edge = Edges[Pred][Nodes[Now].ParentEdgeIndex];
-      PathCapacity = std::min(PathCapacity, Edge.Capacity - Edge.Flow);
-      Now = Pred;
-    }
-
+  void augmentFlowAlongPath(uint64_t PathCapacity) {
     assert(PathCapacity > 0 && "found an incorrect augmenting path");
-
-    // Update the flow along the path
-    Now = Target;
+    uint64_t Now = Target;
     while (Now != Source) {
       uint64_t Pred = Nodes[Now].ParentNode;
       auto &Edge = Edges[Pred][Nodes[Now].ParentEdgeIndex];
@@ -237,6 +287,220 @@ class MinCostMaxFlow {
     }
   }
 
+  /// Find an Augmenting DAG order using a modified version of DFS in which we
+  /// can visit a node multiple times. In the DFS search, when scanning each
+  /// edge out of a node, continue search at Edge.Dst endpoint if it has not
+  /// been discovered yet and its NumCalls < MaxDfsCalls. The algorithm
+  /// runs in O(MaxDfsCalls * |Edges| + |Nodes|) time.
+  /// It returns an Augmenting Order (Taken nodes in decreasing Finish time)
+  /// that starts with Source and ends with Target.
+  std::vector<uint64_t> findAugmentingDAG() {
+    // We use a stack based implemenation of DFS to avoid recursion.
+    // Defining DFS data structures:
+    // A pair (NodeIdx, EdgeIdx) at the top of the Stack denotes that
+    //  - we are currently visiting Nodes[NodeIdx] and
+    //  - the next edge to scan is Edges[NodeIdx][EdgeIdx]
+    typedef std::pair<uint64_t, uint64_t> StackItemType;
+    std::stack<StackItemType> Stack;
+    std::vector<uint64_t> AugmentingOrder;
+
+    // Phase 0: Initialize Node attributes and Time for DFS run
+    for (auto &Node : Nodes) {
+      Node.Discovery = 0;
+      Node.Finish = 0;
+      Node.NumCalls = 0;
+      Node.Taken = false;
+    }
+    uint64_t Time = 0;
+    // Mark Target as Taken
+    // Taken attribute will be propagated backwards from Target towards Source
+    Nodes[Target].Taken = true;
+
+    // Phase 1: Start DFS traversal from Source
+    Stack.emplace(Source, 0);
+    Nodes[Source].Discovery = ++Time;
+    while (!Stack.empty()) {
+      auto NodeIdx = Stack.top().first;
+      auto EdgeIdx = Stack.top().second;
+
+      // If we haven't scanned all edges out of NodeIdx, continue scanning
+      if (EdgeIdx < Edges[NodeIdx].size()) {
+        auto &Edge = Edges[NodeIdx][EdgeIdx];
+        auto &Dst = Nodes[Edge.Dst];
+        Stack.top().second++;
+
+        if (Edge.OnShortestPath) {
+          // If we haven't seen Edge.Dst so far, continue DFS search there
+          if (Dst.Discovery == 0 && Dst.NumCalls < SampleProfileMaxDfsCalls) {
+            Dst.Discovery = ++Time;
+            Stack.emplace(Edge.Dst, 0);
+            Dst.NumCalls++;
+          } else if (Dst.Taken && Dst.Finish != 0) {
+            // Else, if Edge.Dst already have a path to Target, so that NodeIdx
+            Nodes[NodeIdx].Taken = true;
+          }
+        }
+      } else {
+        // If we are done scanning all edge out of NodeIdx
+        Stack.pop();
+        // If we haven't found a path from NodeIdx to Target, forget about it
+        if (!Nodes[NodeIdx].Taken) {
+          Nodes[NodeIdx].Discovery = 0;
+        } else {
+          // If we have found a path from NodeIdx to Target, then finish NodeIdx
+          // and propagate Taken flag to DFS parent unless at the Source
+          Nodes[NodeIdx].Finish = ++Time;
+          // NodeIdx == Source if and only if the stack is empty
+          if (NodeIdx != Source) {
+            assert(!Stack.empty() && "empty stack while running dfs");
+            Nodes[Stack.top().first].Taken = true;
+          }
+          AugmentingOrder.push_back(NodeIdx);
+        }
+      }
+    }
+    // Nodes are collected decreasing Finish time, so the order is reversed
+    std::reverse(AugmentingOrder.begin(), AugmentingOrder.end());
+
+    // Phase 2: Extract all forward (DAG) edges and fill in AugmentingEdges
+    for (size_t Src : AugmentingOrder) {
+      AugmentingEdges[Src].clear();
+      for (auto &Edge : Edges[Src]) {
+        uint64_t Dst = Edge.Dst;
+        if (Edge.OnShortestPath && Nodes[Src].Taken && Nodes[Dst].Taken &&
+            Nodes[Dst].Finish < Nodes[Src].Finish) {
+          AugmentingEdges[Src].push_back(&Edge);
+        }
+      }
+      assert((Src == Target || !AugmentingEdges[Src].empty()) &&
+             "incorrectly constructed augmenting edges");
+    }
+
+    return AugmentingOrder;
+  }
+
+  /// Update the current flow along the given (acyclic) subgraph specified by
+  /// the vertex order, AugmentingOrder. The objective is to send as much flow
+  /// as possible while evenly distributing flow among successors of each node.
+  /// After the update at least one edge is saturated.
+  bool augmentFlowAlongDAG(const std::vector<uint64_t> &AugmentingOrder) {
+    // Phase 0: Initialization
+    for (uint64_t Src : AugmentingOrder) {
+      Nodes[Src].FracFlow = 0;
+      Nodes[Src].IntFlow = 0;
+      for (auto &Edge : AugmentingEdges[Src]) {
+        Edge->AugmentedFlow = 0;
+      }
+    }
+
+    // Phase 1: Send a unit of fractional flow along the DAG
+    uint64_t MaxFlowAmount = INF;
+    Nodes[Source].FracFlow = 1.0;
+    for (uint64_t Src : AugmentingOrder) {
+      assert((Src == Target || Nodes[Src].FracFlow > 0.0) &&
+             "incorrectly computed fractional flow");
+      // Distribute flow evenly among successors of Src
+      uint64_t Degree = AugmentingEdges[Src].size();
+      for (auto &Edge : AugmentingEdges[Src]) {
+        double EdgeFlow = Nodes[Src].FracFlow / Degree;
+        Nodes[Edge->Dst].FracFlow += EdgeFlow;
+        if (Edge->Capacity == INF)
+          continue;
+        uint64_t MaxIntFlow = double(Edge->Capacity - Edge->Flow) / EdgeFlow;
+        MaxFlowAmount = std::min(MaxFlowAmount, MaxIntFlow);
+      }
+    }
+    // Stop early if we cannot send any (integral) flow from Source to Target
+    if (MaxFlowAmount == 0)
+      return false;
+
+    // Phase 2: Send an integral flow of MaxFlowAmount
+    Nodes[Source].IntFlow = MaxFlowAmount;
+    for (uint64_t Src : AugmentingOrder) {
+      if (Src == Target)
+        break;
+      // Distribute flow evenly among successors of Src, rounding up to make
+      // sure all flow is sent
+      uint64_t Degree = AugmentingEdges[Src].size();
+      // We are guaranteeed that Node[Src].IntFlow <= SuccFlow * Degree
+      uint64_t SuccFlow = (Nodes[Src].IntFlow + Degree - 1) / Degree;
+      for (auto &Edge : AugmentingEdges[Src]) {
+        uint64_t Dst = Edge->Dst;
+        uint64_t EdgeFlow = std::min(Nodes[Src].IntFlow, SuccFlow);
+        EdgeFlow = std::min(EdgeFlow, uint64_t(Edge->Capacity - Edge->Flow));
+        Nodes[Dst].IntFlow += EdgeFlow;
+        Nodes[Src].IntFlow -= EdgeFlow;
+        Edge->AugmentedFlow += EdgeFlow;
+      }
+    }
+    assert(Nodes[Target].IntFlow <= MaxFlowAmount);
+    Nodes[Target].IntFlow = 0;
+
+    // Phase 3: Send excess flow back traversing the nodes backwards.
+    // Because of rounding, not all flow can be sent along the edges of Src.
+    // Hence, sending the remaining flow back to maintain flow conservation
+    for (size_t Idx = AugmentingOrder.size() - 1; Idx > 0; Idx--) {
+      uint64_t Src = AugmentingOrder[Idx - 1];
+      // Try to send excess flow back along each edge.
+      // Make sure we only send back flow we just augmented (AugmentedFlow).
+      for (auto &Edge : AugmentingEdges[Src]) {
+        uint64_t Dst = Edge->Dst;
+        if (Nodes[Dst].IntFlow == 0)
+          continue;
+        uint64_t EdgeFlow = std::min(Nodes[Dst].IntFlow, Edge->AugmentedFlow);
+        Nodes[Dst].IntFlow -= EdgeFlow;
+        Nodes[Src].IntFlow += EdgeFlow;
+        Edge->AugmentedFlow -= EdgeFlow;
+      }
+    }
+
+    // Phase 4: Update flow values along all edges
+    bool HasSaturatedEdges = false;
+    for (uint64_t Src : AugmentingOrder) {
+      // Verify that we have sent all the excess flow from the node
+      assert(Src == Source || Nodes[Src].IntFlow == 0);
+      for (auto &Edge : AugmentingEdges[Src]) {
+        assert(uint64_t(Edge->Capacity - Edge->Flow) >= Edge->AugmentedFlow);
+        // Update flow values along the edge and its reverse copy
+        auto &RevEdge = Edges[Edge->Dst][Edge->RevEdgeIndex];
+        Edge->Flow += Edge->AugmentedFlow;
+        RevEdge.Flow -= Edge->AugmentedFlow;
+        if (Edge->Capacity == Edge->Flow && Edge->AugmentedFlow > 0)
+          HasSaturatedEdges = true;
+      }
+    }
+
+    // The augmentation is successful iff at least one edge becomes saturated
+    return HasSaturatedEdges;
+  }
+
+  /// Identify candidate (shortest) edges for augmentation.
+  void identifyShortestEdges(uint64_t PathCapacity) {
+    assert(PathCapacity > 0 && "found an incorrect augmenting DAG");
+    // To make sure the augmentation DAG contains only edges with large residual
+    // capacity, we prune all edges whose capacity is below a fraction of
+    // the capacity of the augmented path.
+    // (All edges of the path itself are always in the DAG)
+    uint64_t MinCapacity = std::max(PathCapacity / 2, uint64_t(1));
+
+    // Decide which edges are on a shortest path from Source to Target
+    for (size_t Src = 0; Src < Nodes.size(); Src++) {
+      // An edge cannot be augmenting if the endpoint has large distance
+      if (Nodes[Src].Distance > Nodes[Target].Distance)
+        continue;
+
+      for (auto &Edge : Edges[Src]) {
+        uint64_t Dst = Edge.Dst;
+        Edge.OnShortestPath =
+            Src != Target && Dst != Source &&
+            Nodes[Dst].Distance <= Nodes[Target].Distance &&
+            Nodes[Dst].Distance == Nodes[Src].Distance + Edge.Cost &&
+            Edge.Capacity > Edge.Flow &&
+            uint64_t(Edge.Capacity - Edge.Flow) >= MinCapacity;
+      }
+    }
+  }
+
   /// A node in a flow network.
   struct Node {
     /// The cost of the cheapest path from the source to the current node.
@@ -247,7 +511,20 @@ class MinCostMaxFlow {
     uint64_t ParentEdgeIndex;
     /// An indicator of whether the current node is in a queue.
     bool Taken;
+
+    /// Data fields utilized in DAG-augmentation:
+    /// Fractional flow.
+    double FracFlow;
+    /// Integral flow.
+    uint64_t IntFlow;
+    /// Discovery time.
+    uint64_t Discovery;
+    /// Finish time.
+    uint64_t Finish;
+    /// NumCalls.
+    uint64_t NumCalls;
   };
+
   /// An edge in a flow network.
   struct Edge {
     /// The cost of the edge.
@@ -260,6 +537,12 @@ class MinCostMaxFlow {
     uint64_t Dst;
     /// The index of the reverse edge between Dst and the current node.
     uint64_t RevEdgeIndex;
+
+    /// Data fields utilized in DAG-augmentation:
+    /// Whether the edge is currently on a shortest path from Source to Target.
+    bool OnShortestPath;
+    /// Extra flow along the edge.
+    uint64_t AugmentedFlow;
   };
 
   /// The set of network nodes.
@@ -270,6 +553,8 @@ class MinCostMaxFlow {
   uint64_t Source;
   /// Target (sink) node of the flow.
   uint64_t Target;
+  /// Augmenting edges.
+  std::vector<std::vector<Edge *>> AugmentingEdges;
 };
 
 /// A post-processing adjustment of control flow. It applies two steps by
@@ -511,7 +796,7 @@ class FlowAdjuster {
                            std::vector<FlowBlock *> &KnownDstBlocks,
                            std::vector<FlowBlock *> &UnknownBlocks) {
     // Run BFS from SrcBlock and make sure all paths are going through unknown
-    // blocks and end at a non-unknown DstBlock
+    // blocks and end at a known DstBlock
     auto Visited = BitVector(NumBlocks(), false);
     std::queue<uint64_t> Queue;
 

diff  --git a/llvm/test/Transforms/SampleProfile/Inputs/profile-inference-even-count-distribution.prof b/llvm/test/Transforms/SampleProfile/Inputs/profile-inference-even-count-distribution.prof
new file mode 100644
index 0000000000000..7b9012c0f8dd9
--- /dev/null
+++ b/llvm/test/Transforms/SampleProfile/Inputs/profile-inference-even-count-distribution.prof
@@ -0,0 +1,16 @@
+foo1:37078302:0
+ 1: 1000
+ 2: 0
+ 3: 0
+ 4: 1000
+ !CFGChecksum: 157181141624
+
+foo2:37078302:0
+ 3: 1000
+ !CFGChecksum: 208782362068
+
+foo3:37078302:0
+ 1: 1000
+ 4: 1000
+ 6: 1000
+ !CFGChecksum: 189901498683

diff  --git a/llvm/test/Transforms/SampleProfile/csspgo-import-list.ll b/llvm/test/Transforms/SampleProfile/csspgo-import-list.ll
index b797b0031cf9c..0202177da3e0e 100644
--- a/llvm/test/Transforms/SampleProfile/csspgo-import-list.ll
+++ b/llvm/test/Transforms/SampleProfile/csspgo-import-list.ll
@@ -1,19 +1,19 @@
 ; Make sure Import GUID list for ThinLTO properly set for CSSPGO
-; RUN: opt < %s -passes='thinlto-pre-link<O2>' -pgo-kind=pgo-sample-use-pipeline -sample-profile-file=%S/Inputs/csspgo-import-list.prof -S | FileCheck %s
+; RUN: opt < %s -passes='thinlto-pre-link<O2>' -pgo-kind=pgo-sample-use-pipeline -sample-profile-file=%S/Inputs/csspgo-import-list.prof -sample-profile-even-count-distribution=0 -S | FileCheck %s
 ; RUN: llvm-profdata merge --sample --extbinary %S/Inputs/csspgo-import-list.prof -o %t.prof
-; RUN: opt < %s -passes='thinlto-pre-link<O2>' -pgo-kind=pgo-sample-use-pipeline -sample-profile-file=%t.prof -S | FileCheck %s
+; RUN: opt < %s -passes='thinlto-pre-link<O2>' -pgo-kind=pgo-sample-use-pipeline -sample-profile-file=%t.prof -sample-profile-even-count-distribution=0 -S | FileCheck %s
 ; RUN: llvm-profdata show --sample -show-sec-info-only %t.prof | FileCheck %s --check-prefix=CHECK-ORDERED
 ; RUN: llvm-profdata merge --sample --extbinary --use-md5 %S/Inputs/csspgo-import-list.prof -o %t.md5
-; RUN: opt < %s -passes='thinlto-pre-link<O2>' -pgo-kind=pgo-sample-use-pipeline -sample-profile-file=%t.md5 -S | FileCheck %s
+; RUN: opt < %s -passes='thinlto-pre-link<O2>' -pgo-kind=pgo-sample-use-pipeline -sample-profile-file=%t.md5 -sample-profile-even-count-distribution=0 -S | FileCheck %s
 ; RUN: llvm-profdata show --sample -show-sec-info-only %t.md5 | FileCheck %s --check-prefix=CHECK-ORDERED
 
 ;; Validate that with replay in effect, we import call sites even if they are below the threshold
 ;; Baseline import decisions
-; RUN: opt < %s -passes='thinlto-pre-link<O2>' -pgo-kind=pgo-sample-use-pipeline -sample-profile-file=%S/Inputs/csspgo-import-list.prof -profile-summary-hot-count=10000 -S | FileCheck %s --check-prefix=THRESHOLD
+; RUN: opt < %s -passes='thinlto-pre-link<O2>' -pgo-kind=pgo-sample-use-pipeline -sample-profile-file=%S/Inputs/csspgo-import-list.prof -profile-summary-hot-count=10000 -sample-profile-even-count-distribution=0 -S | FileCheck %s --check-prefix=THRESHOLD
 ;; With replay
-; RUN: opt < %s -passes='thinlto-pre-link<O2>' -pgo-kind=pgo-sample-use-pipeline -sample-profile-file=%S/Inputs/csspgo-import-list.prof -sample-profile-inline-replay=%S/Inputs/csspgo-import-list-replay.txt -sample-profile-inline-replay-scope=Module -profile-summary-hot-count=10000 -S | FileCheck %s --check-prefix=THRESHOLD-REPLAY
+; RUN: opt < %s -passes='thinlto-pre-link<O2>' -pgo-kind=pgo-sample-use-pipeline -sample-profile-file=%S/Inputs/csspgo-import-list.prof -sample-profile-inline-replay=%S/Inputs/csspgo-import-list-replay.txt -sample-profile-inline-replay-scope=Module -profile-summary-hot-count=10000 -sample-profile-even-count-distribution=0 -S | FileCheck %s --check-prefix=THRESHOLD-REPLAY
 ;; With replay but no profile information for call to _Z5funcAi. We import _Z5funcAi because it's explicitly in the replay but don't go further to its callee (_Z3fibi) because we lack samples
-; RUN: opt < %s -passes='thinlto-pre-link<O2>' -pgo-kind=pgo-sample-use-pipeline -sample-profile-file=%S/Inputs/csspgo-import-list-no-funca.prof -sample-profile-inline-replay=%S/Inputs/csspgo-import-list-replay.txt -sample-profile-inline-replay-scope=Module -profile-summary-hot-count=10000 -S | FileCheck %s --check-prefix=THRESHOLD-REPLAY-NO-FUNCA
+; RUN: opt < %s -passes='thinlto-pre-link<O2>' -pgo-kind=pgo-sample-use-pipeline -sample-profile-file=%S/Inputs/csspgo-import-list-no-funca.prof -sample-profile-inline-replay=%S/Inputs/csspgo-import-list-replay.txt -sample-profile-inline-replay-scope=Module -profile-summary-hot-count=10000 -sample-profile-even-count-distribution=0 -S | FileCheck %s --check-prefix=THRESHOLD-REPLAY-NO-FUNCA
 
 declare i32 @_Z5funcBi(i32 %x)
 declare i32 @_Z5funcAi(i32 %x)

diff  --git a/llvm/test/Transforms/SampleProfile/profile-context-tracker.ll b/llvm/test/Transforms/SampleProfile/profile-context-tracker.ll
index 323f662afcdde..d49e2dc9b5f0e 100644
--- a/llvm/test/Transforms/SampleProfile/profile-context-tracker.ll
+++ b/llvm/test/Transforms/SampleProfile/profile-context-tracker.ll
@@ -8,10 +8,10 @@
 ;   main:3 @ _Z5funcAi
 ;   main:3 @ _Z5funcAi:1 @ _Z8funcLeafi
 ;   _Z5funcBi:1 @ _Z8funcLeafi
-; RUN: opt < %s -passes=sample-profile -sample-profile-file=%S/Inputs/profile-context-tracker.prof -sample-profile-inline-size -sample-profile-prioritized-inline=0 -profile-sample-accurate -S | FileCheck %s --check-prefix=INLINE-ALL
-; RUN: opt < %s -passes=sample-profile -sample-profile-file=%t -sample-profile-inline-size -sample-profile-prioritized-inline=0 -profile-sample-accurate -S | FileCheck %s --check-prefix=INLINE-ALL
-; RUN: opt < %s -passes=sample-profile -sample-profile-file=%S/Inputs/profile-context-tracker.prof -sample-profile-inline-size -sample-profile-cold-inline-threshold=200 -profile-sample-accurate -S | FileCheck %s --check-prefix=INLINE-ALL
-; RUN: opt < %s -passes=sample-profile -sample-profile-file=%t -sample-profile-inline-size -sample-profile-cold-inline-threshold=200 -profile-sample-accurate -S | FileCheck %s --check-prefix=INLINE-ALL
+; RUN: opt < %s -passes=sample-profile -sample-profile-file=%S/Inputs/profile-context-tracker.prof -sample-profile-inline-size -sample-profile-prioritized-inline=0 -profile-sample-accurate -sample-profile-even-count-distribution=0 -S | FileCheck %s --check-prefix=INLINE-ALL
+; RUN: opt < %s -passes=sample-profile -sample-profile-file=%t -sample-profile-inline-size -sample-profile-prioritized-inline=0 -profile-sample-accurate -sample-profile-even-count-distribution=0 -S | FileCheck %s --check-prefix=INLINE-ALL
+; RUN: opt < %s -passes=sample-profile -sample-profile-file=%S/Inputs/profile-context-tracker.prof -sample-profile-inline-size -sample-profile-cold-inline-threshold=200 -profile-sample-accurate -sample-profile-even-count-distribution=0 -S | FileCheck %s --check-prefix=INLINE-ALL
+; RUN: opt < %s -passes=sample-profile -sample-profile-file=%t -sample-profile-inline-size -sample-profile-cold-inline-threshold=200 -profile-sample-accurate -sample-profile-even-count-distribution=0 -S | FileCheck %s --check-prefix=INLINE-ALL
 ;
 ; Test we inlined the following in top-down order and entry counts accurate reflects post-inline base profile
 ;   _Z5funcAi:1 @ _Z8funcLeafi
@@ -145,9 +145,9 @@ entry:
 ; INLINE-HOT-DAG-SAME: [[LEAF_PROF]] = !{!"function_entry_count", i64 0}
 ; INLINE-HOT-DAG: [[FUNCB_PROF]] = !{!"function_entry_count", i64 13}
 
-; INLINE-NONE: [[MAIN_PROF]] = !{!"function_entry_count", i64 1}
+; INLINE-NONE: [[MAIN_PROF]] = !{!"function_entry_count", i64 13}
 ; INLINE-NONE: [[FUNCA_PROF]] = !{!"function_entry_count", i64 24}
-; INLINE-NONE-DAG-SAME: [[LEAF_PROF]] = !{!"function_entry_count", i64 22}
+; INLINE-NONE-DAG-SAME: [[LEAF_PROF]] = !{!"function_entry_count", i64 21}
 ; INLINE-NONE-DAG: [[FUNCB_PROF]] = !{!"function_entry_count", i64 32}
 
 declare i32 @_Z3fibi(i32)

diff  --git a/llvm/test/Transforms/SampleProfile/profile-inference-even-count-distribution.ll b/llvm/test/Transforms/SampleProfile/profile-inference-even-count-distribution.ll
new file mode 100644
index 0000000000000..baffbc9656df6
--- /dev/null
+++ b/llvm/test/Transforms/SampleProfile/profile-inference-even-count-distribution.ll
@@ -0,0 +1,173 @@
+; RUN: opt < %s -passes=pseudo-probe,sample-profile -sample-profile-use-profi -sample-profile-file=%S/Inputs/profile-inference-even-count-distribution.prof | opt -analyze -branch-prob -enable-new-pm=0 | FileCheck %s
+; RUN: opt < %s -passes=pseudo-probe,sample-profile -sample-profile-use-profi -sample-profile-file=%S/Inputs/profile-inference-even-count-distribution.prof | opt -analyze -block-freq  -enable-new-pm=0 | FileCheck %s --check-prefix=CHECK2
+
+; The test verifies that counts are evenly distributed among branches with
+; equal weights.
+;
+; +-----------+     +-----------+
+; | b3 [0]    | <-- | b1 [1000] |
+; +-----------+     +-----------+
+;   |                 |
+;   |                 |
+;   |                 v
+;   |               +-----------+
+;   |               | b2 [0]    |
+;   |               +-----------+
+;   |                 |
+;   |                 |
+;   |                 v
+;   |               +-----------+
+;   +-------------> | b4 [1000] |
+;                   +-----------+
+
+ at yydebug = dso_local global i32 0, align 4
+
+; Function Attrs: nounwind uwtable
+define dso_local i32 @foo1(i32 %0, i32 %1) #0 {
+b11:
+  call void @llvm.pseudoprobe(i64 7682762345278052905, i64 1, i32 0, i64 -1)
+  %cmp = icmp ne i32 %0, 0
+  br i1 %cmp, label %b12, label %b13
+; CHECK:  edge b11 -> b12 probability is 0x40000000 / 0x80000000 = 50.00%
+; CHECK:  edge b11 -> b13 probability is 0x40000000 / 0x80000000 = 50.00%
+; CHECK2: - b11: float = {{.*}}, int = {{.*}}, count = 1000
+
+b12:
+  call void @llvm.pseudoprobe(i64 7682762345278052905, i64 2, i32 0, i64 -1)
+  br label %b14
+; CHECK2: - b12: float = {{.*}}, int = {{.*}}, count = 500
+
+b13:
+  call void @llvm.pseudoprobe(i64 7682762345278052905, i64 3, i32 0, i64 -1)
+  br label %b14
+; CHECK2: - b13: float = {{.*}}, int = {{.*}}, count = 500
+
+b14:
+  call void @llvm.pseudoprobe(i64 7682762345278052905, i64 4, i32 0, i64 -1)
+  ret i32 %1
+; CHECK2: - b14: float = {{.*}}, int = {{.*}}, count = 1000
+}
+
+
+; The test verifies that counts are evenly distributed when the entry basic
+; block is dangling.
+;
+; +-----------+
+; |  b1 [?]   | -+
+; +-----------+  |
+;   |            |
+;   |            |
+;   v            |
+; +-----------+  |
+; |  b2 [?]   |  |
+; +-----------+  |
+;   |            |
+;   |            |
+;   v            |
+; +-----------+  |
+; | b3 [1000] | <+
+; +-----------+
+
+define dso_local i32 @foo2(i32 %0, i32 %1) #0 {
+b21:
+  call void @llvm.pseudoprobe(i64 2494702099028631698, i64 1, i32 0, i64 -1)
+  %cmp = icmp ne i32 %0, 0
+  br i1 %cmp, label %b22, label %b23
+; CHECK:  edge b21 -> b22 probability is 0x40000000 / 0x80000000 = 50.00%
+; CHECK:  edge b21 -> b23 probability is 0x40000000 / 0x80000000 = 50.00%
+; CHECK2: - b21: float = {{.*}}, int = {{.*}}, count = 1000
+
+b22:
+  call void @llvm.pseudoprobe(i64 2494702099028631698, i64 2, i32 0, i64 -1)
+  br label %b23
+; CHECK2: - b22: float = {{.*}}, int = {{.*}}, count = 500
+
+b23:
+  call void @llvm.pseudoprobe(i64 2494702099028631698, i64 3, i32 0, i64 -1)
+  ret i32 %1
+; CHECK2: - b23: float = {{.*}}, int = {{.*}}, count = 1000
+
+}
+
+; The test verifies even count distribution in the presence of multiple sinks.
+;
+;                +-----------+
+;                | b1 [1000] |
+;                +-----------+
+;                  |
+;                  |
+;                  v
+;                +-----------+
+;                |  b2 [?]   | -+
+;                +-----------+  |
+;                  |            |
+;                  |            |
+;                  v            |
+; +--------+     +-----------+  |
+; | b5 [?] | <-- |  b3 [?]   |  |
+; +--------+     +-----------+  |
+;   |              |            |
+;   |              |            |
+;   |              v            |
+;   |            +-----------+  |
+;   |            | b4 [1000] | <+
+;   |            +-----------+
+;   |              |
+;   |              |
+;   |              v
+;   |            +-----------+
+;   +----------> | b6 [1000] |
+;                +-----------+
+;
+
+define dso_local i32 @foo3(i32 %0, i32 %1) #0 {
+b31:
+  call void @llvm.pseudoprobe(i64 -7908226060800700466, i64 1, i32 0, i64 -1)
+  %cmp = icmp ne i32 %0, 0
+  br label %b32
+; CHECK2: - b31: float = {{.*}}, int = {{.*}}, count = 1000
+
+b32:
+  call void @llvm.pseudoprobe(i64 -7908226060800700466, i64 2, i32 0, i64 -1)
+  br i1 %cmp, label %b33, label %b34
+; CHECK:  edge b32 -> b33 probability is 0x40000000 / 0x80000000 = 50.00%
+; CHECK:  edge b32 -> b34 probability is 0x40000000 / 0x80000000 = 50.00%
+; CHECK2: - b32: float = {{.*}}, int = {{.*}}, count = 1000
+
+b33:
+  call void @llvm.pseudoprobe(i64 -7908226060800700466, i64 3, i32 0, i64 -1)
+  br i1 %cmp, label %b35, label %b34
+; CHECK:  edge b33 -> b35 probability is 0x00000000 / 0x80000000 = 0.00%
+; CHECK:  edge b33 -> b34 probability is 0x80000000 / 0x80000000 = 100.00% [HOT edge]
+; CHECK2: - b33: float = {{.*}}, int = {{.*}}, count = 500
+
+b34:
+  call void @llvm.pseudoprobe(i64 -7908226060800700466, i64 4, i32 0, i64 -1)
+  br label %b36
+; CHECK2: - b34: float = {{.*}}, int = {{.*}}, count = 1000
+
+b35:
+  call void @llvm.pseudoprobe(i64 -7908226060800700466, i64 5, i32 0, i64 -1)
+  br label %b36
+; CHECK2: - b35: float = {{.*}}, int = {{.*}}, count = 0
+
+b36:
+  call void @llvm.pseudoprobe(i64 -7908226060800700466, i64 6, i32 0, i64 -1)
+  ret i32 %1
+; CHECK2: - b36: float = {{.*}}, int = {{.*}}, count = 1000
+}
+
+
+
+; Function Attrs: inaccessiblememonly nounwind willreturn
+declare void @llvm.pseudoprobe(i64, i64, i32, i64) #4
+
+attributes #0 = { noinline nounwind uwtable "use-sample-profile" }
+attributes #4 = { inaccessiblememonly nounwind willreturn }
+
+!llvm.pseudo_probe_desc = !{!7, !8, !9, !10}
+
+!7 = !{i64 7682762345278052905, i64 157181141624, !"foo1", null}
+!8 = !{i64 2494702099028631698, i64 208782362068, !"foo2", null}
+!9 = !{i64 -7908226060800700466, i64 189901498683, !"foo3", null}
+!10 = !{i64 -6882312132165544686, i64 241030178952, !"foo4", null}