[llvm] 98dd2f9 - profi - a flow-based profile inference algorithm: Part II (out of 3)

Thu Dec 2 11:04:32 PST 2021

Author: spupyrev
Date: 2021-12-02T11:04:21-08:00
New Revision: 98dd2f9ed3ddb0a114582d48d48f781d9c80a2da

URL: https://github.com/llvm/llvm-project/commit/98dd2f9ed3ddb0a114582d48d48f781d9c80a2da
DIFF: https://github.com/llvm/llvm-project/commit/98dd2f9ed3ddb0a114582d48d48f781d9c80a2da.diff

LOG: profi - a flow-based profile inference algorithm: Part II (out of 3)

This is a continuation of D109860.

Traditional flow-based algorithms cannot guarantee that the resulting edge
frequencies correspond to a *connected* flow in the control-flow graph. For
example, for an instance in the attached figure, a flow-based (or any other)
inference algorithm may produce an output in which the hot loop is disconnected
from the entry block (refer to the rightmost graph in the figure). Furthermore,
creating a connected minimum-cost maximum flow is a computationally NP-hard
problem. Hence, we apply a post-processing adjustments to the computed flow
by connecting all isolated flow components ("islands").

This feature helps to keep all blocks with sample counts connected and results
in significant performance wins for some binaries.
{F19077343}

Reviewed By: hoy

Differential Revision: https://reviews.llvm.org/D109903

Added: 
    llvm/test/Transforms/SampleProfile/Inputs/profile-inference-islands.prof
    llvm/test/Transforms/SampleProfile/profile-inference-islands.ll

Modified: 
    llvm/lib/Transforms/Utils/SampleProfileInference.cpp

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Transforms/Utils/SampleProfileInference.cpp b/llvm/lib/Transforms/Utils/SampleProfileInference.cpp
index 9495e442e0bf0..daf5d751b5aa0 100644

--- a/llvm/lib/Transforms/Utils/SampleProfileInference.cpp
+++ b/llvm/lib/Transforms/Utils/SampleProfileInference.cpp
@@ -271,6 +271,177 @@ class MinCostMaxFlow {
   uint64_t Target;
 };
 
+/// Post-processing adjustment of the control flow.
+class FlowAdjuster {
+public:
+  FlowAdjuster(FlowFunction &Func) : Func(Func) {
+    assert(Func.Blocks[Func.Entry].isEntry() &&
+           "incorrect index of the entry block");
+  }
+
+  // Run the post-processing
+  void run() {
+    /// We adjust the control flow in a function so as to remove all
+    /// "isolated" components with positive flow that are unreachable
+    /// from the entry block. For every such component, we find the shortest
+    /// path from the entry to an exit passing through the component, and
+    /// increase the flow by one unit along the path.
+    joinIsolatedComponents();
+  }
+
+private:
+  void joinIsolatedComponents() {
+    // Find blocks that are reachable from the source
+    auto Visited = std::vector<bool>(NumBlocks(), false);
+    findReachable(Func.Entry, Visited);
+
+    // Iterate over all non-reachable blocks and adjust their weights
+    for (uint64_t I = 0; I < NumBlocks(); I++) {
+      auto &Block = Func.Blocks[I];
+      if (Block.Flow > 0 && !Visited[I]) {
+        // Find a path from the entry to an exit passing through the block I
+        auto Path = findShortestPath(I);
+        // Increase the flow along the path
+        assert(Path.size() > 0 && Path[0]->Source == Func.Entry &&
+               "incorrectly computed path adjusting control flow");
+        Func.Blocks[Func.Entry].Flow += 1;
+        for (auto &Jump : Path) {
+          Jump->Flow += 1;
+          Func.Blocks[Jump->Target].Flow += 1;
+          // Update reachability
+          findReachable(Jump->Target, Visited);
+        }
+      }
+    }
+  }
+
+  /// Run bfs from a given block along the jumps with a positive flow and mark
+  /// all reachable blocks.
+  void findReachable(uint64_t Src, std::vector<bool> &Visited) {
+    if (Visited[Src])
+      return;
+    std::queue<uint64_t> Queue;
+    Queue.push(Src);
+    Visited[Src] = true;
+    while (!Queue.empty()) {
+      Src = Queue.front();
+      Queue.pop();
+      for (auto Jump : Func.Blocks[Src].SuccJumps) {
+        uint64_t Dst = Jump->Target;
+        if (Jump->Flow > 0 && !Visited[Dst]) {
+          Queue.push(Dst);
+          Visited[Dst] = true;
+        }
+      }
+    }
+  }
+
+  /// Find the shortest path from the entry block to an exit block passing
+  /// through a given block.
+  std::vector<FlowJump *> findShortestPath(uint64_t BlockIdx) {
+    // A path from the entry block to BlockIdx
+    auto ForwardPath = findShortestPath(Func.Entry, BlockIdx);
+    // A path from BlockIdx to an exit block
+    auto BackwardPath = findShortestPath(BlockIdx, AnyExitBlock);
+
+    // Concatenate the two paths
+    std::vector<FlowJump *> Result;
+    Result.insert(Result.end(), ForwardPath.begin(), ForwardPath.end());
+    Result.insert(Result.end(), BackwardPath.begin(), BackwardPath.end());
+    return Result;
+  }
+
+  /// Apply the Dijkstra algorithm to find the shortest path from a given
+  /// Source to a given Target block.
+  /// If Target == -1, then the path ends at an exit block.
+  std::vector<FlowJump *> findShortestPath(uint64_t Source, uint64_t Target) {
+    // Quit early, if possible
+    if (Source == Target)
+      return std::vector<FlowJump *>();
+    if (Func.Blocks[Source].isExit() && Target == AnyExitBlock)
+      return std::vector<FlowJump *>();
+
+    // Initialize data structures
+    auto Distance = std::vector<int64_t>(NumBlocks(), INF);
+    auto Parent = std::vector<FlowJump *>(NumBlocks(), nullptr);
+    Distance[Source] = 0;
+    std::set<std::pair<uint64_t, uint64_t>> Queue;
+    Queue.insert(std::make_pair(Distance[Source], Source));
+
+    // Run the Dijkstra algorithm
+    while (!Queue.empty()) {
+      uint64_t Src = Queue.begin()->second;
+      Queue.erase(Queue.begin());
+      // If we found a solution, quit early
+      if (Src == Target ||
+          (Func.Blocks[Src].isExit() && Target == AnyExitBlock))
+        break;
+
+      for (auto Jump : Func.Blocks[Src].SuccJumps) {
+        uint64_t Dst = Jump->Target;
+        int64_t JumpDist = jumpDistance(Jump);
+        if (Distance[Dst] > Distance[Src] + JumpDist) {
+          Queue.erase(std::make_pair(Distance[Dst], Dst));
+
+          Distance[Dst] = Distance[Src] + JumpDist;
+          Parent[Dst] = Jump;
+
+          Queue.insert(std::make_pair(Distance[Dst], Dst));
+        }
+      }
+    }
+    // If Target is not provided, find the closest exit block
+    if (Target == AnyExitBlock) {
+      for (uint64_t I = 0; I < NumBlocks(); I++) {
+        if (Func.Blocks[I].isExit() && Parent[I] != nullptr) {
+          if (Target == AnyExitBlock || Distance[Target] > Distance[I]) {
+            Target = I;
+          }
+        }
+      }
+    }
+    assert(Parent[Target] != nullptr && "a path does not exist");
+
+    // Extract the constructed path
+    std::vector<FlowJump *> Result;
+    uint64_t Now = Target;
+    while (Now != Source) {
+      assert(Now == Parent[Now]->Target && "incorrect parent jump");
+      Result.push_back(Parent[Now]);
+      Now = Parent[Now]->Source;
+    }
+    // Reverse the path, since it is extracted from Target to Source
+    std::reverse(Result.begin(), Result.end());
+    return Result;
+  }
+
+  /// A distance of a path for a given jump.
+  /// In order to incite the path to use blocks/jumps with large positive flow,
+  /// and avoid changing branch probability of outgoing edges drastically,
+  /// set the distance as follows:
+  ///   if Jump.Flow > 0, then distance = max(100 - Jump->Flow, 0)
+  ///   if Block.Weight > 0, then distance = 1
+  ///   otherwise distance >> 1
+  int64_t jumpDistance(FlowJump *Jump) const {
+    int64_t BaseDistance = 100;
+    if (Jump->IsUnlikely)
+      return MinCostMaxFlow::AuxCostUnlikely;
+    if (Jump->Flow > 0)
+      return std::max(BaseDistance - (int64_t)Jump->Flow, (int64_t)0);
+    if (Func.Blocks[Jump->Target].Weight > 0)
+      return BaseDistance;
+    return BaseDistance * (NumBlocks() + 1);
+  };
+
+  uint64_t NumBlocks() const { return Func.Blocks.size(); }
+
+  /// A constant indicating an arbitrary exit block of a function.
+  static constexpr uint64_t AnyExitBlock = uint64_t(-1);
+
+  /// The function.
+  FlowFunction &Func;
+};
+
 /// Initializing flow network for a given function.
 ///
 /// Every block is split into three nodes that are responsible for (i) an
@@ -440,6 +611,39 @@ void verifyWeights(const FlowFunction &Func) {
     }
   }
   assert(TotalInFlow == TotalOutFlow && "incorrectly computed control flow");
+
+  // Verify that there are no isolated flow components
+  // One could modify FlowFunction to hold edges indexed by the sources, which
+  // will avoid a creation of the object
+  auto PositiveFlowEdges = std::vector<std::vector<uint64_t>>(NumBlocks);
+  for (auto &Jump : Func.Jumps) {
+    if (Jump.Flow > 0) {
+      PositiveFlowEdges[Jump.Source].push_back(Jump.Target);
+    }
+  }
+
+  // Run bfs from the source along edges with positive flow
+  std::queue<uint64_t> Queue;
+  auto Visited = std::vector<bool>(NumBlocks, false);
+  Queue.push(Func.Entry);
+  Visited[Func.Entry] = true;
+  while (!Queue.empty()) {
+    uint64_t Src = Queue.front();
+    Queue.pop();
+    for (uint64_t Dst : PositiveFlowEdges[Src]) {
+      if (!Visited[Dst]) {
+        Queue.push(Dst);
+        Visited[Dst] = true;
+      }
+    }
+  }
+
+  // Verify that every block that has a positive flow is reached from the source
+  // along edges with a positive flow
+  for (uint64_t I = 0; I < NumBlocks; I++) {
+    auto &Block = Func.Blocks[I];
+    assert((Visited[I] || Block.Flow == 0) && "an isolated flow component");
+  }
 }
 #endif
 
@@ -455,6 +659,10 @@ void llvm::applyFlowInference(FlowFunction &Func) {
   // Extract flow values for every block and every edge
   extractWeights(InferenceNetwork, Func);
 
+  // Post-processing adjustments to the flow
+  auto Adjuster = FlowAdjuster(Func);
+  Adjuster.run();
+
 #ifndef NDEBUG
   // Verify the result
   verifyWeights(Func);

diff  --git a/llvm/test/Transforms/SampleProfile/Inputs/profile-inference-islands.prof b/llvm/test/Transforms/SampleProfile/Inputs/profile-inference-islands.prof
new file mode 100644
index 0000000000000..59cf1b1de3300
--- /dev/null
+++ b/llvm/test/Transforms/SampleProfile/Inputs/profile-inference-islands.prof
@@ -0,0 +1,27 @@
+islands_1:10822:0
+ 1: 1
+ 2: 100
+ 3: 100
+ 4: 1
+ 5: 0
+ 6: 1
+ 7: 0
+ !CFGChecksum: 120879332589
+
+islands_2:108:0
+ 1: 0
+ 2: 10000
+ 3: 10000
+ 4: 0
+ !CFGChecksum: 69495280403
+
+islands_3:108:0
+ 1: 10
+ 2: 0
+ 3: 10
+ 4: 0
+ 5: 1000
+ 6: 1000
+ 7: 0
+ 8: 10
+ !CFGChecksum: 156608410269

diff  --git a/llvm/test/Transforms/SampleProfile/profile-inference-islands.ll b/llvm/test/Transforms/SampleProfile/profile-inference-islands.ll
new file mode 100644
index 0000000000000..a6145c51ec8d5
--- /dev/null
+++ b/llvm/test/Transforms/SampleProfile/profile-inference-islands.ll
@@ -0,0 +1,212 @@
+; RUN: opt < %s -passes=pseudo-probe,sample-profile -sample-profile-use-profi -sample-profile-file=%S/Inputs/profile-inference-islands.prof -S -o %t
+; RUN: FileCheck %s < %t -check-prefix=CHECK-ENTRY-COUNT
+; RUN: opt < %t -analyze -block-freq -enable-new-pm=0 | FileCheck %s
+
+
+; The test contains an isolated flow component ("island") that needs to be
+; reconnected to the entry point via edges with a positive flow.
+; The corresponding CFG is shown below:
+;
+; +--------+     +--------+     +----------+
+; | b6 [1] | <-- | b4 [1] | <-- |  b1 [1]  |
+; +--------+     +--------+     +----------+
+;                  |              |
+;                  |              |
+;                  v              v
+;                +--------+     +----------+
+;                | b5 [0] |     | b2 [100] | <+
+;                +--------+     +----------+  |
+;                                 |           |
+;                                 |           |
+;                                 v           |
+;                               +----------+  |
+;                               | b3 [100] | -+
+;                               +----------+
+;                                 |
+;                                 |
+;                                 v
+;                               +----------+
+;                               |  b7 [0]  |
+;                               +----------+
+
+
+; Function Attrs: nounwind uwtable
+define dso_local i32 @islands_1(i32 %0, i32 %1) #0 {
+b1:
+  call void @llvm.pseudoprobe(i64 -5646793257986063976, i64 1, i32 0, i64 -1)
+  %cmp = icmp ne i32 %0, 0
+  br i1 %cmp, label %b2, label %b4
+; CHECK: - b1: float = {{.*}}, int = {{.*}}, count = 2
+
+b2:
+  call void @llvm.pseudoprobe(i64 -5646793257986063976, i64 2, i32 0, i64 -1)
+  br label %b3
+; CHECK: - b2: float = {{.*}}, int = {{.*}}, count = 101
+
+b3:
+  call void @llvm.pseudoprobe(i64 -5646793257986063976, i64 3, i32 0, i64 -1)
+  br i1 %cmp, label %b2, label %b7
+; CHECK: - b3: float = {{.*}}, int = {{.*}}, count = 101
+
+b4:
+  call void @llvm.pseudoprobe(i64 -5646793257986063976, i64 4, i32 0, i64 -1)
+  br i1 %cmp, label %b5, label %b6
+; CHECK: - b4: float = {{.*}}, int = {{.*}}, count = 1
+
+b5:
+  call void @llvm.pseudoprobe(i64 -5646793257986063976, i64 5, i32 0, i64 -1)
+  ret i32 %1
+; CHECK: - b5: float = {{.*}}, int = {{.*}}, count = 0
+
+b6:
+  call void @llvm.pseudoprobe(i64 -5646793257986063976, i64 6, i32 0, i64 -1)
+  ret i32 %1
+; CHECK: - b6: float = {{.*}}, int = {{.*}}, count = 1
+
+b7:
+  call void @llvm.pseudoprobe(i64 -5646793257986063976, i64 7, i32 0, i64 -1)
+  ret i32 %1
+; CHECK: - b7: float = {{.*}}, int = {{.*}}, count = 1
+
+}
+
+; Another test with an island.
+;
+;  +----------+
+;  |  b1 [0]  |
+;  +----------+
+;   |
+;   |
+;   v
+;  +----------+
+;  | b2 [100] | <+
+;  +----------+  |
+;   |            |
+;   |            |
+;   v            |
+;  +----------+  |
+;  | b3 [100] | -+
+;  +----------+
+;   |
+;   |
+;   v
+;  +----------+
+;  |  b4 [0]  |
+;  +----------+
+
+; Function Attrs: nounwind uwtable
+define dso_local i32 @islands_2(i32 %0, i32 %1) #1 {
+b1:
+  call void @llvm.pseudoprobe(i64 -7683376842751444845, i64 1, i32 0, i64 -1)
+  %cmp = icmp ne i32 %0, 0
+  br label %b2
+; CHECK: - b1: float = {{.*}}, int = {{.*}}, count = 1
+
+b2:
+  call void @llvm.pseudoprobe(i64 -7683376842751444845, i64 2, i32 0, i64 -1)
+  br label %b3
+; CHECK: - b2: float = {{.*}}, int = {{.*}}, count = 10001
+
+b3:
+  call void @llvm.pseudoprobe(i64 -7683376842751444845, i64 3, i32 0, i64 -1)
+  br i1 %cmp, label %b2, label %b4
+; CHECK: - b3: float = {{.*}}, int = {{.*}}, count = 10001
+
+b4:
+  call void @llvm.pseudoprobe(i64 -7683376842751444845, i64 4, i32 0, i64 -1)
+  ret i32 %1
+; CHECK: - b4: float = {{.*}}, int = {{.*}}, count = 1
+}
+
+
+; The test verifies that the island is connected to the entry block via a
+; cheapest path (that is, passing through blocks with large counts).
+;
+; +---------+     +---------+     +----------+     +--------+
+; | b8 [10] | <-- | b3 [10] | <-- | b1 [10]  | --> | b4 [0] |
+; +---------+     +---------+     +----------+     +--------+
+;                   |               |                |
+;                   |               |                |
+;                   |               v                |
+;                   |             +----------+       |
+;                   |             |  b2 [0]  |       |
+;                   |             +----------+       |
+;                   |               |                |
+;                   |               |                |
+;                   |               v                v
+;                   |             +-------------------------+
+;                   +-----------> |        b5 [100]         |
+;                                 +-------------------------+
+;                                   |           ^
+;                                   |           |
+;                                   v           |
+;                                 +----------+  |
+;                                 | b6 [100] | -+
+;                                 +----------+
+;                                   |
+;                                   |
+;                                   v
+;                                 +----------+
+;                                 |  b7 [0]  |
+;                                 +----------+
+
+; Function Attrs: nounwind uwtable
+define dso_local i32 @islands_3(i32 %0, i32 %1) #1 {
+b1:
+  call void @llvm.pseudoprobe(i64 -9095645063288297061, i64 1, i32 0, i64 -1)
+  %cmp = icmp ne i32 %0, 0
+  switch i32 %1, label %b2 [
+    i32 1, label %b3
+    i32 2, label %b4
+  ]
+; CHECK: - b1: float = {{.*}}, int = {{.*}}, count = 11
+
+b2:
+  call void @llvm.pseudoprobe(i64 -9095645063288297061, i64 2, i32 0, i64 -1)
+  br label %b5
+; CHECK: - b2: float = {{.*}}, int = {{.*}}, count = 0
+
+b3:
+  call void @llvm.pseudoprobe(i64 -9095645063288297061, i64 3, i32 0, i64 -1)
+  br i1 %cmp, label %b8, label %b5
+; CHECK: - b3: float = {{.*}}, int = {{.*}}, count = 11
+
+b4:
+  call void @llvm.pseudoprobe(i64 -9095645063288297061, i64 4, i32 0, i64 -1)
+  ret i32 %1
+; CHECK: - b4: float = {{.*}}, int = {{.*}}, count = 0
+
+b5:
+  call void @llvm.pseudoprobe(i64 -9095645063288297061, i64 5, i32 0, i64 -1)
+  br label %b6
+; CHECK: - b5: float = {{.*}}, int = {{.*}}, count = 1001
+
+b6:
+  call void @llvm.pseudoprobe(i64 -9095645063288297061, i64 6, i32 0, i64 -1)
+  br i1 %cmp, label %b7, label %b5
+; CHECK: - b6: float = {{.*}}, int = {{.*}}, count = 1001
+
+b7:
+  call void @llvm.pseudoprobe(i64 -9095645063288297061, i64 7, i32 0, i64 -1)
+  ret i32 %1
+; CHECK: - b7: float = {{.*}}, int = {{.*}}, count = 1
+
+b8:
+  call void @llvm.pseudoprobe(i64 -9095645063288297061, i64 8, i32 0, i64 -1)
+  ret i32 %1
+; CHECK: - b8: float = {{.*}}, int = {{.*}}, count = 10
+}
+
+declare void @llvm.pseudoprobe(i64, i64, i32, i64) #2
+
+attributes #0 = { noinline nounwind uwtable "use-sample-profile"}
+attributes #1 = { noinline nounwind uwtable "use-sample-profile"}
+attributes #2 = { nounwind }
+
+!llvm.pseudo_probe_desc = !{!7, !8}
+
+!7 = !{i64 -5646793257986063976, i64 120879332589, !"islands_1"}
+!8 = !{i64 -7683376842751444845, i64 69495280403, !"islands_2"}
+!9 = !{i64 -9095645063288297061, i64 156608410269, !"islands_3"}
+
+; CHECK-ENTRY-COUNT: = !{!"function_entry_count", i64 2}