[clang-tools-extra] [CodeLayout] Faster basic block reordering, ext-tsp (PR #68617)

via cfe-commits cfe-commits at lists.llvm.org
Tue Oct 24 16:07:17 PDT 2023


https://github.com/spupyrev updated https://github.com/llvm/llvm-project/pull/68617

>From ca0e18230d9b61aa8d65113d2ad0292f3b61c8a0 Mon Sep 17 00:00:00 2001
From: spupyrev <spupyrev at fb.com>
Date: Thu, 12 Oct 2023 17:32:36 -0700
Subject: [PATCH 1/4] review

---
 llvm/lib/Transforms/Utils/CodeLayout.cpp | 130 +++++++++++++----------
 1 file changed, 76 insertions(+), 54 deletions(-)

diff --git a/llvm/lib/Transforms/Utils/CodeLayout.cpp b/llvm/lib/Transforms/Utils/CodeLayout.cpp
index dea91dcac21ae14..3f325eaa21a4863 100644
--- a/llvm/lib/Transforms/Utils/CodeLayout.cpp
+++ b/llvm/lib/Transforms/Utils/CodeLayout.cpp
@@ -101,8 +101,8 @@ static cl::opt<unsigned> BackwardDistance(
 // The maximum size of a chain created by the algorithm. The size is bounded
 // so that the algorithm can efficiently process extremely large instances.
 static cl::opt<unsigned>
-    MaxChainSize("ext-tsp-max-chain-size", cl::ReallyHidden, cl::init(4096),
-                 cl::desc("The maximum size of a chain to create."));
+    MaxChainSize("ext-tsp-max-chain-size", cl::ReallyHidden, cl::init(512),
+                 cl::desc("The maximum size of a chain to create"));
 
 // The maximum size of a chain for splitting. Larger values of the threshold
 // may yield better quality at the cost of worsen run-time.
@@ -110,11 +110,10 @@ static cl::opt<unsigned> ChainSplitThreshold(
     "ext-tsp-chain-split-threshold", cl::ReallyHidden, cl::init(128),
     cl::desc("The maximum size of a chain to apply splitting"));
 
-// The option enables splitting (large) chains along in-coming and out-going
-// jumps. This typically results in a better quality.
-static cl::opt<bool> EnableChainSplitAlongJumps(
-    "ext-tsp-enable-chain-split-along-jumps", cl::ReallyHidden, cl::init(true),
-    cl::desc("The maximum size of a chain to apply splitting"));
+// The maximum ratio between densities of two chains for merging.
+static cl::opt<double> MaxMergeDensityRatio(
+    "ext-tsp-max-merge-density-ratio", cl::ReallyHidden, cl::init(100),
+    cl::desc("The maximum ratio between densities of two chains for merging"));
 
 // Algorithm-specific options for CDS.
 static cl::opt<unsigned> CacheEntries("cds-cache-entries", cl::ReallyHidden,
@@ -222,6 +221,8 @@ struct NodeT {
 
   bool isEntry() const { return Index == 0; }
 
+  // Check if Other is a successor of the node.
+  bool isSuccessor(const NodeT *Other) const;
   // The total execution count of outgoing jumps.
   uint64_t outCount() const;
 
@@ -285,7 +286,7 @@ struct ChainT {
 
   size_t numBlocks() const { return Nodes.size(); }
 
-  double density() const { return static_cast<double>(ExecutionCount) / Size; }
+  double density() const { return ExecutionCount / Size; }
 
   bool isEntry() const { return Nodes[0]->Index == 0; }
 
@@ -346,8 +347,9 @@ struct ChainT {
   uint64_t Id;
   // Cached ext-tsp score for the chain.
   double Score{0};
-  // The total execution count of the chain.
-  uint64_t ExecutionCount{0};
+  // The total execution count of the chain. Since the execution count of
+  // a basic block is uint64_t, using doubles here to avoid overflow.
+  double ExecutionCount{0};
   // The total size of the chain.
   uint64_t Size{0};
   // Nodes of the chain.
@@ -442,6 +444,14 @@ struct ChainEdge {
   bool CacheValidBackward{false};
 };
 
+bool NodeT::isSuccessor(const NodeT *Other) const {
+  for (JumpT *Jump : OutJumps) {
+    if (Jump->Target == Other)
+      return true;
+  }
+  return false;
+}
+
 uint64_t NodeT::outCount() const {
   uint64_t Count = 0;
   for (JumpT *Jump : OutJumps)
@@ -507,8 +517,6 @@ struct MergedNodesT {
 
   const NodeT *getFirstNode() const { return *Begin1; }
 
-  bool empty() const { return Begin1 == End1; }
-
 private:
   NodeIter Begin1;
   NodeIter End1;
@@ -632,7 +640,8 @@ class ExtTSPImpl {
       }
     }
     for (JumpT &Jump : AllJumps) {
-      assert(OutDegree[Jump.Source->Index] > 0);
+      assert(OutDegree[Jump.Source->Index] > 0 &&
+             "incorrectly computed out-degree of the block");
       Jump.IsConditional = OutDegree[Jump.Source->Index] > 1;
     }
 
@@ -725,6 +734,7 @@ class ExtTSPImpl {
       return std::make_tuple(A1->Id, B1->Id) < std::make_tuple(A2->Id, B2->Id);
     };
 
+    double PrevScore = 1e9;
     while (HotChains.size() > 1) {
       ChainT *BestChainPred = nullptr;
       ChainT *BestChainSucc = nullptr;
@@ -734,14 +744,25 @@ class ExtTSPImpl {
         // Get candidates for merging with the current chain.
         for (const auto &[ChainSucc, Edge] : ChainPred->Edges) {
           // Ignore loop edges.
-          if (ChainPred == ChainSucc)
+          if (Edge->isSelfEdge())
             continue;
-
           // Stop early if the combined chain violates the maximum allowed size.
           if (ChainPred->numBlocks() + ChainSucc->numBlocks() >= MaxChainSize)
             continue;
+          // Don't merge the chains if they have vastly different densities.
+          // We stop early if the ratio between the densities exceeds
+          // MaxMergeDensityRatio. Smaller values of the option result in
+          // fewer merges (hence, more chains), which in turn typically yields
+          // smaller size of the hot code section.
+          auto [minDensity, maxDensity] =
+              std::minmax(ChainPred->density(), ChainSucc->density());
+          assert(minDensity > 0.0 && maxDensity > 0.0 &&
+                 "incorrectly computed chain densities");
+          const double Ratio = maxDensity / minDensity;
+          if (Ratio > MaxMergeDensityRatio)
+            continue;
 
-          // Compute the gain of merging the two chains.
+          // Compute the gain of merging the two chains
           MergeGainT CurGain = getBestMergeGain(ChainPred, ChainSucc, Edge);
           if (CurGain.score() <= EPS)
             continue;
@@ -753,8 +774,15 @@ class ExtTSPImpl {
             BestGain = CurGain;
             BestChainPred = ChainPred;
             BestChainSucc = ChainSucc;
+            // Stop early when the merge is as good as the previous one.
+            if (BestGain.score() == PrevScore)
+              break;
           }
         }
+        // Since the score of merging (mostly) doesn't increase, we stop early
+        // when the newly found merge is as good as the previous one.
+        if (BestGain.score() == PrevScore)
+          break;
       }
 
       // Stop merging when there is no improvement.
@@ -762,6 +790,7 @@ class ExtTSPImpl {
         break;
 
       // Merge the best pair of chains.
+      PrevScore = BestGain.score();
       mergeChains(BestChainPred, BestChainSucc, BestGain.mergeOffset(),
                   BestGain.mergeType());
     }
@@ -851,36 +880,42 @@ class ExtTSPImpl {
     Gain.updateIfLessThan(
         computeMergeGain(ChainPred, ChainSucc, Jumps, 0, MergeTypeT::X_Y));
 
-    if (EnableChainSplitAlongJumps) {
-      // Attach (a part of) ChainPred before the first node of ChainSucc.
-      for (JumpT *Jump : ChainSucc->Nodes.front()->InJumps) {
-        const NodeT *SrcBlock = Jump->Source;
-        if (SrcBlock->CurChain != ChainPred)
-          continue;
-        size_t Offset = SrcBlock->CurIndex + 1;
-        tryChainMerging(Offset, {MergeTypeT::X1_Y_X2, MergeTypeT::X2_X1_Y});
-      }
+    // Attach (a part of) ChainPred before the first node of ChainSucc.
+    for (JumpT *Jump : ChainSucc->Nodes.front()->InJumps) {
+      const NodeT *SrcBlock = Jump->Source;
+      if (SrcBlock->CurChain != ChainPred)
+        continue;
+      size_t Offset = SrcBlock->CurIndex + 1;
+      tryChainMerging(Offset, {MergeTypeT::X1_Y_X2, MergeTypeT::X2_X1_Y});
+    }
 
-      // Attach (a part of) ChainPred after the last node of ChainSucc.
-      for (JumpT *Jump : ChainSucc->Nodes.back()->OutJumps) {
-        const NodeT *DstBlock = Jump->Target;
-        if (DstBlock->CurChain != ChainPred)
-          continue;
-        size_t Offset = DstBlock->CurIndex;
-        tryChainMerging(Offset, {MergeTypeT::X1_Y_X2, MergeTypeT::Y_X2_X1});
-      }
+    // Attach (a part of) ChainPred after the last node of ChainSucc.
+    for (JumpT *Jump : ChainSucc->Nodes.back()->OutJumps) {
+      const NodeT *DstBlock = Jump->Target;
+      if (DstBlock->CurChain != ChainPred)
+        continue;
+      size_t Offset = DstBlock->CurIndex;
+      tryChainMerging(Offset, {MergeTypeT::X1_Y_X2, MergeTypeT::Y_X2_X1});
     }
 
-    // Try to break ChainPred in various ways and concatenate with ChainSucc.
     if (ChainPred->Nodes.size() <= ChainSplitThreshold) {
+      // Try to break ChainPred in various ways and concatenate with ChainSucc.
       for (size_t Offset = 1; Offset < ChainPred->Nodes.size(); Offset++) {
-        // Try to split the chain in different ways. In practice, applying
-        // X2_Y_X1 merging is almost never provides benefits; thus, we exclude
-        // it from consideration to reduce the search space.
+        // Do not split the chain along a fall-through jump. One of the two
+        // loops above may still "break" such a jump whenever it results in a
+        // new fall-through.
+        const NodeT *BB = ChainPred->Nodes[Offset - 1];
+        const NodeT *BB2 = ChainPred->Nodes[Offset];
+        if (BB->isSuccessor(BB2))
+          continue;
+
+        // In practice, applying X2_Y_X1 merging almost never provides benefits;
+        // thus, we exclude it from consideration to reduce the search space.
         tryChainMerging(Offset, {MergeTypeT::X1_Y_X2, MergeTypeT::Y_X2_X1,
                                  MergeTypeT::X2_X1_Y});
       }
     }
+
     Edge->setCachedMergeGain(ChainPred, ChainSucc, Gain);
     return Gain;
   }
@@ -939,22 +974,11 @@ class ExtTSPImpl {
 
   /// Concatenate all chains into the final order.
   std::vector<uint64_t> concatChains() {
-    // Collect chains and calculate density stats for their sorting.
+    // Collect non-empty chains.
     std::vector<const ChainT *> SortedChains;
-    DenseMap<const ChainT *, double> ChainDensity;
     for (ChainT &Chain : AllChains) {
-      if (!Chain.Nodes.empty()) {
+      if (!Chain.Nodes.empty())
         SortedChains.push_back(&Chain);
-        // Using doubles to avoid overflow of ExecutionCounts.
-        double Size = 0;
-        double ExecutionCount = 0;
-        for (NodeT *Node : Chain.Nodes) {
-          Size += static_cast<double>(Node->Size);
-          ExecutionCount += static_cast<double>(Node->ExecutionCount);
-        }
-        assert(Size > 0 && "a chain of zero size");
-        ChainDensity[&Chain] = ExecutionCount / Size;
-      }
     }
 
     // Sorting chains by density in the decreasing order.
@@ -964,11 +988,9 @@ class ExtTSPImpl {
                 if (L->isEntry() != R->isEntry())
                   return L->isEntry();
 
-                const double DL = ChainDensity[L];
-                const double DR = ChainDensity[R];
                 // Compare by density and break ties by chain identifiers.
-                return std::make_tuple(-DL, L->Id) <
-                       std::make_tuple(-DR, R->Id);
+                return std::make_tuple(-L->density(), L->Id) <
+                       std::make_tuple(-R->density(), R->Id);
               });
 
     // Collect the nodes in the order specified by their chains.

>From e80e70d19a0c5706f387f267fcf37022e12d30c0 Mon Sep 17 00:00:00 2001
From: spupyrev <spupyrev at fb.com>
Date: Thu, 12 Oct 2023 18:03:52 -0700
Subject: [PATCH 2/4] adjusted tests

---
 llvm/test/CodeGen/X86/code_placement_ext_tsp.ll     | 13 ++-----------
 .../CodeGen/X86/code_placement_ext_tsp_large.ll     |  2 +-
 2 files changed, 3 insertions(+), 12 deletions(-)

diff --git a/llvm/test/CodeGen/X86/code_placement_ext_tsp.ll b/llvm/test/CodeGen/X86/code_placement_ext_tsp.ll
index 4053b8a8e123b1c..be0b9820e145415 100644
--- a/llvm/test/CodeGen/X86/code_placement_ext_tsp.ll
+++ b/llvm/test/CodeGen/X86/code_placement_ext_tsp.ll
@@ -1,6 +1,5 @@
 ;; See also llvm/unittests/Transforms/Utils/CodeLayoutTest.cpp
 ; RUN: llc -mcpu=corei7 -mtriple=x86_64-linux -enable-ext-tsp-block-placement=1 < %s | FileCheck %s
-; RUN: llc -mcpu=corei7 -mtriple=x86_64-linux -enable-ext-tsp-block-placement=1 -ext-tsp-chain-split-threshold=0 -ext-tsp-enable-chain-split-along-jumps=0 < %s | FileCheck %s -check-prefix=CHECK2
 
 define void @func1a()  {
 ; Test that the algorithm positions the most likely successor first
@@ -329,8 +328,8 @@ end:
 }
 
 define void @func4() !prof !11 {
-; Test verifying that, if enabled, chains can be split in order to improve the
-; objective (by creating more fallthroughs)
+; Test verifying that chains can be split in order to improve the objective
+; by creating more fallthroughs
 ;
 ; +-------+
 ; | entry |--------+
@@ -354,19 +353,11 @@ define void @func4() !prof !11 {
 ; |  b2   | <+ ----+
 ; +-------+
 ;
-; With chain splitting enabled:
 ; CHECK-LABEL: func4:
 ; CHECK: entry
 ; CHECK: b1
 ; CHECK: b3
 ; CHECK: b2
-;
-; With chain splitting disabled:
-; CHECK2-LABEL: func4:
-; CHECK2: entry
-; CHECK2: b1
-; CHECK2: b2
-; CHECK2: b3
 
 entry:
   call void @b()
diff --git a/llvm/test/CodeGen/X86/code_placement_ext_tsp_large.ll b/llvm/test/CodeGen/X86/code_placement_ext_tsp_large.ll
index cee8489e9aaea0c..3f06b536dc3a1e6 100644
--- a/llvm/test/CodeGen/X86/code_placement_ext_tsp_large.ll
+++ b/llvm/test/CodeGen/X86/code_placement_ext_tsp_large.ll
@@ -6,7 +6,7 @@
 @yydebug = dso_local global i32 0, align 4
 
 define void @func_large() !prof !0 {
-; A largee CFG instance where chain splitting helps to
+; A large CFG instance where chain splitting helps to
 ; compute a better basic block ordering. The test verifies that with chain
 ; splitting, the resulting layout is improved (e.g., the score is increased).
 ;

>From 02eade74a5dd5d3bddb5c81e4b0c21b6efb6cc3b Mon Sep 17 00:00:00 2001
From: spupyrev <spupyrev at fb.com>
Date: Tue, 17 Oct 2023 18:38:47 -0700
Subject: [PATCH 3/4] Got rid of confusing PrevScore

---
 llvm/lib/Transforms/Utils/CodeLayout.cpp | 11 +----------
 1 file changed, 1 insertion(+), 10 deletions(-)

diff --git a/llvm/lib/Transforms/Utils/CodeLayout.cpp b/llvm/lib/Transforms/Utils/CodeLayout.cpp
index 3f325eaa21a4863..12e8874ee22825f 100644
--- a/llvm/lib/Transforms/Utils/CodeLayout.cpp
+++ b/llvm/lib/Transforms/Utils/CodeLayout.cpp
@@ -734,7 +734,6 @@ class ExtTSPImpl {
       return std::make_tuple(A1->Id, B1->Id) < std::make_tuple(A2->Id, B2->Id);
     };
 
-    double PrevScore = 1e9;
     while (HotChains.size() > 1) {
       ChainT *BestChainPred = nullptr;
       ChainT *BestChainSucc = nullptr;
@@ -774,15 +773,8 @@ class ExtTSPImpl {
             BestGain = CurGain;
             BestChainPred = ChainPred;
             BestChainSucc = ChainSucc;
-            // Stop early when the merge is as good as the previous one.
-            if (BestGain.score() == PrevScore)
-              break;
           }
         }
-        // Since the score of merging (mostly) doesn't increase, we stop early
-        // when the newly found merge is as good as the previous one.
-        if (BestGain.score() == PrevScore)
-          break;
       }
 
       // Stop merging when there is no improvement.
@@ -790,7 +782,6 @@ class ExtTSPImpl {
         break;
 
       // Merge the best pair of chains.
-      PrevScore = BestGain.score();
       mergeChains(BestChainPred, BestChainSucc, BestGain.mergeOffset(),
                   BestGain.mergeType());
     }
@@ -898,8 +889,8 @@ class ExtTSPImpl {
       tryChainMerging(Offset, {MergeTypeT::X1_Y_X2, MergeTypeT::Y_X2_X1});
     }
 
+    // Try to break ChainPred in various ways and concatenate with ChainSucc.
     if (ChainPred->Nodes.size() <= ChainSplitThreshold) {
-      // Try to break ChainPred in various ways and concatenate with ChainSucc.
       for (size_t Offset = 1; Offset < ChainPred->Nodes.size(); Offset++) {
         // Do not split the chain along a fall-through jump. One of the two
         // loops above may still "break" such a jump whenever it results in a

>From cad375b84149942a538c3fe0b50e349b6252b383 Mon Sep 17 00:00:00 2001
From: spupyrev <spupyrev at fb.com>
Date: Mon, 23 Oct 2023 12:52:53 -0700
Subject: [PATCH 4/4] adjusted comments

---
 llvm/lib/Transforms/Utils/CodeLayout.cpp | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/llvm/lib/Transforms/Utils/CodeLayout.cpp b/llvm/lib/Transforms/Utils/CodeLayout.cpp
index 12e8874ee22825f..dd54de32c6efdb5 100644
--- a/llvm/lib/Transforms/Utils/CodeLayout.cpp
+++ b/llvm/lib/Transforms/Utils/CodeLayout.cpp
@@ -745,14 +745,14 @@ class ExtTSPImpl {
           // Ignore loop edges.
           if (Edge->isSelfEdge())
             continue;
-          // Stop early if the combined chain violates the maximum allowed size.
+          // Skip the merge if the combined chain violates the maximum specified
+          // size.
           if (ChainPred->numBlocks() + ChainSucc->numBlocks() >= MaxChainSize)
             continue;
           // Don't merge the chains if they have vastly different densities.
-          // We stop early if the ratio between the densities exceeds
-          // MaxMergeDensityRatio. Smaller values of the option result in
-          // fewer merges (hence, more chains), which in turn typically yields
-          // smaller size of the hot code section.
+          // Skip the merge if the ratio between the densities exceeds
+          // MaxMergeDensityRatio. Smaller values of the option result in fewer
+          // merges, and hence, more chains.
           auto [minDensity, maxDensity] =
               std::minmax(ChainPred->density(), ChainSucc->density());
           assert(minDensity > 0.0 && maxDensity > 0.0 &&
@@ -761,7 +761,7 @@ class ExtTSPImpl {
           if (Ratio > MaxMergeDensityRatio)
             continue;
 
-          // Compute the gain of merging the two chains
+          // Compute the gain of merging the two chains.
           MergeGainT CurGain = getBestMergeGain(ChainPred, ChainSucc, Edge);
           if (CurGain.score() <= EPS)
             continue;



More information about the cfe-commits mailing list