[llvm] [BOLT] CDSplit: A New 3-Way Function Splitting Algorithm (PR #72225)

via llvm-commits llvm-commits at lists.llvm.org
Wed Nov 15 14:30:56 PST 2023


https://github.com/ShatianWang updated https://github.com/llvm/llvm-project/pull/72225

>From 9c169ef928f9874e77751c044d2170c210f7722c Mon Sep 17 00:00:00 2001
From: Shatian Wang <shatian at meta.com>
Date: Thu, 2 Nov 2023 12:26:49 -0700
Subject: [PATCH 1/9] [BOLT] Extend calculateEmittedSize for Block Size
 Calculation

This commit modifies BinaryContext::calculateEmittedSize to update the
BinaryBasicBlock::OutputAddressRange for each basic block in the input
BF. The modification is done in place, where BB.OutputAddressRange.second
less BB.OutputAddressRange.first now gives the emitted size of the basic
block.
---
 bolt/lib/Core/BinaryContext.cpp | 45 ++++++++++++++++++++++++++++-----
 1 file changed, 39 insertions(+), 6 deletions(-)

diff --git a/bolt/lib/Core/BinaryContext.cpp b/bolt/lib/Core/BinaryContext.cpp
index fd4d09964b725e1..a68168a2303159c 100644
--- a/bolt/lib/Core/BinaryContext.cpp
+++ b/bolt/lib/Core/BinaryContext.cpp
@@ -2305,14 +2305,47 @@ BinaryContext::calculateEmittedSize(BinaryFunction &BF, bool FixBranches) {
   MCAsmLayout Layout(Assembler);
   Assembler.layout(Layout);
 
+  // Obtain main fragment size.
   const uint64_t HotSize =
       Layout.getSymbolOffset(*EndLabel) - Layout.getSymbolOffset(*StartLabel);
-  const uint64_t ColdSize =
-      std::accumulate(SplitLabels.begin(), SplitLabels.end(), 0ULL,
-                      [&](const uint64_t Accu, const LabelRange &Labels) {
-                        return Accu + Layout.getSymbolOffset(*Labels.second) -
-                               Layout.getSymbolOffset(*Labels.first);
-                      });
+  // Populate new start and end offsets of each basic block in main.
+  BinaryBasicBlock *PrevBB = nullptr;
+  for (BinaryBasicBlock *BB : BF.getLayout().getMainFragment()) {
+    uint64_t BBStartOffset = Layout.getSymbolOffset(*(BB->getLabel()));
+    BB->setOutputStartAddress(BBStartOffset);
+    if (PrevBB)
+      PrevBB->setOutputEndAddress(BBStartOffset);
+    PrevBB = BB;
+  }
+  if (PrevBB)
+    PrevBB->setOutputEndAddress(HotSize);
+
+  // Obtain split fragment sizes.
+  std::vector<uint64_t> SplitFragmentSizes;
+  uint64_t ColdSize = 0;
+  for (const auto &Labels : SplitLabels) {
+    uint64_t Size = Layout.getSymbolOffset(*Labels.second) -
+                    Layout.getSymbolOffset(*Labels.first);
+    SplitFragmentSizes.push_back(Size);
+    ColdSize += Size;
+  }
+
+  // Populate new start and end offsets of each basic block in split fragments.
+  PrevBB = nullptr;
+  uint64_t FragmentInd = 0;
+  for (FunctionFragment &FF : BF.getLayout().getSplitFragments()) {
+    for (BinaryBasicBlock *BB : FF) {
+      uint64_t BBStartOffset = Layout.getSymbolOffset(*(BB->getLabel()));
+      BB->setOutputStartAddress(BBStartOffset);
+      if (PrevBB)
+        PrevBB->setOutputEndAddress(BBStartOffset);
+      PrevBB = BB;
+    }
+    if (PrevBB)
+      PrevBB->setOutputEndAddress(SplitFragmentSizes[FragmentInd]);
+    FragmentInd++;
+    PrevBB = nullptr;
+  }
 
   // Clean-up the effect of the code emission.
   for (const MCSymbol &Symbol : Assembler.symbols()) {

>From 7b32fdea1b8a646ea0d1792b581534f7809c854d Mon Sep 17 00:00:00 2001
From: Shatian Wang <shatian at meta.com>
Date: Wed, 8 Nov 2023 08:20:11 -0800
Subject: [PATCH 2/9] [BOLT] Refactor SplitFunctions for Function Reuse

This commit updates SplitFunctions.h and SplitFunctions.cpp to enable
the reuse of createEHTrampolines, mergeEHTrampolines, hasFullProfile,
and allBlocksCold by a distinct function splitting pass (CDSplit).
---
 bolt/include/bolt/Core/BinaryFunction.h   | 14 ++++++++++
 bolt/include/bolt/Passes/SplitFunctions.h | 32 +++++++++++------------
 bolt/lib/Passes/SplitFunctions.cpp        | 18 +++----------
 3 files changed, 33 insertions(+), 31 deletions(-)

diff --git a/bolt/include/bolt/Core/BinaryFunction.h b/bolt/include/bolt/Core/BinaryFunction.h
index b1adf2da0da3e15..27b997753116970 100644
--- a/bolt/include/bolt/Core/BinaryFunction.h
+++ b/bolt/include/bolt/Core/BinaryFunction.h
@@ -1276,6 +1276,20 @@ class BinaryFunction {
   /// otherwise processed.
   bool isPseudo() const { return IsPseudo; }
 
+  /// Return true if every block in the function has a valid execution count.
+  bool hasFullProfile() const {
+    return llvm::all_of(blocks(), [](const BinaryBasicBlock &BB) {
+      return BB.getExecutionCount() != BinaryBasicBlock::COUNT_NO_PROFILE;
+    });
+  }
+
+  /// Return true if every block in the function has a zero execution count.
+  bool allBlocksCold() const {
+    return llvm::all_of(blocks(), [](const BinaryBasicBlock &BB) {
+      return BB.getExecutionCount() == 0;
+    });
+  }
+
   /// Return true if the function contains explicit or implicit indirect branch
   /// to its split fragments, e.g., split jump table, landing pad in split
   /// fragment.
diff --git a/bolt/include/bolt/Passes/SplitFunctions.h b/bolt/include/bolt/Passes/SplitFunctions.h
index 4058f3317dfbdbb..91b6d5518eaab26 100644
--- a/bolt/include/bolt/Passes/SplitFunctions.h
+++ b/bolt/include/bolt/Passes/SplitFunctions.h
@@ -50,6 +50,19 @@ class SplitFunctions : public BinaryFunctionPass {
   /// Split function body into fragments.
   void splitFunction(BinaryFunction &Function, SplitStrategy &Strategy);
 
+  std::atomic<uint64_t> SplitBytesHot{0ull};
+  std::atomic<uint64_t> SplitBytesCold{0ull};
+
+public:
+  explicit SplitFunctions(const cl::opt<bool> &PrintPass)
+      : BinaryFunctionPass(PrintPass) {}
+
+  bool shouldOptimize(const BinaryFunction &BF) const override;
+
+  const char *getName() const override { return "split-functions"; }
+
+  void runOnFunctions(BinaryContext &BC) override;
+
   struct TrampolineKey {
     FragmentNum SourceFN = FragmentNum::main();
     const MCSymbol *Target = nullptr;
@@ -81,27 +94,14 @@ class SplitFunctions : public BinaryFunctionPass {
   /// corresponding thrower block. The trampoline landing pad, when created,
   /// will redirect the execution to the real landing pad in a different
   /// fragment.
-  TrampolineSetType createEHTrampolines(BinaryFunction &Function) const;
+  static TrampolineSetType createEHTrampolines(BinaryFunction &Function);
 
   /// Merge trampolines into \p Layout without trampolines. The merge will place
   /// a trampoline immediately before its destination. Used to revert the effect
   /// of trampolines after createEHTrampolines().
-  BasicBlockOrderType
+  static BasicBlockOrderType
   mergeEHTrampolines(BinaryFunction &BF, BasicBlockOrderType &Layout,
-                     const TrampolineSetType &Trampolines) const;
-
-  std::atomic<uint64_t> SplitBytesHot{0ull};
-  std::atomic<uint64_t> SplitBytesCold{0ull};
-
-public:
-  explicit SplitFunctions(const cl::opt<bool> &PrintPass)
-      : BinaryFunctionPass(PrintPass) {}
-
-  bool shouldOptimize(const BinaryFunction &BF) const override;
-
-  const char *getName() const override { return "split-functions"; }
-
-  void runOnFunctions(BinaryContext &BC) override;
+                     const TrampolineSetType &Trampolines);
 };
 
 } // namespace bolt
diff --git a/bolt/lib/Passes/SplitFunctions.cpp b/bolt/lib/Passes/SplitFunctions.cpp
index 34973cecdf49161..223f8d17367845d 100644
--- a/bolt/lib/Passes/SplitFunctions.cpp
+++ b/bolt/lib/Passes/SplitFunctions.cpp
@@ -109,21 +109,9 @@ static cl::opt<SplitFunctionsStrategy> SplitStrategy(
 } // namespace opts
 
 namespace {
-bool hasFullProfile(const BinaryFunction &BF) {
-  return llvm::all_of(BF.blocks(), [](const BinaryBasicBlock &BB) {
-    return BB.getExecutionCount() != BinaryBasicBlock::COUNT_NO_PROFILE;
-  });
-}
-
-bool allBlocksCold(const BinaryFunction &BF) {
-  return llvm::all_of(BF.blocks(), [](const BinaryBasicBlock &BB) {
-    return BB.getExecutionCount() == 0;
-  });
-}
-
 struct SplitProfile2 final : public SplitStrategy {
   bool canSplit(const BinaryFunction &BF) override {
-    return BF.hasValidProfile() && hasFullProfile(BF) && !allBlocksCold(BF);
+    return BF.hasValidProfile() && BF.hasFullProfile() && !BF.allBlocksCold();
   }
 
   bool keepEmpty() override { return false; }
@@ -434,7 +422,7 @@ void SplitFunctions::splitFunction(BinaryFunction &BF, SplitStrategy &S) {
 }
 
 SplitFunctions::TrampolineSetType
-SplitFunctions::createEHTrampolines(BinaryFunction &BF) const {
+SplitFunctions::createEHTrampolines(BinaryFunction &BF) {
   const auto &MIB = BF.getBinaryContext().MIB;
 
   // Map real landing pads to the corresponding trampolines.
@@ -501,7 +489,7 @@ SplitFunctions::createEHTrampolines(BinaryFunction &BF) const {
 
 SplitFunctions::BasicBlockOrderType SplitFunctions::mergeEHTrampolines(
     BinaryFunction &BF, SplitFunctions::BasicBlockOrderType &Layout,
-    const SplitFunctions::TrampolineSetType &Trampolines) const {
+    const SplitFunctions::TrampolineSetType &Trampolines) {
   DenseMap<const MCSymbol *, SmallVector<const MCSymbol *, 0>>
       IncomingTrampolines;
   for (const auto &Entry : Trampolines) {

>From 7e608a933c652885fd1cb3d725422b849af30f27 Mon Sep 17 00:00:00 2001
From: Shatian Wang <shatian at meta.com>
Date: Thu, 2 Nov 2023 20:51:52 -0700
Subject: [PATCH 3/9] [BOLT] Setup CDSplit Pass Structure

This commit establishes the general structure of the CDSplit
implementation without incorporating the exact splitting logic.
Currently, all functions undergo hot-cold splitting based on the
decisions made by the SplitFunctions pass. Subsequent commits
will introduce the precise splitting logic.
---
 bolt/include/bolt/Passes/CDSplit.h     |  63 ++++++++
 bolt/lib/Passes/CDSplit.cpp            | 208 +++++++++++++++++++++++++
 bolt/lib/Passes/CMakeLists.txt         |   1 +
 bolt/lib/Passes/SplitFunctions.cpp     |  12 ++
 bolt/lib/Rewrite/BinaryPassManager.cpp |  10 ++
 bolt/lib/Utils/CommandLineOpts.cpp     |   6 +
 6 files changed, 300 insertions(+)
 create mode 100644 bolt/include/bolt/Passes/CDSplit.h
 create mode 100644 bolt/lib/Passes/CDSplit.cpp

diff --git a/bolt/include/bolt/Passes/CDSplit.h b/bolt/include/bolt/Passes/CDSplit.h
new file mode 100644
index 000000000000000..96a982683a7ec26
--- /dev/null
+++ b/bolt/include/bolt/Passes/CDSplit.h
@@ -0,0 +1,63 @@
+//===- bolt/Passes/CDSplit.h - Split functions into hot/warm/cold
+// after function reordering pass -------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef BOLT_PASSES_CDSPLIT
+#define BOLT_PASSES_CDSPLIT
+
+#include "bolt/Passes/SplitFunctions.h"
+#include <atomic>
+
+namespace llvm {
+namespace bolt {
+
+using BasicBlockOrder = BinaryFunction::BasicBlockOrderType;
+
+class CDSplit : public BinaryFunctionPass {
+private:
+  /// Overall stats.
+  std::atomic<uint64_t> SplitBytesHot{0ull};
+  std::atomic<uint64_t> SplitBytesCold{0ull};
+
+  /// List of functions to be considered.
+  /// All functions in the list are used to construct a call graph.
+  /// A subset of functions in this list are considered for splitting.
+  std::vector<BinaryFunction *> FunctionsToConsider;
+
+  /// Helper functions to initialize global variables.
+  void initialize(BinaryContext &BC);
+
+  /// Split function body into 3 fragments: hot / warm / cold.
+  void runOnFunction(BinaryFunction &BF);
+
+  /// Assign each basic block in the given function to either hot, cold,
+  /// or warm fragment using the CDSplit algorithm.
+  void assignFragmentThreeWay(const BinaryFunction &BF,
+                              const BasicBlockOrder &BlockOrder);
+
+  /// Find the best split index that separates hot from warm.
+  /// The basic block whose index equals the returned split index will be the
+  /// last hot block.
+  size_t findSplitIndex(const BinaryFunction &BF,
+                        const BasicBlockOrder &BlockOrder);
+
+public:
+  explicit CDSplit(const cl::opt<bool> &PrintPass)
+      : BinaryFunctionPass(PrintPass) {}
+
+  bool shouldOptimize(const BinaryFunction &BF) const override;
+
+  const char *getName() const override { return "cdsplit"; }
+
+  void runOnFunctions(BinaryContext &BC) override;
+};
+
+} // namespace bolt
+} // namespace llvm
+
+#endif
diff --git a/bolt/lib/Passes/CDSplit.cpp b/bolt/lib/Passes/CDSplit.cpp
new file mode 100644
index 000000000000000..cd67b24241a4249
--- /dev/null
+++ b/bolt/lib/Passes/CDSplit.cpp
@@ -0,0 +1,208 @@
+//===- bolt/Passes/CDSplit.cpp - Pass for splitting function code 3-way
+//--===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the CDSplit pass.
+//
+//===----------------------------------------------------------------------===//
+
+#include "bolt/Passes/CDSplit.h"
+#include "bolt/Core/ParallelUtilities.h"
+#include "bolt/Utils/CommandLineOpts.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/MC/MCInst.h"
+#include "llvm/Support/MathExtras.h"
+
+#define DEBUG_TYPE "bolt-opts"
+
+using namespace llvm;
+using namespace bolt;
+
+namespace opts {
+
+extern cl::OptionCategory BoltOptCategory;
+
+extern cl::opt<bool> UseCDSplit;
+extern cl::opt<bool> SplitEH;
+extern cl::opt<unsigned> ExecutionCountThreshold;
+} // namespace opts
+
+namespace llvm {
+namespace bolt {
+
+namespace {
+/// Return true if the function should be considered for building call graph.
+bool shouldConsider(const BinaryFunction &BF) {
+  return BF.hasValidIndex() && BF.hasValidProfile() && !BF.empty();
+}
+} // anonymous namespace
+
+bool CDSplit::shouldOptimize(const BinaryFunction &BF) const {
+  // Do not split functions with a small execution count.
+  if (BF.getKnownExecutionCount() < opts::ExecutionCountThreshold)
+    return false;
+
+  // Do not split functions with at least one block that has no known
+  // execution count due to incomplete information.
+  // Do not split functions with only zero-execution count blocks
+  // as there is not enough variation in block count to justify splitting.
+  if (!BF.hasFullProfile() || BF.allBlocksCold())
+    return false;
+
+  return BinaryFunctionPass::shouldOptimize(BF);
+}
+
+/// Initialize algorithm's metadata.
+void CDSplit::initialize(BinaryContext &BC) {
+  // Construct a list of functions that are considered for building call graph.
+  // Only those in this list that evaluates true for shouldOptimize are
+  // candidates for 3-way splitting.
+  std::vector<BinaryFunction *> SortedFunctions = BC.getSortedFunctions();
+  FunctionsToConsider.reserve(SortedFunctions.size());
+  for (BinaryFunction *BF : SortedFunctions) {
+    if (shouldConsider(*BF))
+      FunctionsToConsider.push_back(BF);
+  }
+}
+
+/// Find the best index for splitting. The returned value is the index of the
+/// last hot basic block. Hence, "no splitting" is equivalent to returning the
+/// value which is one less than the size of the function.
+size_t CDSplit::findSplitIndex(const BinaryFunction &BF,
+                               const BasicBlockOrder &BlockOrder) {
+  // Placeholder: hot-cold splitting.
+  return BF.getLayout().getMainFragment().size() - 1;
+}
+
+/// Assign each basic block in the given function to either hot, cold,
+/// or warm fragment using the CDSplit algorithm.
+void CDSplit::assignFragmentThreeWay(const BinaryFunction &BF,
+                                     const BasicBlockOrder &BlockOrder) {
+  size_t BestSplitIndex = findSplitIndex(BF, BlockOrder);
+
+  // Assign fragments based on the computed best split index.
+  // All basic blocks with index up to the best split index become hot.
+  // All remaining blocks are warm / cold depending on if count is
+  // greater than 0 or not.
+  FragmentNum Main(0);
+  FragmentNum Warm(1);
+  FragmentNum Cold(2);
+  for (size_t Index = 0; Index < BlockOrder.size(); Index++) {
+    BinaryBasicBlock *BB = BlockOrder[Index];
+    if (Index <= BestSplitIndex)
+      BB->setFragmentNum(Main);
+    else
+      BB->setFragmentNum(BB->getKnownExecutionCount() > 0 ? Warm : Cold);
+  }
+}
+
+void CDSplit::runOnFunction(BinaryFunction &BF) {
+  assert(!BF.empty() && "splitting an empty function");
+
+  FunctionLayout &Layout = BF.getLayout();
+  BinaryContext &BC = BF.getBinaryContext();
+
+  BasicBlockOrder NewLayout(Layout.block_begin(), Layout.block_end());
+  // Never outline the first basic block.
+  NewLayout.front()->setCanOutline(false);
+  for (BinaryBasicBlock *BB : NewLayout) {
+    if (!BB->canOutline())
+      continue;
+
+    // Do not split extra entry points in aarch64. They can be referred by
+    // using ADRs and when this happens, these blocks cannot be placed far
+    // away due to the limited range in ADR instruction.
+    if (BC.isAArch64() && BB->isEntryPoint()) {
+      BB->setCanOutline(false);
+      continue;
+    }
+
+    if (BF.hasEHRanges() && !opts::SplitEH) {
+      // We cannot move landing pads (or rather entry points for landing pads).
+      if (BB->isLandingPad()) {
+        BB->setCanOutline(false);
+        continue;
+      }
+      // We cannot move a block that can throw since exception-handling
+      // runtime cannot deal with split functions. However, if we can guarantee
+      // that the block never throws, it is safe to move the block to
+      // decrease the size of the function.
+      for (MCInst &Instr : *BB) {
+        if (BC.MIB->isInvoke(Instr)) {
+          BB->setCanOutline(false);
+          break;
+        }
+      }
+    }
+  }
+
+  // Assign each basic block in NewLayout to either hot, warm, or cold fragment.
+  assignFragmentThreeWay(BF, NewLayout);
+
+  // Make sure all non-outlineable blocks are in the main-fragment.
+  for (BinaryBasicBlock *BB : NewLayout) {
+    if (!BB->canOutline())
+      BB->setFragmentNum(FragmentNum::main());
+  }
+
+  // In case any non-outlineable blocks previously in warm or cold is now set
+  // to be in main by the preceding for loop, move them to the end of main.
+  llvm::stable_sort(NewLayout,
+                    [&](const BinaryBasicBlock *L, const BinaryBasicBlock *R) {
+                      return L->getFragmentNum() < R->getFragmentNum();
+                    });
+
+  BF.getLayout().update(NewLayout);
+
+  // For shared objects, invoke instructions and corresponding landing pads
+  // have to be placed in the same fragment. When we split them, create
+  // trampoline landing pads that will redirect the execution to real LPs.
+  SplitFunctions::TrampolineSetType Trampolines;
+  if (!BC.HasFixedLoadAddress && BF.hasEHRanges() && BF.isSplit())
+    Trampolines = SplitFunctions::createEHTrampolines(BF);
+
+  if (BC.isX86() && BF.isSplit()) {
+    size_t HotSize;
+    size_t ColdSize;
+    std::tie(HotSize, ColdSize) = BC.calculateEmittedSize(BF);
+    SplitBytesHot += HotSize;
+    SplitBytesCold += ColdSize;
+  }
+}
+
+void CDSplit::runOnFunctions(BinaryContext &BC) {
+  if (!opts::UseCDSplit)
+    return;
+
+  // Initialize global variables.
+  initialize(BC);
+
+  // Only functions satisfying shouldConsider and shouldOptimize are candidates
+  // for splitting.
+  ParallelUtilities::PredicateTy SkipFunc = [&](const BinaryFunction &BF) {
+    return !(shouldConsider(BF) && shouldOptimize(BF));
+  };
+
+  // Make function splitting decisions in parallel.
+  ParallelUtilities::runOnEachFunction(
+      BC, ParallelUtilities::SchedulingPolicy::SP_BB_LINEAR,
+      [&](BinaryFunction &BF) { runOnFunction(BF); }, SkipFunc, "CDSplit",
+      /*ForceSequential=*/false);
+
+  if (SplitBytesHot + SplitBytesCold > 0) {
+    outs() << "BOLT-INFO: cdsplit separates " << SplitBytesHot
+           << " hot bytes from " << SplitBytesCold << " cold bytes "
+           << format("(%.2lf%% of split functions is in the main fragment)\n",
+                     100.0 * SplitBytesHot / (SplitBytesHot + SplitBytesCold));
+
+  } else
+    outs() << "BOLT-INFO: cdsplit didn't split any functions\n";
+}
+
+} // namespace bolt
+} // namespace llvm
diff --git a/bolt/lib/Passes/CMakeLists.txt b/bolt/lib/Passes/CMakeLists.txt
index b8bbe59a64480c9..4cc4b4fa6ae345c 100644
--- a/bolt/lib/Passes/CMakeLists.txt
+++ b/bolt/lib/Passes/CMakeLists.txt
@@ -9,6 +9,7 @@ add_llvm_library(LLVMBOLTPasses
   CacheMetrics.cpp
   CallGraph.cpp
   CallGraphWalker.cpp
+  CDSplit.cpp
   DataflowAnalysis.cpp
   DataflowInfoManager.cpp
   FrameAnalysis.cpp
diff --git a/bolt/lib/Passes/SplitFunctions.cpp b/bolt/lib/Passes/SplitFunctions.cpp
index 223f8d17367845d..0c11d0fb49cd09c 100644
--- a/bolt/lib/Passes/SplitFunctions.cpp
+++ b/bolt/lib/Passes/SplitFunctions.cpp
@@ -60,6 +60,7 @@ extern cl::OptionCategory BoltOptCategory;
 extern cl::opt<bool> SplitEH;
 extern cl::opt<unsigned> ExecutionCountThreshold;
 extern cl::opt<uint32_t> RandomSeed;
+extern cl::opt<bool> UseCDSplit;
 
 static cl::opt<bool> AggressiveSplitting(
     "split-all-cold", cl::desc("outline as many cold basic blocks as possible"),
@@ -231,6 +232,17 @@ bool SplitFunctions::shouldOptimize(const BinaryFunction &BF) const {
 }
 
 void SplitFunctions::runOnFunctions(BinaryContext &BC) {
+  if (opts::UseCDSplit &&
+      !(opts::SplitFunctions &&
+        opts::SplitStrategy == SplitFunctionsStrategy::Profile2)) {
+    errs() << "BOLT-ERROR: -use-cdsplit should be applied together with "
+              "-split-functions using default -split-strategy=profile2. "
+              "-split-functions 2-way splits functions before the function "
+              "reordering pass, while -use-cdsplit 3-way splits functions "
+              "after the function reordering pass. \n";
+    exit(1);
+  }
+
   if (!opts::SplitFunctions)
     return;
 
diff --git a/bolt/lib/Rewrite/BinaryPassManager.cpp b/bolt/lib/Rewrite/BinaryPassManager.cpp
index cd27c71ba2aedf2..daa54d8d5e647d9 100644
--- a/bolt/lib/Rewrite/BinaryPassManager.cpp
+++ b/bolt/lib/Rewrite/BinaryPassManager.cpp
@@ -11,6 +11,7 @@
 #include "bolt/Passes/Aligner.h"
 #include "bolt/Passes/AllocCombiner.h"
 #include "bolt/Passes/AsmDump.h"
+#include "bolt/Passes/CDSplit.h"
 #include "bolt/Passes/CMOVConversion.h"
 #include "bolt/Passes/FixRISCVCallsPass.h"
 #include "bolt/Passes/FixRelaxationPass.h"
@@ -177,6 +178,10 @@ static cl::opt<bool>
     PrintSplit("print-split", cl::desc("print functions after code splitting"),
                cl::Hidden, cl::cat(BoltOptCategory));
 
+static cl::opt<bool> PrintCDSplit("print-cdsplit",
+                                  cl::desc("print functions after cdsplit"),
+                                  cl::Hidden, cl::cat(BoltOptCategory));
+
 static cl::opt<bool>
     PrintStoke("print-stoke", cl::desc("print functions after stoke analysis"),
                cl::Hidden, cl::cat(BoltOptCategory));
@@ -424,6 +429,11 @@ void BinaryFunctionPassManager::runAllPasses(BinaryContext &BC) {
   Manager.registerPass(
       std::make_unique<ReorderFunctions>(PrintReorderedFunctions));
 
+  /// This pass three-way splits functions after function reordering.
+  Manager.registerPass(std::make_unique<CDSplit>(PrintCDSplit));
+
+  Manager.registerPass(std::make_unique<FixupBranches>(PrintAfterBranchFixup));
+
   // Print final dyno stats right while CFG and instruction analysis are intact.
   Manager.registerPass(
       std::make_unique<DynoStatsPrintPass>(
diff --git a/bolt/lib/Utils/CommandLineOpts.cpp b/bolt/lib/Utils/CommandLineOpts.cpp
index 0c0e83c4ec9703b..e71dcc680cac1bd 100644
--- a/bolt/lib/Utils/CommandLineOpts.cpp
+++ b/bolt/lib/Utils/CommandLineOpts.cpp
@@ -196,6 +196,12 @@ cl::opt<unsigned>
               cl::init(0), cl::ZeroOrMore, cl::cat(BoltCategory),
               cl::sub(cl::SubCommand::getAll()));
 
+cl::opt<bool>
+    UseCDSplit("use-cdsplit",
+               cl::desc("split functions into 3 fragments using the CDSplit "
+                        "algorithm after function reordering pass"),
+               cl::init(false), cl::cat(BoltOptCategory));
+
 bool processAllFunctions() {
   if (opts::AggregateOnly)
     return false;

>From a8fe566d2c9e766854f644a0543e943a10b068ce Mon Sep 17 00:00:00 2001
From: Shatian Wang <shatian at meta.com>
Date: Tue, 31 Oct 2023 14:57:00 -0700
Subject: [PATCH 4/9] [BOLT] Introduce .text.warm for -use-cdsplit=1

This commit explicitly adds a warm code section, .text.warm, when the
-use-cdsplit=1 flag is set. This replaces the previous approach of using
.text.cold.0 as warm and .text.cold.1 as cold in 3-way splitting.
---
 bolt/include/bolt/Core/BinaryBasicBlock.h |  4 +---
 bolt/include/bolt/Core/BinaryContext.h    |  2 ++
 bolt/include/bolt/Core/BinaryFunction.h   | 16 ++++++++------
 bolt/include/bolt/Core/FunctionLayout.h   |  5 ++++-
 bolt/lib/Core/BinaryBasicBlock.cpp        | 19 ++++++++++------
 bolt/lib/Core/BinaryEmitter.cpp           |  6 ++++-
 bolt/lib/Core/BinaryFunction.cpp          | 13 +++++++++++
 bolt/lib/Passes/BinaryPasses.cpp          |  4 +++-
 bolt/lib/Passes/CDSplit.cpp               |  1 -
 bolt/lib/Passes/IndirectCallPromotion.cpp |  6 +++--
 bolt/lib/Passes/SplitFunctions.cpp        |  6 ++---
 bolt/lib/Rewrite/RewriteInstance.cpp      | 27 ++++++++++++++++-------
 12 files changed, 75 insertions(+), 34 deletions(-)

diff --git a/bolt/include/bolt/Core/BinaryBasicBlock.h b/bolt/include/bolt/Core/BinaryBasicBlock.h
index bc95e2c4de3a11e..32e48b964d73933 100644
--- a/bolt/include/bolt/Core/BinaryBasicBlock.h
+++ b/bolt/include/bolt/Core/BinaryBasicBlock.h
@@ -677,9 +677,7 @@ class BinaryBasicBlock {
     return isSplit();
   }
 
-  void setIsCold(const bool Flag) {
-    Fragment = Flag ? FragmentNum::cold() : FragmentNum::main();
-  }
+  void setIsCold(const bool Flag);
 
   /// Return true if the block can be outlined. At the moment we disallow
   /// outlining of blocks that can potentially throw exceptions or are
diff --git a/bolt/include/bolt/Core/BinaryContext.h b/bolt/include/bolt/Core/BinaryContext.h
index a4e84cb93c093dc..6e68799bfbff7b7 100644
--- a/bolt/include/bolt/Core/BinaryContext.h
+++ b/bolt/include/bolt/Core/BinaryContext.h
@@ -927,6 +927,8 @@ class BinaryContext {
 
   const char *getMainCodeSectionName() const { return ".text"; }
 
+  const char *getWarmCodeSectionName() const { return ".text.warm"; }
+
   const char *getColdCodeSectionName() const { return ".text.cold"; }
 
   const char *getHotTextMoverSectionName() const { return ".text.mover"; }
diff --git a/bolt/include/bolt/Core/BinaryFunction.h b/bolt/include/bolt/Core/BinaryFunction.h
index 27b997753116970..ff4ccb757d00c0d 100644
--- a/bolt/include/bolt/Core/BinaryFunction.h
+++ b/bolt/include/bolt/Core/BinaryFunction.h
@@ -359,6 +359,9 @@ class BinaryFunction {
   /// Name for the section this function code should reside in.
   std::string CodeSectionName;
 
+  /// Name for the corresponding warm code section.
+  std::string WarmCodeSectionName;
+
   /// Name for the corresponding cold code section.
   std::string ColdCodeSectionName;
 
@@ -1235,13 +1238,7 @@ class BinaryFunction {
 
   /// Return internal section name for this function.
   SmallString<32>
-  getCodeSectionName(const FragmentNum Fragment = FragmentNum::main()) const {
-    if (Fragment == FragmentNum::main())
-      return SmallString<32>(CodeSectionName);
-    if (Fragment == FragmentNum::cold())
-      return SmallString<32>(ColdCodeSectionName);
-    return formatv("{0}.{1}", ColdCodeSectionName, Fragment.get() - 1);
-  }
+  getCodeSectionName(const FragmentNum Fragment = FragmentNum::main()) const;
 
   /// Assign a code section name to the function.
   void setCodeSectionName(const StringRef Name) {
@@ -1254,6 +1251,11 @@ class BinaryFunction {
     return BC.getUniqueSectionByName(getCodeSectionName(Fragment));
   }
 
+  /// Assign a section name for the warm part of the function.
+  void setWarmCodeSectionName(const StringRef Name) {
+    WarmCodeSectionName = Name.str();
+  }
+
   /// Assign a section name for the cold part of the function.
   void setColdCodeSectionName(const StringRef Name) {
     ColdCodeSectionName = Name.str();
diff --git a/bolt/include/bolt/Core/FunctionLayout.h b/bolt/include/bolt/Core/FunctionLayout.h
index 904da3a4a93aade..94d71a84aae8334 100644
--- a/bolt/include/bolt/Core/FunctionLayout.h
+++ b/bolt/include/bolt/Core/FunctionLayout.h
@@ -62,7 +62,10 @@ class FragmentNum {
   }
 
   static constexpr FragmentNum main() { return FragmentNum(0); }
-  static constexpr FragmentNum cold() { return FragmentNum(1); }
+  static constexpr FragmentNum warm() { return FragmentNum(1); }
+  static constexpr FragmentNum cold(bool Flag = false) {
+    return FragmentNum(Flag ? 2 : 1);
+  }
 };
 
 /// A freestanding subset of contiguous blocks of a function.
diff --git a/bolt/lib/Core/BinaryBasicBlock.cpp b/bolt/lib/Core/BinaryBasicBlock.cpp
index 984bc6dbd220ab0..40a655c0cd24b3a 100644
--- a/bolt/lib/Core/BinaryBasicBlock.cpp
+++ b/bolt/lib/Core/BinaryBasicBlock.cpp
@@ -15,17 +15,25 @@
 #include "bolt/Core/BinaryFunction.h"
 #include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/MC/MCInst.h"
+#include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Errc.h"
 
 #define DEBUG_TYPE "bolt"
 
-namespace llvm {
-namespace bolt {
+using namespace llvm;
+using namespace bolt;
+namespace opts {
+extern cl::opt<bool> UseCDSplit;
+}
 
 constexpr uint32_t BinaryBasicBlock::INVALID_OFFSET;
 
-bool operator<(const BinaryBasicBlock &LHS, const BinaryBasicBlock &RHS) {
-  return LHS.Index < RHS.Index;
+bool bolt::operator<(const BinaryBasicBlock &LHS, const BinaryBasicBlock &RHS) {
+  return LHS.getIndex() < RHS.getIndex();
+}
+
+void BinaryBasicBlock::setIsCold(const bool Flag) {
+  Fragment = Flag ? FragmentNum::cold(opts::UseCDSplit) : FragmentNum::main();
 }
 
 bool BinaryBasicBlock::hasCFG() const { return getParent()->hasCFG(); }
@@ -611,6 +619,3 @@ BinaryBasicBlock *BinaryBasicBlock::splitAt(iterator II) {
 
   return NewBlock;
 }
-
-} // namespace bolt
-} // namespace llvm
diff --git a/bolt/lib/Core/BinaryEmitter.cpp b/bolt/lib/Core/BinaryEmitter.cpp
index fb1bf530c1974aa..9829c6bc107f03a 100644
--- a/bolt/lib/Core/BinaryEmitter.cpp
+++ b/bolt/lib/Core/BinaryEmitter.cpp
@@ -34,6 +34,7 @@ namespace opts {
 
 extern cl::opt<JumpTableSupportLevel> JumpTables;
 extern cl::opt<bool> PreserveBlocksAlignment;
+extern cl::opt<bool> UseCDSplit;
 
 cl::opt<bool> AlignBlocks("align-blocks", cl::desc("align basic blocks"),
                           cl::cat(BoltOptCategory));
@@ -287,7 +288,10 @@ void BinaryEmitter::emitFunctions() {
 
   // Mark the end of hot text.
   if (opts::HotText) {
-    Streamer.switchSection(BC.getTextSection());
+    if (opts::UseCDSplit)
+      Streamer.switchSection(BC.getCodeSection(BC.getWarmCodeSectionName()));
+    else
+      Streamer.switchSection(BC.getTextSection());
     Streamer.emitLabel(BC.getHotTextEndSymbol());
   }
 }
diff --git a/bolt/lib/Core/BinaryFunction.cpp b/bolt/lib/Core/BinaryFunction.cpp
index 1f4a7cc35247425..cf0b00721eb1340 100644
--- a/bolt/lib/Core/BinaryFunction.cpp
+++ b/bolt/lib/Core/BinaryFunction.cpp
@@ -61,6 +61,7 @@ extern cl::opt<bool> Instrument;
 extern cl::opt<bool> KeepNops;
 extern cl::opt<bool> StrictMode;
 extern cl::opt<bool> UpdateDebugSections;
+extern cl::opt<bool> UseCDSplit;
 extern cl::opt<unsigned> Verbosity;
 
 extern bool processAllFunctions();
@@ -169,6 +170,18 @@ template <typename R> static bool emptyRange(const R &Range) {
   return Range.begin() == Range.end();
 }
 
+/// Return internal section name for this function.
+SmallString<32>
+BinaryFunction::getCodeSectionName(const FragmentNum Fragment) const {
+  if (Fragment == FragmentNum::main())
+    return SmallString<32>(CodeSectionName);
+  if (Fragment == FragmentNum::cold(opts::UseCDSplit))
+    return SmallString<32>(ColdCodeSectionName);
+  if (Fragment == FragmentNum::warm())
+    return SmallString<32>(WarmCodeSectionName);
+  return formatv("{0}.{1}", ColdCodeSectionName, Fragment.get() - 1);
+}
+
 /// Gets debug line information for the instruction located at the given
 /// address in the original binary. The SMLoc's pointer is used
 /// to point to this information, which is represented by a
diff --git a/bolt/lib/Passes/BinaryPasses.cpp b/bolt/lib/Passes/BinaryPasses.cpp
index 4e1343e2c30be56..c60a03d3d984720 100644
--- a/bolt/lib/Passes/BinaryPasses.cpp
+++ b/bolt/lib/Passes/BinaryPasses.cpp
@@ -1244,8 +1244,10 @@ void AssignSections::runOnFunctions(BinaryContext &BC) {
     else
       Function.setCodeSectionName(BC.getColdCodeSectionName());
 
-    if (Function.isSplit())
+    if (Function.isSplit()) {
+      Function.setWarmCodeSectionName(BC.getWarmCodeSectionName());
       Function.setColdCodeSectionName(BC.getColdCodeSectionName());
+    }
   }
 }
 
diff --git a/bolt/lib/Passes/CDSplit.cpp b/bolt/lib/Passes/CDSplit.cpp
index cd67b24241a4249..509d250c3419183 100644
--- a/bolt/lib/Passes/CDSplit.cpp
+++ b/bolt/lib/Passes/CDSplit.cpp
@@ -24,7 +24,6 @@ using namespace llvm;
 using namespace bolt;
 
 namespace opts {
-
 extern cl::OptionCategory BoltOptCategory;
 
 extern cl::opt<bool> UseCDSplit;
diff --git a/bolt/lib/Passes/IndirectCallPromotion.cpp b/bolt/lib/Passes/IndirectCallPromotion.cpp
index 8b084c3b63d565a..a46df74d7fd7bbc 100644
--- a/bolt/lib/Passes/IndirectCallPromotion.cpp
+++ b/bolt/lib/Passes/IndirectCallPromotion.cpp
@@ -34,6 +34,7 @@ extern cl::OptionCategory BoltOptCategory;
 extern cl::opt<IndirectCallPromotionType> ICP;
 extern cl::opt<unsigned> Verbosity;
 extern cl::opt<unsigned> ExecutionCountThreshold;
+extern cl::opt<bool> UseCDSplit;
 
 static cl::opt<unsigned> ICPJTRemainingPercentThreshold(
     "icp-jt-remaining-percent-threshold",
@@ -259,9 +260,10 @@ IndirectCallPromotion::getCallTargets(BinaryBasicBlock &BB,
       MCSymbol *Entry = JT->Entries[I];
       const BinaryBasicBlock *ToBB = BF.getBasicBlockForLabel(Entry);
       assert(ToBB || Entry == BF.getFunctionEndLabel() ||
-             Entry == BF.getFunctionEndLabel(FragmentNum::cold()));
+             Entry ==
+                 BF.getFunctionEndLabel(FragmentNum::cold(opts::UseCDSplit)));
       if (Entry == BF.getFunctionEndLabel() ||
-          Entry == BF.getFunctionEndLabel(FragmentNum::cold()))
+          Entry == BF.getFunctionEndLabel(FragmentNum::cold(opts::UseCDSplit)))
         continue;
       const Location To(Entry);
       const BinaryBasicBlock::BinaryBranchInfo &BI = BB.getBranchInfo(*ToBB);
diff --git a/bolt/lib/Passes/SplitFunctions.cpp b/bolt/lib/Passes/SplitFunctions.cpp
index 0c11d0fb49cd09c..a5eb19724c31458 100644
--- a/bolt/lib/Passes/SplitFunctions.cpp
+++ b/bolt/lib/Passes/SplitFunctions.cpp
@@ -115,12 +115,12 @@ struct SplitProfile2 final : public SplitStrategy {
     return BF.hasValidProfile() && BF.hasFullProfile() && !BF.allBlocksCold();
   }
 
-  bool keepEmpty() override { return false; }
+  bool keepEmpty() override { return opts::UseCDSplit ? true : false; }
 
   void fragment(const BlockIt Start, const BlockIt End) override {
     for (BinaryBasicBlock *const BB : llvm::make_range(Start, End)) {
       if (BB->getExecutionCount() == 0)
-        BB->setFragmentNum(FragmentNum::cold());
+        BB->setFragmentNum(FragmentNum::cold(opts::UseCDSplit));
     }
   }
 };
@@ -144,7 +144,7 @@ struct SplitRandom2 final : public SplitStrategy {
     std::uniform_int_distribution<DiffT> Dist(1, LastSplitPoint);
     const DiffT SplitPoint = Dist(Gen);
     for (BinaryBasicBlock *BB : llvm::make_range(Start + SplitPoint, End))
-      BB->setFragmentNum(FragmentNum::cold());
+      BB->setFragmentNum(FragmentNum::cold(opts::UseCDSplit));
 
     LLVM_DEBUG(dbgs() << formatv("BOLT-DEBUG: randomly chose last {0} (out of "
                                  "{1} possible) blocks to split\n",
diff --git a/bolt/lib/Rewrite/RewriteInstance.cpp b/bolt/lib/Rewrite/RewriteInstance.cpp
index ce3402c5827ae6d..88b93ab12282035 100644
--- a/bolt/lib/Rewrite/RewriteInstance.cpp
+++ b/bolt/lib/Rewrite/RewriteInstance.cpp
@@ -84,6 +84,7 @@ extern cl::opt<JumpTableSupportLevel> JumpTables;
 extern cl::list<std::string> ReorderData;
 extern cl::opt<bolt::ReorderFunctions::ReorderType> ReorderFunctions;
 extern cl::opt<bool> TimeBuild;
+extern cl::opt<bool> UseCDSplit;
 
 cl::opt<bool> AllowStripped("allow-stripped",
                             cl::desc("allow processing of stripped binaries"),
@@ -3480,11 +3481,21 @@ std::vector<BinarySection *> RewriteInstance::getCodeSections() {
     if (B->getName() == BC->getHotTextMoverSectionName())
       return false;
 
-    // Depending on the option, put main text at the beginning or at the end.
-    if (opts::HotFunctionsAtEnd)
-      return B->getName() == BC->getMainCodeSectionName();
-    else
-      return A->getName() == BC->getMainCodeSectionName();
+    // Depending on opts::HotFunctionsAtEnd, place main and warm sections in
+    // order.
+    if (opts::HotFunctionsAtEnd) {
+      if (B->getName() == BC->getMainCodeSectionName())
+        return true;
+      if (A->getName() == BC->getMainCodeSectionName())
+        return false;
+      return (B->getName() == BC->getWarmCodeSectionName());
+    } else {
+      if (A->getName() == BC->getMainCodeSectionName())
+        return true;
+      if (B->getName() == BC->getMainCodeSectionName())
+        return false;
+      return (A->getName() == BC->getWarmCodeSectionName());
+    }
   };
 
   // Determine the order of sections.
@@ -3641,7 +3652,7 @@ void RewriteInstance::mapCodeSections(BOLTLinker::SectionMapper MapSection) {
            "non-relocation mode.");
 
     FunctionFragment &FF =
-        Function.getLayout().getFragment(FragmentNum::cold());
+        Function.getLayout().getFragment(FragmentNum::cold(opts::UseCDSplit));
     ErrorOr<BinarySection &> ColdSection =
         Function.getCodeSection(FF.getFragmentNum());
     assert(ColdSection && "cannot find section for cold part");
@@ -4686,8 +4697,8 @@ void RewriteInstance::updateELFSymbolTable(
       SmallVector<char, 256> Buf;
       NewColdSym.st_name = AddToStrTab(
           Twine(Function->getPrintName()).concat(".cold.0").toStringRef(Buf));
-      const FunctionFragment &ColdFF =
-          Function->getLayout().getFragment(FragmentNum::cold());
+      const FunctionFragment &ColdFF = Function->getLayout().getFragment(
+          FragmentNum::cold(opts::UseCDSplit));
       NewColdSym.st_value = ColdFF.getAddress();
       NewColdSym.st_size = ColdFF.getImageSize();
       Symbols.emplace_back(NewColdSym);

>From 6c0211eb7031dc97fd550f12f6ec8a3bb2d4104a Mon Sep 17 00:00:00 2001
From: Shatian Wang <shatian at meta.com>
Date: Tue, 7 Nov 2023 06:36:42 -0800
Subject: [PATCH 5/9] [BOLT] Initialize Auxiliary Variables

This diff defines and initializes auxiliary variables used by CDSplit.
---
 bolt/include/bolt/Passes/CDSplit.h | 19 +++++++
 bolt/lib/Passes/CDSplit.cpp        | 82 ++++++++++++++++++++++++++++++
 2 files changed, 101 insertions(+)

diff --git a/bolt/include/bolt/Passes/CDSplit.h b/bolt/include/bolt/Passes/CDSplit.h
index 96a982683a7ec26..ef03b9f65e2b648 100644
--- a/bolt/include/bolt/Passes/CDSplit.h
+++ b/bolt/include/bolt/Passes/CDSplit.h
@@ -18,7 +18,14 @@ namespace bolt {
 
 using BasicBlockOrder = BinaryFunction::BasicBlockOrderType;
 
+struct JumpInfo {
+  bool HasUncondBranch = false;
+  BinaryBasicBlock *CondSuccessor = nullptr;
+  BinaryBasicBlock *UncondSuccessor = nullptr;
+};
+
 class CDSplit : public BinaryFunctionPass {
+
 private:
   /// Overall stats.
   std::atomic<uint64_t> SplitBytesHot{0ull};
@@ -29,6 +36,18 @@ class CDSplit : public BinaryFunctionPass {
   /// A subset of functions in this list are considered for splitting.
   std::vector<BinaryFunction *> FunctionsToConsider;
 
+  /// Auxiliary variables used by the algorithm.
+  size_t TotalNumBlocks{0};
+  size_t OrigHotSectionSize;
+  DenseMap<const BinaryBasicBlock *, size_t> GlobalIndices;
+  DenseMap<const BinaryBasicBlock *, size_t> BBSizes;
+  DenseMap<const BinaryBasicBlock *, size_t> BBOffsets;
+  // Call graph.
+  std::vector<SmallVector<const BinaryBasicBlock *, 0>> Callers;
+  std::vector<SmallVector<const BinaryBasicBlock *, 0>> Callees;
+  // Conditional and unconditional successors of each BB.
+  DenseMap<const BinaryBasicBlock *, JumpInfo> JumpInfos;
+
   /// Helper functions to initialize global variables.
   void initialize(BinaryContext &BC);
 
diff --git a/bolt/lib/Passes/CDSplit.cpp b/bolt/lib/Passes/CDSplit.cpp
index 509d250c3419183..6b6f1454512aa02 100644
--- a/bolt/lib/Passes/CDSplit.cpp
+++ b/bolt/lib/Passes/CDSplit.cpp
@@ -39,6 +39,25 @@ namespace {
 bool shouldConsider(const BinaryFunction &BF) {
   return BF.hasValidIndex() && BF.hasValidProfile() && !BF.empty();
 }
+
+/// Find (un)conditional branch instruction info of the basic block.
+JumpInfo analyzeBranches(BinaryBasicBlock *BB) {
+  JumpInfo BBJumpInfo;
+  const MCSymbol *TBB = nullptr;
+  const MCSymbol *FBB = nullptr;
+  MCInst *CondBranch = nullptr;
+  MCInst *UncondBranch = nullptr;
+  if (BB->analyzeBranch(TBB, FBB, CondBranch, UncondBranch)) {
+    BBJumpInfo.HasUncondBranch = UncondBranch != nullptr;
+    if (BB->succ_size() == 1) {
+      BBJumpInfo.UncondSuccessor = BB->getSuccessor();
+    } else if (BB->succ_size() == 2) {
+      BBJumpInfo.CondSuccessor = BB->getConditionalSuccessor(true);
+      BBJumpInfo.UncondSuccessor = BB->getConditionalSuccessor(false);
+    }
+  }
+  return BBJumpInfo;
+}
 } // anonymous namespace
 
 bool CDSplit::shouldOptimize(const BinaryFunction &BF) const {
@@ -67,6 +86,69 @@ void CDSplit::initialize(BinaryContext &BC) {
     if (shouldConsider(*BF))
       FunctionsToConsider.push_back(BF);
   }
+
+  // Initialize auxiliary variables.
+  for (BinaryFunction *BF : FunctionsToConsider) {
+    // Calculate the size of each BB after hot-cold splitting.
+    // This populates BinaryBasicBlock::OutputAddressRange which
+    // can be used to compute the size of each BB.
+    BC.calculateEmittedSize(*BF, /*FixBranches=*/true);
+
+    for (BinaryBasicBlock *BB : BF->getLayout().blocks()) {
+      // Unique global index.
+      GlobalIndices[BB] = TotalNumBlocks;
+      TotalNumBlocks++;
+
+      // Block size after hot-cold splitting.
+      BBSizes[BB] = BB->getOutputAddressRange().second -
+                    BB->getOutputAddressRange().first;
+
+      // Hot block offset after hot-cold splitting.
+      BBOffsets[BB] = OrigHotSectionSize;
+      if (!BB->isSplit())
+        OrigHotSectionSize += BBSizes[BB];
+
+      // Conditional and unconditional successors.
+      JumpInfos[BB] = analyzeBranches(BB);
+    }
+  }
+
+  // Build call graph.
+  Callers.resize(TotalNumBlocks);
+  Callees.resize(TotalNumBlocks);
+  for (BinaryFunction *SrcFunction : FunctionsToConsider) {
+    for (BinaryBasicBlock &SrcBB : SrcFunction->blocks()) {
+      // Skip blocks that are not executed
+      if (SrcBB.getKnownExecutionCount() == 0)
+        continue;
+
+      // Find call instructions and extract target symbols from each one
+      for (const MCInst &Inst : SrcBB) {
+        if (!BC.MIB->isCall(Inst))
+          continue;
+
+        // Call info
+        const MCSymbol *DstSym = BC.MIB->getTargetSymbol(Inst);
+        // Ignore calls w/o information
+        if (!DstSym)
+          continue;
+
+        const BinaryFunction *DstFunction = BC.getFunctionForSymbol(DstSym);
+        // Ignore calls that do not have a valid target, but do not ignore
+        // recursive calls, because caller block could be moved to warm.
+        if (!DstFunction || DstFunction->getLayout().block_empty())
+          continue;
+
+        const BinaryBasicBlock *DstBB = &(DstFunction->front());
+
+        // Record the call only if DstBB is also in FunctionsToConsider.
+        if (GlobalIndices.contains(DstBB)) {
+          Callers[GlobalIndices[DstBB]].push_back(&SrcBB);
+          Callees[GlobalIndices[&SrcBB]].push_back(DstBB);
+        }
+      }
+    }
+  }
 }
 
 /// Find the best index for splitting. The returned value is the index of the

>From de24efcb88bd45850ebe1fbfbc21916b4ce39658 Mon Sep 17 00:00:00 2001
From: Shatian Wang <shatian at meta.com>
Date: Mon, 13 Nov 2023 17:16:59 -0800
Subject: [PATCH 6/9] [BOLT] CDSplit Main Logic Part 1/3

The first diff in a series of 3 that implements the main logic of
CDSplit. Under X86, function splitting can lead to block size increase.
This is because conditional and unconditional branch instructions whose
offset is under 8 bits can be encoded with 2 bytes. If the offset is
greater than 8 bits, then they need 6 and 5 bytes respectively.
Splitting a short conditional / unconditional branch will thus increase
the size of the src basic block by 4 and 3 bytes respectively. CDSplit
takes into account the potential block size increase when it makes
splitting decisions. This diff implements a function
estimatePostSplitBBAddress in CDSplit that approximates the block level
size increase at the given split index of the given function.
---
 bolt/include/bolt/Passes/CDSplit.h | 18 ++++++
 bolt/lib/Passes/CDSplit.cpp        | 90 ++++++++++++++++++++++++++++++
 2 files changed, 108 insertions(+)

diff --git a/bolt/include/bolt/Passes/CDSplit.h b/bolt/include/bolt/Passes/CDSplit.h
index ef03b9f65e2b648..a777ca5d52c8d4c 100644
--- a/bolt/include/bolt/Passes/CDSplit.h
+++ b/bolt/include/bolt/Passes/CDSplit.h
@@ -48,9 +48,27 @@ class CDSplit : public BinaryFunctionPass {
   // Conditional and unconditional successors of each BB.
   DenseMap<const BinaryBasicBlock *, JumpInfo> JumpInfos;
 
+  /// Sizes of branch instructions used to approximate block size increase
+  /// due to hot-warm splitting. Initialized to be 0. These values are updated
+  /// if the architecture is X86.
+  uint8_t BRANCH_SIZE = 0;
+  uint8_t LONG_UNCOND_BRANCH_SIZE_DELTA = 0;
+  uint8_t LONG_COND_BRANCH_SIZE_DELTA = 0;
+
   /// Helper functions to initialize global variables.
   void initialize(BinaryContext &BC);
 
+  /// Populate BinaryBasicBlock::OutputAddressRange with estimated basic block
+  /// start and end addresses for hot and warm basic blocks, assuming hot-warm
+  /// splitting happens at \p SplitIndex. Also return estimated end addresses
+  /// of the hot fragment before and after splitting.
+  /// The estimations take into account the potential addition of branch
+  /// instructions due to split fall through branches as well as the need to
+  /// use longer branch instructions for split (un)conditional branches.
+  std::pair<size_t, size_t>
+  estimatePostSplitBBAddress(const BasicBlockOrder &BlockOrder,
+                             const size_t SplitIndex);
+
   /// Split function body into 3 fragments: hot / warm / cold.
   void runOnFunction(BinaryFunction &BF);
 
diff --git a/bolt/lib/Passes/CDSplit.cpp b/bolt/lib/Passes/CDSplit.cpp
index 6b6f1454512aa02..7ba2e5ca73e68c5 100644
--- a/bolt/lib/Passes/CDSplit.cpp
+++ b/bolt/lib/Passes/CDSplit.cpp
@@ -149,6 +149,96 @@ void CDSplit::initialize(BinaryContext &BC) {
       }
     }
   }
+
+  // If X86, long branch instructions take more bytes than short branches.
+  // Adjust sizes of branch instructions used to approximate block size
+  // increase due to hot-warm splitting.
+  if (BC.isX86()) {
+    // a short branch takes 2 bytes.
+    BRANCH_SIZE = 2;
+    // a long uncond branch takes BRANCH_SIZE + 3 bytes.
+    LONG_UNCOND_BRANCH_SIZE_DELTA = 3;
+    // a long cond branch takes BRANCH_SIZE + 4 bytes.
+    LONG_COND_BRANCH_SIZE_DELTA = 4;
+  }
+}
+
+std::pair<size_t, size_t>
+CDSplit::estimatePostSplitBBAddress(const BasicBlockOrder &BlockOrder,
+                                    const size_t SplitIndex) {
+  assert(SplitIndex < BlockOrder.size() && "Invalid split index");
+  // Helper function estimating if a branch needs a longer branch instruction.
+  // The function returns true if the following two conditions are satisfied:
+  // condition 1. One of SrcBB and DstBB is in hot, the other is in warm.
+  // condition 2. The pre-split branch distance is within 8 bytes.
+  auto needNewLongBranch = [&](const BinaryBasicBlock *SrcBB,
+                               const BinaryBasicBlock *DstBB) {
+    if (!SrcBB || !DstBB)
+      return false;
+    // The following checks for condition 1.
+    if (SrcBB->isSplit() || DstBB->isSplit())
+      return false;
+    if ((SrcBB->getLayoutIndex() <= SplitIndex) ==
+        (DstBB->getLayoutIndex() <= SplitIndex))
+      return false;
+    // The following checks for condition 2.
+    return (AbsoluteDifference(BBOffsets[DstBB],
+                               BBOffsets[SrcBB] + BBSizes[SrcBB]) <=
+            std::numeric_limits<int8_t>::max());
+  };
+
+  // Populate BB.OutputAddressRange with estimated new start and end addresses
+  // and compute the old end address of the hot section and the new end address
+  // of the hot section.
+  size_t OldHotEndAddr;
+  size_t NewHotEndAddr;
+  size_t CurrentAddr = BBOffsets[BlockOrder[0]];
+  for (BinaryBasicBlock *BB : BlockOrder) {
+    // We only care about new addresses of blocks in hot/warm.
+    if (BB->isSplit())
+      break;
+    size_t NewSize = BBSizes[BB];
+    // Need to add a new branch instruction if a fall-through branch is split.
+    bool NeedNewUncondBranch =
+        (JumpInfos[BB].UncondSuccessor && !JumpInfos[BB].HasUncondBranch &&
+         BB->getLayoutIndex() == SplitIndex);
+
+    NewSize += BRANCH_SIZE * NeedNewUncondBranch +
+               LONG_UNCOND_BRANCH_SIZE_DELTA *
+                   needNewLongBranch(BB, JumpInfos[BB].UncondSuccessor) +
+               LONG_COND_BRANCH_SIZE_DELTA *
+                   needNewLongBranch(BB, JumpInfos[BB].CondSuccessor);
+    BB->setOutputStartAddress(CurrentAddr);
+    CurrentAddr += NewSize;
+    BB->setOutputEndAddress(CurrentAddr);
+    // Temporarily set the start address of the warm fragment of the current
+    // function to be 0. We will update it later when we can get a better
+    // estimate.
+    if (BB->getLayoutIndex() == SplitIndex) {
+      NewHotEndAddr = CurrentAddr;
+      CurrentAddr = 0;
+    }
+    OldHotEndAddr = BBOffsets[BB] + BBSizes[BB];
+  }
+
+  // Update the start and end addresses of blocks in the warm fragment.
+  // First get a better estimate of the start address of the warm fragment.
+  assert(OrigHotSectionSize + OldHotEndAddr >= NewHotEndAddr);
+  size_t WarmSectionStartAddr =
+      OrigHotSectionSize + OldHotEndAddr - NewHotEndAddr;
+  // Do the correction.
+  for (size_t Index = SplitIndex + 1; Index < BlockOrder.size(); Index++) {
+    BinaryBasicBlock *BB = BlockOrder[Index];
+    // We only care about new addresses of blocks in warm.
+    if (BB->isSplit())
+      break;
+    size_t StartAddrOffset = BB->getOutputAddressRange().first;
+    size_t EndAddrOffset = BB->getOutputAddressRange().second;
+    BB->setOutputStartAddress(WarmSectionStartAddr + StartAddrOffset);
+    BB->setOutputEndAddress(WarmSectionStartAddr + EndAddrOffset);
+  }
+
+  return std::make_pair(OldHotEndAddr, NewHotEndAddr);
 }
 
 /// Find the best index for splitting. The returned value is the index of the

>From d56e5a4153eee8ed306aebf3a931672edece7175 Mon Sep 17 00:00:00 2001
From: Shatian Wang <shatian at meta.com>
Date: Mon, 13 Nov 2023 17:39:15 -0800
Subject: [PATCH 7/9] [BOLT] CDSplit Main Logic Part 2/3

The second diff in a series of 3 that implements the main logic of
CDSplit. When the function order is [... X ... BF ... Y ...], a main
benefit of splitting the hot fragment of BF further into a hot and a
warm fragment is that function calls in the form of X->Y or Y->X will
become shorter (i.e., SrcBB and DstBB will become closer to each other)
as long as the new hot fragment of BF is smaller in size compared to
the original hot fragment. This diff implements a function that finds
all such "shortenable" calls in the form of X->Y or Y->X for the given
function BF.
---
 bolt/include/bolt/Passes/CDSplit.h | 11 +++++++
 bolt/lib/Passes/CDSplit.cpp        | 49 ++++++++++++++++++++++++++++++
 2 files changed, 60 insertions(+)

diff --git a/bolt/include/bolt/Passes/CDSplit.h b/bolt/include/bolt/Passes/CDSplit.h
index a777ca5d52c8d4c..fb61d353a7e2dc5 100644
--- a/bolt/include/bolt/Passes/CDSplit.h
+++ b/bolt/include/bolt/Passes/CDSplit.h
@@ -18,6 +18,11 @@ namespace bolt {
 
 using BasicBlockOrder = BinaryFunction::BasicBlockOrderType;
 
+struct CallInfo {
+  size_t Length;
+  size_t Count;
+};
+
 struct JumpInfo {
   bool HasUncondBranch = false;
   BinaryBasicBlock *CondSuccessor = nullptr;
@@ -58,6 +63,12 @@ class CDSplit : public BinaryFunctionPass {
   /// Helper functions to initialize global variables.
   void initialize(BinaryContext &BC);
 
+  /// Get a collection of "shortenable" calls, that is, calls of type X->Y
+  /// when the function order is [... X ... BF ... Y ...].
+  /// Such calls are guaranteed to get shorter (by the size of non-hot section
+  /// minus that of the total size increase).
+  std::vector<CallInfo> extractCoverCalls(const BinaryFunction &BF);
+
   /// Populate BinaryBasicBlock::OutputAddressRange with estimated basic block
   /// start and end addresses for hot and warm basic blocks, assuming hot-warm
   /// splitting happens at \p SplitIndex. Also return estimated end addresses
diff --git a/bolt/lib/Passes/CDSplit.cpp b/bolt/lib/Passes/CDSplit.cpp
index 7ba2e5ca73e68c5..c8d8972aad1d8e8 100644
--- a/bolt/lib/Passes/CDSplit.cpp
+++ b/bolt/lib/Passes/CDSplit.cpp
@@ -29,6 +29,11 @@ extern cl::OptionCategory BoltOptCategory;
 extern cl::opt<bool> UseCDSplit;
 extern cl::opt<bool> SplitEH;
 extern cl::opt<unsigned> ExecutionCountThreshold;
+
+static cl::opt<double> CallScale("call-scale",
+                                 cl::desc("Call score scale coefficient"),
+                                 cl::init(0.95), cl::ReallyHidden,
+                                 cl::cat(BoltOptCategory));
 } // namespace opts
 
 namespace llvm {
@@ -163,6 +168,50 @@ void CDSplit::initialize(BinaryContext &BC) {
   }
 }
 
+/// Get a collection of "shortenable" calls, that is, calls of type X->Y
+/// when the function order is [... X ... BF ... Y ...].
+/// If the hot fragment size of BF is reduced, then such calls are guaranteed
+/// to get shorter by the reduced hot fragment size.
+std::vector<CallInfo> CDSplit::extractCoverCalls(const BinaryFunction &BF) {
+  // Record the length and the count of the calls that can be shortened
+  std::vector<CallInfo> CoverCalls;
+  if (opts::CallScale == 0)
+    return CoverCalls;
+
+  const BinaryFunction *ThisBF = &BF;
+  const BinaryBasicBlock *ThisBB = &(ThisBF->front());
+  size_t ThisGI = GlobalIndices[ThisBB];
+
+  for (BinaryFunction *DstBF : FunctionsToConsider) {
+    const BinaryBasicBlock *DstBB = &(DstBF->front());
+    if (DstBB->getKnownExecutionCount() == 0)
+      continue;
+
+    size_t DstGI = GlobalIndices[DstBB];
+    for (const BinaryBasicBlock *SrcBB : Callers[DstGI]) {
+      const BinaryFunction *SrcBF = SrcBB->getFunction();
+      if (ThisBF == SrcBF)
+        continue;
+
+      const size_t CallCount = SrcBB->getKnownExecutionCount();
+
+      size_t SrcGI = GlobalIndices[SrcBB];
+
+      bool IsCoverCall = (SrcGI < ThisGI && ThisGI < DstGI) ||
+                         (DstGI <= ThisGI && ThisGI < SrcGI);
+      if (!IsCoverCall)
+        continue;
+
+      size_t SrcBBEndAddr = BBOffsets[SrcBB] + BBSizes[SrcBB];
+      size_t DstBBStartAddr = BBOffsets[DstBB];
+      size_t CallLength = AbsoluteDifference(SrcBBEndAddr, DstBBStartAddr);
+      CallInfo CI{CallLength, CallCount};
+      CoverCalls.emplace_back(CI);
+    }
+  }
+  return CoverCalls;
+}
+
 std::pair<size_t, size_t>
 CDSplit::estimatePostSplitBBAddress(const BasicBlockOrder &BlockOrder,
                                     const size_t SplitIndex) {

>From 8cdaf0c811502bc9588ef607a4816f8230ef893e Mon Sep 17 00:00:00 2001
From: Shatian Wang <shatian at meta.com>
Date: Mon, 13 Nov 2023 17:52:02 -0800
Subject: [PATCH 8/9] [BOLT] CDSplit Main Logic Part 3/3

The third diff in a series of 3 that implements the main logic of
CDSplit. CDSplit processes functions in a binary in parallel. For each
function BF, it assumes that all other functions are hot-cold split. For
each possible hot-warm split point of BF, it computes its corresponding
SplitScore, and chooses the split point with the best SplitScore. The
SplitScore of each split point is computed in the following way: each
call edge or jump edge has an edge score that is proportional to its
execution count, and inversely proportional to its distance. The
SplitScore of a split point is a sum of edge scores over a fixed set of
edges whose distance can change due to hot-warm splitting BF. This set
contains all cover calls in the form of X->Y or Y->X given function
order [... X ... BF ... Y ...]; we refer to the sum of edge scores over
the set of cover calls as CoverCallScore. This set also contains all
jump edges (branches) within BF as well as all call edges originated
from BF; we refer to the sum of edge scores over this set of edges as
LocalScore. CDSplit finds the split index maximizing CoverCallScore +
LocalScore.
---
 bolt/include/bolt/Passes/CDSplit.h |  40 ++++++
 bolt/lib/Passes/CDSplit.cpp        | 188 ++++++++++++++++++++++++++++-
 2 files changed, 226 insertions(+), 2 deletions(-)

diff --git a/bolt/include/bolt/Passes/CDSplit.h b/bolt/include/bolt/Passes/CDSplit.h
index fb61d353a7e2dc5..cc6ea616aa5aa16 100644
--- a/bolt/include/bolt/Passes/CDSplit.h
+++ b/bolt/include/bolt/Passes/CDSplit.h
@@ -29,6 +29,13 @@ struct JumpInfo {
   BinaryBasicBlock *UncondSuccessor = nullptr;
 };
 
+struct SplitScore {
+  size_t SplitIndex;
+  size_t HotSizeReduction = 0;
+  double LocalScore = 0;
+  double CoverCallScore = 0;
+};
+
 class CDSplit : public BinaryFunctionPass {
 
 private:
@@ -80,6 +87,39 @@ class CDSplit : public BinaryFunctionPass {
   estimatePostSplitBBAddress(const BasicBlockOrder &BlockOrder,
                              const size_t SplitIndex);
 
+  /// Compute sum of scores over jumps within \p BlockOrder given \p SplitIndex.
+  /// Increament Score.LocalScore in place by the sum.
+  void computeJumpScore(const BasicBlockOrder &BlockOrder,
+                        const size_t SplitIndex, SplitScore &Score);
+
+  /// Compute sum of scores over calls originated in the current function
+  /// given \p SplitIndex. Increament Score.LocalScore in place by the sum.
+  void computeLocalCallScore(const BasicBlockOrder &BlockOrder,
+                             const size_t SplitIndex, SplitScore &Score);
+
+  /// Compute sum of splitting scores for cover calls of the input function.
+  /// Increament Score.CoverCallScore in place by the sum.
+  void computeCoverCallScore(const BasicBlockOrder &BlockOrder,
+                             const size_t SplitIndex,
+                             const std::vector<CallInfo> &CoverCalls,
+                             SplitScore &Score);
+
+  /// Compute the split score of splitting a function at a given index.
+  /// The split score consists of local score and cover score. Cover call score
+  /// is expensive to compute. As a result, we pass in a \p ReferenceScore and
+  /// compute cover score only when the local score exceeds that in the
+  /// ReferenceScore or that the size reduction of the hot fragment is larger
+  /// than that achieved by the split index of the ReferenceScore. This function
+  /// returns \p Score of SplitScore type. It contains the local score and cover
+  /// score (if computed) of the current splitting index. For easier book
+  /// keeping and comparison, it also stores the split index and the resulting
+  /// reduction in hot fragment size.
+  SplitScore computeSplitScore(const BinaryFunction &BF,
+                               const BasicBlockOrder &BlockOrder,
+                               const size_t SplitIndex,
+                               const std::vector<CallInfo> &CoverCalls,
+                               const SplitScore &ReferenceScore);
+
   /// Split function body into 3 fragments: hot / warm / cold.
   void runOnFunction(BinaryFunction &BF);
 
diff --git a/bolt/lib/Passes/CDSplit.cpp b/bolt/lib/Passes/CDSplit.cpp
index c8d8972aad1d8e8..2e3b24f238f4dab 100644
--- a/bolt/lib/Passes/CDSplit.cpp
+++ b/bolt/lib/Passes/CDSplit.cpp
@@ -34,6 +34,12 @@ static cl::opt<double> CallScale("call-scale",
                                  cl::desc("Call score scale coefficient"),
                                  cl::init(0.95), cl::ReallyHidden,
                                  cl::cat(BoltOptCategory));
+static cl::opt<double> CallPower("call-power", cl::desc("Call score power"),
+                                 cl::init(0.05), cl::ReallyHidden,
+                                 cl::cat(BoltOptCategory));
+static cl::opt<double> JumpPower("jump-power", cl::desc("Jump score power"),
+                                 cl::init(0.15), cl::ReallyHidden,
+                                 cl::cat(BoltOptCategory));
 } // namespace opts
 
 namespace llvm {
@@ -63,6 +69,20 @@ JumpInfo analyzeBranches(BinaryBasicBlock *BB) {
   }
   return BBJumpInfo;
 }
+
+/// Compute the edge score of a call edge.
+double computeCallScore(uint64_t CallCount, size_t CallLength) {
+  // Increase call lengths by 1 to avoid raising 0 to a negative power.
+  return opts::CallScale * static_cast<double>(CallCount) /
+         std::pow(static_cast<double>(CallLength + 1), opts::CallPower);
+}
+
+/// Compute the edge score of a jump (branch) edge.
+double computeJumpScore(uint64_t JumpCount, size_t JumpLength) {
+  // Increase jump lengths by 1 to avoid raising 0 to a negative power.
+  return static_cast<double>(JumpCount) /
+         std::pow(static_cast<double>(JumpLength + 1), opts::JumpPower);
+}
 } // anonymous namespace
 
 bool CDSplit::shouldOptimize(const BinaryFunction &BF) const {
@@ -290,13 +310,177 @@ CDSplit::estimatePostSplitBBAddress(const BasicBlockOrder &BlockOrder,
   return std::make_pair(OldHotEndAddr, NewHotEndAddr);
 }
 
+/// Compute sum of scores over jumps within \p BlockOrder given \p SplitIndex.
+/// Increament Score.LocalScore in place by the sum.
+void CDSplit::computeJumpScore(const BasicBlockOrder &BlockOrder,
+                               const size_t SplitIndex, SplitScore &Score) {
+
+  for (BinaryBasicBlock *SrcBB : BlockOrder) {
+    if (SrcBB->getKnownExecutionCount() == 0)
+      continue;
+
+    size_t SrcBBEndAddr = SrcBB->getOutputAddressRange().second;
+
+    for (const auto Pair : zip(SrcBB->successors(), SrcBB->branch_info())) {
+      const BinaryBasicBlock *DstBB = std::get<0>(Pair);
+      const BinaryBasicBlock::BinaryBranchInfo &Branch = std::get<1>(Pair);
+      const size_t JumpCount = Branch.Count;
+
+      if (JumpCount == 0)
+        continue;
+
+      size_t DstBBStartAddr = DstBB->getOutputAddressRange().first;
+      size_t NewJumpLength = AbsoluteDifference(SrcBBEndAddr, DstBBStartAddr);
+      Score.LocalScore += ::computeJumpScore(JumpCount, NewJumpLength);
+    }
+  }
+}
+
+/// Compute sum of scores over calls originated in the current function
+/// given \p SplitIndex. Increament Score.LocalScore in place by the sum.
+void CDSplit::computeLocalCallScore(const BasicBlockOrder &BlockOrder,
+                                    const size_t SplitIndex,
+                                    SplitScore &Score) {
+  if (opts::CallScale == 0)
+    return;
+
+  // Global index of the last block in the current function.
+  // This is later used to determine whether a call originated in the current
+  // function is to a function that comes after the current function.
+  size_t LastGlobalIndex = GlobalIndices[BlockOrder.back()];
+
+  // The length of calls originated in the input function can increase /
+  // decrease depending on the splitting decision.
+  for (BinaryBasicBlock *SrcBB : BlockOrder) {
+    const size_t CallCount = SrcBB->getKnownExecutionCount();
+    // If SrcBB does not call any functions, skip it.
+    if (CallCount == 0)
+      continue;
+
+    // Obtain an estimate on the end address of the src basic block
+    // after splitting at SplitIndex.
+    size_t SrcBBEndAddr = SrcBB->getOutputAddressRange().second;
+
+    for (const BinaryBasicBlock *DstBB : Callees[GlobalIndices[SrcBB]]) {
+      // Obtain an estimate on the start address of the dst basic block
+      // after splitting at SplitIndex. If DstBB is in a function before
+      // the current function, then its start address remains unchanged.
+      size_t DstBBStartAddr = BBOffsets[DstBB];
+      // If DstBB is in a function after the current function, then its
+      // start address should be adjusted based on the reduction in hot size.
+      if (GlobalIndices[DstBB] > LastGlobalIndex) {
+        assert(DstBBStartAddr >= Score.HotSizeReduction);
+        DstBBStartAddr -= Score.HotSizeReduction;
+      }
+      size_t NewCallLength = AbsoluteDifference(SrcBBEndAddr, DstBBStartAddr);
+      Score.LocalScore += ::computeCallScore(CallCount, NewCallLength);
+    }
+  }
+}
+
+/// Compute sum of splitting scores for cover calls of the input function.
+/// Increament Score.CoverCallScore in place by the sum.
+void CDSplit::computeCoverCallScore(const BasicBlockOrder &BlockOrder,
+                                    const size_t SplitIndex,
+                                    const std::vector<CallInfo> &CoverCalls,
+                                    SplitScore &Score) {
+  if (opts::CallScale == 0)
+    return;
+
+  for (const CallInfo CI : CoverCalls) {
+    assert(CI.Length >= Score.HotSizeReduction &&
+           "Length of cover calls must exceed reduced size of hot fragment.");
+    // Compute the new length of the call, which is shorter than the original
+    // one by the size of the splitted fragment minus the total size increase.
+    size_t NewCallLength = CI.Length - Score.HotSizeReduction;
+    Score.CoverCallScore += ::computeCallScore(CI.Count, NewCallLength);
+  }
+}
+
+/// Compute the split score of splitting a function at a given index.
+/// The split score consists of local score and cover score. Cover call score is
+/// expensive to compute. As a result, we pass in a \p ReferenceScore and
+/// compute cover score only when the local score exceeds that in the
+/// ReferenceScore or that the size reduction of the hot fragment is larger than
+/// that achieved by the split index of the ReferenceScore. This function
+/// returns \p Score of SplitScore type. It contains the local score and cover
+/// score (if computed) of the current splitting index. For easier book keeping
+/// and comparison, it also stores the split index and the resulting reduction
+/// in hot fragment size.
+SplitScore CDSplit::computeSplitScore(const BinaryFunction &BF,
+                                      const BasicBlockOrder &BlockOrder,
+                                      const size_t SplitIndex,
+                                      const std::vector<CallInfo> &CoverCalls,
+                                      const SplitScore &ReferenceScore) {
+  // Populate BinaryBasicBlock::OutputAddressRange with estimated
+  // new start and end addresses after hot-warm splitting at SplitIndex.
+  size_t OldHotEnd;
+  size_t NewHotEnd;
+  std::tie(OldHotEnd, NewHotEnd) =
+      estimatePostSplitBBAddress(BlockOrder, SplitIndex);
+
+  SplitScore Score;
+  Score.SplitIndex = SplitIndex;
+
+  // It's not worth splitting if OldHotEnd < NewHotEnd.
+  if (OldHotEnd < NewHotEnd)
+    return Score;
+
+  // Hot fragment size reduction due to splitting.
+  Score.HotSizeReduction = OldHotEnd - NewHotEnd;
+
+  // First part of LocalScore is the sum over call edges originated in the input
+  // function. These edges can get shorter or longer depending on SplitIndex.
+  // Score.LocalScore is increamented in place.
+  computeLocalCallScore(BlockOrder, SplitIndex, Score);
+
+  // Second part of LocalScore is the sum over jump edges with src basic block
+  // and dst basic block in the current function. Score.LocalScore is
+  // increamented in place.
+  computeJumpScore(BlockOrder, SplitIndex, Score);
+
+  // There is no need to compute CoverCallScore if we have already found another
+  // split index with a bigger LocalScore and bigger HotSizeReduction.
+  if (Score.LocalScore <= ReferenceScore.LocalScore &&
+      Score.HotSizeReduction <= ReferenceScore.HotSizeReduction)
+    return Score;
+
+  // Compute CoverCallScore and store in Score in place.
+  computeCoverCallScore(BlockOrder, SplitIndex, CoverCalls, Score);
+  return Score;
+}
+
 /// Find the best index for splitting. The returned value is the index of the
 /// last hot basic block. Hence, "no splitting" is equivalent to returning the
 /// value which is one less than the size of the function.
 size_t CDSplit::findSplitIndex(const BinaryFunction &BF,
                                const BasicBlockOrder &BlockOrder) {
-  // Placeholder: hot-cold splitting.
-  return BF.getLayout().getMainFragment().size() - 1;
+  // Find all function calls that can be shortened if we move blocks of the
+  // current function to warm/cold
+  std::vector<CallInfo> CoverCalls = extractCoverCalls(BF);
+
+  // Try all possible split indices (blocks with Index <= SplitIndex are in hot)
+  // and find the one maximizing the splitting score.
+  SplitScore BestScore;
+  double BestScoreSum = -1.0;
+  SplitScore ReferenceScore;
+  for (size_t Index = 0; Index < BlockOrder.size(); Index++) {
+    const BinaryBasicBlock *LastHotBB = BlockOrder[Index];
+    // No need to keep cold blocks in the hot section.
+    if (LastHotBB->isSplit())
+      break;
+    SplitScore Score =
+        computeSplitScore(BF, BlockOrder, Index, CoverCalls, ReferenceScore);
+    double ScoreSum = Score.LocalScore + Score.CoverCallScore;
+    if (ScoreSum > BestScoreSum) {
+      BestScoreSum = ScoreSum;
+      BestScore = Score;
+    }
+    if (Score.LocalScore > ReferenceScore.LocalScore)
+      ReferenceScore = Score;
+  }
+
+  return BestScore.SplitIndex;
 }
 
 /// Assign each basic block in the given function to either hot, cold,

>From 0bf25f69ea2cb27a029aeea1845bd09b9da0081e Mon Sep 17 00:00:00 2001
From: Shatian Wang <shatian at meta.com>
Date: Wed, 15 Nov 2023 13:13:21 -0800
Subject: [PATCH 9/9] [BOLT] Add test 1 for the CDSplit pass

Test the control of aggressiveness of 3-way splitting by -call-scale.
When -call-scale=0.0, the tested function is 2-way splitted. When
-call-scale=1.0, the tested function is 3-way splitted with 5 blocks in
warm because of the increased benefit of shortening the call edges. When
-call-scale=1000.0, the tested function is 3-way splitted with 7 blocks
in warm because of the strong benefit of shortening the call edges.
---
 bolt/test/X86/cdsplit-call-scale.s | 129 +++++++++++++++++++++++++++++
 1 file changed, 129 insertions(+)
 create mode 100644 bolt/test/X86/cdsplit-call-scale.s

diff --git a/bolt/test/X86/cdsplit-call-scale.s b/bolt/test/X86/cdsplit-call-scale.s
new file mode 100644
index 000000000000000..4d5adee8822ccdd
--- /dev/null
+++ b/bolt/test/X86/cdsplit-call-scale.s
@@ -0,0 +1,129 @@
+# Test the control of aggressiveness of 3-way splitting by -call-scale.
+# When -call-scale=0.0, the tested function is 2-way splitted.
+# When -call-scale=1.0, the tested function is 3-way splitted with 5 blocks
+# in warm because of the increased benefit of shortening the call edges.
+# When -call-scale=1000.0, the tested function is 3-way splitted with 7 blocks
+# in warm because of the strong benefit of shortening the call edges.
+
+# RUN: llvm-mc --filetype=obj --triple x86_64-unknown-unknown %s -o %t.o
+# RUN: link_fdata %s %t.o %t.fdata
+# RUN: llvm-strip --strip-unneeded %t.o
+# RUN: %clang %cflags %t.o -o %t.exe -Wl,-q
+# RUN: llvm-bolt %t.exe -o %t.bolt --split-functions --split-all-cold \
+# RUN:         --use-cdsplit --call-scale=0.0 --print-cdsplit --print-only=chain \
+# RUN:         --data=%t.fdata --reorder-blocks=ext-tsp \
+# RUN:     2>&1 | FileCheck --check-prefix=LOWINCENTIVE %s
+# RUN: llvm-bolt %t.exe -o %t.bolt --split-functions --split-all-cold \
+# RUN:         --use-cdsplit --call-scale=1.0 --print-cdsplit --print-only=chain \
+# RUN:         --data=%t.fdata --reorder-blocks=ext-tsp \
+# RUN:     2>&1 | FileCheck --check-prefix=MEDINCENTIVE %s
+# RUN: llvm-bolt %t.exe -o %t.bolt --split-functions --split-all-cold \
+# RUN:         --use-cdsplit --call-scale=1000.0 --print-cdsplit --print-only=chain \
+# RUN:         --data=%t.fdata --reorder-blocks=ext-tsp \
+# RUN:     2>&1 | FileCheck --check-prefix=HIGHINCENTIVE %s
+
+# LOWINCENTIVE: Binary Function "chain" after cdsplit
+# LOWINCENTIVE: {{^\.Ltmp5}}
+# LOWINCENTIVE: -------   HOT-COLD SPLIT POINT   -------
+# LOWINCENTIVE: -------   HOT-COLD SPLIT POINT   -------
+# LOWINCENTIVE: {{^\.LFT1}}
+
+# MEDINCENTIVE: Binary Function "chain" after cdsplit
+# MEDINCENTIVE: {{^\.Ltmp1}}
+# MEDINCENTIVE: -------   HOT-COLD SPLIT POINT   -------
+# MEDINCENTIVE: {{^\.Ltmp0}}
+# MEDINCENTIVE: {{^\.Ltmp5}}
+# MEDINCENTIVE: -------   HOT-COLD SPLIT POINT   -------
+# MEDINCENTIVE: {{^\.LFT1}}
+
+# HIGHINCENTIVE: Binary Function "chain" after cdsplit
+# HIGHINCENTIVE: {{^\.LBB00}}
+# HIGHINCENTIVE: -------   HOT-COLD SPLIT POINT   -------
+# HIGHINCENTIVE: {{^\.LFT0}}
+# HIGHINCENTIVE: {{^\.Ltmp5}}
+# HIGHINCENTIVE: -------   HOT-COLD SPLIT POINT   -------
+# HIGHINCENTIVE: {{^\.LFT1}}
+
+
+        .text
+        .globl  chain
+        .type   chain, @function
+chain:
+        pushq   %rbp
+        movq    %rsp, %rbp
+        cmpl    $2, %edi
+LLentry_LLchain_start:
+        jge     LLchain_start
+# FDATA: 1 chain #LLentry_LLchain_start# 1 chain #LLchain_start# 0 10
+# FDATA: 1 chain #LLentry_LLchain_start# 1 chain #LLfast# 0 500
+LLfast:
+        movl    $5, %eax
+LLfast_LLexit:
+        jmp     LLexit
+# FDATA: 1 chain #LLfast_LLexit# 1 chain #LLexit# 0 500
+LLchain_start:
+        movl    $10, %eax
+LLchain_start_LLchain1:
+        jge     LLchain1
+# FDATA: 1 chain #LLchain_start_LLchain1# 1 chain #LLchain1# 0 10
+# FDATA: 1 chain #LLchain_start_LLchain1# 1 chain #LLcold# 0 0
+LLcold:
+        addl    $1, %eax
+        addl    $1, %eax
+        addl    $1, %eax
+        addl    $1, %eax
+        addl    $1, %eax
+        addl    $1, %eax
+LLchain1:
+        addl    $1, %eax
+LLchain1_LLchain2:
+        jmp     LLchain2
+# FDATA: 1 chain #LLchain1_LLchain2# 1 chain #LLchain2# 0 10
+LLchain2:
+        addl    $1, %eax
+LLchain2_LLchain3:
+        jmp     LLchain3
+# FDATA: 1 chain #LLchain2_LLchain3# 1 chain #LLchain3# 0 10
+LLchain3:
+        addl    $1, %eax
+        addl    $1, %eax
+        addl    $1, %eax
+        addl    $1, %eax
+        addl    $1, %eax
+LLchain3_LLchain4:
+        jmp     LLchain4
+# FDATA: 1 chain #LLchain3_LLchain4# 1 chain #LLchain4# 0 10
+LLchain4:
+        addl    $1, %eax
+        addl    $1, %eax
+        addl    $1, %eax
+        addl    $1, %eax
+        addl    $1, %eax
+LLchain4_LLexit:
+        jmp     LLexit
+# FDATA: 1 chain #LLchain4_LLexit# 1 chain #LLexit# 0 10
+LLexit:
+        popq    %rbp
+        ret
+LLchain_end:
+        .size   chain, LLchain_end-chain
+
+
+        .globl  main
+        .type   main, @function
+main:
+        pushq   %rbp
+        movq    %rsp, %rbp
+        movl    $1, %edi
+LLmain_chain1:
+        call    chain
+# FDATA: 1 main #LLmain_chain1# 1 chain 0 0 500
+        movl    $4, %edi
+LLmain_chain2:
+        call    chain
+# FDATA: 1 main #LLmain_chain2# 1 chain 0 0 10
+        xorl    %eax, %eax
+        popq    %rbp
+        retq
+.Lmain_end:
+        .size   main, .Lmain_end-main



More information about the llvm-commits mailing list