[llvm] [BOLT] CDSplit Main Logic Part 1/3 (PR #73082)

via llvm-commits llvm-commits at lists.llvm.org
Tue Nov 21 20:35:49 PST 2023


https://github.com/ShatianWang created https://github.com/llvm/llvm-project/pull/73082

The first diff in a series of 3 that implements the main logic of
CDSplit. Under X86, function splitting can lead to block size increase.
This is because conditional and unconditional branch instructions whose
offset is under 8 bits can be encoded with 2 bytes. If the offset is
greater than 8 bits, then they need 6 and 5 bytes respectively.
Splitting a short conditional / unconditional branch will thus increase
the size of the src basic block by 4 and 3 bytes respectively. CDSplit
takes into account the potential block size increase when it makes
splitting decisions. This diff implements a function
estimatePostSplitBBAddress in CDSplit that approximates the block level
size increase at the given split index of the given function.

>From ad9c91c2348ba426a830575946cd8748bbb4ef86 Mon Sep 17 00:00:00 2001
From: Shatian Wang <shatian at meta.com>
Date: Thu, 2 Nov 2023 12:26:49 -0700
Subject: [PATCH 1/6] [BOLT] Extend calculateEmittedSize for Block Size
 Calculation

This commit modifies BinaryContext::calculateEmittedSize to update the
BinaryBasicBlock::OutputAddressRange for each basic block in the input
BF. The modification is done in place, where BB.OutputAddressRange.second
less BB.OutputAddressRange.first now gives the emitted size of the basic
block.
---
 bolt/include/bolt/Core/BinaryContext.h |  3 +++
 bolt/lib/Core/BinaryContext.cpp        | 35 +++++++++++++++++++++-----
 2 files changed, 32 insertions(+), 6 deletions(-)

diff --git a/bolt/include/bolt/Core/BinaryContext.h b/bolt/include/bolt/Core/BinaryContext.h
index ad1bf2baaeb5b1e..17e55a673e8b489 100644
--- a/bolt/include/bolt/Core/BinaryContext.h
+++ b/bolt/include/bolt/Core/BinaryContext.h
@@ -1230,6 +1230,9 @@ class BinaryContext {
   ///
   /// Return the pair where the first size is for the main part, and the second
   /// size is for the cold one.
+  /// Modify BinaryBasicBlock::OutputAddressRange for each basic block in the
+  /// function in place so that BB.OutputAddressRange.second less
+  /// BB.OutputAddressRange.first gives the emitted size of BB.
   std::pair<size_t, size_t> calculateEmittedSize(BinaryFunction &BF,
                                                  bool FixBranches = true);
 
diff --git a/bolt/lib/Core/BinaryContext.cpp b/bolt/lib/Core/BinaryContext.cpp
index 06b68765909d20e..baf86333ce53206 100644
--- a/bolt/lib/Core/BinaryContext.cpp
+++ b/bolt/lib/Core/BinaryContext.cpp
@@ -2331,14 +2331,37 @@ BinaryContext::calculateEmittedSize(BinaryFunction &BF, bool FixBranches) {
   MCAsmLayout Layout(Assembler);
   Assembler.layout(Layout);
 
+  // Obtain fragment sizes.
+  std::vector<uint64_t> FragmentSizes(BF.getLayout().fragment_size());
+  // Main fragment size.
   const uint64_t HotSize =
       Layout.getSymbolOffset(*EndLabel) - Layout.getSymbolOffset(*StartLabel);
-  const uint64_t ColdSize =
-      std::accumulate(SplitLabels.begin(), SplitLabels.end(), 0ULL,
-                      [&](const uint64_t Accu, const LabelRange &Labels) {
-                        return Accu + Layout.getSymbolOffset(*Labels.second) -
-                               Layout.getSymbolOffset(*Labels.first);
-                      });
+  FragmentSizes.push_back(HotSize);
+  // Split fragment sizes.
+  uint64_t ColdSize = 0;
+  for (const auto &Labels : SplitLabels) {
+    uint64_t Size = Layout.getSymbolOffset(*Labels.second) -
+                    Layout.getSymbolOffset(*Labels.first);
+    FragmentSizes.push_back(Size);
+    ColdSize += Size;
+  }
+
+  // Populate new start and end offsets of each basic block.
+  BinaryBasicBlock *PrevBB = nullptr;
+  uint64_t FragmentIndex = 0;
+  for (FunctionFragment &FF : BF.getLayout().fragments()) {
+    for (BinaryBasicBlock *BB : FF) {
+      const uint64_t BBStartOffset = Layout.getSymbolOffset(*(BB->getLabel()));
+      BB->setOutputStartAddress(BBStartOffset);
+      if (PrevBB)
+        PrevBB->setOutputEndAddress(BBStartOffset);
+      PrevBB = BB;
+    }
+    if (PrevBB)
+      PrevBB->setOutputEndAddress(FragmentSizes[FragmentIndex]);
+    FragmentIndex++;
+    PrevBB = nullptr;
+  }
 
   // Clean-up the effect of the code emission.
   for (const MCSymbol &Symbol : Assembler.symbols()) {

>From 273ee295a2bbc09a1500c867d1156ea4d29ead40 Mon Sep 17 00:00:00 2001
From: Shatian Wang <shatian at meta.com>
Date: Wed, 8 Nov 2023 08:20:11 -0800
Subject: [PATCH 2/6] [BOLT] Refactor SplitFunctions for Function Reuse

This commit updates SplitFunctions.h and SplitFunctions.cpp to enable
the reuse of createEHTrampolines, mergeEHTrampolines, hasFullProfile,
and allBlocksCold by a distinct function splitting pass (CDSplit).
---
 bolt/include/bolt/Core/BinaryFunction.h   | 14 ++++++++++
 bolt/include/bolt/Passes/SplitFunctions.h | 32 +++++++++++------------
 bolt/lib/Passes/SplitFunctions.cpp        | 18 +++----------
 3 files changed, 33 insertions(+), 31 deletions(-)

diff --git a/bolt/include/bolt/Core/BinaryFunction.h b/bolt/include/bolt/Core/BinaryFunction.h
index 72c360ca0c2db66..3723cccc50f040c 100644
--- a/bolt/include/bolt/Core/BinaryFunction.h
+++ b/bolt/include/bolt/Core/BinaryFunction.h
@@ -1272,6 +1272,20 @@ class BinaryFunction {
   /// otherwise processed.
   bool isPseudo() const { return IsPseudo; }
 
+  /// Return true if every block in the function has a valid execution count.
+  bool hasFullProfile() const {
+    return llvm::all_of(blocks(), [](const BinaryBasicBlock &BB) {
+      return BB.getExecutionCount() != BinaryBasicBlock::COUNT_NO_PROFILE;
+    });
+  }
+
+  /// Return true if every block in the function has a zero execution count.
+  bool allBlocksCold() const {
+    return llvm::all_of(blocks(), [](const BinaryBasicBlock &BB) {
+      return BB.getExecutionCount() == 0;
+    });
+  }
+
   /// Return true if the function contains explicit or implicit indirect branch
   /// to its split fragments, e.g., split jump table, landing pad in split
   /// fragment.
diff --git a/bolt/include/bolt/Passes/SplitFunctions.h b/bolt/include/bolt/Passes/SplitFunctions.h
index 4058f3317dfbdbb..91b6d5518eaab26 100644
--- a/bolt/include/bolt/Passes/SplitFunctions.h
+++ b/bolt/include/bolt/Passes/SplitFunctions.h
@@ -50,6 +50,19 @@ class SplitFunctions : public BinaryFunctionPass {
   /// Split function body into fragments.
   void splitFunction(BinaryFunction &Function, SplitStrategy &Strategy);
 
+  std::atomic<uint64_t> SplitBytesHot{0ull};
+  std::atomic<uint64_t> SplitBytesCold{0ull};
+
+public:
+  explicit SplitFunctions(const cl::opt<bool> &PrintPass)
+      : BinaryFunctionPass(PrintPass) {}
+
+  bool shouldOptimize(const BinaryFunction &BF) const override;
+
+  const char *getName() const override { return "split-functions"; }
+
+  void runOnFunctions(BinaryContext &BC) override;
+
   struct TrampolineKey {
     FragmentNum SourceFN = FragmentNum::main();
     const MCSymbol *Target = nullptr;
@@ -81,27 +94,14 @@ class SplitFunctions : public BinaryFunctionPass {
   /// corresponding thrower block. The trampoline landing pad, when created,
   /// will redirect the execution to the real landing pad in a different
   /// fragment.
-  TrampolineSetType createEHTrampolines(BinaryFunction &Function) const;
+  static TrampolineSetType createEHTrampolines(BinaryFunction &Function);
 
   /// Merge trampolines into \p Layout without trampolines. The merge will place
   /// a trampoline immediately before its destination. Used to revert the effect
   /// of trampolines after createEHTrampolines().
-  BasicBlockOrderType
+  static BasicBlockOrderType
   mergeEHTrampolines(BinaryFunction &BF, BasicBlockOrderType &Layout,
-                     const TrampolineSetType &Trampolines) const;
-
-  std::atomic<uint64_t> SplitBytesHot{0ull};
-  std::atomic<uint64_t> SplitBytesCold{0ull};
-
-public:
-  explicit SplitFunctions(const cl::opt<bool> &PrintPass)
-      : BinaryFunctionPass(PrintPass) {}
-
-  bool shouldOptimize(const BinaryFunction &BF) const override;
-
-  const char *getName() const override { return "split-functions"; }
-
-  void runOnFunctions(BinaryContext &BC) override;
+                     const TrampolineSetType &Trampolines);
 };
 
 } // namespace bolt
diff --git a/bolt/lib/Passes/SplitFunctions.cpp b/bolt/lib/Passes/SplitFunctions.cpp
index 34973cecdf49161..223f8d17367845d 100644
--- a/bolt/lib/Passes/SplitFunctions.cpp
+++ b/bolt/lib/Passes/SplitFunctions.cpp
@@ -109,21 +109,9 @@ static cl::opt<SplitFunctionsStrategy> SplitStrategy(
 } // namespace opts
 
 namespace {
-bool hasFullProfile(const BinaryFunction &BF) {
-  return llvm::all_of(BF.blocks(), [](const BinaryBasicBlock &BB) {
-    return BB.getExecutionCount() != BinaryBasicBlock::COUNT_NO_PROFILE;
-  });
-}
-
-bool allBlocksCold(const BinaryFunction &BF) {
-  return llvm::all_of(BF.blocks(), [](const BinaryBasicBlock &BB) {
-    return BB.getExecutionCount() == 0;
-  });
-}
-
 struct SplitProfile2 final : public SplitStrategy {
   bool canSplit(const BinaryFunction &BF) override {
-    return BF.hasValidProfile() && hasFullProfile(BF) && !allBlocksCold(BF);
+    return BF.hasValidProfile() && BF.hasFullProfile() && !BF.allBlocksCold();
   }
 
   bool keepEmpty() override { return false; }
@@ -434,7 +422,7 @@ void SplitFunctions::splitFunction(BinaryFunction &BF, SplitStrategy &S) {
 }
 
 SplitFunctions::TrampolineSetType
-SplitFunctions::createEHTrampolines(BinaryFunction &BF) const {
+SplitFunctions::createEHTrampolines(BinaryFunction &BF) {
   const auto &MIB = BF.getBinaryContext().MIB;
 
   // Map real landing pads to the corresponding trampolines.
@@ -501,7 +489,7 @@ SplitFunctions::createEHTrampolines(BinaryFunction &BF) const {
 
 SplitFunctions::BasicBlockOrderType SplitFunctions::mergeEHTrampolines(
     BinaryFunction &BF, SplitFunctions::BasicBlockOrderType &Layout,
-    const SplitFunctions::TrampolineSetType &Trampolines) const {
+    const SplitFunctions::TrampolineSetType &Trampolines) {
   DenseMap<const MCSymbol *, SmallVector<const MCSymbol *, 0>>
       IncomingTrampolines;
   for (const auto &Entry : Trampolines) {

>From 2215673c1d18aca198c095f6acbc251f1af0bd97 Mon Sep 17 00:00:00 2001
From: Shatian Wang <shatian at meta.com>
Date: Thu, 2 Nov 2023 20:51:52 -0700
Subject: [PATCH 3/6] [BOLT] Setup CDSplit Pass Structure

This commit establishes the general structure of the CDSplit
implementation without incorporating the exact splitting logic.
Currently, all functions undergo hot-cold splitting based on the
decisions made by the SplitFunctions pass. Subsequent commits
will introduce the precise splitting logic.
---
 bolt/include/bolt/Passes/CDSplit.h     |  63 ++++++++
 bolt/lib/Passes/CDSplit.cpp            | 208 +++++++++++++++++++++++++
 bolt/lib/Passes/CMakeLists.txt         |   1 +
 bolt/lib/Passes/SplitFunctions.cpp     |  12 ++
 bolt/lib/Rewrite/BinaryPassManager.cpp |  10 ++
 bolt/lib/Utils/CommandLineOpts.cpp     |   6 +
 6 files changed, 300 insertions(+)
 create mode 100644 bolt/include/bolt/Passes/CDSplit.h
 create mode 100644 bolt/lib/Passes/CDSplit.cpp

diff --git a/bolt/include/bolt/Passes/CDSplit.h b/bolt/include/bolt/Passes/CDSplit.h
new file mode 100644
index 000000000000000..96a982683a7ec26
--- /dev/null
+++ b/bolt/include/bolt/Passes/CDSplit.h
@@ -0,0 +1,63 @@
+//===- bolt/Passes/CDSplit.h - Split functions into hot/warm/cold
+// after function reordering pass -------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef BOLT_PASSES_CDSPLIT
+#define BOLT_PASSES_CDSPLIT
+
+#include "bolt/Passes/SplitFunctions.h"
+#include <atomic>
+
+namespace llvm {
+namespace bolt {
+
+using BasicBlockOrder = BinaryFunction::BasicBlockOrderType;
+
+class CDSplit : public BinaryFunctionPass {
+private:
+  /// Overall stats.
+  std::atomic<uint64_t> SplitBytesHot{0ull};
+  std::atomic<uint64_t> SplitBytesCold{0ull};
+
+  /// List of functions to be considered.
+  /// All functions in the list are used to construct a call graph.
+  /// A subset of functions in this list are considered for splitting.
+  std::vector<BinaryFunction *> FunctionsToConsider;
+
+  /// Helper functions to initialize global variables.
+  void initialize(BinaryContext &BC);
+
+  /// Split function body into 3 fragments: hot / warm / cold.
+  void runOnFunction(BinaryFunction &BF);
+
+  /// Assign each basic block in the given function to either hot, cold,
+  /// or warm fragment using the CDSplit algorithm.
+  void assignFragmentThreeWay(const BinaryFunction &BF,
+                              const BasicBlockOrder &BlockOrder);
+
+  /// Find the best split index that separates hot from warm.
+  /// The basic block whose index equals the returned split index will be the
+  /// last hot block.
+  size_t findSplitIndex(const BinaryFunction &BF,
+                        const BasicBlockOrder &BlockOrder);
+
+public:
+  explicit CDSplit(const cl::opt<bool> &PrintPass)
+      : BinaryFunctionPass(PrintPass) {}
+
+  bool shouldOptimize(const BinaryFunction &BF) const override;
+
+  const char *getName() const override { return "cdsplit"; }
+
+  void runOnFunctions(BinaryContext &BC) override;
+};
+
+} // namespace bolt
+} // namespace llvm
+
+#endif
diff --git a/bolt/lib/Passes/CDSplit.cpp b/bolt/lib/Passes/CDSplit.cpp
new file mode 100644
index 000000000000000..cd67b24241a4249
--- /dev/null
+++ b/bolt/lib/Passes/CDSplit.cpp
@@ -0,0 +1,208 @@
+//===- bolt/Passes/CDSplit.cpp - Pass for splitting function code 3-way
+//--===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the CDSplit pass.
+//
+//===----------------------------------------------------------------------===//
+
+#include "bolt/Passes/CDSplit.h"
+#include "bolt/Core/ParallelUtilities.h"
+#include "bolt/Utils/CommandLineOpts.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/MC/MCInst.h"
+#include "llvm/Support/MathExtras.h"
+
+#define DEBUG_TYPE "bolt-opts"
+
+using namespace llvm;
+using namespace bolt;
+
+namespace opts {
+
+extern cl::OptionCategory BoltOptCategory;
+
+extern cl::opt<bool> UseCDSplit;
+extern cl::opt<bool> SplitEH;
+extern cl::opt<unsigned> ExecutionCountThreshold;
+} // namespace opts
+
+namespace llvm {
+namespace bolt {
+
+namespace {
+/// Return true if the function should be considered for building call graph.
+bool shouldConsider(const BinaryFunction &BF) {
+  return BF.hasValidIndex() && BF.hasValidProfile() && !BF.empty();
+}
+} // anonymous namespace
+
+bool CDSplit::shouldOptimize(const BinaryFunction &BF) const {
+  // Do not split functions with a small execution count.
+  if (BF.getKnownExecutionCount() < opts::ExecutionCountThreshold)
+    return false;
+
+  // Do not split functions with at least one block that has no known
+  // execution count due to incomplete information.
+  // Do not split functions with only zero-execution count blocks
+  // as there is not enough variation in block count to justify splitting.
+  if (!BF.hasFullProfile() || BF.allBlocksCold())
+    return false;
+
+  return BinaryFunctionPass::shouldOptimize(BF);
+}
+
+/// Initialize algorithm's metadata.
+void CDSplit::initialize(BinaryContext &BC) {
+  // Construct a list of functions that are considered for building call graph.
+  // Only those in this list that evaluates true for shouldOptimize are
+  // candidates for 3-way splitting.
+  std::vector<BinaryFunction *> SortedFunctions = BC.getSortedFunctions();
+  FunctionsToConsider.reserve(SortedFunctions.size());
+  for (BinaryFunction *BF : SortedFunctions) {
+    if (shouldConsider(*BF))
+      FunctionsToConsider.push_back(BF);
+  }
+}
+
+/// Find the best index for splitting. The returned value is the index of the
+/// last hot basic block. Hence, "no splitting" is equivalent to returning the
+/// value which is one less than the size of the function.
+size_t CDSplit::findSplitIndex(const BinaryFunction &BF,
+                               const BasicBlockOrder &BlockOrder) {
+  // Placeholder: hot-cold splitting.
+  return BF.getLayout().getMainFragment().size() - 1;
+}
+
+/// Assign each basic block in the given function to either hot, cold,
+/// or warm fragment using the CDSplit algorithm.
+void CDSplit::assignFragmentThreeWay(const BinaryFunction &BF,
+                                     const BasicBlockOrder &BlockOrder) {
+  size_t BestSplitIndex = findSplitIndex(BF, BlockOrder);
+
+  // Assign fragments based on the computed best split index.
+  // All basic blocks with index up to the best split index become hot.
+  // All remaining blocks are warm / cold depending on if count is
+  // greater than 0 or not.
+  FragmentNum Main(0);
+  FragmentNum Warm(1);
+  FragmentNum Cold(2);
+  for (size_t Index = 0; Index < BlockOrder.size(); Index++) {
+    BinaryBasicBlock *BB = BlockOrder[Index];
+    if (Index <= BestSplitIndex)
+      BB->setFragmentNum(Main);
+    else
+      BB->setFragmentNum(BB->getKnownExecutionCount() > 0 ? Warm : Cold);
+  }
+}
+
+void CDSplit::runOnFunction(BinaryFunction &BF) {
+  assert(!BF.empty() && "splitting an empty function");
+
+  FunctionLayout &Layout = BF.getLayout();
+  BinaryContext &BC = BF.getBinaryContext();
+
+  BasicBlockOrder NewLayout(Layout.block_begin(), Layout.block_end());
+  // Never outline the first basic block.
+  NewLayout.front()->setCanOutline(false);
+  for (BinaryBasicBlock *BB : NewLayout) {
+    if (!BB->canOutline())
+      continue;
+
+    // Do not split extra entry points in aarch64. They can be referred by
+    // using ADRs and when this happens, these blocks cannot be placed far
+    // away due to the limited range in ADR instruction.
+    if (BC.isAArch64() && BB->isEntryPoint()) {
+      BB->setCanOutline(false);
+      continue;
+    }
+
+    if (BF.hasEHRanges() && !opts::SplitEH) {
+      // We cannot move landing pads (or rather entry points for landing pads).
+      if (BB->isLandingPad()) {
+        BB->setCanOutline(false);
+        continue;
+      }
+      // We cannot move a block that can throw since exception-handling
+      // runtime cannot deal with split functions. However, if we can guarantee
+      // that the block never throws, it is safe to move the block to
+      // decrease the size of the function.
+      for (MCInst &Instr : *BB) {
+        if (BC.MIB->isInvoke(Instr)) {
+          BB->setCanOutline(false);
+          break;
+        }
+      }
+    }
+  }
+
+  // Assign each basic block in NewLayout to either hot, warm, or cold fragment.
+  assignFragmentThreeWay(BF, NewLayout);
+
+  // Make sure all non-outlineable blocks are in the main-fragment.
+  for (BinaryBasicBlock *BB : NewLayout) {
+    if (!BB->canOutline())
+      BB->setFragmentNum(FragmentNum::main());
+  }
+
+  // In case any non-outlineable blocks previously in warm or cold is now set
+  // to be in main by the preceding for loop, move them to the end of main.
+  llvm::stable_sort(NewLayout,
+                    [&](const BinaryBasicBlock *L, const BinaryBasicBlock *R) {
+                      return L->getFragmentNum() < R->getFragmentNum();
+                    });
+
+  BF.getLayout().update(NewLayout);
+
+  // For shared objects, invoke instructions and corresponding landing pads
+  // have to be placed in the same fragment. When we split them, create
+  // trampoline landing pads that will redirect the execution to real LPs.
+  SplitFunctions::TrampolineSetType Trampolines;
+  if (!BC.HasFixedLoadAddress && BF.hasEHRanges() && BF.isSplit())
+    Trampolines = SplitFunctions::createEHTrampolines(BF);
+
+  if (BC.isX86() && BF.isSplit()) {
+    size_t HotSize;
+    size_t ColdSize;
+    std::tie(HotSize, ColdSize) = BC.calculateEmittedSize(BF);
+    SplitBytesHot += HotSize;
+    SplitBytesCold += ColdSize;
+  }
+}
+
+void CDSplit::runOnFunctions(BinaryContext &BC) {
+  if (!opts::UseCDSplit)
+    return;
+
+  // Initialize global variables.
+  initialize(BC);
+
+  // Only functions satisfying shouldConsider and shouldOptimize are candidates
+  // for splitting.
+  ParallelUtilities::PredicateTy SkipFunc = [&](const BinaryFunction &BF) {
+    return !(shouldConsider(BF) && shouldOptimize(BF));
+  };
+
+  // Make function splitting decisions in parallel.
+  ParallelUtilities::runOnEachFunction(
+      BC, ParallelUtilities::SchedulingPolicy::SP_BB_LINEAR,
+      [&](BinaryFunction &BF) { runOnFunction(BF); }, SkipFunc, "CDSplit",
+      /*ForceSequential=*/false);
+
+  if (SplitBytesHot + SplitBytesCold > 0) {
+    outs() << "BOLT-INFO: cdsplit separates " << SplitBytesHot
+           << " hot bytes from " << SplitBytesCold << " cold bytes "
+           << format("(%.2lf%% of split functions is in the main fragment)\n",
+                     100.0 * SplitBytesHot / (SplitBytesHot + SplitBytesCold));
+
+  } else
+    outs() << "BOLT-INFO: cdsplit didn't split any functions\n";
+}
+
+} // namespace bolt
+} // namespace llvm
diff --git a/bolt/lib/Passes/CMakeLists.txt b/bolt/lib/Passes/CMakeLists.txt
index b8bbe59a64480c9..4cc4b4fa6ae345c 100644
--- a/bolt/lib/Passes/CMakeLists.txt
+++ b/bolt/lib/Passes/CMakeLists.txt
@@ -9,6 +9,7 @@ add_llvm_library(LLVMBOLTPasses
   CacheMetrics.cpp
   CallGraph.cpp
   CallGraphWalker.cpp
+  CDSplit.cpp
   DataflowAnalysis.cpp
   DataflowInfoManager.cpp
   FrameAnalysis.cpp
diff --git a/bolt/lib/Passes/SplitFunctions.cpp b/bolt/lib/Passes/SplitFunctions.cpp
index 223f8d17367845d..0c11d0fb49cd09c 100644
--- a/bolt/lib/Passes/SplitFunctions.cpp
+++ b/bolt/lib/Passes/SplitFunctions.cpp
@@ -60,6 +60,7 @@ extern cl::OptionCategory BoltOptCategory;
 extern cl::opt<bool> SplitEH;
 extern cl::opt<unsigned> ExecutionCountThreshold;
 extern cl::opt<uint32_t> RandomSeed;
+extern cl::opt<bool> UseCDSplit;
 
 static cl::opt<bool> AggressiveSplitting(
     "split-all-cold", cl::desc("outline as many cold basic blocks as possible"),
@@ -231,6 +232,17 @@ bool SplitFunctions::shouldOptimize(const BinaryFunction &BF) const {
 }
 
 void SplitFunctions::runOnFunctions(BinaryContext &BC) {
+  if (opts::UseCDSplit &&
+      !(opts::SplitFunctions &&
+        opts::SplitStrategy == SplitFunctionsStrategy::Profile2)) {
+    errs() << "BOLT-ERROR: -use-cdsplit should be applied together with "
+              "-split-functions using default -split-strategy=profile2. "
+              "-split-functions 2-way splits functions before the function "
+              "reordering pass, while -use-cdsplit 3-way splits functions "
+              "after the function reordering pass. \n";
+    exit(1);
+  }
+
   if (!opts::SplitFunctions)
     return;
 
diff --git a/bolt/lib/Rewrite/BinaryPassManager.cpp b/bolt/lib/Rewrite/BinaryPassManager.cpp
index 37de3eabc6d235d..28983de11c3ae07 100644
--- a/bolt/lib/Rewrite/BinaryPassManager.cpp
+++ b/bolt/lib/Rewrite/BinaryPassManager.cpp
@@ -11,6 +11,7 @@
 #include "bolt/Passes/Aligner.h"
 #include "bolt/Passes/AllocCombiner.h"
 #include "bolt/Passes/AsmDump.h"
+#include "bolt/Passes/CDSplit.h"
 #include "bolt/Passes/CMOVConversion.h"
 #include "bolt/Passes/FixRISCVCallsPass.h"
 #include "bolt/Passes/FixRelaxationPass.h"
@@ -182,6 +183,10 @@ static cl::opt<bool>
     PrintSplit("print-split", cl::desc("print functions after code splitting"),
                cl::Hidden, cl::cat(BoltOptCategory));
 
+static cl::opt<bool> PrintCDSplit("print-cdsplit",
+                                  cl::desc("print functions after cdsplit"),
+                                  cl::Hidden, cl::cat(BoltOptCategory));
+
 static cl::opt<bool>
     PrintStoke("print-stoke", cl::desc("print functions after stoke analysis"),
                cl::Hidden, cl::cat(BoltOptCategory));
@@ -430,6 +435,11 @@ void BinaryFunctionPassManager::runAllPasses(BinaryContext &BC) {
   Manager.registerPass(
       std::make_unique<ReorderFunctions>(PrintReorderedFunctions));
 
+  /// This pass three-way splits functions after function reordering.
+  Manager.registerPass(std::make_unique<CDSplit>(PrintCDSplit));
+
+  Manager.registerPass(std::make_unique<FixupBranches>(PrintAfterBranchFixup));
+
   // Print final dyno stats right while CFG and instruction analysis are intact.
   Manager.registerPass(
       std::make_unique<DynoStatsPrintPass>(
diff --git a/bolt/lib/Utils/CommandLineOpts.cpp b/bolt/lib/Utils/CommandLineOpts.cpp
index a1df5de26234029..75d63e369c731e4 100644
--- a/bolt/lib/Utils/CommandLineOpts.cpp
+++ b/bolt/lib/Utils/CommandLineOpts.cpp
@@ -191,6 +191,12 @@ cl::opt<unsigned>
               cl::init(0), cl::ZeroOrMore, cl::cat(BoltCategory),
               cl::sub(cl::SubCommand::getAll()));
 
+cl::opt<bool>
+    UseCDSplit("use-cdsplit",
+               cl::desc("split functions into 3 fragments using the CDSplit "
+                        "algorithm after function reordering pass"),
+               cl::init(false), cl::cat(BoltOptCategory));
+
 bool processAllFunctions() {
   if (opts::AggregateOnly)
     return false;

>From af930a0da876b280c969bb9ea512a256c4439d4e Mon Sep 17 00:00:00 2001
From: Shatian Wang <shatian at meta.com>
Date: Tue, 31 Oct 2023 14:57:00 -0700
Subject: [PATCH 4/6] [BOLT] Introduce .text.warm for -use-cdsplit=1

This commit explicitly adds a warm code section, .text.warm, when the
-use-cdsplit=1 flag is set. This replaces the previous approach of using
.text.cold.0 as warm and .text.cold.1 as cold in 3-way splitting.
---
 bolt/include/bolt/Core/BinaryBasicBlock.h |  4 +--
 bolt/include/bolt/Core/BinaryContext.h    |  2 ++
 bolt/include/bolt/Core/BinaryFunction.h   | 16 ++++++----
 bolt/include/bolt/Core/FunctionLayout.h   |  5 ++-
 bolt/lib/Core/BinaryBasicBlock.cpp        | 19 +++++++----
 bolt/lib/Core/BinaryEmitter.cpp           |  6 +++-
 bolt/lib/Core/BinaryFunction.cpp          | 13 ++++++++
 bolt/lib/Passes/BinaryPasses.cpp          |  4 ++-
 bolt/lib/Passes/CDSplit.cpp               |  1 -
 bolt/lib/Passes/IndirectCallPromotion.cpp |  6 ++--
 bolt/lib/Passes/SplitFunctions.cpp        |  6 ++--
 bolt/lib/Rewrite/RewriteInstance.cpp      | 39 ++++++++++++++++-------
 12 files changed, 84 insertions(+), 37 deletions(-)

diff --git a/bolt/include/bolt/Core/BinaryBasicBlock.h b/bolt/include/bolt/Core/BinaryBasicBlock.h
index bc95e2c4de3a11e..32e48b964d73933 100644
--- a/bolt/include/bolt/Core/BinaryBasicBlock.h
+++ b/bolt/include/bolt/Core/BinaryBasicBlock.h
@@ -677,9 +677,7 @@ class BinaryBasicBlock {
     return isSplit();
   }
 
-  void setIsCold(const bool Flag) {
-    Fragment = Flag ? FragmentNum::cold() : FragmentNum::main();
-  }
+  void setIsCold(const bool Flag);
 
   /// Return true if the block can be outlined. At the moment we disallow
   /// outlining of blocks that can potentially throw exceptions or are
diff --git a/bolt/include/bolt/Core/BinaryContext.h b/bolt/include/bolt/Core/BinaryContext.h
index 17e55a673e8b489..80cbcc37d2a2184 100644
--- a/bolt/include/bolt/Core/BinaryContext.h
+++ b/bolt/include/bolt/Core/BinaryContext.h
@@ -927,6 +927,8 @@ class BinaryContext {
 
   const char *getMainCodeSectionName() const { return ".text"; }
 
+  const char *getWarmCodeSectionName() const { return ".text.warm"; }
+
   const char *getColdCodeSectionName() const { return ".text.cold"; }
 
   const char *getHotTextMoverSectionName() const { return ".text.mover"; }
diff --git a/bolt/include/bolt/Core/BinaryFunction.h b/bolt/include/bolt/Core/BinaryFunction.h
index 3723cccc50f040c..31677fabae1d93a 100644
--- a/bolt/include/bolt/Core/BinaryFunction.h
+++ b/bolt/include/bolt/Core/BinaryFunction.h
@@ -355,6 +355,9 @@ class BinaryFunction {
   /// Name for the section this function code should reside in.
   std::string CodeSectionName;
 
+  /// Name for the corresponding warm code section.
+  std::string WarmCodeSectionName;
+
   /// Name for the corresponding cold code section.
   std::string ColdCodeSectionName;
 
@@ -1231,13 +1234,7 @@ class BinaryFunction {
 
   /// Return internal section name for this function.
   SmallString<32>
-  getCodeSectionName(const FragmentNum Fragment = FragmentNum::main()) const {
-    if (Fragment == FragmentNum::main())
-      return SmallString<32>(CodeSectionName);
-    if (Fragment == FragmentNum::cold())
-      return SmallString<32>(ColdCodeSectionName);
-    return formatv("{0}.{1}", ColdCodeSectionName, Fragment.get() - 1);
-  }
+  getCodeSectionName(const FragmentNum Fragment = FragmentNum::main()) const;
 
   /// Assign a code section name to the function.
   void setCodeSectionName(const StringRef Name) {
@@ -1250,6 +1247,11 @@ class BinaryFunction {
     return BC.getUniqueSectionByName(getCodeSectionName(Fragment));
   }
 
+  /// Assign a section name for the warm part of the function.
+  void setWarmCodeSectionName(const StringRef Name) {
+    WarmCodeSectionName = Name.str();
+  }
+
   /// Assign a section name for the cold part of the function.
   void setColdCodeSectionName(const StringRef Name) {
     ColdCodeSectionName = Name.str();
diff --git a/bolt/include/bolt/Core/FunctionLayout.h b/bolt/include/bolt/Core/FunctionLayout.h
index 904da3a4a93aade..94d71a84aae8334 100644
--- a/bolt/include/bolt/Core/FunctionLayout.h
+++ b/bolt/include/bolt/Core/FunctionLayout.h
@@ -62,7 +62,10 @@ class FragmentNum {
   }
 
   static constexpr FragmentNum main() { return FragmentNum(0); }
-  static constexpr FragmentNum cold() { return FragmentNum(1); }
+  static constexpr FragmentNum warm() { return FragmentNum(1); }
+  static constexpr FragmentNum cold(bool Flag = false) {
+    return FragmentNum(Flag ? 2 : 1);
+  }
 };
 
 /// A freestanding subset of contiguous blocks of a function.
diff --git a/bolt/lib/Core/BinaryBasicBlock.cpp b/bolt/lib/Core/BinaryBasicBlock.cpp
index 984bc6dbd220ab0..40a655c0cd24b3a 100644
--- a/bolt/lib/Core/BinaryBasicBlock.cpp
+++ b/bolt/lib/Core/BinaryBasicBlock.cpp
@@ -15,17 +15,25 @@
 #include "bolt/Core/BinaryFunction.h"
 #include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/MC/MCInst.h"
+#include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Errc.h"
 
 #define DEBUG_TYPE "bolt"
 
-namespace llvm {
-namespace bolt {
+using namespace llvm;
+using namespace bolt;
+namespace opts {
+extern cl::opt<bool> UseCDSplit;
+}
 
 constexpr uint32_t BinaryBasicBlock::INVALID_OFFSET;
 
-bool operator<(const BinaryBasicBlock &LHS, const BinaryBasicBlock &RHS) {
-  return LHS.Index < RHS.Index;
+bool bolt::operator<(const BinaryBasicBlock &LHS, const BinaryBasicBlock &RHS) {
+  return LHS.getIndex() < RHS.getIndex();
+}
+
+void BinaryBasicBlock::setIsCold(const bool Flag) {
+  Fragment = Flag ? FragmentNum::cold(opts::UseCDSplit) : FragmentNum::main();
 }
 
 bool BinaryBasicBlock::hasCFG() const { return getParent()->hasCFG(); }
@@ -611,6 +619,3 @@ BinaryBasicBlock *BinaryBasicBlock::splitAt(iterator II) {
 
   return NewBlock;
 }
-
-} // namespace bolt
-} // namespace llvm
diff --git a/bolt/lib/Core/BinaryEmitter.cpp b/bolt/lib/Core/BinaryEmitter.cpp
index fb1bf530c1974aa..9829c6bc107f03a 100644
--- a/bolt/lib/Core/BinaryEmitter.cpp
+++ b/bolt/lib/Core/BinaryEmitter.cpp
@@ -34,6 +34,7 @@ namespace opts {
 
 extern cl::opt<JumpTableSupportLevel> JumpTables;
 extern cl::opt<bool> PreserveBlocksAlignment;
+extern cl::opt<bool> UseCDSplit;
 
 cl::opt<bool> AlignBlocks("align-blocks", cl::desc("align basic blocks"),
                           cl::cat(BoltOptCategory));
@@ -287,7 +288,10 @@ void BinaryEmitter::emitFunctions() {
 
   // Mark the end of hot text.
   if (opts::HotText) {
-    Streamer.switchSection(BC.getTextSection());
+    if (opts::UseCDSplit)
+      Streamer.switchSection(BC.getCodeSection(BC.getWarmCodeSectionName()));
+    else
+      Streamer.switchSection(BC.getTextSection());
     Streamer.emitLabel(BC.getHotTextEndSymbol());
   }
 }
diff --git a/bolt/lib/Core/BinaryFunction.cpp b/bolt/lib/Core/BinaryFunction.cpp
index e81d58ef0b1047b..0b89be26def39df 100644
--- a/bolt/lib/Core/BinaryFunction.cpp
+++ b/bolt/lib/Core/BinaryFunction.cpp
@@ -59,6 +59,7 @@ extern cl::opt<bool> EnableBAT;
 extern cl::opt<bool> Instrument;
 extern cl::opt<bool> StrictMode;
 extern cl::opt<bool> UpdateDebugSections;
+extern cl::opt<bool> UseCDSplit;
 extern cl::opt<unsigned> Verbosity;
 
 extern bool processAllFunctions();
@@ -167,6 +168,18 @@ template <typename R> static bool emptyRange(const R &Range) {
   return Range.begin() == Range.end();
 }
 
+/// Return internal section name for this function.
+SmallString<32>
+BinaryFunction::getCodeSectionName(const FragmentNum Fragment) const {
+  if (Fragment == FragmentNum::main())
+    return SmallString<32>(CodeSectionName);
+  if (Fragment == FragmentNum::cold(opts::UseCDSplit))
+    return SmallString<32>(ColdCodeSectionName);
+  if (Fragment == FragmentNum::warm())
+    return SmallString<32>(WarmCodeSectionName);
+  return formatv("{0}.{1}", ColdCodeSectionName, Fragment.get() - 1);
+}
+
 /// Gets debug line information for the instruction located at the given
 /// address in the original binary. The SMLoc's pointer is used
 /// to point to this information, which is represented by a
diff --git a/bolt/lib/Passes/BinaryPasses.cpp b/bolt/lib/Passes/BinaryPasses.cpp
index 4e1343e2c30be56..c60a03d3d984720 100644
--- a/bolt/lib/Passes/BinaryPasses.cpp
+++ b/bolt/lib/Passes/BinaryPasses.cpp
@@ -1244,8 +1244,10 @@ void AssignSections::runOnFunctions(BinaryContext &BC) {
     else
       Function.setCodeSectionName(BC.getColdCodeSectionName());
 
-    if (Function.isSplit())
+    if (Function.isSplit()) {
+      Function.setWarmCodeSectionName(BC.getWarmCodeSectionName());
       Function.setColdCodeSectionName(BC.getColdCodeSectionName());
+    }
   }
 }
 
diff --git a/bolt/lib/Passes/CDSplit.cpp b/bolt/lib/Passes/CDSplit.cpp
index cd67b24241a4249..509d250c3419183 100644
--- a/bolt/lib/Passes/CDSplit.cpp
+++ b/bolt/lib/Passes/CDSplit.cpp
@@ -24,7 +24,6 @@ using namespace llvm;
 using namespace bolt;
 
 namespace opts {
-
 extern cl::OptionCategory BoltOptCategory;
 
 extern cl::opt<bool> UseCDSplit;
diff --git a/bolt/lib/Passes/IndirectCallPromotion.cpp b/bolt/lib/Passes/IndirectCallPromotion.cpp
index 8b084c3b63d565a..a46df74d7fd7bbc 100644
--- a/bolt/lib/Passes/IndirectCallPromotion.cpp
+++ b/bolt/lib/Passes/IndirectCallPromotion.cpp
@@ -34,6 +34,7 @@ extern cl::OptionCategory BoltOptCategory;
 extern cl::opt<IndirectCallPromotionType> ICP;
 extern cl::opt<unsigned> Verbosity;
 extern cl::opt<unsigned> ExecutionCountThreshold;
+extern cl::opt<bool> UseCDSplit;
 
 static cl::opt<unsigned> ICPJTRemainingPercentThreshold(
     "icp-jt-remaining-percent-threshold",
@@ -259,9 +260,10 @@ IndirectCallPromotion::getCallTargets(BinaryBasicBlock &BB,
       MCSymbol *Entry = JT->Entries[I];
       const BinaryBasicBlock *ToBB = BF.getBasicBlockForLabel(Entry);
       assert(ToBB || Entry == BF.getFunctionEndLabel() ||
-             Entry == BF.getFunctionEndLabel(FragmentNum::cold()));
+             Entry ==
+                 BF.getFunctionEndLabel(FragmentNum::cold(opts::UseCDSplit)));
       if (Entry == BF.getFunctionEndLabel() ||
-          Entry == BF.getFunctionEndLabel(FragmentNum::cold()))
+          Entry == BF.getFunctionEndLabel(FragmentNum::cold(opts::UseCDSplit)))
         continue;
       const Location To(Entry);
       const BinaryBasicBlock::BinaryBranchInfo &BI = BB.getBranchInfo(*ToBB);
diff --git a/bolt/lib/Passes/SplitFunctions.cpp b/bolt/lib/Passes/SplitFunctions.cpp
index 0c11d0fb49cd09c..a5eb19724c31458 100644
--- a/bolt/lib/Passes/SplitFunctions.cpp
+++ b/bolt/lib/Passes/SplitFunctions.cpp
@@ -115,12 +115,12 @@ struct SplitProfile2 final : public SplitStrategy {
     return BF.hasValidProfile() && BF.hasFullProfile() && !BF.allBlocksCold();
   }
 
-  bool keepEmpty() override { return false; }
+  bool keepEmpty() override { return opts::UseCDSplit ? true : false; }
 
   void fragment(const BlockIt Start, const BlockIt End) override {
     for (BinaryBasicBlock *const BB : llvm::make_range(Start, End)) {
       if (BB->getExecutionCount() == 0)
-        BB->setFragmentNum(FragmentNum::cold());
+        BB->setFragmentNum(FragmentNum::cold(opts::UseCDSplit));
     }
   }
 };
@@ -144,7 +144,7 @@ struct SplitRandom2 final : public SplitStrategy {
     std::uniform_int_distribution<DiffT> Dist(1, LastSplitPoint);
     const DiffT SplitPoint = Dist(Gen);
     for (BinaryBasicBlock *BB : llvm::make_range(Start + SplitPoint, End))
-      BB->setFragmentNum(FragmentNum::cold());
+      BB->setFragmentNum(FragmentNum::cold(opts::UseCDSplit));
 
     LLVM_DEBUG(dbgs() << formatv("BOLT-DEBUG: randomly chose last {0} (out of "
                                  "{1} possible) blocks to split\n",
diff --git a/bolt/lib/Rewrite/RewriteInstance.cpp b/bolt/lib/Rewrite/RewriteInstance.cpp
index 81c9cbff726bb9a..6d0b3cd5d15322b 100644
--- a/bolt/lib/Rewrite/RewriteInstance.cpp
+++ b/bolt/lib/Rewrite/RewriteInstance.cpp
@@ -84,6 +84,7 @@ extern cl::opt<JumpTableSupportLevel> JumpTables;
 extern cl::list<std::string> ReorderData;
 extern cl::opt<bolt::ReorderFunctions::ReorderType> ReorderFunctions;
 extern cl::opt<bool> TimeBuild;
+extern cl::opt<bool> UseCDSplit;
 
 cl::opt<bool> AllowStripped("allow-stripped",
                             cl::desc("allow processing of stripped binaries"),
@@ -3478,11 +3479,21 @@ std::vector<BinarySection *> RewriteInstance::getCodeSections() {
     if (B->getName() == BC->getHotTextMoverSectionName())
       return false;
 
-    // Depending on the option, put main text at the beginning or at the end.
-    if (opts::HotFunctionsAtEnd)
-      return B->getName() == BC->getMainCodeSectionName();
-    else
-      return A->getName() == BC->getMainCodeSectionName();
+    // Depending on opts::HotFunctionsAtEnd, place main and warm sections in
+    // order.
+    if (opts::HotFunctionsAtEnd) {
+      if (B->getName() == BC->getMainCodeSectionName())
+        return true;
+      if (A->getName() == BC->getMainCodeSectionName())
+        return false;
+      return (B->getName() == BC->getWarmCodeSectionName());
+    } else {
+      if (A->getName() == BC->getMainCodeSectionName())
+        return true;
+      if (B->getName() == BC->getMainCodeSectionName())
+        return false;
+      return (A->getName() == BC->getWarmCodeSectionName());
+    }
   };
 
   // Determine the order of sections.
@@ -3639,7 +3650,7 @@ void RewriteInstance::mapCodeSections(BOLTLinker::SectionMapper MapSection) {
            "non-relocation mode.");
 
     FunctionFragment &FF =
-        Function.getLayout().getFragment(FragmentNum::cold());
+        Function.getLayout().getFragment(FragmentNum::cold(opts::UseCDSplit));
     ErrorOr<BinarySection &> ColdSection =
         Function.getCodeSection(FF.getFragmentNum());
     assert(ColdSection && "cannot find section for cold part");
@@ -4423,9 +4434,15 @@ void RewriteInstance::updateELFSymbolTable(
            Function.getLayout().getSplitFragments()) {
         if (FF.getAddress()) {
           ELFSymTy NewColdSym = FunctionSymbol;
-          const SmallString<256> SymbolName = formatv(
-              "{0}.cold.{1}", cantFail(FunctionSymbol.getName(StringSection)),
-              FF.getFragmentNum().get() - 1);
+          SmallString<256> SymbolName;
+          if (opts::UseCDSplit)
+            SymbolName = formatv(
+                "{0}.{1}", cantFail(FunctionSymbol.getName(StringSection)),
+                FF.getFragmentNum().get() == 1 ? "warm" : "cold");
+          else
+            SymbolName = formatv(
+                "{0}.cold.{1}", cantFail(FunctionSymbol.getName(StringSection)),
+                FF.getFragmentNum().get() - 1);
           NewColdSym.st_name = AddToStrTab(SymbolName);
           NewColdSym.st_shndx =
               Function.getCodeSection(FF.getFragmentNum())->getIndex();
@@ -4684,8 +4701,8 @@ void RewriteInstance::updateELFSymbolTable(
       SmallVector<char, 256> Buf;
       NewColdSym.st_name = AddToStrTab(
           Twine(Function->getPrintName()).concat(".cold.0").toStringRef(Buf));
-      const FunctionFragment &ColdFF =
-          Function->getLayout().getFragment(FragmentNum::cold());
+      const FunctionFragment &ColdFF = Function->getLayout().getFragment(
+          FragmentNum::cold(opts::UseCDSplit));
       NewColdSym.st_value = ColdFF.getAddress();
       NewColdSym.st_size = ColdFF.getImageSize();
       Symbols.emplace_back(NewColdSym);

>From 9bae1c55d905e115ca9be1a49916a8db6abe6ebe Mon Sep 17 00:00:00 2001
From: Shatian Wang <shatian at meta.com>
Date: Tue, 7 Nov 2023 06:36:42 -0800
Subject: [PATCH 5/6] [BOLT] Initialize Auxiliary Variables

This diff defines and initializes auxiliary variables used by CDSplit.
---
 bolt/include/bolt/Passes/CDSplit.h | 19 +++++++
 bolt/lib/Passes/CDSplit.cpp        | 82 ++++++++++++++++++++++++++++++
 2 files changed, 101 insertions(+)

diff --git a/bolt/include/bolt/Passes/CDSplit.h b/bolt/include/bolt/Passes/CDSplit.h
index 96a982683a7ec26..a8d162d22b56469 100644
--- a/bolt/include/bolt/Passes/CDSplit.h
+++ b/bolt/include/bolt/Passes/CDSplit.h
@@ -18,7 +18,14 @@ namespace bolt {
 
 using BasicBlockOrder = BinaryFunction::BasicBlockOrderType;
 
+struct JumpInfo {
+  bool HasUncondBranch = false;
+  BinaryBasicBlock *CondSuccessor = nullptr;
+  BinaryBasicBlock *UncondSuccessor = nullptr;
+};
+
 class CDSplit : public BinaryFunctionPass {
+
 private:
   /// Overall stats.
   std::atomic<uint64_t> SplitBytesHot{0ull};
@@ -29,6 +36,18 @@ class CDSplit : public BinaryFunctionPass {
   /// A subset of functions in this list are considered for splitting.
   std::vector<BinaryFunction *> FunctionsToConsider;
 
+  /// Auxiliary variables used by the algorithm.
+  size_t TotalNumBlocks{0};
+  size_t OrigHotSectionSize{0};
+  DenseMap<const BinaryBasicBlock *, size_t> GlobalIndices;
+  DenseMap<const BinaryBasicBlock *, size_t> BBSizes;
+  DenseMap<const BinaryBasicBlock *, size_t> BBOffsets;
+  // Call graph.
+  std::vector<SmallVector<const BinaryBasicBlock *, 0>> Callers;
+  std::vector<SmallVector<const BinaryBasicBlock *, 0>> Callees;
+  // Conditional and unconditional successors of each BB.
+  DenseMap<const BinaryBasicBlock *, JumpInfo> JumpInfos;
+
   /// Helper functions to initialize global variables.
   void initialize(BinaryContext &BC);
 
diff --git a/bolt/lib/Passes/CDSplit.cpp b/bolt/lib/Passes/CDSplit.cpp
index 509d250c3419183..6b6f1454512aa02 100644
--- a/bolt/lib/Passes/CDSplit.cpp
+++ b/bolt/lib/Passes/CDSplit.cpp
@@ -39,6 +39,25 @@ namespace {
 bool shouldConsider(const BinaryFunction &BF) {
   return BF.hasValidIndex() && BF.hasValidProfile() && !BF.empty();
 }
+
+/// Find (un)conditional branch instruction info of the basic block.
+JumpInfo analyzeBranches(BinaryBasicBlock *BB) {
+  JumpInfo BBJumpInfo;
+  const MCSymbol *TBB = nullptr;
+  const MCSymbol *FBB = nullptr;
+  MCInst *CondBranch = nullptr;
+  MCInst *UncondBranch = nullptr;
+  if (BB->analyzeBranch(TBB, FBB, CondBranch, UncondBranch)) {
+    BBJumpInfo.HasUncondBranch = UncondBranch != nullptr;
+    if (BB->succ_size() == 1) {
+      BBJumpInfo.UncondSuccessor = BB->getSuccessor();
+    } else if (BB->succ_size() == 2) {
+      BBJumpInfo.CondSuccessor = BB->getConditionalSuccessor(true);
+      BBJumpInfo.UncondSuccessor = BB->getConditionalSuccessor(false);
+    }
+  }
+  return BBJumpInfo;
+}
 } // anonymous namespace
 
 bool CDSplit::shouldOptimize(const BinaryFunction &BF) const {
@@ -67,6 +86,69 @@ void CDSplit::initialize(BinaryContext &BC) {
     if (shouldConsider(*BF))
       FunctionsToConsider.push_back(BF);
   }
+
+  // Initialize auxiliary variables.
+  for (BinaryFunction *BF : FunctionsToConsider) {
+    // Calculate the size of each BB after hot-cold splitting.
+    // This populates BinaryBasicBlock::OutputAddressRange which
+    // can be used to compute the size of each BB.
+    BC.calculateEmittedSize(*BF, /*FixBranches=*/true);
+
+    for (BinaryBasicBlock *BB : BF->getLayout().blocks()) {
+      // Unique global index.
+      GlobalIndices[BB] = TotalNumBlocks;
+      TotalNumBlocks++;
+
+      // Block size after hot-cold splitting.
+      BBSizes[BB] = BB->getOutputAddressRange().second -
+                    BB->getOutputAddressRange().first;
+
+      // Hot block offset after hot-cold splitting.
+      BBOffsets[BB] = OrigHotSectionSize;
+      if (!BB->isSplit())
+        OrigHotSectionSize += BBSizes[BB];
+
+      // Conditional and unconditional successors.
+      JumpInfos[BB] = analyzeBranches(BB);
+    }
+  }
+
+  // Build call graph.
+  Callers.resize(TotalNumBlocks);
+  Callees.resize(TotalNumBlocks);
+  for (BinaryFunction *SrcFunction : FunctionsToConsider) {
+    for (BinaryBasicBlock &SrcBB : SrcFunction->blocks()) {
+      // Skip blocks that are not executed
+      if (SrcBB.getKnownExecutionCount() == 0)
+        continue;
+
+      // Find call instructions and extract target symbols from each one
+      for (const MCInst &Inst : SrcBB) {
+        if (!BC.MIB->isCall(Inst))
+          continue;
+
+        // Call info
+        const MCSymbol *DstSym = BC.MIB->getTargetSymbol(Inst);
+        // Ignore calls w/o information
+        if (!DstSym)
+          continue;
+
+        const BinaryFunction *DstFunction = BC.getFunctionForSymbol(DstSym);
+        // Ignore calls that do not have a valid target, but do not ignore
+        // recursive calls, because caller block could be moved to warm.
+        if (!DstFunction || DstFunction->getLayout().block_empty())
+          continue;
+
+        const BinaryBasicBlock *DstBB = &(DstFunction->front());
+
+        // Record the call only if DstBB is also in FunctionsToConsider.
+        if (GlobalIndices.contains(DstBB)) {
+          Callers[GlobalIndices[DstBB]].push_back(&SrcBB);
+          Callees[GlobalIndices[&SrcBB]].push_back(DstBB);
+        }
+      }
+    }
+  }
 }
 
 /// Find the best index for splitting. The returned value is the index of the

>From f88459fb42dcdd6918feba8ea69bc54ac2f09525 Mon Sep 17 00:00:00 2001
From: Shatian Wang <shatian at meta.com>
Date: Mon, 13 Nov 2023 17:16:59 -0800
Subject: [PATCH 6/6] [BOLT] CDSplit Main Logic Part 1/3

The first diff in a series of 3 that implements the main logic of
CDSplit. Under X86, function splitting can lead to block size increase.
This is because conditional and unconditional branch instructions whose
offset is under 8 bits can be encoded with 2 bytes. If the offset is
greater than 8 bits, then they need 6 and 5 bytes respectively.
Splitting a short conditional / unconditional branch will thus increase
the size of the src basic block by 4 and 3 bytes respectively. CDSplit
takes into account the potential block size increase when it makes
splitting decisions. This diff implements a function
estimatePostSplitBBAddress in CDSplit that approximates the block level
size increase at the given split index of the given function.
---
 bolt/include/bolt/Passes/CDSplit.h | 18 ++++++
 bolt/lib/Passes/CDSplit.cpp        | 90 ++++++++++++++++++++++++++++++
 2 files changed, 108 insertions(+)

diff --git a/bolt/include/bolt/Passes/CDSplit.h b/bolt/include/bolt/Passes/CDSplit.h
index a8d162d22b56469..20145e7656407c9 100644
--- a/bolt/include/bolt/Passes/CDSplit.h
+++ b/bolt/include/bolt/Passes/CDSplit.h
@@ -48,9 +48,27 @@ class CDSplit : public BinaryFunctionPass {
   // Conditional and unconditional successors of each BB.
   DenseMap<const BinaryBasicBlock *, JumpInfo> JumpInfos;
 
+  /// Sizes of branch instructions used to approximate block size increase
+  /// due to hot-warm splitting. Initialized to be 0. These values are updated
+  /// if the architecture is X86.
+  uint8_t BRANCH_SIZE = 0;
+  uint8_t LONG_UNCOND_BRANCH_SIZE_DELTA = 0;
+  uint8_t LONG_COND_BRANCH_SIZE_DELTA = 0;
+
   /// Helper functions to initialize global variables.
   void initialize(BinaryContext &BC);
 
+  /// Populate BinaryBasicBlock::OutputAddressRange with estimated basic block
+  /// start and end addresses for hot and warm basic blocks, assuming hot-warm
+  /// splitting happens at \p SplitIndex. Also return estimated end addresses
+  /// of the hot fragment before and after splitting.
+  /// The estimations take into account the potential addition of branch
+  /// instructions due to split fall through branches as well as the need to
+  /// use longer branch instructions for split (un)conditional branches.
+  std::pair<size_t, size_t>
+  estimatePostSplitBBAddress(const BasicBlockOrder &BlockOrder,
+                             const size_t SplitIndex);
+
   /// Split function body into 3 fragments: hot / warm / cold.
   void runOnFunction(BinaryFunction &BF);
 
diff --git a/bolt/lib/Passes/CDSplit.cpp b/bolt/lib/Passes/CDSplit.cpp
index 6b6f1454512aa02..854bcd52d3eafb7 100644
--- a/bolt/lib/Passes/CDSplit.cpp
+++ b/bolt/lib/Passes/CDSplit.cpp
@@ -149,6 +149,96 @@ void CDSplit::initialize(BinaryContext &BC) {
       }
     }
   }
+
+  // If X86, long branch instructions take more bytes than short branches.
+  // Adjust sizes of branch instructions used to approximate block size
+  // increase due to hot-warm splitting.
+  if (BC.isX86()) {
+    // a short branch takes 2 bytes.
+    BRANCH_SIZE = 2;
+    // a long uncond branch takes BRANCH_SIZE + 3 bytes.
+    LONG_UNCOND_BRANCH_SIZE_DELTA = 3;
+    // a long cond branch takes BRANCH_SIZE + 4 bytes.
+    LONG_COND_BRANCH_SIZE_DELTA = 4;
+  }
+}
+
+std::pair<size_t, size_t>
+CDSplit::estimatePostSplitBBAddress(const BasicBlockOrder &BlockOrder,
+                                    const size_t SplitIndex) {
+  assert(SplitIndex < BlockOrder.size() && "Invalid split index");
+  // Helper function estimating if a branch needs a longer branch instruction.
+  // The function returns true if the following two conditions are satisfied:
+  // condition 1. One of SrcBB and DstBB is in hot, the other is in warm.
+  // condition 2. The pre-split branch distance is within 8 bytes.
+  auto needNewLongBranch = [&](const BinaryBasicBlock *SrcBB,
+                               const BinaryBasicBlock *DstBB) {
+    if (!SrcBB || !DstBB)
+      return false;
+    // The following checks for condition 1.
+    if (SrcBB->isSplit() || DstBB->isSplit())
+      return false;
+    if ((SrcBB->getLayoutIndex() <= SplitIndex) ==
+        (DstBB->getLayoutIndex() <= SplitIndex))
+      return false;
+    // The following checks for condition 2.
+    return (AbsoluteDifference(BBOffsets[DstBB],
+                               BBOffsets[SrcBB] + BBSizes[SrcBB]) <=
+            std::numeric_limits<int8_t>::max());
+  };
+
+  // Populate BB.OutputAddressRange with estimated new start and end addresses
+  // and compute the old end address of the hot section and the new end address
+  // of the hot section.
+  size_t OldHotEndAddr;
+  size_t NewHotEndAddr;
+  size_t CurrentAddr = BBOffsets[BlockOrder[0]];
+  for (BinaryBasicBlock *BB : BlockOrder) {
+    // We only care about new addresses of blocks in hot/warm.
+    if (BB->isSplit())
+      break;
+    size_t NewSize = BBSizes[BB];
+    // Need to add a new branch instruction if a fall-through branch is split.
+    bool NeedNewUncondBranch =
+        (JumpInfos[BB].UncondSuccessor && !JumpInfos[BB].HasUncondBranch &&
+         BB->getLayoutIndex() == SplitIndex);
+
+    NewSize += BRANCH_SIZE * NeedNewUncondBranch +
+               LONG_UNCOND_BRANCH_SIZE_DELTA *
+                   needNewLongBranch(BB, JumpInfos[BB].UncondSuccessor) +
+               LONG_COND_BRANCH_SIZE_DELTA *
+                   needNewLongBranch(BB, JumpInfos[BB].CondSuccessor);
+    BB->setOutputStartAddress(CurrentAddr);
+    CurrentAddr += NewSize;
+    BB->setOutputEndAddress(CurrentAddr);
+    // Temporarily set the start address of the warm fragment of the current
+    // function to be 0. We will update it later when we can get a better
+    // estimate.
+    if (BB->getLayoutIndex() == SplitIndex) {
+      NewHotEndAddr = CurrentAddr;
+      CurrentAddr = 0;
+    }
+    OldHotEndAddr = BBOffsets[BB] + BBSizes[BB];
+  }
+
+  // Update the start and end addresses of blocks in the warm fragment.
+  // First get a better estimate of the start address of the warm fragment.
+  assert(OrigHotSectionSize + NewHotEndAddr >= OldHotEndAddr);
+  size_t WarmSectionStartAddr =
+      OrigHotSectionSize + NewHotEndAddr - OldHotEndAddr;
+  // Do the correction.
+  for (size_t Index = SplitIndex + 1; Index < BlockOrder.size(); Index++) {
+    BinaryBasicBlock *BB = BlockOrder[Index];
+    // We only care about new addresses of blocks in warm.
+    if (BB->isSplit())
+      break;
+    size_t StartAddrOffset = BB->getOutputAddressRange().first;
+    size_t EndAddrOffset = BB->getOutputAddressRange().second;
+    BB->setOutputStartAddress(WarmSectionStartAddr + StartAddrOffset);
+    BB->setOutputEndAddress(WarmSectionStartAddr + EndAddrOffset);
+  }
+
+  return std::make_pair(OldHotEndAddr, NewHotEndAddr);
 }
 
 /// Find the best index for splitting. The returned value is the index of the



More information about the llvm-commits mailing list