[llvm] [BOLT][AArch64] Add support for compact code model (PR #112110)

Maksim Panchenko via llvm-commits llvm-commits at lists.llvm.org
Sat Oct 12 15:39:00 PDT 2024


https://github.com/maksfb created https://github.com/llvm/llvm-project/pull/112110

Add `--compact-code-model` option that executes alternative branch relaxation with an assumption that the resulting binary has less than 128MB of code. The relaxation is done in `relaxLocalBranches()`, which operates on a function level and executes on multiple functions in parallel.

Running the new option on AArch64 Clang binary produces slightly smaller code and the relaxation finishes in about 1/10th of the time.

Note that the new `.text` has to be smaller than 128MB, *and* `.plt` has to be closer than 128MB to `.text`.

>From 9c5b3401612f61cdb473a63ad66fb6fe4d67df71 Mon Sep 17 00:00:00 2001
From: Maksim Panchenko <maks at fb.com>
Date: Fri, 13 Sep 2024 21:17:02 -0700
Subject: [PATCH] [BOLT][AArch64] Add support for compact code model

Add `--compact-code-model` option that executes alternative branch
relaxation with an assumption that the resulting binary has less than
128MB of code. The relaxation is done in `relaxLocalBranches()`, which
operates on a function level and executes on multiple functions in
parallel.

Running the new pass on AArch64 Clang binary produces slightly smaller
code and finishes in about 1/10th of the time.

Note that the new .text has to be smaller than 128MB, *and* .plt has to
be closer than 128MB to the new code.
---
 bolt/include/bolt/Core/BinaryBasicBlock.h |   3 +
 bolt/include/bolt/Core/FunctionLayout.h   |   3 +-
 bolt/include/bolt/Passes/LongJmp.h        |  13 +
 bolt/lib/Core/FunctionLayout.cpp          |   4 +-
 bolt/lib/Passes/LongJmp.cpp               | 278 +++++++++++++++++++++-
 bolt/test/AArch64/compact-code-model.s    |  48 ++++
 6 files changed, 344 insertions(+), 5 deletions(-)
 create mode 100644 bolt/test/AArch64/compact-code-model.s

diff --git a/bolt/include/bolt/Core/BinaryBasicBlock.h b/bolt/include/bolt/Core/BinaryBasicBlock.h
index b4f31cf2bae6f6..25cccc4edecf68 100644
--- a/bolt/include/bolt/Core/BinaryBasicBlock.h
+++ b/bolt/include/bolt/Core/BinaryBasicBlock.h
@@ -819,6 +819,9 @@ class BinaryBasicBlock {
     return OutputAddressRange;
   }
 
+  uint64_t getOutputStartAddress() const { return OutputAddressRange.first; }
+  uint64_t getOutputEndAddress() const { return OutputAddressRange.second; }
+
   bool hasLocSyms() const { return LocSyms != nullptr; }
 
   /// Return mapping of input offsets to symbols in the output.
diff --git a/bolt/include/bolt/Core/FunctionLayout.h b/bolt/include/bolt/Core/FunctionLayout.h
index 6a13cbec69fee7..ee4dd689b8dd64 100644
--- a/bolt/include/bolt/Core/FunctionLayout.h
+++ b/bolt/include/bolt/Core/FunctionLayout.h
@@ -123,7 +123,8 @@ class FunctionFragment {
   const_iterator begin() const;
   iterator end();
   const_iterator end() const;
-  const BinaryBasicBlock *front() const;
+  BinaryBasicBlock *front() const;
+  BinaryBasicBlock *back() const;
 
   friend class FunctionLayout;
 };
diff --git a/bolt/include/bolt/Passes/LongJmp.h b/bolt/include/bolt/Passes/LongJmp.h
index 3d02d75ac4a277..df3ea9620918af 100644
--- a/bolt/include/bolt/Passes/LongJmp.h
+++ b/bolt/include/bolt/Passes/LongJmp.h
@@ -63,6 +63,19 @@ class LongJmpPass : public BinaryFunctionPass {
   uint32_t NumColdStubs{0};
   uint32_t NumSharedStubs{0};
 
+  /// The shortest distance for any branch instruction on AArch64.
+  static constexpr size_t ShortestJumpBits = 16;
+  static constexpr size_t ShortestJumpSpan = 1ULL << (ShortestJumpBits - 1);
+
+  /// The longest single-instruction branch.
+  static constexpr size_t LongestJumpBits = 28;
+  static constexpr size_t LongestJumpSpan = 1ULL << (LongestJumpBits - 1);
+
+  /// Relax all internal function branches including those between fragments.
+  /// Assume that fragments are placed in different sections but are within
+  /// 128MB of each other.
+  void relaxLocalBranches(BinaryFunction &BF);
+
   ///                 -- Layout estimation methods --
   /// Try to do layout before running the emitter, by looking at BinaryFunctions
   /// and MCInsts -- this is an estimation. To be correct for longjmp inserter
diff --git a/bolt/lib/Core/FunctionLayout.cpp b/bolt/lib/Core/FunctionLayout.cpp
index 15e6127ad2e9e8..4498fc44da9548 100644
--- a/bolt/lib/Core/FunctionLayout.cpp
+++ b/bolt/lib/Core/FunctionLayout.cpp
@@ -33,7 +33,9 @@ FunctionFragment::const_iterator FunctionFragment::end() const {
   return const_iterator(Layout->block_begin() + StartIndex + Size);
 }
 
-const BinaryBasicBlock *FunctionFragment::front() const { return *begin(); }
+BinaryBasicBlock *FunctionFragment::front() const { return *begin(); }
+
+BinaryBasicBlock *FunctionFragment::back() const { return *std::prev(end()); }
 
 FunctionLayout::FunctionLayout() { addFragment(); }
 
diff --git a/bolt/lib/Passes/LongJmp.cpp b/bolt/lib/Passes/LongJmp.cpp
index c483f70a836ee1..4ce2322ab4352c 100644
--- a/bolt/lib/Passes/LongJmp.cpp
+++ b/bolt/lib/Passes/LongJmp.cpp
@@ -11,18 +11,26 @@
 //===----------------------------------------------------------------------===//
 
 #include "bolt/Passes/LongJmp.h"
+#include "bolt/Core/ParallelUtilities.h"
+#include "llvm/Support/MathExtras.h"
 
 #define DEBUG_TYPE "longjmp"
 
 using namespace llvm;
 
 namespace opts {
+extern cl::OptionCategory BoltCategory;
 extern cl::OptionCategory BoltOptCategory;
 extern llvm::cl::opt<unsigned> AlignText;
 extern cl::opt<unsigned> AlignFunctions;
 extern cl::opt<bool> UseOldText;
 extern cl::opt<bool> HotFunctionsAtEnd;
 
+static cl::opt<bool>
+    CompactCodeModel("compact-code-model",
+                     cl::desc("generate code for binaries <128MB on AArch64"),
+                     cl::init(false), cl::cat(BoltCategory));
+
 static cl::opt<bool> GroupStubs("group-stubs",
                                 cl::desc("share stubs across functions"),
                                 cl::init(true), cl::cat(BoltOptCategory));
@@ -61,10 +69,10 @@ static BinaryBasicBlock *getBBAtHotColdSplitPoint(BinaryFunction &Func) {
     if (Next != E && (*Next)->isCold())
       return *I;
   }
-  llvm_unreachable("No hot-colt split point found");
+  llvm_unreachable("No hot-cold split point found");
 }
 
-static bool shouldInsertStub(const BinaryContext &BC, const MCInst &Inst) {
+static bool mayNeedStub(const BinaryContext &BC, const MCInst &Inst) {
   return (BC.MIB->isBranch(Inst) || BC.MIB->isCall(Inst)) &&
          !BC.MIB->isIndirectBranch(Inst) && !BC.MIB->isIndirectCall(Inst);
 }
@@ -565,7 +573,7 @@ Error LongJmpPass::relax(BinaryFunction &Func, bool &Modified) {
       if (BC.MIB->isPseudo(Inst))
         continue;
 
-      if (!shouldInsertStub(BC, Inst)) {
+      if (!mayNeedStub(BC, Inst)) {
         DotAddress += InsnSize;
         continue;
       }
@@ -629,7 +637,271 @@ Error LongJmpPass::relax(BinaryFunction &Func, bool &Modified) {
   return Error::success();
 }
 
+void LongJmpPass::relaxLocalBranches(BinaryFunction &BF) {
+  BinaryContext &BC = BF.getBinaryContext();
+  auto &MIB = BC.MIB;
+
+  if (!BF.isSimple())
+    return;
+
+  // Quick path.
+  if (!BF.isSplit() && BF.estimateSize() < ShortestJumpSpan)
+    return;
+
+  auto isBranchOffsetInRange = [&](const MCInst &Inst, int64_t Offset) {
+    const unsigned Bits = MIB->getPCRelEncodingSize(Inst);
+    return isIntN(Bits, Offset);
+  };
+
+  auto isBlockInRange = [&](const MCInst &Inst, uint64_t InstAddress,
+                            const BinaryBasicBlock &BB) {
+    const int64_t Offset = BB.getOutputStartAddress() - InstAddress;
+    return isBranchOffsetInRange(Inst, Offset);
+  };
+
+  // Keep track of *all* function trampolines that are going to be added to the
+  // function layout at the end of relaxation.
+  std::vector<std::pair<BinaryBasicBlock *, std::unique_ptr<BinaryBasicBlock>>>
+      FunctionTrampolines;
+
+  // Function fragments are relaxed independently.
+  for (FunctionFragment &FF : BF.getLayout().fragments()) {
+    // Fill out code size estimation for the fragment. Use output BB address
+    // ranges to store offsets from the start of the function.
+    uint64_t CodeSize = 0;
+    for (BinaryBasicBlock *BB : FF) {
+      BB->setOutputStartAddress(CodeSize);
+      CodeSize += BB->estimateSize();
+      BB->setOutputEndAddress(CodeSize);
+    }
+
+    // Dynamically-updated size of the fragment.
+    uint64_t FragmentSize = CodeSize;
+
+    // Size of the trampoline in bytes.
+    constexpr uint64_t TrampolineSize = 4;
+
+    // Trampolines created for the fragment. DestinationBB -> TrampolineBB.
+    // NB: here we store only the first trampoline created for DestinationBB.
+    DenseMap<const BinaryBasicBlock *, BinaryBasicBlock *> FragmentTrampolines;
+
+    // Create a trampoline code after \p BB or at the end of the fragment if BB
+    // is nullptr.
+    auto addTrampolineAfter = [&](BinaryBasicBlock *BB,
+                                  BinaryBasicBlock *TargetBB, uint64_t Count,
+                                  bool UpdateOffsets = true) {
+      std::unique_ptr<BinaryBasicBlock> TrampolineBB = BF.createBasicBlock();
+      MCInst Inst;
+      {
+        auto L = BC.scopeLock();
+        MIB->createUncondBranch(Inst, TargetBB->getLabel(), BC.Ctx.get());
+      }
+      TrampolineBB->addInstruction(Inst);
+      TrampolineBB->addSuccessor(TargetBB, Count);
+      TrampolineBB->setExecutionCount(Count);
+      const uint64_t TrampolineAddress =
+          BB ? BB->getOutputEndAddress() : FragmentSize;
+      TrampolineBB->setOutputStartAddress(TrampolineAddress);
+      TrampolineBB->setOutputEndAddress(TrampolineAddress + TrampolineSize);
+      TrampolineBB->setFragmentNum(FF.getFragmentNum());
+
+      if (UpdateOffsets) {
+        FragmentSize += TrampolineSize;
+        for (BinaryBasicBlock *IBB : FF) {
+          if (IBB->getOutputStartAddress() >= TrampolineAddress) {
+            IBB->setOutputStartAddress(IBB->getOutputStartAddress() +
+                                       TrampolineSize);
+            IBB->setOutputEndAddress(IBB->getOutputEndAddress() +
+                                     TrampolineSize);
+          }
+        }
+        for (auto &Pair : FunctionTrampolines) {
+          BinaryBasicBlock *IBB = Pair.second.get();
+          if (IBB->getFragmentNum() != TrampolineBB->getFragmentNum())
+            continue;
+          if (IBB == TrampolineBB.get())
+            continue;
+          if (IBB->getOutputStartAddress() >= TrampolineAddress) {
+            IBB->setOutputStartAddress(IBB->getOutputStartAddress() +
+                                       TrampolineSize);
+            IBB->setOutputEndAddress(IBB->getOutputEndAddress() +
+                                     TrampolineSize);
+          }
+        }
+      }
+
+      if (!FragmentTrampolines.lookup(TargetBB))
+        FragmentTrampolines[TargetBB] = TrampolineBB.get();
+      FunctionTrampolines.emplace_back(BB ? BB : FF.back(),
+                                       std::move(TrampolineBB));
+
+      return FunctionTrampolines.back().second.get();
+    };
+
+    // Pre-populate trampolines by splitting unconditional branches from the
+    // containing basic block.
+    for (BinaryBasicBlock *BB : FF) {
+      MCInst *Inst = BB->getLastNonPseudoInstr();
+      if (!Inst || !MIB->isUnconditionalBranch(*Inst))
+        continue;
+
+      const MCSymbol *TargetSymbol = MIB->getTargetSymbol(*Inst);
+      BB->eraseInstruction(BB->findInstruction(Inst));
+      BB->setOutputEndAddress(BB->getOutputEndAddress() - TrampolineSize);
+
+      BinaryBasicBlock::BinaryBranchInfo BI;
+      BinaryBasicBlock *TargetBB = BB->getSuccessor(TargetSymbol, BI);
+
+      BinaryBasicBlock *TrampolineBB =
+          addTrampolineAfter(BB, TargetBB, BI.Count, /*UpdateOffsets*/ false);
+      BB->replaceSuccessor(TargetBB, TrampolineBB, BI.Count);
+    }
+
+    /// Relax the branch \p Inst. Return true if basic block offsets need an
+    /// update after the trampoline insertion.
+    auto relaxBranch = [&](BinaryBasicBlock *BB, MCInst &Inst,
+                           uint64_t InstAddress, BinaryBasicBlock *TargetBB) {
+      BinaryFunction *BF = BB->getParent();
+
+      // Use branch taken count for optimal relaxation.
+      const uint64_t Count = BB->getBranchInfo(*TargetBB).Count;
+      assert(Count != BinaryBasicBlock::COUNT_NO_PROFILE &&
+             "Expected valid branch execution count");
+
+      // Try to reuse an existing trampoline without introducing any new code.
+      BinaryBasicBlock *TrampolineBB = FragmentTrampolines.lookup(TargetBB);
+      if (TrampolineBB && isBlockInRange(Inst, InstAddress, *TrampolineBB)) {
+        BB->replaceSuccessor(TargetBB, TrampolineBB, Count);
+        TrampolineBB->setExecutionCount(TrampolineBB->getExecutionCount() +
+                                        Count);
+        auto L = BC.scopeLock();
+        MIB->replaceBranchTarget(Inst, TrampolineBB->getLabel(), BC.Ctx.get());
+        return;
+      }
+
+      // For cold branches, check if we can introduce a trampoline at the end
+      // of the fragment that is within the branch reach. Note that such
+      // trampoline may change address later and become unreachable in which
+      // case we will need further relaxation.
+      const int64_t OffsetToEnd = FragmentSize - InstAddress;
+      if (Count == 0 && isBranchOffsetInRange(Inst, OffsetToEnd)) {
+        TrampolineBB = addTrampolineAfter(nullptr, TargetBB, Count);
+        BB->replaceSuccessor(TargetBB, TrampolineBB, Count);
+        auto L = BC.scopeLock();
+        MIB->replaceBranchTarget(Inst, TrampolineBB->getLabel(), BC.Ctx.get());
+
+        return;
+      }
+
+      // Insert a new block after the current one and use it as a trampoline.
+      TrampolineBB = addTrampolineAfter(BB, TargetBB, Count);
+
+      // If the other successor is a fall-through, invert the condition code.
+      const BinaryBasicBlock *const NextBB =
+          BF->getLayout().getBasicBlockAfter(BB, /*IgnoreSplits*/ false);
+      if (BB->getConditionalSuccessor(false) == NextBB) {
+        BB->swapConditionalSuccessors();
+        auto L = BC.scopeLock();
+        MIB->reverseBranchCondition(Inst, NextBB->getLabel(), BC.Ctx.get());
+      } else {
+        auto L = BC.scopeLock();
+        MIB->replaceBranchTarget(Inst, TrampolineBB->getLabel(), BC.Ctx.get());
+      }
+      BB->replaceSuccessor(TargetBB, TrampolineBB, Count);
+    };
+
+    bool MayNeedRelaxation;
+    uint64_t NumIterations = 0;
+    do {
+      MayNeedRelaxation = false;
+      ++NumIterations;
+      for (auto BBI = FF.begin(); BBI != FF.end(); ++BBI) {
+        BinaryBasicBlock *BB = *BBI;
+        uint64_t NextInstOffset = BB->getOutputStartAddress();
+        for (MCInst &Inst : *BB) {
+          const size_t InstAddress = NextInstOffset;
+          if (!MIB->isPseudo(Inst))
+            NextInstOffset += 4;
+
+          if (!mayNeedStub(BF.getBinaryContext(), Inst))
+            continue;
+
+          const size_t BitsAvailable = MIB->getPCRelEncodingSize(Inst);
+
+          // Span of +/-128MB.
+          if (BitsAvailable == LongestJumpBits)
+            continue;
+
+          const MCSymbol *TargetSymbol = MIB->getTargetSymbol(Inst);
+          BinaryBasicBlock *TargetBB = BB->getSuccessor(TargetSymbol);
+          assert(TargetBB &&
+                 "Basic block target expected for conditional branch.");
+
+          // Check if the relaxation is needed.
+          if (TargetBB->getFragmentNum() == FF.getFragmentNum() &&
+              isBlockInRange(Inst, InstAddress, *TargetBB))
+            continue;
+
+          relaxBranch(BB, Inst, InstAddress, TargetBB);
+
+          MayNeedRelaxation = true;
+        }
+      }
+
+      // We may have added new instructions, but the whole fragment is less than
+      // the minimum branch span.
+      if (FragmentSize < ShortestJumpSpan)
+        MayNeedRelaxation = false;
+
+    } while (MayNeedRelaxation);
+
+    LLVM_DEBUG({
+      if (NumIterations > 2) {
+        dbgs() << "BOLT-DEBUG: relaxed fragment " << FF.getFragmentNum().get()
+               << " of " << BF << " in " << NumIterations << " iterations\n";
+      }
+    });
+  }
+
+  // Add trampoline blocks from all fragments to the layout.
+  DenseMap<BinaryBasicBlock *, std::vector<std::unique_ptr<BinaryBasicBlock>>>
+      Insertions;
+  for (std::pair<BinaryBasicBlock *, std::unique_ptr<BinaryBasicBlock>> &Pair :
+       FunctionTrampolines) {
+    if (!Pair.second)
+      continue;
+    Insertions[Pair.first].emplace_back(std::move(Pair.second));
+  }
+
+  for (auto &Pair : Insertions) {
+    BF.insertBasicBlocks(Pair.first, std::move(Pair.second),
+                         /*UpdateLayout*/ true, /*UpdateCFI*/ true,
+                         /*RecomputeLPs*/ false);
+  }
+}
+
 Error LongJmpPass::runOnFunctions(BinaryContext &BC) {
+
+  if (opts::CompactCodeModel) {
+    BC.outs()
+        << "BOLT-INFO: relaxing branches for compact code model (<128MB)\n";
+
+    ParallelUtilities::WorkFuncTy WorkFun = [&](BinaryFunction &BF) {
+      relaxLocalBranches(BF);
+    };
+
+    ParallelUtilities::PredicateTy SkipPredicate =
+        [&](const BinaryFunction &BF) {
+          return !BC.shouldEmit(BF) || !BF.isSimple();
+        };
+
+    ParallelUtilities::runOnEachFunction(
+        BC, ParallelUtilities::SchedulingPolicy::SP_INST_LINEAR, WorkFun,
+        SkipPredicate, "RelaxLocalBranches");
+
+    return Error::success();
+  }
+
   BC.outs() << "BOLT-INFO: Starting stub-insertion pass\n";
   std::vector<BinaryFunction *> Sorted = BC.getSortedFunctions();
   bool Modified;
diff --git a/bolt/test/AArch64/compact-code-model.s b/bolt/test/AArch64/compact-code-model.s
new file mode 100644
index 00000000000000..c8d8ac9131b45c
--- /dev/null
+++ b/bolt/test/AArch64/compact-code-model.s
@@ -0,0 +1,48 @@
+## Check that llvm-bolt successfully relaxes branches for compact (<128MB) code
+## model.
+
+# RUN: llvm-mc -filetype=obj -triple aarch64-unknown-unknown %s -o %t.o
+# RUN: %clang %cflags %t.o -o %t.exe -Wl,-q -static
+# RUN: llvm-bolt %t.exe -o %t.bolt --split-functions --split-strategy=randomN \
+# RUN:   --keep-nops --compact-code-model
+# RUN: llvm-objdump -d --disassemble-symbols=_start %t.bolt | FileCheck %s
+# RUN: llvm-nm -n %t.bolt | FileCheck %s --check-prefix=CHECK-NM
+
+## _start will be split and its main fragment will be separated from other
+## fragments by large_function() which is over 1MB.
+
+# CHECK-NM: _start
+# CHECK-NM-NEXT: large_function
+# CHECK-NM-NEXT: _start.cold
+
+  .text
+  .globl _start
+  .type _start, %function
+_start:
+  .cfi_startproc
+  cmp  x1, 1
+  b.hi  .L1
+# CHECK: b.hi
+# CHECK-NEXT: b
+# CHECK-NEXT: b
+
+  bl large_function
+.L1:
+  ret  x30
+  .cfi_endproc
+.size _start, .-_start
+
+
+  .globl large_function
+  .type large_function, %function
+large_function:
+  .cfi_startproc
+  .rept 300000
+    nop
+  .endr
+  ret  x30
+  .cfi_endproc
+.size large_function, .-large_function
+
+## Force relocation mode.
+  .reloc 0, R_AARCH64_NONE



More information about the llvm-commits mailing list