[llvm] [BOLT][AArch64] Add support for compact code model (PR #112110)
Maksim Panchenko via llvm-commits
llvm-commits at lists.llvm.org
Sat Oct 12 15:39:00 PDT 2024
https://github.com/maksfb created https://github.com/llvm/llvm-project/pull/112110
Add `--compact-code-model` option that executes alternative branch relaxation with an assumption that the resulting binary has less than 128MB of code. The relaxation is done in `relaxLocalBranches()`, which operates on a function level and executes on multiple functions in parallel.
Running the new option on AArch64 Clang binary produces slightly smaller code and the relaxation finishes in about 1/10th of the time.
Note that the new `.text` has to be smaller than 128MB, *and* `.plt` has to be closer than 128MB to `.text`.
>From 9c5b3401612f61cdb473a63ad66fb6fe4d67df71 Mon Sep 17 00:00:00 2001
From: Maksim Panchenko <maks at fb.com>
Date: Fri, 13 Sep 2024 21:17:02 -0700
Subject: [PATCH] [BOLT][AArch64] Add support for compact code model
Add `--compact-code-model` option that executes alternative branch
relaxation with an assumption that the resulting binary has less than
128MB of code. The relaxation is done in `relaxLocalBranches()`, which
operates on a function level and executes on multiple functions in
parallel.
Running the new pass on AArch64 Clang binary produces slightly smaller
code and finishes in about 1/10th of the time.
Note that the new .text has to be smaller than 128MB, *and* .plt has to
be closer than 128MB to the new code.
---
bolt/include/bolt/Core/BinaryBasicBlock.h | 3 +
bolt/include/bolt/Core/FunctionLayout.h | 3 +-
bolt/include/bolt/Passes/LongJmp.h | 13 +
bolt/lib/Core/FunctionLayout.cpp | 4 +-
bolt/lib/Passes/LongJmp.cpp | 278 +++++++++++++++++++++-
bolt/test/AArch64/compact-code-model.s | 48 ++++
6 files changed, 344 insertions(+), 5 deletions(-)
create mode 100644 bolt/test/AArch64/compact-code-model.s
diff --git a/bolt/include/bolt/Core/BinaryBasicBlock.h b/bolt/include/bolt/Core/BinaryBasicBlock.h
index b4f31cf2bae6f6..25cccc4edecf68 100644
--- a/bolt/include/bolt/Core/BinaryBasicBlock.h
+++ b/bolt/include/bolt/Core/BinaryBasicBlock.h
@@ -819,6 +819,9 @@ class BinaryBasicBlock {
return OutputAddressRange;
}
+ uint64_t getOutputStartAddress() const { return OutputAddressRange.first; }
+ uint64_t getOutputEndAddress() const { return OutputAddressRange.second; }
+
bool hasLocSyms() const { return LocSyms != nullptr; }
/// Return mapping of input offsets to symbols in the output.
diff --git a/bolt/include/bolt/Core/FunctionLayout.h b/bolt/include/bolt/Core/FunctionLayout.h
index 6a13cbec69fee7..ee4dd689b8dd64 100644
--- a/bolt/include/bolt/Core/FunctionLayout.h
+++ b/bolt/include/bolt/Core/FunctionLayout.h
@@ -123,7 +123,8 @@ class FunctionFragment {
const_iterator begin() const;
iterator end();
const_iterator end() const;
- const BinaryBasicBlock *front() const;
+ BinaryBasicBlock *front() const;
+ BinaryBasicBlock *back() const;
friend class FunctionLayout;
};
diff --git a/bolt/include/bolt/Passes/LongJmp.h b/bolt/include/bolt/Passes/LongJmp.h
index 3d02d75ac4a277..df3ea9620918af 100644
--- a/bolt/include/bolt/Passes/LongJmp.h
+++ b/bolt/include/bolt/Passes/LongJmp.h
@@ -63,6 +63,19 @@ class LongJmpPass : public BinaryFunctionPass {
uint32_t NumColdStubs{0};
uint32_t NumSharedStubs{0};
+ /// The shortest distance for any branch instruction on AArch64.
+ static constexpr size_t ShortestJumpBits = 16;
+ static constexpr size_t ShortestJumpSpan = 1ULL << (ShortestJumpBits - 1);
+
+ /// The longest single-instruction branch.
+ static constexpr size_t LongestJumpBits = 28;
+ static constexpr size_t LongestJumpSpan = 1ULL << (LongestJumpBits - 1);
+
+ /// Relax all internal function branches including those between fragments.
+ /// Assume that fragments are placed in different sections but are within
+ /// 128MB of each other.
+ void relaxLocalBranches(BinaryFunction &BF);
+
/// -- Layout estimation methods --
/// Try to do layout before running the emitter, by looking at BinaryFunctions
/// and MCInsts -- this is an estimation. To be correct for longjmp inserter
diff --git a/bolt/lib/Core/FunctionLayout.cpp b/bolt/lib/Core/FunctionLayout.cpp
index 15e6127ad2e9e8..4498fc44da9548 100644
--- a/bolt/lib/Core/FunctionLayout.cpp
+++ b/bolt/lib/Core/FunctionLayout.cpp
@@ -33,7 +33,9 @@ FunctionFragment::const_iterator FunctionFragment::end() const {
return const_iterator(Layout->block_begin() + StartIndex + Size);
}
-const BinaryBasicBlock *FunctionFragment::front() const { return *begin(); }
+BinaryBasicBlock *FunctionFragment::front() const { return *begin(); }
+
+BinaryBasicBlock *FunctionFragment::back() const { return *std::prev(end()); }
FunctionLayout::FunctionLayout() { addFragment(); }
diff --git a/bolt/lib/Passes/LongJmp.cpp b/bolt/lib/Passes/LongJmp.cpp
index c483f70a836ee1..4ce2322ab4352c 100644
--- a/bolt/lib/Passes/LongJmp.cpp
+++ b/bolt/lib/Passes/LongJmp.cpp
@@ -11,18 +11,26 @@
//===----------------------------------------------------------------------===//
#include "bolt/Passes/LongJmp.h"
+#include "bolt/Core/ParallelUtilities.h"
+#include "llvm/Support/MathExtras.h"
#define DEBUG_TYPE "longjmp"
using namespace llvm;
namespace opts {
+extern cl::OptionCategory BoltCategory;
extern cl::OptionCategory BoltOptCategory;
extern llvm::cl::opt<unsigned> AlignText;
extern cl::opt<unsigned> AlignFunctions;
extern cl::opt<bool> UseOldText;
extern cl::opt<bool> HotFunctionsAtEnd;
+static cl::opt<bool>
+ CompactCodeModel("compact-code-model",
+ cl::desc("generate code for binaries <128MB on AArch64"),
+ cl::init(false), cl::cat(BoltCategory));
+
static cl::opt<bool> GroupStubs("group-stubs",
cl::desc("share stubs across functions"),
cl::init(true), cl::cat(BoltOptCategory));
@@ -61,10 +69,10 @@ static BinaryBasicBlock *getBBAtHotColdSplitPoint(BinaryFunction &Func) {
if (Next != E && (*Next)->isCold())
return *I;
}
- llvm_unreachable("No hot-colt split point found");
+ llvm_unreachable("No hot-cold split point found");
}
-static bool shouldInsertStub(const BinaryContext &BC, const MCInst &Inst) {
+static bool mayNeedStub(const BinaryContext &BC, const MCInst &Inst) {
return (BC.MIB->isBranch(Inst) || BC.MIB->isCall(Inst)) &&
!BC.MIB->isIndirectBranch(Inst) && !BC.MIB->isIndirectCall(Inst);
}
@@ -565,7 +573,7 @@ Error LongJmpPass::relax(BinaryFunction &Func, bool &Modified) {
if (BC.MIB->isPseudo(Inst))
continue;
- if (!shouldInsertStub(BC, Inst)) {
+ if (!mayNeedStub(BC, Inst)) {
DotAddress += InsnSize;
continue;
}
@@ -629,7 +637,271 @@ Error LongJmpPass::relax(BinaryFunction &Func, bool &Modified) {
return Error::success();
}
+void LongJmpPass::relaxLocalBranches(BinaryFunction &BF) {
+ BinaryContext &BC = BF.getBinaryContext();
+ auto &MIB = BC.MIB;
+
+ if (!BF.isSimple())
+ return;
+
+ // Quick path.
+ if (!BF.isSplit() && BF.estimateSize() < ShortestJumpSpan)
+ return;
+
+ auto isBranchOffsetInRange = [&](const MCInst &Inst, int64_t Offset) {
+ const unsigned Bits = MIB->getPCRelEncodingSize(Inst);
+ return isIntN(Bits, Offset);
+ };
+
+ auto isBlockInRange = [&](const MCInst &Inst, uint64_t InstAddress,
+ const BinaryBasicBlock &BB) {
+ const int64_t Offset = BB.getOutputStartAddress() - InstAddress;
+ return isBranchOffsetInRange(Inst, Offset);
+ };
+
+ // Keep track of *all* function trampolines that are going to be added to the
+ // function layout at the end of relaxation.
+ std::vector<std::pair<BinaryBasicBlock *, std::unique_ptr<BinaryBasicBlock>>>
+ FunctionTrampolines;
+
+ // Function fragments are relaxed independently.
+ for (FunctionFragment &FF : BF.getLayout().fragments()) {
+ // Fill out code size estimation for the fragment. Use output BB address
+ // ranges to store offsets from the start of the function.
+ uint64_t CodeSize = 0;
+ for (BinaryBasicBlock *BB : FF) {
+ BB->setOutputStartAddress(CodeSize);
+ CodeSize += BB->estimateSize();
+ BB->setOutputEndAddress(CodeSize);
+ }
+
+ // Dynamically-updated size of the fragment.
+ uint64_t FragmentSize = CodeSize;
+
+ // Size of the trampoline in bytes.
+ constexpr uint64_t TrampolineSize = 4;
+
+ // Trampolines created for the fragment. DestinationBB -> TrampolineBB.
+ // NB: here we store only the first trampoline created for DestinationBB.
+ DenseMap<const BinaryBasicBlock *, BinaryBasicBlock *> FragmentTrampolines;
+
+ // Create a trampoline code after \p BB or at the end of the fragment if BB
+ // is nullptr.
+ auto addTrampolineAfter = [&](BinaryBasicBlock *BB,
+ BinaryBasicBlock *TargetBB, uint64_t Count,
+ bool UpdateOffsets = true) {
+ std::unique_ptr<BinaryBasicBlock> TrampolineBB = BF.createBasicBlock();
+ MCInst Inst;
+ {
+ auto L = BC.scopeLock();
+ MIB->createUncondBranch(Inst, TargetBB->getLabel(), BC.Ctx.get());
+ }
+ TrampolineBB->addInstruction(Inst);
+ TrampolineBB->addSuccessor(TargetBB, Count);
+ TrampolineBB->setExecutionCount(Count);
+ const uint64_t TrampolineAddress =
+ BB ? BB->getOutputEndAddress() : FragmentSize;
+ TrampolineBB->setOutputStartAddress(TrampolineAddress);
+ TrampolineBB->setOutputEndAddress(TrampolineAddress + TrampolineSize);
+ TrampolineBB->setFragmentNum(FF.getFragmentNum());
+
+ if (UpdateOffsets) {
+ FragmentSize += TrampolineSize;
+ for (BinaryBasicBlock *IBB : FF) {
+ if (IBB->getOutputStartAddress() >= TrampolineAddress) {
+ IBB->setOutputStartAddress(IBB->getOutputStartAddress() +
+ TrampolineSize);
+ IBB->setOutputEndAddress(IBB->getOutputEndAddress() +
+ TrampolineSize);
+ }
+ }
+ for (auto &Pair : FunctionTrampolines) {
+ BinaryBasicBlock *IBB = Pair.second.get();
+ if (IBB->getFragmentNum() != TrampolineBB->getFragmentNum())
+ continue;
+ if (IBB == TrampolineBB.get())
+ continue;
+ if (IBB->getOutputStartAddress() >= TrampolineAddress) {
+ IBB->setOutputStartAddress(IBB->getOutputStartAddress() +
+ TrampolineSize);
+ IBB->setOutputEndAddress(IBB->getOutputEndAddress() +
+ TrampolineSize);
+ }
+ }
+ }
+
+ if (!FragmentTrampolines.lookup(TargetBB))
+ FragmentTrampolines[TargetBB] = TrampolineBB.get();
+ FunctionTrampolines.emplace_back(BB ? BB : FF.back(),
+ std::move(TrampolineBB));
+
+ return FunctionTrampolines.back().second.get();
+ };
+
+ // Pre-populate trampolines by splitting unconditional branches from the
+ // containing basic block.
+ for (BinaryBasicBlock *BB : FF) {
+ MCInst *Inst = BB->getLastNonPseudoInstr();
+ if (!Inst || !MIB->isUnconditionalBranch(*Inst))
+ continue;
+
+ const MCSymbol *TargetSymbol = MIB->getTargetSymbol(*Inst);
+ BB->eraseInstruction(BB->findInstruction(Inst));
+ BB->setOutputEndAddress(BB->getOutputEndAddress() - TrampolineSize);
+
+ BinaryBasicBlock::BinaryBranchInfo BI;
+ BinaryBasicBlock *TargetBB = BB->getSuccessor(TargetSymbol, BI);
+
+ BinaryBasicBlock *TrampolineBB =
+ addTrampolineAfter(BB, TargetBB, BI.Count, /*UpdateOffsets*/ false);
+ BB->replaceSuccessor(TargetBB, TrampolineBB, BI.Count);
+ }
+
+ /// Relax the branch \p Inst. Return true if basic block offsets need an
+ /// update after the trampoline insertion.
+ auto relaxBranch = [&](BinaryBasicBlock *BB, MCInst &Inst,
+ uint64_t InstAddress, BinaryBasicBlock *TargetBB) {
+ BinaryFunction *BF = BB->getParent();
+
+ // Use branch taken count for optimal relaxation.
+ const uint64_t Count = BB->getBranchInfo(*TargetBB).Count;
+ assert(Count != BinaryBasicBlock::COUNT_NO_PROFILE &&
+ "Expected valid branch execution count");
+
+ // Try to reuse an existing trampoline without introducing any new code.
+ BinaryBasicBlock *TrampolineBB = FragmentTrampolines.lookup(TargetBB);
+ if (TrampolineBB && isBlockInRange(Inst, InstAddress, *TrampolineBB)) {
+ BB->replaceSuccessor(TargetBB, TrampolineBB, Count);
+ TrampolineBB->setExecutionCount(TrampolineBB->getExecutionCount() +
+ Count);
+ auto L = BC.scopeLock();
+ MIB->replaceBranchTarget(Inst, TrampolineBB->getLabel(), BC.Ctx.get());
+ return;
+ }
+
+ // For cold branches, check if we can introduce a trampoline at the end
+ // of the fragment that is within the branch reach. Note that such
+ // trampoline may change address later and become unreachable in which
+ // case we will need further relaxation.
+ const int64_t OffsetToEnd = FragmentSize - InstAddress;
+ if (Count == 0 && isBranchOffsetInRange(Inst, OffsetToEnd)) {
+ TrampolineBB = addTrampolineAfter(nullptr, TargetBB, Count);
+ BB->replaceSuccessor(TargetBB, TrampolineBB, Count);
+ auto L = BC.scopeLock();
+ MIB->replaceBranchTarget(Inst, TrampolineBB->getLabel(), BC.Ctx.get());
+
+ return;
+ }
+
+ // Insert a new block after the current one and use it as a trampoline.
+ TrampolineBB = addTrampolineAfter(BB, TargetBB, Count);
+
+ // If the other successor is a fall-through, invert the condition code.
+ const BinaryBasicBlock *const NextBB =
+ BF->getLayout().getBasicBlockAfter(BB, /*IgnoreSplits*/ false);
+ if (BB->getConditionalSuccessor(false) == NextBB) {
+ BB->swapConditionalSuccessors();
+ auto L = BC.scopeLock();
+ MIB->reverseBranchCondition(Inst, NextBB->getLabel(), BC.Ctx.get());
+ } else {
+ auto L = BC.scopeLock();
+ MIB->replaceBranchTarget(Inst, TrampolineBB->getLabel(), BC.Ctx.get());
+ }
+ BB->replaceSuccessor(TargetBB, TrampolineBB, Count);
+ };
+
+ bool MayNeedRelaxation;
+ uint64_t NumIterations = 0;
+ do {
+ MayNeedRelaxation = false;
+ ++NumIterations;
+ for (auto BBI = FF.begin(); BBI != FF.end(); ++BBI) {
+ BinaryBasicBlock *BB = *BBI;
+ uint64_t NextInstOffset = BB->getOutputStartAddress();
+ for (MCInst &Inst : *BB) {
+ const size_t InstAddress = NextInstOffset;
+ if (!MIB->isPseudo(Inst))
+ NextInstOffset += 4;
+
+ if (!mayNeedStub(BF.getBinaryContext(), Inst))
+ continue;
+
+ const size_t BitsAvailable = MIB->getPCRelEncodingSize(Inst);
+
+ // Span of +/-128MB.
+ if (BitsAvailable == LongestJumpBits)
+ continue;
+
+ const MCSymbol *TargetSymbol = MIB->getTargetSymbol(Inst);
+ BinaryBasicBlock *TargetBB = BB->getSuccessor(TargetSymbol);
+ assert(TargetBB &&
+ "Basic block target expected for conditional branch.");
+
+ // Check if the relaxation is needed.
+ if (TargetBB->getFragmentNum() == FF.getFragmentNum() &&
+ isBlockInRange(Inst, InstAddress, *TargetBB))
+ continue;
+
+ relaxBranch(BB, Inst, InstAddress, TargetBB);
+
+ MayNeedRelaxation = true;
+ }
+ }
+
+ // We may have added new instructions, but the whole fragment is less than
+ // the minimum branch span.
+ if (FragmentSize < ShortestJumpSpan)
+ MayNeedRelaxation = false;
+
+ } while (MayNeedRelaxation);
+
+ LLVM_DEBUG({
+ if (NumIterations > 2) {
+ dbgs() << "BOLT-DEBUG: relaxed fragment " << FF.getFragmentNum().get()
+ << " of " << BF << " in " << NumIterations << " iterations\n";
+ }
+ });
+ }
+
+ // Add trampoline blocks from all fragments to the layout.
+ DenseMap<BinaryBasicBlock *, std::vector<std::unique_ptr<BinaryBasicBlock>>>
+ Insertions;
+ for (std::pair<BinaryBasicBlock *, std::unique_ptr<BinaryBasicBlock>> &Pair :
+ FunctionTrampolines) {
+ if (!Pair.second)
+ continue;
+ Insertions[Pair.first].emplace_back(std::move(Pair.second));
+ }
+
+ for (auto &Pair : Insertions) {
+ BF.insertBasicBlocks(Pair.first, std::move(Pair.second),
+ /*UpdateLayout*/ true, /*UpdateCFI*/ true,
+ /*RecomputeLPs*/ false);
+ }
+}
+
Error LongJmpPass::runOnFunctions(BinaryContext &BC) {
+
+ if (opts::CompactCodeModel) {
+ BC.outs()
+ << "BOLT-INFO: relaxing branches for compact code model (<128MB)\n";
+
+ ParallelUtilities::WorkFuncTy WorkFun = [&](BinaryFunction &BF) {
+ relaxLocalBranches(BF);
+ };
+
+ ParallelUtilities::PredicateTy SkipPredicate =
+ [&](const BinaryFunction &BF) {
+ return !BC.shouldEmit(BF) || !BF.isSimple();
+ };
+
+ ParallelUtilities::runOnEachFunction(
+ BC, ParallelUtilities::SchedulingPolicy::SP_INST_LINEAR, WorkFun,
+ SkipPredicate, "RelaxLocalBranches");
+
+ return Error::success();
+ }
+
BC.outs() << "BOLT-INFO: Starting stub-insertion pass\n";
std::vector<BinaryFunction *> Sorted = BC.getSortedFunctions();
bool Modified;
diff --git a/bolt/test/AArch64/compact-code-model.s b/bolt/test/AArch64/compact-code-model.s
new file mode 100644
index 00000000000000..c8d8ac9131b45c
--- /dev/null
+++ b/bolt/test/AArch64/compact-code-model.s
@@ -0,0 +1,48 @@
+## Check that llvm-bolt successfully relaxes branches for compact (<128MB) code
+## model.
+
+# RUN: llvm-mc -filetype=obj -triple aarch64-unknown-unknown %s -o %t.o
+# RUN: %clang %cflags %t.o -o %t.exe -Wl,-q -static
+# RUN: llvm-bolt %t.exe -o %t.bolt --split-functions --split-strategy=randomN \
+# RUN: --keep-nops --compact-code-model
+# RUN: llvm-objdump -d --disassemble-symbols=_start %t.bolt | FileCheck %s
+# RUN: llvm-nm -n %t.bolt | FileCheck %s --check-prefix=CHECK-NM
+
+## _start will be split and its main fragment will be separated from other
+## fragments by large_function() which is over 1MB.
+
+# CHECK-NM: _start
+# CHECK-NM-NEXT: large_function
+# CHECK-NM-NEXT: _start.cold
+
+ .text
+ .globl _start
+ .type _start, %function
+_start:
+ .cfi_startproc
+ cmp x1, 1
+ b.hi .L1
+# CHECK: b.hi
+# CHECK-NEXT: b
+# CHECK-NEXT: b
+
+ bl large_function
+.L1:
+ ret x30
+ .cfi_endproc
+.size _start, .-_start
+
+
+ .globl large_function
+ .type large_function, %function
+large_function:
+ .cfi_startproc
+ .rept 300000
+ nop
+ .endr
+ ret x30
+ .cfi_endproc
+.size large_function, .-large_function
+
+## Force relocation mode.
+ .reloc 0, R_AARCH64_NONE
More information about the llvm-commits
mailing list