[llvm] [BOLT][AArch64] Add support for compact code model (PR #112110)
Maksim Panchenko via llvm-commits
llvm-commits at lists.llvm.org
Tue Oct 29 11:12:16 PDT 2024
https://github.com/maksfb updated https://github.com/llvm/llvm-project/pull/112110
>From 9c5b3401612f61cdb473a63ad66fb6fe4d67df71 Mon Sep 17 00:00:00 2001
From: Maksim Panchenko <maks at fb.com>
Date: Fri, 13 Sep 2024 21:17:02 -0700
Subject: [PATCH 1/5] [BOLT][AArch64] Add support for compact code model
Add `--compact-code-model` option that executes alternative branch
relaxation with an assumption that the resulting binary has less than
128MB of code. The relaxation is done in `relaxLocalBranches()`, which
operates on a function level and executes on multiple functions in
parallel.
Running the new pass on AArch64 Clang binary produces slightly smaller
code and finishes in about 1/10th of the time.
Note that the new .text has to be smaller than 128MB, *and* .plt has to
be closer than 128MB to the new code.
---
bolt/include/bolt/Core/BinaryBasicBlock.h | 3 +
bolt/include/bolt/Core/FunctionLayout.h | 3 +-
bolt/include/bolt/Passes/LongJmp.h | 13 +
bolt/lib/Core/FunctionLayout.cpp | 4 +-
bolt/lib/Passes/LongJmp.cpp | 278 +++++++++++++++++++++-
bolt/test/AArch64/compact-code-model.s | 48 ++++
6 files changed, 344 insertions(+), 5 deletions(-)
create mode 100644 bolt/test/AArch64/compact-code-model.s
diff --git a/bolt/include/bolt/Core/BinaryBasicBlock.h b/bolt/include/bolt/Core/BinaryBasicBlock.h
index b4f31cf2bae6f6..25cccc4edecf68 100644
--- a/bolt/include/bolt/Core/BinaryBasicBlock.h
+++ b/bolt/include/bolt/Core/BinaryBasicBlock.h
@@ -819,6 +819,9 @@ class BinaryBasicBlock {
return OutputAddressRange;
}
+ uint64_t getOutputStartAddress() const { return OutputAddressRange.first; }
+ uint64_t getOutputEndAddress() const { return OutputAddressRange.second; }
+
bool hasLocSyms() const { return LocSyms != nullptr; }
/// Return mapping of input offsets to symbols in the output.
diff --git a/bolt/include/bolt/Core/FunctionLayout.h b/bolt/include/bolt/Core/FunctionLayout.h
index 6a13cbec69fee7..ee4dd689b8dd64 100644
--- a/bolt/include/bolt/Core/FunctionLayout.h
+++ b/bolt/include/bolt/Core/FunctionLayout.h
@@ -123,7 +123,8 @@ class FunctionFragment {
const_iterator begin() const;
iterator end();
const_iterator end() const;
- const BinaryBasicBlock *front() const;
+ BinaryBasicBlock *front() const;
+ BinaryBasicBlock *back() const;
friend class FunctionLayout;
};
diff --git a/bolt/include/bolt/Passes/LongJmp.h b/bolt/include/bolt/Passes/LongJmp.h
index 3d02d75ac4a277..df3ea9620918af 100644
--- a/bolt/include/bolt/Passes/LongJmp.h
+++ b/bolt/include/bolt/Passes/LongJmp.h
@@ -63,6 +63,19 @@ class LongJmpPass : public BinaryFunctionPass {
uint32_t NumColdStubs{0};
uint32_t NumSharedStubs{0};
+ /// The shortest distance for any branch instruction on AArch64.
+ static constexpr size_t ShortestJumpBits = 16;
+ static constexpr size_t ShortestJumpSpan = 1ULL << (ShortestJumpBits - 1);
+
+ /// The longest single-instruction branch.
+ static constexpr size_t LongestJumpBits = 28;
+ static constexpr size_t LongestJumpSpan = 1ULL << (LongestJumpBits - 1);
+
+ /// Relax all internal function branches including those between fragments.
+ /// Assume that fragments are placed in different sections but are within
+ /// 128MB of each other.
+ void relaxLocalBranches(BinaryFunction &BF);
+
/// -- Layout estimation methods --
/// Try to do layout before running the emitter, by looking at BinaryFunctions
/// and MCInsts -- this is an estimation. To be correct for longjmp inserter
diff --git a/bolt/lib/Core/FunctionLayout.cpp b/bolt/lib/Core/FunctionLayout.cpp
index 15e6127ad2e9e8..4498fc44da9548 100644
--- a/bolt/lib/Core/FunctionLayout.cpp
+++ b/bolt/lib/Core/FunctionLayout.cpp
@@ -33,7 +33,9 @@ FunctionFragment::const_iterator FunctionFragment::end() const {
return const_iterator(Layout->block_begin() + StartIndex + Size);
}
-const BinaryBasicBlock *FunctionFragment::front() const { return *begin(); }
+BinaryBasicBlock *FunctionFragment::front() const { return *begin(); }
+
+BinaryBasicBlock *FunctionFragment::back() const { return *std::prev(end()); }
FunctionLayout::FunctionLayout() { addFragment(); }
diff --git a/bolt/lib/Passes/LongJmp.cpp b/bolt/lib/Passes/LongJmp.cpp
index c483f70a836ee1..4ce2322ab4352c 100644
--- a/bolt/lib/Passes/LongJmp.cpp
+++ b/bolt/lib/Passes/LongJmp.cpp
@@ -11,18 +11,26 @@
//===----------------------------------------------------------------------===//
#include "bolt/Passes/LongJmp.h"
+#include "bolt/Core/ParallelUtilities.h"
+#include "llvm/Support/MathExtras.h"
#define DEBUG_TYPE "longjmp"
using namespace llvm;
namespace opts {
+extern cl::OptionCategory BoltCategory;
extern cl::OptionCategory BoltOptCategory;
extern llvm::cl::opt<unsigned> AlignText;
extern cl::opt<unsigned> AlignFunctions;
extern cl::opt<bool> UseOldText;
extern cl::opt<bool> HotFunctionsAtEnd;
+static cl::opt<bool>
+ CompactCodeModel("compact-code-model",
+ cl::desc("generate code for binaries <128MB on AArch64"),
+ cl::init(false), cl::cat(BoltCategory));
+
static cl::opt<bool> GroupStubs("group-stubs",
cl::desc("share stubs across functions"),
cl::init(true), cl::cat(BoltOptCategory));
@@ -61,10 +69,10 @@ static BinaryBasicBlock *getBBAtHotColdSplitPoint(BinaryFunction &Func) {
if (Next != E && (*Next)->isCold())
return *I;
}
- llvm_unreachable("No hot-colt split point found");
+ llvm_unreachable("No hot-cold split point found");
}
-static bool shouldInsertStub(const BinaryContext &BC, const MCInst &Inst) {
+static bool mayNeedStub(const BinaryContext &BC, const MCInst &Inst) {
return (BC.MIB->isBranch(Inst) || BC.MIB->isCall(Inst)) &&
!BC.MIB->isIndirectBranch(Inst) && !BC.MIB->isIndirectCall(Inst);
}
@@ -565,7 +573,7 @@ Error LongJmpPass::relax(BinaryFunction &Func, bool &Modified) {
if (BC.MIB->isPseudo(Inst))
continue;
- if (!shouldInsertStub(BC, Inst)) {
+ if (!mayNeedStub(BC, Inst)) {
DotAddress += InsnSize;
continue;
}
@@ -629,7 +637,271 @@ Error LongJmpPass::relax(BinaryFunction &Func, bool &Modified) {
return Error::success();
}
+void LongJmpPass::relaxLocalBranches(BinaryFunction &BF) {
+ BinaryContext &BC = BF.getBinaryContext();
+ auto &MIB = BC.MIB;
+
+ if (!BF.isSimple())
+ return;
+
+ // Quick path.
+ if (!BF.isSplit() && BF.estimateSize() < ShortestJumpSpan)
+ return;
+
+ auto isBranchOffsetInRange = [&](const MCInst &Inst, int64_t Offset) {
+ const unsigned Bits = MIB->getPCRelEncodingSize(Inst);
+ return isIntN(Bits, Offset);
+ };
+
+ auto isBlockInRange = [&](const MCInst &Inst, uint64_t InstAddress,
+ const BinaryBasicBlock &BB) {
+ const int64_t Offset = BB.getOutputStartAddress() - InstAddress;
+ return isBranchOffsetInRange(Inst, Offset);
+ };
+
+ // Keep track of *all* function trampolines that are going to be added to the
+ // function layout at the end of relaxation.
+ std::vector<std::pair<BinaryBasicBlock *, std::unique_ptr<BinaryBasicBlock>>>
+ FunctionTrampolines;
+
+ // Function fragments are relaxed independently.
+ for (FunctionFragment &FF : BF.getLayout().fragments()) {
+ // Fill out code size estimation for the fragment. Use output BB address
+ // ranges to store offsets from the start of the function.
+ uint64_t CodeSize = 0;
+ for (BinaryBasicBlock *BB : FF) {
+ BB->setOutputStartAddress(CodeSize);
+ CodeSize += BB->estimateSize();
+ BB->setOutputEndAddress(CodeSize);
+ }
+
+ // Dynamically-updated size of the fragment.
+ uint64_t FragmentSize = CodeSize;
+
+ // Size of the trampoline in bytes.
+ constexpr uint64_t TrampolineSize = 4;
+
+ // Trampolines created for the fragment. DestinationBB -> TrampolineBB.
+ // NB: here we store only the first trampoline created for DestinationBB.
+ DenseMap<const BinaryBasicBlock *, BinaryBasicBlock *> FragmentTrampolines;
+
+ // Create a trampoline code after \p BB or at the end of the fragment if BB
+ // is nullptr.
+ auto addTrampolineAfter = [&](BinaryBasicBlock *BB,
+ BinaryBasicBlock *TargetBB, uint64_t Count,
+ bool UpdateOffsets = true) {
+ std::unique_ptr<BinaryBasicBlock> TrampolineBB = BF.createBasicBlock();
+ MCInst Inst;
+ {
+ auto L = BC.scopeLock();
+ MIB->createUncondBranch(Inst, TargetBB->getLabel(), BC.Ctx.get());
+ }
+ TrampolineBB->addInstruction(Inst);
+ TrampolineBB->addSuccessor(TargetBB, Count);
+ TrampolineBB->setExecutionCount(Count);
+ const uint64_t TrampolineAddress =
+ BB ? BB->getOutputEndAddress() : FragmentSize;
+ TrampolineBB->setOutputStartAddress(TrampolineAddress);
+ TrampolineBB->setOutputEndAddress(TrampolineAddress + TrampolineSize);
+ TrampolineBB->setFragmentNum(FF.getFragmentNum());
+
+ if (UpdateOffsets) {
+ FragmentSize += TrampolineSize;
+ for (BinaryBasicBlock *IBB : FF) {
+ if (IBB->getOutputStartAddress() >= TrampolineAddress) {
+ IBB->setOutputStartAddress(IBB->getOutputStartAddress() +
+ TrampolineSize);
+ IBB->setOutputEndAddress(IBB->getOutputEndAddress() +
+ TrampolineSize);
+ }
+ }
+ for (auto &Pair : FunctionTrampolines) {
+ BinaryBasicBlock *IBB = Pair.second.get();
+ if (IBB->getFragmentNum() != TrampolineBB->getFragmentNum())
+ continue;
+ if (IBB == TrampolineBB.get())
+ continue;
+ if (IBB->getOutputStartAddress() >= TrampolineAddress) {
+ IBB->setOutputStartAddress(IBB->getOutputStartAddress() +
+ TrampolineSize);
+ IBB->setOutputEndAddress(IBB->getOutputEndAddress() +
+ TrampolineSize);
+ }
+ }
+ }
+
+ if (!FragmentTrampolines.lookup(TargetBB))
+ FragmentTrampolines[TargetBB] = TrampolineBB.get();
+ FunctionTrampolines.emplace_back(BB ? BB : FF.back(),
+ std::move(TrampolineBB));
+
+ return FunctionTrampolines.back().second.get();
+ };
+
+ // Pre-populate trampolines by splitting unconditional branches from the
+ // containing basic block.
+ for (BinaryBasicBlock *BB : FF) {
+ MCInst *Inst = BB->getLastNonPseudoInstr();
+ if (!Inst || !MIB->isUnconditionalBranch(*Inst))
+ continue;
+
+ const MCSymbol *TargetSymbol = MIB->getTargetSymbol(*Inst);
+ BB->eraseInstruction(BB->findInstruction(Inst));
+ BB->setOutputEndAddress(BB->getOutputEndAddress() - TrampolineSize);
+
+ BinaryBasicBlock::BinaryBranchInfo BI;
+ BinaryBasicBlock *TargetBB = BB->getSuccessor(TargetSymbol, BI);
+
+ BinaryBasicBlock *TrampolineBB =
+ addTrampolineAfter(BB, TargetBB, BI.Count, /*UpdateOffsets*/ false);
+ BB->replaceSuccessor(TargetBB, TrampolineBB, BI.Count);
+ }
+
+ /// Relax the branch \p Inst. Return true if basic block offsets need an
+ /// update after the trampoline insertion.
+ auto relaxBranch = [&](BinaryBasicBlock *BB, MCInst &Inst,
+ uint64_t InstAddress, BinaryBasicBlock *TargetBB) {
+ BinaryFunction *BF = BB->getParent();
+
+ // Use branch taken count for optimal relaxation.
+ const uint64_t Count = BB->getBranchInfo(*TargetBB).Count;
+ assert(Count != BinaryBasicBlock::COUNT_NO_PROFILE &&
+ "Expected valid branch execution count");
+
+ // Try to reuse an existing trampoline without introducing any new code.
+ BinaryBasicBlock *TrampolineBB = FragmentTrampolines.lookup(TargetBB);
+ if (TrampolineBB && isBlockInRange(Inst, InstAddress, *TrampolineBB)) {
+ BB->replaceSuccessor(TargetBB, TrampolineBB, Count);
+ TrampolineBB->setExecutionCount(TrampolineBB->getExecutionCount() +
+ Count);
+ auto L = BC.scopeLock();
+ MIB->replaceBranchTarget(Inst, TrampolineBB->getLabel(), BC.Ctx.get());
+ return;
+ }
+
+ // For cold branches, check if we can introduce a trampoline at the end
+ // of the fragment that is within the branch reach. Note that such
+ // trampoline may change address later and become unreachable in which
+ // case we will need further relaxation.
+ const int64_t OffsetToEnd = FragmentSize - InstAddress;
+ if (Count == 0 && isBranchOffsetInRange(Inst, OffsetToEnd)) {
+ TrampolineBB = addTrampolineAfter(nullptr, TargetBB, Count);
+ BB->replaceSuccessor(TargetBB, TrampolineBB, Count);
+ auto L = BC.scopeLock();
+ MIB->replaceBranchTarget(Inst, TrampolineBB->getLabel(), BC.Ctx.get());
+
+ return;
+ }
+
+ // Insert a new block after the current one and use it as a trampoline.
+ TrampolineBB = addTrampolineAfter(BB, TargetBB, Count);
+
+ // If the other successor is a fall-through, invert the condition code.
+ const BinaryBasicBlock *const NextBB =
+ BF->getLayout().getBasicBlockAfter(BB, /*IgnoreSplits*/ false);
+ if (BB->getConditionalSuccessor(false) == NextBB) {
+ BB->swapConditionalSuccessors();
+ auto L = BC.scopeLock();
+ MIB->reverseBranchCondition(Inst, NextBB->getLabel(), BC.Ctx.get());
+ } else {
+ auto L = BC.scopeLock();
+ MIB->replaceBranchTarget(Inst, TrampolineBB->getLabel(), BC.Ctx.get());
+ }
+ BB->replaceSuccessor(TargetBB, TrampolineBB, Count);
+ };
+
+ bool MayNeedRelaxation;
+ uint64_t NumIterations = 0;
+ do {
+ MayNeedRelaxation = false;
+ ++NumIterations;
+ for (auto BBI = FF.begin(); BBI != FF.end(); ++BBI) {
+ BinaryBasicBlock *BB = *BBI;
+ uint64_t NextInstOffset = BB->getOutputStartAddress();
+ for (MCInst &Inst : *BB) {
+ const size_t InstAddress = NextInstOffset;
+ if (!MIB->isPseudo(Inst))
+ NextInstOffset += 4;
+
+ if (!mayNeedStub(BF.getBinaryContext(), Inst))
+ continue;
+
+ const size_t BitsAvailable = MIB->getPCRelEncodingSize(Inst);
+
+ // Span of +/-128MB.
+ if (BitsAvailable == LongestJumpBits)
+ continue;
+
+ const MCSymbol *TargetSymbol = MIB->getTargetSymbol(Inst);
+ BinaryBasicBlock *TargetBB = BB->getSuccessor(TargetSymbol);
+ assert(TargetBB &&
+ "Basic block target expected for conditional branch.");
+
+ // Check if the relaxation is needed.
+ if (TargetBB->getFragmentNum() == FF.getFragmentNum() &&
+ isBlockInRange(Inst, InstAddress, *TargetBB))
+ continue;
+
+ relaxBranch(BB, Inst, InstAddress, TargetBB);
+
+ MayNeedRelaxation = true;
+ }
+ }
+
+ // We may have added new instructions, but the whole fragment is less than
+ // the minimum branch span.
+ if (FragmentSize < ShortestJumpSpan)
+ MayNeedRelaxation = false;
+
+ } while (MayNeedRelaxation);
+
+ LLVM_DEBUG({
+ if (NumIterations > 2) {
+ dbgs() << "BOLT-DEBUG: relaxed fragment " << FF.getFragmentNum().get()
+ << " of " << BF << " in " << NumIterations << " iterations\n";
+ }
+ });
+ }
+
+ // Add trampoline blocks from all fragments to the layout.
+ DenseMap<BinaryBasicBlock *, std::vector<std::unique_ptr<BinaryBasicBlock>>>
+ Insertions;
+ for (std::pair<BinaryBasicBlock *, std::unique_ptr<BinaryBasicBlock>> &Pair :
+ FunctionTrampolines) {
+ if (!Pair.second)
+ continue;
+ Insertions[Pair.first].emplace_back(std::move(Pair.second));
+ }
+
+ for (auto &Pair : Insertions) {
+ BF.insertBasicBlocks(Pair.first, std::move(Pair.second),
+ /*UpdateLayout*/ true, /*UpdateCFI*/ true,
+ /*RecomputeLPs*/ false);
+ }
+}
+
Error LongJmpPass::runOnFunctions(BinaryContext &BC) {
+
+ if (opts::CompactCodeModel) {
+ BC.outs()
+ << "BOLT-INFO: relaxing branches for compact code model (<128MB)\n";
+
+ ParallelUtilities::WorkFuncTy WorkFun = [&](BinaryFunction &BF) {
+ relaxLocalBranches(BF);
+ };
+
+ ParallelUtilities::PredicateTy SkipPredicate =
+ [&](const BinaryFunction &BF) {
+ return !BC.shouldEmit(BF) || !BF.isSimple();
+ };
+
+ ParallelUtilities::runOnEachFunction(
+ BC, ParallelUtilities::SchedulingPolicy::SP_INST_LINEAR, WorkFun,
+ SkipPredicate, "RelaxLocalBranches");
+
+ return Error::success();
+ }
+
BC.outs() << "BOLT-INFO: Starting stub-insertion pass\n";
std::vector<BinaryFunction *> Sorted = BC.getSortedFunctions();
bool Modified;
diff --git a/bolt/test/AArch64/compact-code-model.s b/bolt/test/AArch64/compact-code-model.s
new file mode 100644
index 00000000000000..c8d8ac9131b45c
--- /dev/null
+++ b/bolt/test/AArch64/compact-code-model.s
@@ -0,0 +1,48 @@
+## Check that llvm-bolt successfully relaxes branches for compact (<128MB) code
+## model.
+
+# RUN: llvm-mc -filetype=obj -triple aarch64-unknown-unknown %s -o %t.o
+# RUN: %clang %cflags %t.o -o %t.exe -Wl,-q -static
+# RUN: llvm-bolt %t.exe -o %t.bolt --split-functions --split-strategy=randomN \
+# RUN: --keep-nops --compact-code-model
+# RUN: llvm-objdump -d --disassemble-symbols=_start %t.bolt | FileCheck %s
+# RUN: llvm-nm -n %t.bolt | FileCheck %s --check-prefix=CHECK-NM
+
+## _start will be split and its main fragment will be separated from other
+## fragments by large_function() which is over 1MB.
+
+# CHECK-NM: _start
+# CHECK-NM-NEXT: large_function
+# CHECK-NM-NEXT: _start.cold
+
+ .text
+ .globl _start
+ .type _start, %function
+_start:
+ .cfi_startproc
+ cmp x1, 1
+ b.hi .L1
+# CHECK: b.hi
+# CHECK-NEXT: b
+# CHECK-NEXT: b
+
+ bl large_function
+.L1:
+ ret x30
+ .cfi_endproc
+.size _start, .-_start
+
+
+ .globl large_function
+ .type large_function, %function
+large_function:
+ .cfi_startproc
+ .rept 300000
+ nop
+ .endr
+ ret x30
+ .cfi_endproc
+.size large_function, .-large_function
+
+## Force relocation mode.
+ .reloc 0, R_AARCH64_NONE
>From 3cabeab3b4cc31b30f36d49df4eb9f8f7540184a Mon Sep 17 00:00:00 2001
From: Maksim Panchenko <maks at fb.com>
Date: Mon, 21 Oct 2024 12:34:49 -0700
Subject: [PATCH 2/5] fixup! [BOLT][AArch64] Add support for compact code model
---
bolt/lib/Passes/LongJmp.cpp | 10 ++++------
1 file changed, 4 insertions(+), 6 deletions(-)
diff --git a/bolt/lib/Passes/LongJmp.cpp b/bolt/lib/Passes/LongJmp.cpp
index 4ce2322ab4352c..279ff63faf11e0 100644
--- a/bolt/lib/Passes/LongJmp.cpp
+++ b/bolt/lib/Passes/LongJmp.cpp
@@ -641,9 +641,6 @@ void LongJmpPass::relaxLocalBranches(BinaryFunction &BF) {
BinaryContext &BC = BF.getBinaryContext();
auto &MIB = BC.MIB;
- if (!BF.isSimple())
- return;
-
// Quick path.
if (!BF.isSplit() && BF.estimateSize() < ShortestJumpSpan)
return;
@@ -667,7 +664,7 @@ void LongJmpPass::relaxLocalBranches(BinaryFunction &BF) {
// Function fragments are relaxed independently.
for (FunctionFragment &FF : BF.getLayout().fragments()) {
// Fill out code size estimation for the fragment. Use output BB address
- // ranges to store offsets from the start of the function.
+ // ranges to store offsets from the start of the function fragment.
uint64_t CodeSize = 0;
for (BinaryBasicBlock *BB : FF) {
BB->setOutputStartAddress(CodeSize);
@@ -757,8 +754,9 @@ void LongJmpPass::relaxLocalBranches(BinaryFunction &BF) {
BB->replaceSuccessor(TargetBB, TrampolineBB, BI.Count);
}
- /// Relax the branch \p Inst. Return true if basic block offsets need an
- /// update after the trampoline insertion.
+ /// Relax the branch \p Inst in basic block \p BB that targets \p TargetBB.
+ /// \p InstAddress contains offset of the branch from the start of the
+ /// containing function fragment.
auto relaxBranch = [&](BinaryBasicBlock *BB, MCInst &Inst,
uint64_t InstAddress, BinaryBasicBlock *TargetBB) {
BinaryFunction *BF = BB->getParent();
>From 4dc0221490c6ce812c85b31f148e1e3ad83171a2 Mon Sep 17 00:00:00 2001
From: Maksim Panchenko <maks at fb.com>
Date: Tue, 22 Oct 2024 15:24:27 -0700
Subject: [PATCH 3/5] fixup! fixup! [BOLT][AArch64] Add support for compact
code model
---
bolt/test/AArch64/compact-code-model.s | 66 +++++++++++++++++++++-----
1 file changed, 55 insertions(+), 11 deletions(-)
diff --git a/bolt/test/AArch64/compact-code-model.s b/bolt/test/AArch64/compact-code-model.s
index c8d8ac9131b45c..0805302a885981 100644
--- a/bolt/test/AArch64/compact-code-model.s
+++ b/bolt/test/AArch64/compact-code-model.s
@@ -2,40 +2,84 @@
## model.
# RUN: llvm-mc -filetype=obj -triple aarch64-unknown-unknown %s -o %t.o
+# RUN: link_fdata %s %t.o %t.fdata
+# RUN: llvm-strip --strip-unneeded %t.o
# RUN: %clang %cflags %t.o -o %t.exe -Wl,-q -static
-# RUN: llvm-bolt %t.exe -o %t.bolt --split-functions --split-strategy=randomN \
+# RUN: llvm-bolt %t.exe -o %t.bolt --data %t.fdata --split-functions \
# RUN: --keep-nops --compact-code-model
-# RUN: llvm-objdump -d --disassemble-symbols=_start %t.bolt | FileCheck %s
-# RUN: llvm-nm -n %t.bolt | FileCheck %s --check-prefix=CHECK-NM
+# RUN: llvm-objdump -d \
+# RUN: --disassemble-symbols=_start,_start.cold.0,foo,foo.cold.0 %t.bolt \
+# RUN: | FileCheck %s
+# RUN: llvm-nm -nS %t.bolt | FileCheck %s --check-prefix=CHECK-NM
-## _start will be split and its main fragment will be separated from other
-## fragments by large_function() which is over 1MB.
+## Fragments of _start and foo will be separated by large_function which is over
+## 1MB in size - larger than all conditional branches can cover requiring branch
+## relaxation.
# CHECK-NM: _start
-# CHECK-NM-NEXT: large_function
-# CHECK-NM-NEXT: _start.cold
+# CHECK-NM: foo
+# CHECK-NM: 0000000000124f84 T large_function
+# CHECK-NM: _start.cold.0
+# CHECK-NM: foo.cold.0
.text
.globl _start
.type _start, %function
_start:
+# CHECK: <_start>:
+# FDATA: 0 [unknown] 0 1 _start 0 0 100
.cfi_startproc
- cmp x1, 1
- b.hi .L1
-# CHECK: b.hi
+ cmp x0, 1
+ b.eq .L0
+# CHECK: b.eq
# CHECK-NEXT: b
# CHECK-NEXT: b
bl large_function
-.L1:
+.L0:
ret x30
.cfi_endproc
.size _start, .-_start
+## Check that long branch in foo() is reused during relaxation. I.e. we should
+## see just one branch to the cold fragment.
+
+ .globl foo
+ .type foo, %function
+foo:
+# CHECK: <foo>:
+# FDATA: 0 [unknown] 0 1 foo 0 0 100
+ .cfi_startproc
+ cmp x0, 0
+.T0:
+ b.eq .ERROR
+# CHECK: b {{.*}} <foo.cold.0>
+# CHECK-NOT: b {{.*}} <foo.cold.0>
+# FDATA: 1 foo #.T0# 1 foo #.T1# 0 100
+.T1:
+ bl large_function
+ cmp x0, 1
+.T2:
+ b.eq .ERROR
+# FDATA: 1 foo #.T2# 1 foo #.T3# 0 100
+.T3:
+ mov x1, x0
+ mov x0, 0
+ ret x30
+
+# CHECK: <foo.cold.0>:
+# CHECK-NEXT: mov x0, #0x1
+# CHECK-NEXT: ret
+.ERROR:
+ mov x0, 1
+ ret x30
+ .cfi_endproc
+.size foo, .-foo
.globl large_function
.type large_function, %function
large_function:
+# FDATA: 0 [unknown] 0 1 large_function 0 0 100
.cfi_startproc
.rept 300000
nop
>From 6641d9f967073a3e6dd0d0d54392f749414a7629 Mon Sep 17 00:00:00 2001
From: Maksim Panchenko <maks at fb.com>
Date: Wed, 23 Oct 2024 15:25:45 -0700
Subject: [PATCH 4/5] fixup! fixup! fixup! [BOLT][AArch64] Add support for
compact code model
---
bolt/lib/Passes/LongJmp.cpp | 67 ++++++++++++++++++++++---------------
1 file changed, 40 insertions(+), 27 deletions(-)
diff --git a/bolt/lib/Passes/LongJmp.cpp b/bolt/lib/Passes/LongJmp.cpp
index 279ff63faf11e0..274b33b54373be 100644
--- a/bolt/lib/Passes/LongJmp.cpp
+++ b/bolt/lib/Passes/LongJmp.cpp
@@ -683,11 +683,15 @@ void LongJmpPass::relaxLocalBranches(BinaryFunction &BF) {
DenseMap<const BinaryBasicBlock *, BinaryBasicBlock *> FragmentTrampolines;
// Create a trampoline code after \p BB or at the end of the fragment if BB
- // is nullptr.
+ // is nullptr. If /p UpdateOffsets is true, update FragmentSize and offsets
+ // for basic blocks affected by the insertion of the trampoline.
auto addTrampolineAfter = [&](BinaryBasicBlock *BB,
BinaryBasicBlock *TargetBB, uint64_t Count,
bool UpdateOffsets = true) {
- std::unique_ptr<BinaryBasicBlock> TrampolineBB = BF.createBasicBlock();
+ FunctionTrampolines.emplace_back(BB ? BB : FF.back(),
+ BF.createBasicBlock());
+ BinaryBasicBlock *TrampolineBB = FunctionTrampolines.back().second.get();
+
MCInst Inst;
{
auto L = BC.scopeLock();
@@ -702,37 +706,46 @@ void LongJmpPass::relaxLocalBranches(BinaryFunction &BF) {
TrampolineBB->setOutputEndAddress(TrampolineAddress + TrampolineSize);
TrampolineBB->setFragmentNum(FF.getFragmentNum());
- if (UpdateOffsets) {
- FragmentSize += TrampolineSize;
- for (BinaryBasicBlock *IBB : FF) {
- if (IBB->getOutputStartAddress() >= TrampolineAddress) {
- IBB->setOutputStartAddress(IBB->getOutputStartAddress() +
- TrampolineSize);
- IBB->setOutputEndAddress(IBB->getOutputEndAddress() +
+ if (!FragmentTrampolines.lookup(TargetBB))
+ FragmentTrampolines[TargetBB] = TrampolineBB;
+
+ if (!UpdateOffsets)
+ return TrampolineBB;
+
+ FragmentSize += TrampolineSize;
+
+ // If the trampoline was added at the end of the fragment, offsets of
+ // other fragments should stay intact.
+ if (!BB)
+ return TrampolineBB;
+
+ // Update offsets for blocks after BB.
+ for (BinaryBasicBlock *IBB : FF) {
+ if (IBB->getOutputStartAddress() >= TrampolineAddress) {
+ IBB->setOutputStartAddress(IBB->getOutputStartAddress() +
TrampolineSize);
- }
+ IBB->setOutputEndAddress(IBB->getOutputEndAddress() + TrampolineSize);
}
- for (auto &Pair : FunctionTrampolines) {
- BinaryBasicBlock *IBB = Pair.second.get();
- if (IBB->getFragmentNum() != TrampolineBB->getFragmentNum())
- continue;
- if (IBB == TrampolineBB.get())
- continue;
- if (IBB->getOutputStartAddress() >= TrampolineAddress) {
- IBB->setOutputStartAddress(IBB->getOutputStartAddress() +
- TrampolineSize);
- IBB->setOutputEndAddress(IBB->getOutputEndAddress() +
+ }
+
+ // Update offsets for trampolines in this fragment that are placed after
+ // the new trampoline. Note that trampoline blocks are not part of the
+ // function/fragment layout until we add them right before the return
+ // from relaxLocalBranches().
+ for (auto &Pair : FunctionTrampolines) {
+ BinaryBasicBlock *IBB = Pair.second.get();
+ if (IBB->getFragmentNum() != TrampolineBB->getFragmentNum())
+ continue;
+ if (IBB == TrampolineBB)
+ continue;
+ if (IBB->getOutputStartAddress() >= TrampolineAddress) {
+ IBB->setOutputStartAddress(IBB->getOutputStartAddress() +
TrampolineSize);
- }
+ IBB->setOutputEndAddress(IBB->getOutputEndAddress() + TrampolineSize);
}
}
- if (!FragmentTrampolines.lookup(TargetBB))
- FragmentTrampolines[TargetBB] = TrampolineBB.get();
- FunctionTrampolines.emplace_back(BB ? BB : FF.back(),
- std::move(TrampolineBB));
-
- return FunctionTrampolines.back().second.get();
+ return TrampolineBB;
};
// Pre-populate trampolines by splitting unconditional branches from the
>From 96b858b3b3544ac318595a599ac13f8ff344aac3 Mon Sep 17 00:00:00 2001
From: Maksim Panchenko <maks at fb.com>
Date: Tue, 29 Oct 2024 11:11:59 -0700
Subject: [PATCH 5/5] fixup! fixup! fixup! fixup! [BOLT][AArch64] Add support
for compact code model
---
bolt/lib/Passes/LongJmp.cpp | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/bolt/lib/Passes/LongJmp.cpp b/bolt/lib/Passes/LongJmp.cpp
index 274b33b54373be..f54afc28d0da81 100644
--- a/bolt/lib/Passes/LongJmp.cpp
+++ b/bolt/lib/Passes/LongJmp.cpp
@@ -683,7 +683,7 @@ void LongJmpPass::relaxLocalBranches(BinaryFunction &BF) {
DenseMap<const BinaryBasicBlock *, BinaryBasicBlock *> FragmentTrampolines;
// Create a trampoline code after \p BB or at the end of the fragment if BB
- // is nullptr. If /p UpdateOffsets is true, update FragmentSize and offsets
+ // is nullptr. If \p UpdateOffsets is true, update FragmentSize and offsets
// for basic blocks affected by the insertion of the trampoline.
auto addTrampolineAfter = [&](BinaryBasicBlock *BB,
BinaryBasicBlock *TargetBB, uint64_t Count,
More information about the llvm-commits
mailing list