[lld] 491b82a - ELF: Add branch-to-branch optimization.
Peter Collingbourne via llvm-commits
llvm-commits at lists.llvm.org
Fri Jun 20 13:16:35 PDT 2025
Author: Peter Collingbourne
Date: 2025-06-20T13:16:24-07:00
New Revision: 491b82a5ec1add78d2c93370580a2f1897b6a364
URL: https://github.com/llvm/llvm-project/commit/491b82a5ec1add78d2c93370580a2f1897b6a364
DIFF: https://github.com/llvm/llvm-project/commit/491b82a5ec1add78d2c93370580a2f1897b6a364.diff
LOG: ELF: Add branch-to-branch optimization.
When code calls a function which then immediately tail calls another
function there is no need to go via the intermediate function. By
branching directly to the target function we reduce the program's working
set for a slight increase in runtime performance.
Normally it is relatively uncommon to have functions that just tail call
another function, but with LLVM control flow integrity we have jump tables
that replace the function itself as the canonical address. As a result,
when a function address is taken and called directly, for example after
a compiler optimization resolves the indirect call, or if code built
without control flow integrity calls the function, the call will go via
the jump table.
The impact of this optimization was measured using a large internal
Google benchmark. The results were as follows:
CFI enabled: +0.1% ± 0.05% queries per second
CFI disabled: +0.01% queries per second [not statistically significant]
The optimization is enabled by default at -O2 but may also be enabled
or disabled individually with --{,no-}branch-to-branch.
This optimization is implemented for AArch64 and X86_64 only.
lld's runtime performance (real execution time) after adding this
optimization was measured using firefox-x64 from lld-speed-test [1]
with ldflags "-O2 -S" on an Apple M2 Ultra. The results are as follows:
```
N Min Max Median Avg Stddev
x 512 1.2264546 1.3481076 1.2970261 1.2965788 0.018620888
+ 512 1.2561196 1.3839965 1.3214632 1.3209327 0.019443971
Difference at 95.0% confidence
0.0243538 +/- 0.00233202
1.87831% +/- 0.179859%
(Student's t, pooled s = 0.0190369)
```
[1] https://discourse.llvm.org/t/improving-the-reproducibility-of-linker-benchmarking/86057
Pull Request: https://github.com/llvm/llvm-project/pull/138366
Added:
lld/ELF/Arch/TargetImpl.h
lld/test/ELF/aarch64-branch-to-branch.s
lld/test/ELF/x86-64-branch-to-branch.s
Modified:
lld/ELF/Arch/AArch64.cpp
lld/ELF/Arch/X86_64.cpp
lld/ELF/Config.h
lld/ELF/Driver.cpp
lld/ELF/InputSection.cpp
lld/ELF/Options.td
lld/ELF/Relocations.cpp
lld/ELF/Target.h
lld/docs/ReleaseNotes.rst
lld/docs/ld.lld.1
Removed:
################################################################################
diff --git a/lld/ELF/Arch/AArch64.cpp b/lld/ELF/Arch/AArch64.cpp
index 8a225ed103eef..f00c91b5886f3 100644
--- a/lld/ELF/Arch/AArch64.cpp
+++ b/lld/ELF/Arch/AArch64.cpp
@@ -11,6 +11,7 @@
#include "Symbols.h"
#include "SyntheticSections.h"
#include "Target.h"
+#include "TargetImpl.h"
#include "llvm/BinaryFormat/ELF.h"
#include "llvm/Support/Endian.h"
@@ -82,6 +83,7 @@ class AArch64 : public TargetInfo {
uint64_t val) const override;
RelExpr adjustTlsExpr(RelType type, RelExpr expr) const override;
void relocateAlloc(InputSectionBase &sec, uint8_t *buf) const override;
+ void applyBranchToBranchOpt() const override;
private:
void relaxTlsGdToLe(uint8_t *loc, const Relocation &rel, uint64_t val) const;
@@ -974,6 +976,63 @@ void AArch64::relocateAlloc(InputSectionBase &sec, uint8_t *buf) const {
}
}
+static std::optional<uint64_t> getControlTransferAddend(InputSection &is,
+ Relocation &r) {
+ // Identify a control transfer relocation for the branch-to-branch
+ // optimization. A "control transfer relocation" means a B or BL
+ // target but it also includes relative vtable relocations for example.
+ //
+ // We require the relocation type to be JUMP26, CALL26 or PLT32. With a
+ // relocation type of PLT32 the value may be assumed to be used for branching
+ // directly to the symbol and the addend is only used to produce the relocated
+ // value (hence the effective addend is always 0). This is because if a PLT is
+ // needed the addend will be added to the address of the PLT, and it doesn't
+ // make sense to branch into the middle of a PLT. For example, relative vtable
+ // relocations use PLT32 and 0 or a positive value as the addend but still are
+ // used to branch to the symbol.
+ //
+ // With JUMP26 or CALL26 the only reasonable interpretation of a non-zero
+ // addend is that we are branching to symbol+addend so that becomes the
+ // effective addend.
+ if (r.type == R_AARCH64_PLT32)
+ return 0;
+ if (r.type == R_AARCH64_JUMP26 || r.type == R_AARCH64_CALL26)
+ return r.addend;
+ return std::nullopt;
+}
+
+static std::pair<Relocation *, uint64_t>
+getBranchInfoAtTarget(InputSection &is, uint64_t offset) {
+ auto *i =
+ std::partition_point(is.relocations.begin(), is.relocations.end(),
+ [&](Relocation &r) { return r.offset < offset; });
+ if (i != is.relocations.end() && i->offset == offset &&
+ i->type == R_AARCH64_JUMP26) {
+ return {i, i->addend};
+ }
+ return {nullptr, 0};
+}
+
+static void redirectControlTransferRelocations(Relocation &r1,
+ const Relocation &r2) {
+ r1.expr = r2.expr;
+ r1.sym = r2.sym;
+ // With PLT32 we must respect the original addend as that affects the value's
+ // interpretation. With the other relocation types the original addend is
+ // irrelevant because it referred to an offset within the original target
+ // section so we overwrite it.
+ if (r1.type == R_AARCH64_PLT32)
+ r1.addend += r2.addend;
+ else
+ r1.addend = r2.addend;
+}
+
+void AArch64::applyBranchToBranchOpt() const {
+ applyBranchToBranchOptImpl(ctx, getControlTransferAddend,
+ getBranchInfoAtTarget,
+ redirectControlTransferRelocations);
+}
+
// AArch64 may use security features in variant PLT sequences. These are:
// Pointer Authentication (PAC), introduced in armv8.3-a and Branch Target
// Indicator (BTI) introduced in armv8.5-a. The additional instructions used
diff --git a/lld/ELF/Arch/TargetImpl.h b/lld/ELF/Arch/TargetImpl.h
new file mode 100644
index 0000000000000..f1206570d3e37
--- /dev/null
+++ b/lld/ELF/Arch/TargetImpl.h
@@ -0,0 +1,93 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLD_ELF_ARCH_TARGETIMPL_H
+#define LLD_ELF_ARCH_TARGETIMPL_H
+
+#include "InputFiles.h"
+#include "InputSection.h"
+#include "Relocations.h"
+#include "Symbols.h"
+#include "llvm/BinaryFormat/ELF.h"
+
+namespace lld::elf {
+
+// getControlTransferAddend: If this relocation is used for control transfer
+// instructions (e.g. branch, branch-link or call) or code references (e.g.
+// virtual function pointers) and indicates an address-insignificant reference,
+// return the effective addend for the relocation, otherwise return
+// std::nullopt. The effective addend for a relocation is the addend that is
+// used to determine its branch destination.
+//
+// getBranchInfoAtTarget: If a control transfer relocation referring to
+// is+offset directly transfers control to a relocated branch instruction in the
+// specified section, return the relocation for the branch target as well as its
+// effective addend (see above). Otherwise return {nullptr, 0}.
+//
+// redirectControlTransferRelocations: Given r1, a relocation for which
+// getControlTransferAddend() returned a value, and r2, a relocation returned by
+// getBranchInfo(), modify r1 so that it branches directly to the target of r2.
+template <typename GetControlTransferAddend, typename GetBranchInfoAtTarget,
+ typename RedirectControlTransferRelocations>
+inline void applyBranchToBranchOptImpl(
+ Ctx &ctx, GetControlTransferAddend getControlTransferAddend,
+ GetBranchInfoAtTarget getBranchInfoAtTarget,
+ RedirectControlTransferRelocations redirectControlTransferRelocations) {
+ // Needs to run serially because it writes to the relocations array as well as
+ // reading relocations of other sections.
+ for (ELFFileBase *f : ctx.objectFiles) {
+ auto getRelocBranchInfo =
+ [&getBranchInfoAtTarget](
+ Relocation &r,
+ uint64_t addend) -> std::pair<Relocation *, uint64_t> {
+ auto *target = dyn_cast_or_null<Defined>(r.sym);
+ // We don't allow preemptible symbols or ifuncs (may go somewhere else),
+ // absolute symbols (runtime behavior unknown), non-executable or writable
+ // memory (ditto) or non-regular sections (no section data).
+ if (!target || target->isPreemptible || target->isGnuIFunc() ||
+ !target->section ||
+ !(target->section->flags & llvm::ELF::SHF_EXECINSTR) ||
+ (target->section->flags & llvm::ELF::SHF_WRITE) ||
+ target->section->kind() != SectionBase::Regular)
+ return {nullptr, 0};
+ return getBranchInfoAtTarget(*cast<InputSection>(target->section),
+ target->value + addend);
+ };
+ for (InputSectionBase *s : f->getSections()) {
+ if (!s)
+ continue;
+ for (Relocation &r : s->relocations) {
+ std::optional<uint64_t> addend =
+ getControlTransferAddend(*cast<InputSection>(s), r);
+ if (!addend)
+ continue;
+ std::pair<Relocation *, uint64_t> targetAndAddend =
+ getRelocBranchInfo(r, *addend);
+ if (!targetAndAddend.first)
+ continue;
+ // Avoid getting stuck in an infinite loop if we encounter a branch
+ // that (possibly indirectly) branches to itself. It is unlikely
+ // that more than 5 iterations will ever be needed in practice.
+ size_t iterations = 5;
+ while (iterations--) {
+ std::pair<Relocation *, uint64_t> nextTargetAndAddend =
+ getRelocBranchInfo(*targetAndAddend.first,
+ targetAndAddend.second);
+ if (!nextTargetAndAddend.first)
+ break;
+ targetAndAddend = nextTargetAndAddend;
+ }
+ redirectControlTransferRelocations(r, *targetAndAddend.first);
+ }
+ }
+ }
+}
+
+} // namespace lld::elf
+
+#endif
diff --git a/lld/ELF/Arch/X86_64.cpp b/lld/ELF/Arch/X86_64.cpp
index 974da4d96320a..b991b6f905b96 100644
--- a/lld/ELF/Arch/X86_64.cpp
+++ b/lld/ELF/Arch/X86_64.cpp
@@ -11,6 +11,7 @@
#include "Symbols.h"
#include "SyntheticSections.h"
#include "Target.h"
+#include "TargetImpl.h"
#include "llvm/BinaryFormat/ELF.h"
#include "llvm/Support/Endian.h"
#include "llvm/Support/MathExtras.h"
@@ -49,6 +50,7 @@ class X86_64 : public TargetInfo {
bool deleteFallThruJmpInsn(InputSection &is, InputFile *file,
InputSection *nextIS) const override;
bool relaxOnce(int pass) const override;
+ void applyBranchToBranchOpt() const override;
private:
void relaxTlsGdToLe(uint8_t *loc, const Relocation &rel, uint64_t val) const;
@@ -1161,6 +1163,73 @@ void X86_64::relocateAlloc(InputSectionBase &sec, uint8_t *buf) const {
}
}
+static std::optional<uint64_t> getControlTransferAddend(InputSection &is,
+ Relocation &r) {
+ // Identify a control transfer relocation for the branch-to-branch
+ // optimization. A "control transfer relocation" usually means a CALL or JMP
+ // target but it also includes relative vtable relocations for example.
+ //
+ // We require the relocation type to be PLT32. With a relocation type of PLT32
+ // the value may be assumed to be used for branching directly to the symbol
+ // and the addend is only used to produce the relocated value (hence the
+ // effective addend is always 0). This is because if a PLT is needed the
+ // addend will be added to the address of the PLT, and it doesn't make sense
+ // to branch into the middle of a PLT. For example, relative vtable
+ // relocations use PLT32 and 0 or a positive value as the addend but still are
+ // used to branch to the symbol.
+ //
+ // STT_SECTION symbols are a special case on x86 because the LLVM assembler
+ // uses them for branches to local symbols which are assembled as referring to
+ // the section symbol with the addend equal to the symbol value - 4.
+ if (r.type == R_X86_64_PLT32) {
+ if (r.sym->isSection())
+ return r.addend + 4;
+ return 0;
+ }
+ return std::nullopt;
+}
+
+static std::pair<Relocation *, uint64_t>
+getBranchInfoAtTarget(InputSection &is, uint64_t offset) {
+ auto content = is.contentMaybeDecompress();
+ if (content.size() > offset && content[offset] == 0xe9) { // JMP immediate
+ auto *i = std::partition_point(
+ is.relocations.begin(), is.relocations.end(),
+ [&](Relocation &r) { return r.offset < offset + 1; });
+ // Unlike with getControlTransferAddend() it is valid to accept a PC32
+ // relocation here because we know that this is actually a JMP and not some
+ // other reference, so the interpretation is that we add 4 to the addend and
+ // use that as the effective addend.
+ if (i != is.relocations.end() && i->offset == offset + 1 &&
+ (i->type == R_X86_64_PC32 || i->type == R_X86_64_PLT32)) {
+ return {i, i->addend + 4};
+ }
+ }
+ return {nullptr, 0};
+}
+
+static void redirectControlTransferRelocations(Relocation &r1,
+ const Relocation &r2) {
+ // The isSection() check handles the STT_SECTION case described above.
+ // In that case the original addend is irrelevant because it referred to an
+ // offset within the original target section so we overwrite it.
+ //
+ // The +4 is here to compensate for r2.addend which will likely be -4,
+ // but may also be addend-4 in case of a PC32 branch to symbol+addend.
+ if (r1.sym->isSection())
+ r1.addend = r2.addend;
+ else
+ r1.addend += r2.addend + 4;
+ r1.expr = r2.expr;
+ r1.sym = r2.sym;
+}
+
+void X86_64::applyBranchToBranchOpt() const {
+ applyBranchToBranchOptImpl(ctx, getControlTransferAddend,
+ getBranchInfoAtTarget,
+ redirectControlTransferRelocations);
+}
+
// If Intel Indirect Branch Tracking is enabled, we have to emit special PLT
// entries containing endbr64 instructions. A PLT entry will be split into two
// parts, one in .plt.sec (writePlt), and the other in .plt (writeIBTPlt).
diff --git a/lld/ELF/Config.h b/lld/ELF/Config.h
index 2b72d54ba410d..88bda41d36487 100644
--- a/lld/ELF/Config.h
+++ b/lld/ELF/Config.h
@@ -302,6 +302,7 @@ struct Config {
bool bpFunctionOrderForCompression = false;
bool bpDataOrderForCompression = false;
bool bpVerboseSectionOrderer = false;
+ bool branchToBranch = false;
bool checkSections;
bool checkDynamicRelocs;
std::optional<llvm::DebugCompressionType> compressDebugSections;
diff --git a/lld/ELF/Driver.cpp b/lld/ELF/Driver.cpp
index 7e132a387a04d..1e0b5988343a6 100644
--- a/lld/ELF/Driver.cpp
+++ b/lld/ELF/Driver.cpp
@@ -1644,6 +1644,8 @@ static void readConfigs(Ctx &ctx, opt::InputArgList &args) {
ctx.arg.zWxneeded = hasZOption(args, "wxneeded");
setUnresolvedSymbolPolicy(ctx, args);
ctx.arg.power10Stubs = args.getLastArgValue(OPT_power10_stubs_eq) != "no";
+ ctx.arg.branchToBranch = args.hasFlag(
+ OPT_branch_to_branch, OPT_no_branch_to_branch, ctx.arg.optimize >= 2);
if (opt::Arg *arg = args.getLastArg(OPT_eb, OPT_el)) {
if (arg->getOption().matches(OPT_eb))
diff --git a/lld/ELF/InputSection.cpp b/lld/ELF/InputSection.cpp
index 0ce0f08d03874..f8786265029e8 100644
--- a/lld/ELF/InputSection.cpp
+++ b/lld/ELF/InputSection.cpp
@@ -430,8 +430,9 @@ InputSectionBase *InputSection::getRelocatedSection() const {
template <class ELFT, class RelTy>
void InputSection::copyRelocations(Ctx &ctx, uint8_t *buf) {
- if (ctx.arg.relax && !ctx.arg.relocatable &&
- (ctx.arg.emachine == EM_RISCV || ctx.arg.emachine == EM_LOONGARCH)) {
+ bool linkerRelax =
+ ctx.arg.relax && is_contained({EM_RISCV, EM_LOONGARCH}, ctx.arg.emachine);
+ if (!ctx.arg.relocatable && (linkerRelax || ctx.arg.branchToBranch)) {
// On LoongArch and RISC-V, relaxation might change relocations: copy
// from internal ones that are updated by relaxation.
InputSectionBase *sec = getRelocatedSection();
diff --git a/lld/ELF/Options.td b/lld/ELF/Options.td
index c795147eb9662..d7e3313167007 100644
--- a/lld/ELF/Options.td
+++ b/lld/ELF/Options.td
@@ -59,6 +59,10 @@ def build_id: J<"build-id=">, HelpText<"Generate build ID note">,
MetaVarName<"[fast,md5,sha1,uuid,0x<hexstring>]">;
def : F<"build-id">, Alias<build_id>, AliasArgs<["sha1"]>, HelpText<"Alias for --build-id=sha1">;
+defm branch_to_branch: BB<"branch-to-branch",
+ "Enable branch-to-branch optimization (default at -O2)",
+ "Disable branch-to-branch optimization (default at -O0 and -O1)">;
+
defm check_sections: B<"check-sections",
"Check section addresses for overlaps (default)",
"Do not check section addresses for overlaps">;
diff --git a/lld/ELF/Relocations.cpp b/lld/ELF/Relocations.cpp
index 6c4209a2b81ed..43f19186f0981 100644
--- a/lld/ELF/Relocations.cpp
+++ b/lld/ELF/Relocations.cpp
@@ -1665,9 +1665,10 @@ void RelocationScanner::scan(Relocs<RelTy> rels) {
}
// Sort relocations by offset for more efficient searching for
- // R_RISCV_PCREL_HI20 and R_PPC64_ADDR64.
+ // R_RISCV_PCREL_HI20, R_PPC64_ADDR64 and the branch-to-branch optimization.
if (ctx.arg.emachine == EM_RISCV ||
- (ctx.arg.emachine == EM_PPC64 && sec->name == ".toc"))
+ (ctx.arg.emachine == EM_PPC64 && sec->name == ".toc") ||
+ ctx.arg.branchToBranch)
llvm::stable_sort(sec->relocs(),
[](const Relocation &lhs, const Relocation &rhs) {
return lhs.offset < rhs.offset;
@@ -1958,6 +1959,9 @@ void elf::postScanRelocations(Ctx &ctx) {
for (ELFFileBase *file : ctx.objectFiles)
for (Symbol *sym : file->getLocalSymbols())
fn(*sym);
+
+ if (ctx.arg.branchToBranch)
+ ctx.target->applyBranchToBranchOpt();
}
static bool mergeCmp(const InputSection *a, const InputSection *b) {
diff --git a/lld/ELF/Target.h b/lld/ELF/Target.h
index fd1e5d33c438a..6dd20b2f0cbaa 100644
--- a/lld/ELF/Target.h
+++ b/lld/ELF/Target.h
@@ -101,6 +101,7 @@ class TargetInfo {
virtual void applyJumpInstrMod(uint8_t *loc, JumpModType type,
JumpModType val) const {}
+ virtual void applyBranchToBranchOpt() const {}
virtual ~TargetInfo();
diff --git a/lld/docs/ReleaseNotes.rst b/lld/docs/ReleaseNotes.rst
index 064ed0828c31f..dabfc961dd5ba 100644
--- a/lld/docs/ReleaseNotes.rst
+++ b/lld/docs/ReleaseNotes.rst
@@ -62,6 +62,10 @@ ELF Improvements
on executable sections.
(`#128883 <https://github.com/llvm/llvm-project/pull/128883>`_)
+* For AArch64 and X86_64, added ``--branch-to-branch``, which rewrites branches
+ that point to another branch instruction to instead branch directly to the
+ target of the second instruction. Enabled by default at ``-O2``.
+
Breaking changes
----------------
* Executable-only and readable-executable sections are now allowed to be placed
diff --git a/lld/docs/ld.lld.1 b/lld/docs/ld.lld.1
index cfacdb081a807..7edc522b4f6a4 100644
--- a/lld/docs/ld.lld.1
+++ b/lld/docs/ld.lld.1
@@ -93,6 +93,11 @@ Bind default visibility defined STB_GLOBAL function symbols locally for
.Fl shared.
.It Fl -be8
Write a Big Endian ELF File using BE8 format(AArch32 only)
+.It Fl -branch-to-branch
+Enable the branch-to-branch optimizations: a branch whose target is
+another branch instruction is rewritten to point to the latter branch
+target (AArch64 and X86_64 only). Enabled by default at
+.Fl O2 Ns .
.It Fl -build-id Ns = Ns Ar value
Generate a build ID note.
.Ar value
@@ -414,7 +419,7 @@ If not specified,
.Dv a.out
is used as a default.
.It Fl O Ns Ar value
-Optimize output file size.
+Optimize output file.
.Ar value
may be:
.Pp
@@ -424,7 +429,7 @@ Disable string merging.
.It Cm 1
Enable string merging.
.It Cm 2
-Enable string tail merging.
+Enable string tail merging and branch-to-branch optimization.
.El
.Pp
.Fl O Ns Cm 1
diff --git a/lld/test/ELF/aarch64-branch-to-branch.s b/lld/test/ELF/aarch64-branch-to-branch.s
new file mode 100644
index 0000000000000..7dc485aef853b
--- /dev/null
+++ b/lld/test/ELF/aarch64-branch-to-branch.s
@@ -0,0 +1,82 @@
+# REQUIRES: aarch64
+
+## Test that the branch-to-branch optimization follows the links
+## from f1 -> f2 -> f3 and updates all references to point to f3.
+
+# RUN: llvm-mc -filetype=obj -triple=aarch64-pc-linux %s -o %t.o
+# RUN: ld.lld %t.o -o %t --branch-to-branch --emit-relocs
+# RUN: llvm-objdump -d -s %t | FileCheck --check-prefixes=CHECK,B2B %s
+# RUN: llvm-objdump -r %t | FileCheck --check-prefixes=RELOC,B2B-RELOC %s
+# RUN: ld.lld %t.o -o %t -O2 --emit-relocs
+# RUN: llvm-objdump -d -s %t | FileCheck --check-prefixes=CHECK,B2B %s
+# RUN: llvm-objdump -r %t | FileCheck --check-prefixes=RELOC,B2B-RELOC %s
+
+## Test that branch-to-branch is disabled by default.
+
+# RUN: ld.lld %t.o -o %t --emit-relocs
+# RUN: llvm-objdump -d -s %t | FileCheck --check-prefixes=CHECK,NOB2B %s
+# RUN: llvm-objdump -r %t | FileCheck --check-prefixes=RELOC,NOB2B-RELOC %s
+# RUN: ld.lld %t.o -o %t -O2 --no-branch-to-branch --emit-relocs
+# RUN: llvm-objdump -d -s %t | FileCheck --check-prefixes=CHECK,NOB2B %s
+# RUN: llvm-objdump -r %t | FileCheck --check-prefixes=RELOC,NOB2B-RELOC %s
+
+## Test that branch-to-branch is disabled for preemptible symbols.
+
+# RUN: ld.lld %t.o -o %t --branch-to-branch -shared --emit-relocs
+# RUN: llvm-objdump -d -s %t | FileCheck --check-prefixes=CHECK,NOB2B %s
+# RUN: llvm-objdump -r %t | FileCheck --check-prefixes=RELOC,NOB2B-RELOC %s
+
+.section .rodata.vtable,"a"
+.globl vtable
+vtable:
+# B2B: Contents of section .rodata:
+# RELOC: RELOCATION RECORDS FOR [.rodata]:
+# RELOC-NEXT: OFFSET
+# B2B-NEXT: [[VF:[0-9a-f]{8}]]
+# B2B-RELOC-NEXT: R_AARCH64_PLT32 f3
+# NOB2B-RELOC-NEXT: R_AARCH64_PLT32 f1
+.4byte f1 at PLT - vtable
+# B2B-SAME: [[VF]]
+# B2B-RELOC-NEXT: R_AARCH64_PLT32 f3+0x4
+# NOB2B-RELOC-NEXT: R_AARCH64_PLT32 f2+0x4
+.4byte f2 at PLT - vtable
+# B2B-SAME: [[VF]]
+# RELOC-NEXT: R_AARCH64_PLT32 f3+0x8
+.4byte f3 at PLT - vtable
+
+.section .text._start,"ax"
+.globl _start
+# CHECK: <_start>:
+# RELOC: RELOCATION RECORDS FOR [.text]:
+# RELOC-NEXT: OFFSET
+_start:
+# B2B: bl {{.*}} <f3>
+# B2B-RELOC-NEXT: R_AARCH64_CALL26 f3
+# NOB2B: bl {{.*}} <f1{{.*}}>
+# NOB2B-RELOC-NEXT: R_AARCH64_CALL26 f1
+bl f1
+# B2B: b {{.*}} <f3>
+# B2B-RELOC-NEXT: R_AARCH64_JUMP26 f3
+# NOB2B: b {{.*}} <f2{{.*}}>
+# NOB2B-RELOC-NEXT: R_AARCH64_JUMP26 f2
+b f2
+
+.section .text.f1,"ax"
+.globl f1
+f1:
+# B2B-RELOC-NEXT: R_AARCH64_JUMP26 f3
+# NOB2B-RELOC-NEXT: R_AARCH64_JUMP26 f2
+b f2
+
+.section .text.f2,"ax"
+.globl f2
+# CHECK: <f2>:
+f2:
+# CHECK-NEXT: b {{.*}} <f3{{.*}}>
+# RELOC-NEXT: R_AARCH64_JUMP26 f3
+b f3
+
+.section .text.f3,"ax"
+.globl f3
+f3:
+ret
diff --git a/lld/test/ELF/x86-64-branch-to-branch.s b/lld/test/ELF/x86-64-branch-to-branch.s
new file mode 100644
index 0000000000000..dabf5be571ecc
--- /dev/null
+++ b/lld/test/ELF/x86-64-branch-to-branch.s
@@ -0,0 +1,133 @@
+# REQUIRES: x86
+
+## Test that the branch-to-branch optimization follows the links
+## from f1 -> f2 -> f3 and updates all references to point to f3.
+
+# RUN: llvm-mc -filetype=obj -triple=x86_64-pc-linux %s -o %t.o
+# RUN: ld.lld %t.o -o %t --branch-to-branch --emit-relocs
+# RUN: llvm-objdump -d -s %t | FileCheck --check-prefixes=CHECK,B2B %s
+# RUN: llvm-objdump -r %t | FileCheck --check-prefixes=RELOC,B2B-RELOC %s
+# RUN: ld.lld %t.o -o %t -O2 --emit-relocs
+# RUN: llvm-objdump -d -s %t | FileCheck --check-prefixes=CHECK,B2B %s
+# RUN: llvm-objdump -r %t | FileCheck --check-prefixes=RELOC,B2B-RELOC %s
+
+## Test that branch-to-branch is disabled by default.
+
+# RUN: ld.lld %t.o -o %t --emit-relocs
+# RUN: llvm-objdump -d -s %t | FileCheck --check-prefixes=CHECK,NOB2B %s
+# RUN: llvm-objdump -r %t | FileCheck --check-prefixes=RELOC,NOB2B-RELOC %s
+# RUN: ld.lld %t.o -o %t -O2 --no-branch-to-branch --emit-relocs
+# RUN: llvm-objdump -d -s %t | FileCheck --check-prefixes=CHECK,NOB2B %s
+# RUN: llvm-objdump -r %t | FileCheck --check-prefixes=RELOC,NOB2B-RELOC %s
+
+## Test that branch-to-branch is disabled for preemptible symbols.
+
+# RUN: ld.lld %t.o -o %t --branch-to-branch -shared --emit-relocs
+# RUN: llvm-objdump -d -s %t | FileCheck --check-prefixes=CHECK,NOB2B %s
+# RUN: llvm-objdump -r %t | FileCheck --check-prefixes=RELOC,NOB2B-RELOC %s
+
+.section .rodata.vtable,"a"
+.globl vtable
+vtable:
+# B2B: Contents of section .rodata:
+# RELOC: RELOCATION RECORDS FOR [.rodata]:
+# RELOC-NEXT: OFFSET
+# B2B-NEXT: [[VF:[0-9a-f]{8}]]
+# B2B-RELOC-NEXT: R_X86_64_PLT32 f3
+# NOB2B-RELOC-NEXT: R_X86_64_PLT32 f1
+.4byte f1 at PLT - vtable
+# B2B-SAME: [[VF]]
+# B2B-RELOC-NEXT: R_X86_64_PLT32 f3+0x4
+# NOB2B-RELOC-NEXT: R_X86_64_PLT32 f2+0x4
+.4byte f2 at PLT - vtable
+# B2B-SAME: [[VF]]
+# RELOC-NEXT: R_X86_64_PLT32 f3+0x8
+.4byte f3 at PLT - vtable
+
+# For .rodata.f6
+# B2B-RELOC-NEXT: R_X86_64_PLT32 f3-0x4
+
+.section .text._start,"ax"
+.globl _start
+# CHECK: <_start>:
+# RELOC: RELOCATION RECORDS FOR [.text]:
+# RELOC-NEXT: OFFSET
+_start:
+# B2B-NEXT: jmp {{.*}} <f3>
+# B2B-RELOC-NEXT: R_X86_64_PLT32 f3-0x4
+# NOB2B-NEXT: jmp {{.*}} <f1{{.*}}>
+# NOB2B-RELOC-NEXT: R_X86_64_PLT32 f1-0x4
+jmp f1
+# B2B-NEXT: jmp {{.*}} <f3>
+# B2B-RELOC-NEXT: R_X86_64_PLT32 f3-0x4
+# NOB2B-NEXT: jmp {{.*}} <f2{{.*}}>
+# NOB2B-RELOC-NEXT: R_X86_64_PLT32 f2-0x4
+jmp f2
+# This will assemble to a relocation pointing to an STT_SECTION for .text.f4
+# with an addend, which looks similar to the relative vtable cases above but
+# requires
diff erent handling of the addend so that we don't think this is
+# branching to the `jmp f3` at the start of the target section.
+# CHECK-NEXT: jmp {{.*}} <f4{{.*}}>
+# RELOC-NEXT: R_X86_64_PLT32 .text+0x2e
+jmp f4
+# B2B-NEXT: jmp 0x[[IPLT:[0-9a-f]*]]
+# RELOC-NEXT: R_X86_64_PLT32 f5-0x4
+jmp f5
+# B2B-NEXT: jmp {{.*}} <f6>
+# RELOC-NEXT: R_X86_64_PLT32 f6-0x4
+jmp f6
+# B2B-NEXT: jmp {{.*}} <f7>
+# RELOC-NEXT: R_X86_64_PLT32 f7-0x4
+jmp f7
+
+.section .text.f1,"ax"
+.globl f1
+f1:
+# B2B-RELOC-NEXT: R_X86_64_PLT32 f3-0x4
+# NOB2B-RELOC-NEXT: R_X86_64_PLT32 f2-0x4
+jmp f2
+
+.section .text.f2,"ax"
+.globl f2
+# CHECK: <f2>:
+f2:
+# CHECK-NEXT: jmp {{.*}} <f3{{.*}}>
+# RELOC-NEXT: R_X86_64_PLT32 f3-0x4
+jmp f3
+
+.section .text.f3,"ax"
+.globl f3
+f3:
+# Test that a self-branch doesn't trigger an infinite loop.
+# RELOC-NEXT: R_X86_64_PLT32 f3-0x4
+jmp f3
+
+.section .text.f4,"ax"
+jmp f3
+f4:
+ret
+
+.section .text.f5,"ax"
+.type f5, @gnu_indirect_function
+.globl f5
+f5:
+# RELOC-NEXT: R_X86_64_PLT32 f3-0x4
+jmp f3
+
+.section .rodata.f6,"a"
+.globl f6
+f6:
+# RELOC-NEXT: R_X86_64_PLT32 f3-0x4
+jmp f3
+
+# RELOC: RELOCATION RECORDS FOR [.wtext.f7]:
+# RELOC-NEXT: OFFSET
+
+.section .wtext.f7,"awx"
+.globl f7
+f7:
+# RELOC-NEXT: R_X86_64_PLT32 f3-0x4
+jmp f3
+
+# B2B: <.iplt>:
+# B2B-NEXT: [[IPLT]]:
More information about the llvm-commits
mailing list