[lld] [lld][ELF] Add range extension thunks for x86-64 (PR #180266)

Farid Zakaria via llvm-commits llvm-commits at lists.llvm.org
Tue Feb 10 21:16:59 PST 2026


https://github.com/fzakaria updated https://github.com/llvm/llvm-project/pull/180266

>From b980fac1c46c27a86af634b18a31ff17120fe453 Mon Sep 17 00:00:00 2001
From: Farid Zakaria <fmzakari at fb.com>
Date: Tue, 10 Feb 2026 21:12:32 -0800
Subject: [PATCH] [lld][ELF] Add range extension thunks for x86-64
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Summary:
This patch adds support for range extension thunks on x86-64, enabling LLD to link very large binaries where the .text section exceeds 2 GiB.

On x86-64, branch instructions (call/jmp) use RIP-relative addressing with a 32-bit signed displacement, limiting their range to ±2 GiB. When a branch target is beyond this range, the linker now inserts a thunk that the branch can reach, and the thunk then jumps to the final destination.

Two thunk types are implemented:
- **Short thunk** (5 bytes): Uses `jmp rel32` when the thunk is within 2 GiB of the target
- **Long thunk** (13 bytes): Uses `movabs $target, %r11; jmp *%r11` to reach any 64-bit address

The implementation:
1. Sets `needsThunks = true` for x86-64 in the target configuration
2. Implements `needsThunk()` to detect R_X86_64_PLT32 relocations that overflow
3. Implements `inBranchRange()` to check 32-bit signed displacement limits
4. Sets `getThunkSectionSpacing()` to 1 GiB, ensuring thunk sections are placed at regular intervals so any call site can reach a thunk
5. Adds `X86_64ABSLongThunk` class that generates the appropriate thunk code

The thunk uses r11 as the scratch register since it's caller-saved and not used for parameter passing in the SysV ABI.

Co-authored-by: Grigory Pastukhov <gpastukhov at meta.com>

Test Plan:
Added `lld/test/ELF/x86-64-thunks.s` which tests three scenarios using a linker script to place code at specific addresses:
1. **No thunk**: Direct call to a nearby function (within 2 GiB range)
2. **Short thunk**: Caller at 2.5 GiB calling target at 0x10000, thunk placed at 1 GiB boundary uses `jmp rel32`
3. **Long thunk**: Caller at 0x10000 calling target at 8 GiB, thunk uses `movabs + jmp *%r11`

```
ninja lld && ./bin/llvm-lit -v lld/test/ELF/x86-64-thunks.s
```
---
 lld/ELF/Arch/X86_64.cpp                  |  85 ++++++++---
 lld/ELF/Relocations.cpp                  |  13 +-
 lld/ELF/Thunks.cpp                       | 174 +++++++++++++++++++++--
 lld/ELF/Writer.cpp                       |   7 +-
 lld/test/ELF/x86-64-thunks-icf.s         |  73 ++++++++++
 lld/test/ELF/x86-64-thunks-jmp.s         |  35 +++++
 lld/test/ELF/x86-64-thunks-local.s       |  72 ++++++++++
 lld/test/ELF/x86-64-thunks-long.s        |  44 ++++++
 lld/test/ELF/x86-64-thunks-pic.s         |  49 +++++++
 lld/test/ELF/x86-64-thunks-preemptible.s |  48 +++++++
 lld/test/ELF/x86-64-thunks-retpoline.s   |  47 ++++++
 lld/test/ELF/x86-64-thunks-short.s       |  42 ++++++
 lld/test/ELF/x86-64-thunks-weak.s        |  31 ++++
 13 files changed, 688 insertions(+), 32 deletions(-)
 create mode 100644 lld/test/ELF/x86-64-thunks-icf.s
 create mode 100644 lld/test/ELF/x86-64-thunks-jmp.s
 create mode 100644 lld/test/ELF/x86-64-thunks-local.s
 create mode 100644 lld/test/ELF/x86-64-thunks-long.s
 create mode 100644 lld/test/ELF/x86-64-thunks-pic.s
 create mode 100644 lld/test/ELF/x86-64-thunks-preemptible.s
 create mode 100644 lld/test/ELF/x86-64-thunks-retpoline.s
 create mode 100644 lld/test/ELF/x86-64-thunks-short.s
 create mode 100644 lld/test/ELF/x86-64-thunks-weak.s

diff --git a/lld/ELF/Arch/X86_64.cpp b/lld/ELF/Arch/X86_64.cpp
index 9083b5b9ff250..eea6cc0aa4718 100644
--- a/lld/ELF/Arch/X86_64.cpp
+++ b/lld/ELF/Arch/X86_64.cpp
@@ -51,6 +51,11 @@ class X86_64 : public TargetInfo {
                              InputSection *nextIS) const override;
   bool relaxOnce(int pass) const override;
   void applyBranchToBranchOpt() const override;
+  bool needsThunk(RelExpr expr, RelType type, const InputFile *file,
+                  uint64_t branchAddr, const Symbol &s,
+                  int64_t a) const override;
+  bool inBranchRange(RelType type, uint64_t src, uint64_t dst) const override;
+  uint32_t getThunkSectionSpacing() const override;
 
 private:
   void relaxTlsGdToLe(uint8_t *loc, const Relocation &rel, uint64_t val) const;
@@ -93,6 +98,11 @@ X86_64::X86_64(Ctx &ctx) : TargetInfo(ctx) {
   trapInstr = {0xcc, 0xcc, 0xcc, 0xcc}; // 0xcc = INT3
   nopInstrs = nopInstructions;
 
+  // Enable thunks for x86-64 to support binaries where .text exceeds 2GiB.
+  // This is needed when RIP-relative branches (R_X86_64_PLT32 or R_X86_64_PC32)
+  // overflow.
+  needsThunks = true;
+
   // Align to the large page size (known as a superpage or huge page).
   // FreeBSD automatically promotes large, superpage-aligned allocations.
   defaultImageBase = 0x200000;
@@ -443,9 +453,9 @@ void X86_64::writeIgotPlt(uint8_t *buf, const Symbol &s) const {
 
 void X86_64::writePltHeader(uint8_t *buf) const {
   const uint8_t pltData[] = {
-      0xff, 0x35, 0, 0, 0, 0, // pushq GOTPLT+8(%rip)
-      0xff, 0x25, 0, 0, 0, 0, // jmp *GOTPLT+16(%rip)
-      0x0f, 0x1f, 0x40, 0x00, // nop
+      0xff, 0x35, 0,    0,    0, 0, // pushq GOTPLT+8(%rip)
+      0xff, 0x25, 0,    0,    0, 0, // jmp *GOTPLT+16(%rip)
+      0x0f, 0x1f, 0x40, 0x00,       // nop
   };
   memcpy(buf, pltData, sizeof(pltData));
   uint64_t gotPlt = ctx.in.gotPlt->getVA();
@@ -458,8 +468,8 @@ void X86_64::writePlt(uint8_t *buf, const Symbol &sym,
                       uint64_t pltEntryAddr) const {
   const uint8_t inst[] = {
       0xff, 0x25, 0, 0, 0, 0, // jmpq *got(%rip)
-      0x68, 0, 0, 0, 0,       // pushq <relocation index>
-      0xe9, 0, 0, 0, 0,       // jmpq plt[0]
+      0x68, 0,    0, 0, 0,    // pushq <relocation index>
+      0xe9, 0,    0, 0, 0,    // jmpq plt[0]
   };
   memcpy(buf, inst, sizeof(inst));
 
@@ -1320,12 +1330,12 @@ void Retpoline::writeGotPlt(uint8_t *buf, const Symbol &s) const {
 
 void Retpoline::writePltHeader(uint8_t *buf) const {
   const uint8_t insn[] = {
-      0xff, 0x35, 0,    0,    0,    0,          // 0:    pushq GOTPLT+8(%rip)
-      0x4c, 0x8b, 0x1d, 0,    0,    0,    0,    // 6:    mov GOTPLT+16(%rip), %r11
-      0xe8, 0x0e, 0x00, 0x00, 0x00,             // d:    callq next
-      0xf3, 0x90,                               // 12: loop: pause
-      0x0f, 0xae, 0xe8,                         // 14:   lfence
-      0xeb, 0xf9,                               // 17:   jmp loop
+      0xff, 0x35, 0,    0,    0,    0,       // 0:    pushq GOTPLT+8(%rip)
+      0x4c, 0x8b, 0x1d, 0,    0,    0,    0, // 6:    mov GOTPLT+16(%rip), %r11
+      0xe8, 0x0e, 0x00, 0x00, 0x00,          // d:    callq next
+      0xf3, 0x90,                            // 12: loop: pause
+      0x0f, 0xae, 0xe8,                      // 14:   lfence
+      0xeb, 0xf9,                            // 17:   jmp loop
       0xcc, 0xcc, 0xcc, 0xcc, 0xcc, 0xcc, 0xcc, // 19:   int3; .align 16
       0x4c, 0x89, 0x1c, 0x24,                   // 20: next: mov %r11, (%rsp)
       0xc3,                                     // 24:   ret
@@ -1343,12 +1353,12 @@ void Retpoline::writePltHeader(uint8_t *buf) const {
 void Retpoline::writePlt(uint8_t *buf, const Symbol &sym,
                          uint64_t pltEntryAddr) const {
   const uint8_t insn[] = {
-      0x4c, 0x8b, 0x1d, 0, 0, 0, 0, // 0:  mov foo at GOTPLT(%rip), %r11
-      0xe8, 0,    0,    0,    0,    // 7:  callq plt+0x20
-      0xe9, 0,    0,    0,    0,    // c:  jmp plt+0x12
-      0x68, 0,    0,    0,    0,    // 11: pushq <relocation index>
-      0xe9, 0,    0,    0,    0,    // 16: jmp plt+0
-      0xcc, 0xcc, 0xcc, 0xcc, 0xcc, // 1b: int3; padding
+      0x4c, 0x8b, 0x1d, 0,    0,    0, 0, // 0:  mov foo at GOTPLT(%rip), %r11
+      0xe8, 0,    0,    0,    0,          // 7:  callq plt+0x20
+      0xe9, 0,    0,    0,    0,          // c:  jmp plt+0x12
+      0x68, 0,    0,    0,    0,          // 11: pushq <relocation index>
+      0xe9, 0,    0,    0,    0,          // 16: jmp plt+0
+      0xcc, 0xcc, 0xcc, 0xcc, 0xcc,       // 1b: int3; padding
   };
   memcpy(buf, insn, sizeof(insn));
 
@@ -1396,6 +1406,47 @@ void RetpolineZNow::writePlt(uint8_t *buf, const Symbol &sym,
   write32le(buf + 8, ctx.in.plt->getVA() - pltEntryAddr - 12);
 }
 
+// For x86-64, thunks are needed when the displacement between the branch
+// instruction and its target exceeds the 32-bit signed range (2GiB).
+// This can happen in very large binaries where .text exceeds 2GiB.
+bool X86_64::needsThunk(RelExpr expr, RelType type, const InputFile *file,
+                        uint64_t branchAddr, const Symbol &s, int64_t a) const {
+  // Only PLT32 branch relocations need thunks.
+  // R_X86_64_PLT32 is used for call/jmp by modern compilers.
+  if (type != R_X86_64_PLT32)
+    return false;
+
+  // If the target requires a PLT entry, check if we can reach the PLT
+  if (s.isInPlt(ctx)) {
+    uint64_t dst = s.getPltVA(ctx) + a;
+    return !inBranchRange(type, branchAddr, dst);
+  }
+
+  // For direct calls/jumps, check if we can reach the destination
+  uint64_t dst = s.getVA(ctx, a);
+  return !inBranchRange(type, branchAddr, dst);
+}
+
+// Check if a branch from src to dst is within the 32-bit signed range.
+bool X86_64::inBranchRange(RelType type, uint64_t src, uint64_t dst) const {
+  // x86-64 RIP-relative branches use a 32-bit signed displacement.
+  // The displacement is relative to the address after the instruction,
+  // which is typically 4-5 bytes after the relocation location.
+  // We use a conservative range check here.
+  int64_t offset = dst - src;
+  return llvm::isInt<32>(offset);
+}
+
+// Return the spacing for thunk sections. We want thunks to be placed
+// at intervals such that all branches can reach either the target or
+// a thunk. With a 2GiB range, we place thunks every ~1GiB to allow
+// branches to reach in either direction.
+uint32_t X86_64::getThunkSectionSpacing() const {
+  // 1GiB spacing - gives us 1GiB forward and 1GiB backward range
+  // from any point, which covers the 2GiB total range.
+  return 0x40000000;
+}
+
 void elf::setX86_64TargetInfo(Ctx &ctx) {
   if (ctx.arg.zRetpolineplt) {
     if (ctx.arg.zNow)
diff --git a/lld/ELF/Relocations.cpp b/lld/ELF/Relocations.cpp
index 9ea5758eea8c2..eb917dae1b1d2 100644
--- a/lld/ELF/Relocations.cpp
+++ b/lld/ELF/Relocations.cpp
@@ -1845,6 +1845,11 @@ static int64_t getPCBias(Ctx &ctx, const InputSection &isec,
   }
   if (ctx.arg.emachine == EM_HEXAGON)
     return -getHexagonPacketOffset(isec, rel);
+  // x86-64 R_X86_64_PLT32 encodes a -4 addend for the 4-byte displacement
+  // field. Report this as a PC bias so that after thunk redirection the
+  // call-to-thunk relocation carries the correct addend (-4).
+  if (ctx.arg.emachine == EM_X86_64)
+    return 4;
   return 0;
 }
 
@@ -2179,9 +2184,11 @@ bool ThunkCreator::createThunks(uint32_t pass,
             rel.sym = t->getThunkTargetSym();
             rel.expr = fromPlt(rel.expr);
 
-            // On AArch64 and PPC, a jump/call relocation may be encoded as
-            // STT_SECTION + non-zero addend, clear the addend after
-            // redirection.
+            // A jump/call relocation may be encoded as STT_SECTION +
+            // non-zero addend. After redirecting to a thunk, reset the addend
+            // to just the PC bias (negated) so the call-to-thunk relocation
+            // is correct. getPCBias() returns the architecture-specific bias:
+            //   ARM: 4 or 8, Hexagon: packet offset, x86-64: 4.
             if (ctx.arg.emachine != EM_MIPS)
               rel.addend = -getPCBias(ctx, *isec, rel);
           }
diff --git a/lld/ELF/Thunks.cpp b/lld/ELF/Thunks.cpp
index 65d0f094c43c3..c37eea1a06726 100644
--- a/lld/ELF/Thunks.cpp
+++ b/lld/ELF/Thunks.cpp
@@ -13,10 +13,10 @@
 // such as MIPS PIC and non-PIC or ARM non-Thumb and Thumb functions.
 //
 // If a jump target is too far and its address doesn't fit to a
-// short jump instruction, we need to create a thunk too, but we
-// haven't supported it yet.
+// short jump instruction, we need to create a thunk too.
 //
-// i386 and x86-64 don't need thunks.
+// For x86-64, thunks are needed when the .text section exceeds 2GiB
+// and RIP-relative branches (R_X86_64_PLT32, R_X86_64_PC32) overflow.
 //
 //===---------------------------------------------------------------------===//
 
@@ -41,6 +41,8 @@ using namespace llvm::object;
 using namespace llvm::ELF;
 using namespace lld;
 using namespace lld::elf;
+using llvm::support::endian::write32le;
+using llvm::support::endian::write64le;
 
 namespace {
 
@@ -474,7 +476,8 @@ class PPC32PltCallStub final : public Thunk {
   uint32_t size() override { return 16; }
   void writeTo(uint8_t *buf) override;
   void addSymbols(ThunkSection &isec) override;
-  bool isCompatibleWith(const InputSection &isec, const Relocation &rel) const override;
+  bool isCompatibleWith(const InputSection &isec,
+                        const Relocation &rel) const override;
 
 private:
   // Records the call site of the call stub.
@@ -613,6 +616,51 @@ class PPC64PDLongBranchThunk final : public PPC64LongBranchThunk {
   }
 };
 
+// Base class for x86-64 thunks.
+//
+// An x86-64 thunk may be either short or long. A short thunk is simply a
+// jmp rel32 instruction (5 bytes), and it may be used when the distance from
+// the thunk to the target is less than 2GiB. Long thunks can branch to any
+// 64-bit virtual address and are implemented in derived classes. This class
+// tries to create a short thunk if the target is in range, otherwise it
+// creates a long thunk.
+class X86_64Thunk : public Thunk {
+public:
+  X86_64Thunk(Ctx &ctx, Symbol &dest, int64_t addend)
+      : Thunk(ctx, dest, addend) {}
+  bool getMayUseShortThunk();
+  void writeTo(uint8_t *buf) override;
+
+private:
+  bool mayUseShortThunk = true;
+  virtual void writeLong(uint8_t *buf) = 0;
+};
+
+// x86-64 long range thunk using PC-relative offset.
+//
+// The long thunk uses a position-independent sequence:
+//   movabsq $offset, %r11  # 49 BB xx xx xx xx xx xx xx xx (10 bytes)
+//   leaq (%rip), %r10      # 4C 8D 15 00 00 00 00         (7 bytes)
+//   addq %r10, %r11        # 4D 01 D3                     (3 bytes)
+//   jmp *%r11              # 41 FF E3                     (3 bytes)
+//
+// Total size: 23 bytes. The offset is computed as:
+//   target - (thunk_address + 17)
+// where 17 = sizeof(movabsq) + sizeof(leaq) = 10 + 7, which is the
+// RIP value captured by leaq (the address of the addq instruction).
+// Using r11 and r10 as they are caller-saved registers that are not
+// used for parameter passing in the SysV ABI.
+class X86_64LongThunk final : public X86_64Thunk {
+public:
+  X86_64LongThunk(Ctx &ctx, Symbol &dest, int64_t addend)
+      : X86_64Thunk(ctx, dest, addend) {}
+  uint32_t size() override { return getMayUseShortThunk() ? 5 : 23; }
+  void addSymbols(ThunkSection &isec) override;
+
+private:
+  void writeLong(uint8_t *buf) override;
+};
+
 } // end anonymous namespace
 
 Defined *Thunk::addSymbol(StringRef name, uint8_t type, uint64_t value,
@@ -666,10 +714,10 @@ bool AArch64Thunk::needsSyntheticLandingPad() {
 // AArch64 long range Thunks.
 void AArch64ABSLongThunk::writeLong(uint8_t *buf) {
   const uint8_t data[] = {
-    0x50, 0x00, 0x00, 0x58, //     ldr x16, L0
-    0x00, 0x02, 0x1f, 0xd6, //     br  x16
-    0x00, 0x00, 0x00, 0x00, // L0: .xword S
-    0x00, 0x00, 0x00, 0x00,
+      0x50, 0x00, 0x00, 0x58, //     ldr x16, L0
+      0x00, 0x02, 0x1f, 0xd6, //     br  x16
+      0x00, 0x00, 0x00, 0x00, // L0: .xword S
+      0x00, 0x00, 0x00, 0x00,
   };
   // If mayNeedLandingPad is true then destination is an
   // AArch64BTILandingPadThunk that defines landingPad.
@@ -893,7 +941,8 @@ bool ThumbThunk::isCompatibleWith(const InputSection &isec,
     return false;
 
   // ARM branch relocations can't use BLX
-  return rel.type != R_ARM_JUMP24 && rel.type != R_ARM_PC24 && rel.type != R_ARM_PLT32;
+  return rel.type != R_ARM_JUMP24 && rel.type != R_ARM_PC24 &&
+         rel.type != R_ARM_PLT32;
 }
 
 void ARMV7ABSLongThunk::writeLong(uint8_t *buf) {
@@ -1234,6 +1283,97 @@ void ThumbV4PILongThunk::addLongMapSyms() {
   addSymbol("$d", STT_NOTYPE, 16, *tsec);
 }
 
+// x86-64 Thunk base class.
+// For x86-64, the thunk's addend comes from the original R_X86_64_PLT32
+// relocation. That addend includes a -4 PC-relative compensation (since the
+// CPU adds 4 for the displacement field size during PC-relative addressing).
+// When computing the thunk's jump target, we must add back this +4 to get
+// the actual destination address. For example, a call to a local symbol at
+// section+0x100 produces addend = 0x100 - 4 = 0xFC, so the thunk target
+// should be section.getVA(0xFC + 4) = section + 0x100.
+static uint64_t getX86_64ThunkDestVA(Ctx &ctx, const Symbol &s, int64_t a) {
+  if (s.isInPlt(ctx))
+    return s.getPltVA(ctx);
+  // Add 4 to undo the -4 PC-relative compensation in the addend.
+  return s.getVA(ctx, a + 4);
+}
+
+bool X86_64Thunk::getMayUseShortThunk() {
+  if (!mayUseShortThunk)
+    return false;
+  uint64_t s = getX86_64ThunkDestVA(ctx, destination, addend);
+  uint64_t p = getThunkTargetSym()->getVA(ctx);
+  // The jmp rel32 instruction is 5 bytes, so we check (target - (thunk + 5)).
+  mayUseShortThunk = llvm::isInt<32>(s - p - 5);
+  return mayUseShortThunk;
+}
+
+void X86_64Thunk::writeTo(uint8_t *buf) {
+  if (!getMayUseShortThunk()) {
+    writeLong(buf);
+    return;
+  }
+  // Short thunk: jmp rel32 (5 bytes)
+  uint64_t s = getX86_64ThunkDestVA(ctx, destination, addend);
+  uint64_t p = getThunkTargetSym()->getVA(ctx);
+  buf[0] = 0xe9; // jmp rel32
+  write32le(buf + 1, static_cast<uint32_t>(s - p - 5));
+}
+
+// x86-64 long range thunk implementation.
+// Uses a position-independent RIP-relative offset sequence:
+//   movabsq $offset, %r11  ; 49 BB xx xx xx xx xx xx xx xx  (10 bytes)
+//   leaq (%rip), %r10      ; 4C 8D 15 00 00 00 00          (7 bytes)
+//   addq %r10, %r11        ; 4D 01 D3                      (3 bytes)
+//   jmp *%r11              ; 41 FF E3                      (3 bytes)
+//
+// The leaq captures the RIP (address of the next instruction, i.e. addq),
+// so offset = target - (thunk_address + 10 + 7) = target - (thunk_address + 17).
+void X86_64LongThunk::writeLong(uint8_t *buf) {
+  // movabsq $offset, %r11
+  buf[0] = 0x49;
+  buf[1] = 0xbb;
+
+  uint64_t target = getX86_64ThunkDestVA(ctx, destination, addend);
+  uint64_t thunkAddr = getThunkTargetSym()->getVA(ctx);
+  // RIP after leaq points to the addq instruction at thunkAddr + 17.
+  uint64_t offset = target - (thunkAddr + 17);
+  write64le(buf + 2, offset);
+
+  // leaq (%rip), %r10  ; RIP-relative with 0 displacement
+  buf[10] = 0x4c; // REX.WR prefix (W=1, R=1 for r10)
+  buf[11] = 0x8d; // lea
+  buf[12] = 0x15; // ModRM: mod=00, reg=2 (r10), rm=5 (RIP-relative)
+  write32le(buf + 13, 0); // disp32 = 0
+
+  // addq %r10, %r11
+  buf[17] = 0x4d; // REX.WRB prefix (W=1, R=1 for r10, B=1 for r11)
+  buf[18] = 0x01; // add r/m64, r64
+  buf[19] = 0xd3; // ModRM: mod=11, reg=2 (r10), rm=3 (r11)
+
+  // jmp *%r11
+  buf[20] = 0x41; // REX.B prefix
+  buf[21] = 0xff; // jmp r/m64
+  buf[22] = 0xe3; // ModRM: mod=11, reg=4 (jmp), rm=3 (r11)
+}
+
+void X86_64LongThunk::addSymbols(ThunkSection &isec) {
+  StringRef name = destination.getName();
+  // When the destination is a STT_SECTION symbol (e.g. from a relocation
+  // against a local symbol), the name may be empty. Include the addend in
+  // the thunk name to disambiguate thunks targeting different offsets within
+  // the same section. We add 4 to display the actual offset (undoing the -4
+  // PC-relative compensation baked into x86-64 R_X86_64_PLT32 addends).
+  if (name.empty() || destination.isSection()) {
+    addSymbol(ctx.saver.save("__X86_64LongThunk_" + name + "_" +
+                             llvm::utohexstr(addend + 4)),
+              STT_FUNC, 0, isec);
+  } else {
+    addSymbol(ctx.saver.save("__X86_64LongThunk_" + name), STT_FUNC, 0,
+              isec);
+  }
+}
+
 // Use the long jump which covers a range up to 8MiB.
 void AVRThunk::writeTo(uint8_t *buf) {
   write32(ctx, buf, 0x940c); // jmp func
@@ -1810,6 +1950,16 @@ static std::unique_ptr<Thunk> addThunkPPC64(Ctx &ctx, RelType type, Symbol &s,
   return std::make_unique<PPC64PDLongBranchThunk>(ctx, s, a);
 }
 
+static std::unique_ptr<Thunk> addThunkX86_64(Ctx &ctx, const InputSection &isec,
+                                             RelType type, Symbol &s,
+                                             int64_t a) {
+  // For x86-64, thunks are needed when calls/jumps exceed the 2GiB range
+  // of RIP-relative addressing.
+  assert((type == R_X86_64_PLT32 || type == R_X86_64_PC32) &&
+         "unexpected relocation type for x86-64 thunk");
+  return std::make_unique<X86_64LongThunk>(ctx, s, a);
+}
+
 std::unique_ptr<Thunk> elf::addThunk(Ctx &ctx, const InputSection &isec,
                                      Relocation &rel) {
   Symbol &s = *rel.sym;
@@ -1830,9 +1980,11 @@ std::unique_ptr<Thunk> elf::addThunk(Ctx &ctx, const InputSection &isec,
     return addThunkPPC64(ctx, rel.type, s, a);
   case EM_HEXAGON:
     return addThunkHexagon(ctx, isec, rel, s);
+  case EM_X86_64:
+    return addThunkX86_64(ctx, isec, rel.type, s, a);
   default:
-    llvm_unreachable(
-        "add Thunk only supported for ARM, AVR, Hexagon, Mips and PowerPC");
+    llvm_unreachable("add Thunk only supported for ARM, AVR, Hexagon, Mips, "
+                     "PowerPC, and x86-64");
   }
 }
 
diff --git a/lld/ELF/Writer.cpp b/lld/ELF/Writer.cpp
index 9220d73559b0b..4131ea1883d29 100644
--- a/lld/ELF/Writer.cpp
+++ b/lld/ELF/Writer.cpp
@@ -1546,7 +1546,12 @@ template <class ELFT> void Writer<ELFT>::finalizeAddressDependentContent() {
   while (!ctx.arg.relocatable) {
     bool changed = ctx.target->needsThunks
                        ? tc.createThunks(pass, ctx.outputSections)
-                       : ctx.target->relaxOnce(pass);
+                       : false;
+    // x86-64 needs both thunks (for range extension) and relaxOnce (for
+    // reverting GOT relaxations when addresses overflow 32 bits). Other
+    // architectures either use thunks only or relaxOnce only, and the
+    // default relaxOnce returns false.
+    changed |= ctx.target->relaxOnce(pass);
     bool spilled = ctx.script->spillSections();
     changed |= spilled;
     ++pass;
diff --git a/lld/test/ELF/x86-64-thunks-icf.s b/lld/test/ELF/x86-64-thunks-icf.s
new file mode 100644
index 0000000000000..2abb1faf83c7b
--- /dev/null
+++ b/lld/test/ELF/x86-64-thunks-icf.s
@@ -0,0 +1,73 @@
+# REQUIRES: x86
+# Test that --icf=all interacts correctly with thunks. Two identical
+# functions (foo, bar) are in the same region and get folded by ICF.
+# A far-away caller (_start) references both; after folding, both
+# resolve to foo and the thunk to foo is reused. foo itself calls
+# target without needing a thunk (both are nearby).
+
+# RUN: rm -rf %t && split-file %s %t
+# RUN: llvm-mc -filetype=obj -triple=x86_64 %t/main.s -o %t/main.o
+# RUN: ld.lld --icf=all --print-icf-sections -T %t/script.lds %t/main.o -o %t/out 2>&1 | FileCheck --check-prefix=ICF %s
+# RUN: llvm-objdump -d --no-show-raw-insn %t/out | FileCheck %s
+
+# ICF: selected section {{.*}}:(.text.foo)
+# ICF:   removing identical section {{.*}}:(.text.bar)
+
+## foo calls target directly (no thunk, both near 0x10000).
+# CHECK:      <foo>:
+# CHECK-NEXT:   callq {{.*}} <target>
+# CHECK-NEXT:   retq
+
+# CHECK:      <target>:
+# CHECK-NEXT:   retq
+
+## _start is far away; both calls resolve to foo after ICF folding.
+## A single thunk is reused for both calls.
+# CHECK:      <_start>:
+# CHECK-NEXT:   callq {{.*}} <__X86_64LongThunk_foo>
+# CHECK-NEXT:   callq {{.*}} <__X86_64LongThunk_foo>
+# CHECK-NEXT:   retq
+
+# CHECK:      <__X86_64LongThunk_foo>:
+# CHECK-NEXT:   movabsq
+# CHECK-NEXT:   leaq
+# CHECK-NEXT:   addq    %r10, %r11
+# CHECK-NEXT:   jmpq    *%r11
+
+#--- main.s
+.section .text.foo,"ax", at progbits
+.globl foo
+.type foo, @function
+foo:
+  call target
+  ret
+
+.section .text.bar,"ax", at progbits
+.globl bar
+.type bar, @function
+bar:
+  call target
+  ret
+
+.section .text.target,"ax", at progbits
+.globl target
+.type target, @function
+target:
+  ret
+
+.section .text._start,"ax", at progbits
+.globl _start
+.type _start, @function
+_start:
+  call foo
+  call bar
+  ret
+
+#--- script.lds
+SECTIONS {
+    . = 0x10000;
+    .text : { *(.text.foo) *(.text.bar) *(.text.target) }
+
+    . = 0x200000000;
+    .text.far : { *(.text._start) }
+}
diff --git a/lld/test/ELF/x86-64-thunks-jmp.s b/lld/test/ELF/x86-64-thunks-jmp.s
new file mode 100644
index 0000000000000..facebabf6c562
--- /dev/null
+++ b/lld/test/ELF/x86-64-thunks-jmp.s
@@ -0,0 +1,35 @@
+# REQUIRES: x86
+# Test that jmp instructions (tail calls) also get thunks when the target
+# is out of range. Both call and jmp use R_X86_64_PLT32 relocations, so
+# thunks must handle jmp identically to call.
+
+# RUN: llvm-mc -filetype=obj -triple=x86_64 %s -o %t.o
+# RUN: echo 'SECTIONS { \
+# RUN:       .text_low 0x10000: { *(.text_low) } \
+# RUN:       .text_high 0x200000000: { *(.text_high) } \
+# RUN:       }' > %t.script
+# RUN: ld.lld -T %t.script %t.o -o %t
+# RUN: llvm-objdump -d --no-show-raw-insn %t | FileCheck %s
+
+## The jmp to high_target should go through a thunk, just like a call would.
+# CHECK:      <_start>:
+# CHECK-NEXT:   jmp {{.*}} <__X86_64LongThunk_high_target>
+
+# CHECK:      <__X86_64LongThunk_high_target>:
+# CHECK-NEXT:   movabsq
+# CHECK-NEXT:   leaq
+# CHECK-NEXT:   addq    %r10, %r11
+# CHECK-NEXT:   jmpq    *%r11
+
+# CHECK:      <high_target>:
+# CHECK-NEXT:   retq
+
+.section .text_low,"ax", at progbits
+.globl _start
+_start:
+  jmp high_target
+
+.section .text_high,"ax", at progbits
+.globl high_target
+high_target:
+  ret
diff --git a/lld/test/ELF/x86-64-thunks-local.s b/lld/test/ELF/x86-64-thunks-local.s
new file mode 100644
index 0000000000000..c951f012da9c0
--- /dev/null
+++ b/lld/test/ELF/x86-64-thunks-local.s
@@ -0,0 +1,72 @@
+# REQUIRES: x86
+# Test x86-64 range extension thunks for calls to local symbols.
+#
+# When a call targets a local symbol, the assembler generates a relocation
+# against the STT_SECTION symbol with an addend encoding the function offset
+# (plus the -4 PC bias for x86-64). This test verifies that after redirecting
+# through a thunk:
+# 1. The thunk correctly jumps to the local symbol's address (section + offset)
+# 2. The call relocation to the thunk has the correct addend (-4 for x86-64)
+# 3. The thunk name is properly disambiguated for section symbols
+
+# RUN: rm -rf %t && split-file %s %t
+# RUN: llvm-mc -filetype=obj -triple=x86_64 %t/main.s -o %t/main.o
+# RUN: ld.lld %t/main.o -T %t/script.lds -o %t/out
+# RUN: llvm-objdump -d --no-show-raw-insn %t/out | FileCheck %s
+
+#--- main.s
+.text
+
+## _start calls a local symbol in a far-away section.
+## The assembler encodes this as R_X86_64_PLT32 against the STT_SECTION symbol
+## for .text.far with addend = offset_of_far_local - 4 = 0x100 - 4 = 0xFC.
+.globl _start
+.type _start, @function
+_start:
+    call far_local
+    ret
+
+## The call should reach the thunk entry point exactly (no +0xN offset).
+## The thunk name includes the actual destination offset (0x100) since the
+## destination is a nameless STT_SECTION symbol.
+# CHECK-LABEL: <_start>:
+# CHECK-NEXT:    callq {{.*}} <__X86_64LongThunk__100>
+# CHECK-NEXT:    retq
+
+## Padding to push _start out of range of the far section.
+.section .text.pad,"ax", at progbits
+nop
+
+## far_local is a local (non-global) function placed at 8GiB via linker script.
+## Because it is local, the assembler will generate a relocation against the
+## STT_SECTION symbol for .text.far with an addend.
+.section .text.far,"ax", at progbits
+## Add some padding before the local function so the addend is non-trivial.
+.space 0x100
+far_local:
+    ret
+
+## The thunk must jump to far_local's actual address (8GiB + 0x100),
+## NOT to the start of .text.far (8GiB + 0x0).
+# CHECK-LABEL: <__X86_64LongThunk__100>:
+# CHECK-NEXT:    movabsq
+# CHECK-NEXT:    leaq
+# CHECK-NEXT:    addq    %r10, %r11
+# CHECK-NEXT:    jmpq    *%r11
+
+## Verify that far_local is at the expected address.
+# CHECK-LABEL: <far_local>:
+# CHECK-NEXT:    retq
+
+#--- script.lds
+SECTIONS {
+    . = 0x10000;
+    .text : { *(.text) }
+
+    . = 0x80000000;
+    .text.pad : { *(.text.pad) }
+
+    ## Place far section at 8GiB
+    . = 0x200000000;
+    .text.far : { *(.text.far) }
+}
diff --git a/lld/test/ELF/x86-64-thunks-long.s b/lld/test/ELF/x86-64-thunks-long.s
new file mode 100644
index 0000000000000..a0caeb386504c
--- /dev/null
+++ b/lld/test/ELF/x86-64-thunks-long.s
@@ -0,0 +1,44 @@
+# REQUIRES: x86
+# RUN: llvm-mc -filetype=obj -triple=x86_64 %s -o %t.o
+# RUN: echo 'SECTIONS { \
+# RUN:       .text_low 0x10000: { *(.text_low) } \
+# RUN:       .text_high 0x200000000: { *(.text_high) } \
+# RUN:       }' > %t.script
+# RUN: ld.lld -T %t.script %t.o -o %t
+# RUN: llvm-objdump -d --no-show-raw-insn %t | FileCheck %s
+
+## Two calls to high_target reuse the same long thunk.
+# CHECK:      <_start>:
+# CHECK-NEXT:   callq {{.*}} <__X86_64LongThunk_high_target>
+# CHECK-NEXT:   callq {{.*}} <__X86_64LongThunk_high_target>
+# CHECK-NEXT:   retq
+
+# CHECK:      <__X86_64LongThunk_high_target>:
+# CHECK-NEXT:   movabsq
+# CHECK-NEXT:   leaq
+# CHECK-NEXT:   addq    %r10, %r11
+# CHECK-NEXT:   jmpq    *%r11
+
+## high_target calls back to _start; needs a long thunk.
+# CHECK:      <high_target>:
+# CHECK-NEXT:   callq {{.*}} <__X86_64LongThunk__start>
+# CHECK-NEXT:   retq
+
+# CHECK:      <__X86_64LongThunk__start>:
+# CHECK-NEXT:   movabsq
+# CHECK-NEXT:   leaq
+# CHECK-NEXT:   addq    %r10, %r11
+# CHECK-NEXT:   jmpq    *%r11
+
+.section .text_low,"ax", at progbits
+.globl _start
+_start:
+  call high_target
+  call high_target
+  ret
+
+.section .text_high,"ax", at progbits
+.globl high_target
+high_target:
+  call _start
+  ret
diff --git a/lld/test/ELF/x86-64-thunks-pic.s b/lld/test/ELF/x86-64-thunks-pic.s
new file mode 100644
index 0000000000000..f04c0a1996de1
--- /dev/null
+++ b/lld/test/ELF/x86-64-thunks-pic.s
@@ -0,0 +1,49 @@
+# REQUIRES: x86
+# Test x86-64 range extension thunks with -pie (PIC mode).
+# Verify the same PC-relative thunks are emitted for position-independent
+# executables.
+
+# RUN: llvm-mc -filetype=obj -triple=x86_64 %s -o %t.o
+# RUN: echo 'SECTIONS { \
+# RUN:       .text_low 0x10000: { *(.text_low) } \
+# RUN:       .text_high 0x200000000: { *(.text_high) } \
+# RUN:       }' > %t.script
+# RUN: ld.lld -pie -T %t.script %t.o -o %t
+# RUN: llvm-objdump -d --no-show-raw-insn %t | FileCheck %s
+
+## Two calls to high_target reuse the same long thunk.
+# CHECK:      <_start>:
+# CHECK-NEXT:   callq {{.*}} <__X86_64LongThunk_high_target>
+# CHECK-NEXT:   callq {{.*}} <__X86_64LongThunk_high_target>
+# CHECK-NEXT:   retq
+
+## PC-relative long thunk: movabsq + leaq + addq + jmpq sequence.
+# CHECK:      <__X86_64LongThunk_high_target>:
+# CHECK-NEXT:   movabsq
+# CHECK-NEXT:   leaq
+# CHECK-NEXT:   addq    %r10, %r11
+# CHECK-NEXT:   jmpq    *%r11
+
+## high_target calls back to _start; needs a long thunk.
+# CHECK:      <high_target>:
+# CHECK-NEXT:   callq {{.*}} <__X86_64LongThunk__start>
+# CHECK-NEXT:   retq
+
+# CHECK:      <__X86_64LongThunk__start>:
+# CHECK-NEXT:   movabsq
+# CHECK-NEXT:   leaq
+# CHECK-NEXT:   addq    %r10, %r11
+# CHECK-NEXT:   jmpq    *%r11
+
+.section .text_low,"ax", at progbits
+.globl _start
+_start:
+  call high_target
+  call high_target
+  ret
+
+.section .text_high,"ax", at progbits
+.globl high_target
+high_target:
+  call _start
+  ret
diff --git a/lld/test/ELF/x86-64-thunks-preemptible.s b/lld/test/ELF/x86-64-thunks-preemptible.s
new file mode 100644
index 0000000000000..39a0dac32b74e
--- /dev/null
+++ b/lld/test/ELF/x86-64-thunks-preemptible.s
@@ -0,0 +1,48 @@
+# REQUIRES: x86
+# Test that thunks for preemptible symbols correctly target the PLT entry
+# rather than the symbol's definition address. In a shared library,
+# calls to preemptible symbols must go through the PLT so that symbol
+# interposition works at runtime.
+
+# RUN: rm -rf %t && split-file %s %t
+# RUN: llvm-mc -filetype=obj -triple=x86_64 %t/main.s -o %t/main.o
+# RUN: ld.lld -shared -T %t/script.lds %t/main.o -o %t/out.so
+# RUN: llvm-objdump -d --no-show-raw-insn %t/out.so | FileCheck %s
+
+## In a shared library, preemptible is not hidden so calls go through PLT.
+## The PLT is near .text_low, but the caller is at 8GiB, so it needs a
+## thunk. The thunk should jump to the PLT entry, not the definition.
+# CHECK:      <caller>:
+# CHECK-NEXT:   callq {{.*}} <__X86_64LongThunk_preemptible>
+# CHECK-NEXT:   retq
+
+## The thunk targets the PLT entry for the preemptible symbol.
+# CHECK:      <__X86_64LongThunk_preemptible>:
+# CHECK-NEXT:   movabsq
+# CHECK-NEXT:   leaq
+# CHECK-NEXT:   addq    %r10, %r11
+# CHECK-NEXT:   jmpq    *%r11
+
+#--- main.s
+.section .text_low,"ax", at progbits
+.globl preemptible
+.type preemptible, @function
+preemptible:
+  ret
+
+.section .text_high,"ax", at progbits
+.globl caller
+.type caller, @function
+caller:
+  call preemptible
+  ret
+
+#--- script.lds
+SECTIONS {
+    . = 0x10000;
+    .text_low : { *(.text_low) }
+    .plt : { *(.plt) *(.plt.*) }
+
+    . = 0x200000000;
+    .text_high : { *(.text_high) }
+}
diff --git a/lld/test/ELF/x86-64-thunks-retpoline.s b/lld/test/ELF/x86-64-thunks-retpoline.s
new file mode 100644
index 0000000000000..17238877fa752
--- /dev/null
+++ b/lld/test/ELF/x86-64-thunks-retpoline.s
@@ -0,0 +1,47 @@
+# REQUIRES: x86
+# Test that -z retpolineplt works together with range extension thunks.
+# Retpoline changes the PLT format but the thunk mechanism (which bypasses
+# PLT for non-preemptible symbols) should still function correctly.
+
+# RUN: llvm-mc -filetype=obj -triple=x86_64 %s -o %t.o
+# RUN: echo 'SECTIONS { \
+# RUN:       .text_low 0x10000: { *(.text_low) } \
+# RUN:       .text_high 0x200000000: { *(.text_high) } \
+# RUN:       }' > %t.script
+# RUN: ld.lld -z retpolineplt -T %t.script %t.o -o %t
+# RUN: llvm-objdump -d --no-show-raw-insn %t | FileCheck %s
+
+## The call to high_target needs a thunk. Even with -z retpolineplt,
+## non-preemptible direct calls should get a regular thunk (not go
+## through the retpoline PLT).
+# CHECK:      <_start>:
+# CHECK-NEXT:   callq {{.*}} <__X86_64LongThunk_high_target>
+# CHECK-NEXT:   retq
+
+# CHECK:      <__X86_64LongThunk_high_target>:
+# CHECK-NEXT:   movabsq
+# CHECK-NEXT:   leaq
+# CHECK-NEXT:   addq    %r10, %r11
+# CHECK-NEXT:   jmpq    *%r11
+
+# CHECK:      <high_target>:
+# CHECK-NEXT:   callq {{.*}} <__X86_64LongThunk__start>
+# CHECK-NEXT:   retq
+
+# CHECK:      <__X86_64LongThunk__start>:
+# CHECK-NEXT:   movabsq
+# CHECK-NEXT:   leaq
+# CHECK-NEXT:   addq    %r10, %r11
+# CHECK-NEXT:   jmpq    *%r11
+
+.section .text_low,"ax", at progbits
+.globl _start
+_start:
+  call high_target
+  ret
+
+.section .text_high,"ax", at progbits
+.globl high_target
+high_target:
+  call _start
+  ret
diff --git a/lld/test/ELF/x86-64-thunks-short.s b/lld/test/ELF/x86-64-thunks-short.s
new file mode 100644
index 0000000000000..3f746456ca708
--- /dev/null
+++ b/lld/test/ELF/x86-64-thunks-short.s
@@ -0,0 +1,42 @@
+# REQUIRES: x86
+# RUN: llvm-mc -filetype=obj -triple=x86_64 %s -o %t.o
+# RUN: echo 'SECTIONS { \
+# RUN:       .text_low 0x10000: { *(.text_low) } \
+# RUN:       .text_high 0x80010000: { *(.text_high) } \
+# RUN:       }' > %t.script
+# RUN: ld.lld -T %t.script %t.o -o %t
+# RUN: llvm-objdump -d --no-show-raw-insn %t | FileCheck %s
+
+## The thunk is placed just after _start's code. The call from _start
+## (at 0x10000) to high_target (at 0x8001000e) barely overflows signed
+## 32-bit range, but the thunk (at 0x1000b, a few bytes closer) can
+## still reach high_target with a 5-byte jmp rel32.
+
+## Both calls to high_target reuse the same short thunk.
+# CHECK:      <_start>:
+# CHECK-NEXT:   callq {{.*}} <__X86_64LongThunk_high_target>
+# CHECK-NEXT:   callq {{.*}} <__X86_64LongThunk_high_target>
+# CHECK-NEXT:   retq
+
+## Short thunk: a plain jmp rel32 (5 bytes), NOT the movabsq sequence.
+# CHECK:      <__X86_64LongThunk_high_target>:
+# CHECK-NEXT:   jmp{{.*}} <high_target>
+# CHECK-NOT:    movabsq
+
+# CHECK:      <high_target>:
+# CHECK-NEXT:   retq
+
+.section .text_low,"ax", at progbits
+.globl _start
+_start:
+  call high_target
+  call high_target
+  ret
+
+.section .text_high,"ax", at progbits
+## 14 bytes of padding so high_target is just past the 2GiB range of the
+## call in _start, but still within 2GiB of the thunk placed after _start.
+.space 14
+.globl high_target
+high_target:
+  ret
diff --git a/lld/test/ELF/x86-64-thunks-weak.s b/lld/test/ELF/x86-64-thunks-weak.s
new file mode 100644
index 0000000000000..2fef9cbe8b516
--- /dev/null
+++ b/lld/test/ELF/x86-64-thunks-weak.s
@@ -0,0 +1,31 @@
+# REQUIRES: x86
+# Test that calls to undefined weak symbols (which resolve to 0) get thunks
+# when the caller is placed far from address 0. An undefined weak symbol
+# has a VA of 0, so a call from a high address needs a thunk.
+
+# RUN: llvm-mc -filetype=obj -triple=x86_64 %s -o %t.o
+# RUN: echo 'SECTIONS { \
+# RUN:       .text 0x200000000: { *(.text) } \
+# RUN:       }' > %t.script
+# RUN: ld.lld -T %t.script %t.o -o %t
+# RUN: llvm-objdump -d --no-show-raw-insn %t | FileCheck %s
+
+## _start is at 8GiB, weak_sym resolves to 0. The displacement overflows
+## 32-bit range, so a thunk is needed.
+# CHECK:      <_start>:
+# CHECK-NEXT:   callq {{.*}} <__X86_64LongThunk_weak_sym>
+# CHECK-NEXT:   retq
+
+# CHECK:      <__X86_64LongThunk_weak_sym>:
+# CHECK-NEXT:   movabsq
+# CHECK-NEXT:   leaq
+# CHECK-NEXT:   addq    %r10, %r11
+# CHECK-NEXT:   jmpq    *%r11
+
+.text
+.globl _start
+_start:
+  call weak_sym
+  ret
+
+.weak weak_sym



More information about the llvm-commits mailing list