[llvm] [bolt] Simplify rodata/literal load for X86_64 & AArch64 (PR #179474)

Alexey Moksyakov via llvm-commits llvm-commits at lists.llvm.org
Tue Feb 10 00:04:46 PST 2026


https://github.com/yavtuk updated https://github.com/llvm/llvm-project/pull/179474

>From 55f41db7a48294d94656d829fef3db080db91a22 Mon Sep 17 00:00:00 2001
From: yavtuk <yavtuk at ya.ru>
Date: Tue, 3 Feb 2026 11:49:08 +0300
Subject: [PATCH] [bolt] Simplify rodata/literal load for X86_64 & AArch64

This patch enables simplify rodata loads by default for X86_64 and AArch64.
For AArch64, this is the fix related to ldr literal instruction where constant
island is moved to the end of a function and the target address can be out
of available range  +/- 1MB. For AArch64, the available data sizes are 4, 8 bytes,
for X86_64 available sizes 2, 4 and 8 bytes.
Note: SimplifyRODataLoads is disabled for non-relocation mode.
---
 bolt/include/bolt/Core/MCPlusBuilder.h        |  9 ++
 bolt/lib/Passes/BinaryPasses.cpp              | 55 ++++++++----
 bolt/lib/Rewrite/BinaryPassManager.cpp        | 12 +--
 bolt/lib/Rewrite/RewriteInstance.cpp          |  9 ++
 .../Target/AArch64/AArch64MCPlusBuilder.cpp   | 47 ++++++++++
 bolt/lib/Target/X86/X86MCPlusBuilder.cpp      | 18 ++++
 bolt/test/AArch64/materialize-constant.s      | 85 +++++++++++++++++++
 7 files changed, 213 insertions(+), 22 deletions(-)
 create mode 100644 bolt/test/AArch64/materialize-constant.s

diff --git a/bolt/include/bolt/Core/MCPlusBuilder.h b/bolt/include/bolt/Core/MCPlusBuilder.h
index e571e91d85135..9337a886cff8f 100644
--- a/bolt/include/bolt/Core/MCPlusBuilder.h
+++ b/bolt/include/bolt/Core/MCPlusBuilder.h
@@ -1920,6 +1920,15 @@ class MCPlusBuilder {
     return {};
   }
 
+  /// Materializing \p ConstantData value in the target register of \p Inst
+  virtual InstructionListType materializeConstant(BinaryContext &BC,
+                                                  const MCInst &Inst,
+                                                  StringRef ConstantData,
+                                                  uint64_t Offset) const {
+    llvm_unreachable("not implemented");
+    return {};
+  }
+
   /// Creates a new unconditional branch instruction in Inst and set its operand
   /// to TBB.
   virtual void createUncondBranch(MCInst &Inst, const MCSymbol *TBB,
diff --git a/bolt/lib/Passes/BinaryPasses.cpp b/bolt/lib/Passes/BinaryPasses.cpp
index 480d0cef58f43..65db19ad46644 100644
--- a/bolt/lib/Passes/BinaryPasses.cpp
+++ b/bolt/lib/Passes/BinaryPasses.cpp
@@ -1222,7 +1222,8 @@ bool SimplifyRODataLoads::simplifyRODataLoads(BinaryFunction &BF) {
   uint64_t NumDynamicLocalLoadsFound = 0;
 
   for (BinaryBasicBlock *BB : BF.getLayout().blocks()) {
-    for (MCInst &Inst : *BB) {
+    for (auto It = BB->begin(); It != BB->end(); ++It) {
+      const MCInst &Inst = *It;
       unsigned Opcode = Inst.getOpcode();
       const MCInstrDesc &Desc = BC.MII->get(Opcode);
 
@@ -1235,7 +1236,7 @@ bool SimplifyRODataLoads::simplifyRODataLoads(BinaryFunction &BF) {
 
       if (MIB->hasPCRelOperand(Inst)) {
         // Try to find the symbol that corresponds to the PC-relative operand.
-        MCOperand *DispOpI = MIB->getMemOperandDisp(Inst);
+        MCOperand *DispOpI = MIB->getMemOperandDisp(const_cast<MCInst &>(Inst));
         assert(DispOpI != Inst.end() && "expected PC-relative displacement");
         assert(DispOpI->isExpr() &&
                "found PC-relative with non-symbolic displacement");
@@ -1261,28 +1262,49 @@ bool SimplifyRODataLoads::simplifyRODataLoads(BinaryFunction &BF) {
       }
 
       // Get the contents of the section containing the target address of the
-      // memory operand. We are only interested in read-only sections.
+      // memory operand. We are only interested in read-only sections for X86,
+      // for aarch64 the sections can be read-only or executable.
       ErrorOr<BinarySection &> DataSection =
           BC.getSectionForAddress(TargetAddress);
       if (!DataSection || DataSection->isWritable())
         continue;
 
+      if (DataSection->isText()) {
+        // If data is not part of a function, check if it is part of a global CI
+        // Do not proceed if there aren't data markers for CIs
+        BinaryFunction *TargetBF =
+            BC.getBinaryFunctionContainingAddress(TargetAddress,
+                                                  /*CheckPastEnd*/ false,
+                                                  /*UseMaxSize*/ true);
+        const bool IsInsideFunc =
+            TargetBF && TargetBF->isInConstantIsland(TargetAddress);
+
+        auto CIEndIter = BC.AddressToConstantIslandMap.end();
+        auto CIIter = BC.AddressToConstantIslandMap.find(TargetAddress);
+        if (!IsInsideFunc && CIIter == CIEndIter)
+          continue;
+      }
+
       if (BC.getRelocationAt(TargetAddress) ||
           BC.getDynamicRelocationAt(TargetAddress))
         continue;
 
-      uint32_t Offset = TargetAddress - DataSection->getAddress();
-      StringRef ConstantData = DataSection->getContents();
-
       ++NumLocalLoadsFound;
       if (BB->hasProfile())
         NumDynamicLocalLoadsFound += BB->getExecutionCount();
 
-      if (MIB->replaceMemOperandWithImm(Inst, ConstantData, Offset)) {
-        ++NumLocalLoadsSimplified;
-        if (BB->hasProfile())
-          NumDynamicLocalLoadsSimplified += BB->getExecutionCount();
-      }
+      uint32_t Offset = TargetAddress - DataSection->getAddress();
+      StringRef ConstantData = DataSection->getContents();
+      const InstructionListType Instrs =
+          MIB->materializeConstant(BC, Inst, ConstantData, Offset);
+      if (Instrs.empty())
+        continue;
+
+      It = std::next(BB->replaceInstruction(It, Instrs), Instrs.size() - 1);
+
+      ++NumLocalLoadsSimplified;
+      if (BB->hasProfile())
+        NumDynamicLocalLoadsSimplified += BB->getExecutionCount();
     }
   }
 
@@ -1301,12 +1323,11 @@ Error SimplifyRODataLoads::runOnFunctions(BinaryContext &BC) {
       Modified.insert(&Function);
   }
 
-  BC.outs() << "BOLT-INFO: simplified " << NumLoadsSimplified << " out of "
-            << NumLoadsFound << " loads from a statically computed address.\n"
-            << "BOLT-INFO: dynamic loads simplified: "
-            << NumDynamicLoadsSimplified << "\n"
-            << "BOLT-INFO: dynamic loads found: " << NumDynamicLoadsFound
-            << "\n";
+  if (opts::Verbosity > 0 || NumLoadsSimplified)
+    BC.outs() << "BOLT-INFO: simplified " << NumLoadsSimplified << " out of "
+              << NumLoadsFound << " loads from statically computed addresses\n"
+              << "BOLT-INFO: simplified " << NumDynamicLoadsSimplified
+              << " out of " << NumDynamicLoadsFound << " dynamic loads\n";
   return Error::success();
 }
 
diff --git a/bolt/lib/Rewrite/BinaryPassManager.cpp b/bolt/lib/Rewrite/BinaryPassManager.cpp
index 58d24e15cde01..b4b364874a65c 100644
--- a/bolt/lib/Rewrite/BinaryPassManager.cpp
+++ b/bolt/lib/Rewrite/BinaryPassManager.cpp
@@ -241,11 +241,11 @@ static cl::opt<bool> SimplifyConditionalTailCalls(
     cl::desc("simplify conditional tail calls by removing unnecessary jumps"),
     cl::init(true), cl::cat(BoltOptCategory));
 
-static cl::opt<bool> SimplifyRODataLoads(
+cl::opt<bool> SimplifyRODataLoads(
     "simplify-rodata-loads",
     cl::desc("simplify loads from read-only sections by replacing the memory "
              "operand with the constant found in the corresponding section"),
-    cl::cat(BoltOptCategory));
+    cl::init(true), cl::cat(BoltOptCategory));
 
 static cl::list<std::string>
 SpecializeMemcpy1("memcpy1-spec",
@@ -442,9 +442,11 @@ Error BinaryFunctionPassManager::runAllPasses(BinaryContext &BC) {
       std::make_unique<JTFootprintReduction>(PrintJTFootprintReduction),
       opts::JTFootprintReductionFlag);
 
-  Manager.registerPass(
-      std::make_unique<SimplifyRODataLoads>(PrintSimplifyROLoads),
-      opts::SimplifyRODataLoads);
+  if (!BC.isRISCV()) {
+    Manager.registerPass(
+        std::make_unique<SimplifyRODataLoads>(PrintSimplifyROLoads),
+        opts::SimplifyRODataLoads);
+  }
 
   Manager.registerPass(std::make_unique<RegReAssign>(PrintRegReAssign),
                        opts::RegReAssign);
diff --git a/bolt/lib/Rewrite/RewriteInstance.cpp b/bolt/lib/Rewrite/RewriteInstance.cpp
index b475c6e137909..887512874fc29 100644
--- a/bolt/lib/Rewrite/RewriteInstance.cpp
+++ b/bolt/lib/Rewrite/RewriteInstance.cpp
@@ -87,6 +87,7 @@ extern cl::list<std::string> PrintOnly;
 extern cl::opt<std::string> PrintOnlyFile;
 extern cl::list<std::string> ReorderData;
 extern cl::opt<bolt::ReorderFunctions::ReorderType> ReorderFunctions;
+extern cl::opt<bool> SimplifyRODataLoads;
 extern cl::opt<bool> TerminalHLT;
 extern cl::opt<bool> TerminalTrap;
 extern cl::opt<bool> TimeBuild;
@@ -2419,6 +2420,14 @@ void RewriteInstance::adjustCommandLineOptions() {
     if (!opts::TerminalTrap.getNumOccurrences())
       opts::TerminalTrap = false;
   }
+
+  if (BC->isRISCV() || (BC->isAArch64() && !BC->HasRelocations)) {
+    // TODO: For RISCV, the optimization is not implemented yet.
+    // For AArch64, the one is disabled to avoid increasing
+    // the output functions size in non relocs mode.
+    opts::SimplifyRODataLoads = false;
+    BC->outs() << "BOLT-INFO: simplify rodata loads pass is disabled\n";
+  }
 }
 
 namespace {
diff --git a/bolt/lib/Target/AArch64/AArch64MCPlusBuilder.cpp b/bolt/lib/Target/AArch64/AArch64MCPlusBuilder.cpp
index aa5cf3c671cdc..a005ee73191fa 100644
--- a/bolt/lib/Target/AArch64/AArch64MCPlusBuilder.cpp
+++ b/bolt/lib/Target/AArch64/AArch64MCPlusBuilder.cpp
@@ -2967,6 +2967,53 @@ class AArch64MCPlusBuilder : public MCPlusBuilder {
     return Insts;
   }
 
+  InstructionListType materializeConstant(BinaryContext &BC, const MCInst &Inst,
+                                          StringRef ConstantData,
+                                          uint64_t Offset) const override {
+    struct InstInfo {
+      // Size in bytes that Inst loads from memory.
+      uint8_t DataSize;
+      // Number of instructions needed to materialize the constant.
+      uint8_t NumInstrs;
+      // Opcode to use for materializing the constant.
+      unsigned Opcode;
+    };
+
+    InstInfo II;
+    InstructionListType Insts(0);
+    switch (Inst.getOpcode()) {
+    case AArch64::LDRWl:
+      II = {4, 2, AArch64::MOVKWi};
+      break;
+    case AArch64::LDRXl:
+      II = {8, 4, AArch64::MOVKXi};
+      break;
+    default:
+      return Insts;
+    }
+
+    if (ConstantData.size() - Offset < II.DataSize)
+      return Insts;
+
+    DataExtractor DE(ConstantData, BC.AsmInfo->isLittleEndian(),
+                     BC.AsmInfo->getCodePointerSize());
+    const uint64_t ImmVal = DE.getUnsigned(&Offset, II.DataSize);
+
+    Insts.resize(II.NumInstrs);
+    unsigned Shift = (Insts.size() - 1) * 16;
+    MCPhysReg Reg = Inst.getOperand(0).getReg();
+    for (unsigned I = 0; I < Insts.size(); ++I, Shift -= 16) {
+      Insts[I].setOpcode(II.Opcode);
+      Insts[I].clear();
+      Insts[I].addOperand(MCOperand::createReg(Reg));
+      Insts[I].addOperand(MCOperand::createReg(Reg));
+      Insts[I].addOperand(MCOperand::createImm((ImmVal >> Shift) & 0xFFFF));
+      Insts[I].addOperand(MCOperand::createImm(Shift));
+    }
+
+    return Insts;
+  }
+
   std::optional<Relocation>
   createRelocation(const MCFixup &Fixup,
                    const MCAsmBackend &MAB) const override {
diff --git a/bolt/lib/Target/X86/X86MCPlusBuilder.cpp b/bolt/lib/Target/X86/X86MCPlusBuilder.cpp
index 7c24c2ce136fa..2dc2d25aeeedd 100644
--- a/bolt/lib/Target/X86/X86MCPlusBuilder.cpp
+++ b/bolt/lib/Target/X86/X86MCPlusBuilder.cpp
@@ -1477,6 +1477,24 @@ class X86MCPlusBuilder : public MCPlusBuilder {
     return true;
   }
 
+  InstructionListType materializeConstant(BinaryContext &BC, const MCInst &Inst,
+                                          StringRef ConstantData,
+                                          uint64_t Offset) const override {
+    InstructionListType Instrs;
+    MCInst InstCopy = Inst;
+
+    if (!replaceMemOperandWithImm(InstCopy, ConstantData, Offset))
+      return Instrs;
+
+    Instrs.emplace_back();
+    Instrs.back().setOpcode(InstCopy.getOpcode());
+    Instrs.back().clear();
+    for (unsigned I = 0; I < InstCopy.getNumOperands(); ++I)
+      Instrs.back().addOperand(InstCopy.getOperand(I));
+
+    return Instrs;
+  }
+
   /// TODO: this implementation currently works for the most common opcodes that
   /// load from memory. It can be extended to work with memory store opcodes as
   /// well as more memory load opcodes.
diff --git a/bolt/test/AArch64/materialize-constant.s b/bolt/test/AArch64/materialize-constant.s
new file mode 100644
index 0000000000000..81b61317d392b
--- /dev/null
+++ b/bolt/test/AArch64/materialize-constant.s
@@ -0,0 +1,85 @@
+// this test checks a load literal instructions changed to movk
+
+# REQUIRES: system-linux
+
+# RUN: %clang %cflags %s -Wa,--defsym,CIBIGFUNC=1 -Wl,-q -o %t.exe
+# RUN: llvm-bolt %t.exe -o %t.bolt --lite=0 \
+# RUN:    --keep-nops --eliminate-unreachable=false \
+# RUN:    | FileCheck %s --check-prefix=CHECK-LOGS
+# RUN: llvm-objdump -d --disassemble-symbols=foo %t.bolt \
+# RUN:    | FileCheck %s --check-prefix=CHECK-FOO
+
+# RUN: %clang %cflags %s -Wa,--defsym,CIOUTSIDEFUNC=1 -Wl,-q -o %t.exe
+# RUN: llvm-bolt %t.exe -o %t.bolt --lite=0 \
+# RUN:    --keep-nops --eliminate-unreachable=false \
+# RUN:    | FileCheck %s --check-prefix=CHECK-LOGS
+# RUN: llvm-objdump -d --disassemble-symbols=foo %t.bolt \
+# RUN:    | FileCheck %s --check-prefix=CHECK-FOO
+
+# CHECK-LOGS: simplified 2 out of 2 loads
+
+# CHECK-FOO: movk w23, #0x0, lsl #16
+# CHECK-FOO-NEXT: movk w23, #0x64
+# CHECK-FOO-NEXT: movk x24, #0x0, lsl #48
+# CHECK-FOO-NEXT: movk x24, #0x0, lsl #32
+# CHECK-FOO-NEXT: movk x24, #0x0, lsl #16
+# CHECK-FOO-NEXT: movk x24, #0x3
+
+  .text
+  .align 4
+  .local foo
+  .type foo, %function
+foo:
+    stp x29, x30, [sp, #-32]!
+    stp x19, x20, [sp, #16]
+    mov x29, sp
+
+    mov w19, #0 // counter = 0
+    mov w22, #0 // result = 0
+
+    ldr w23, .Llimit
+    ldr x24, .LStep
+
+.ifdef CIBIGFUNC
+    b .LStub
+.LConstants:
+  .Llimit: .word 100
+  .LStep:  .xword 3
+.LStub:
+// For AArch64, there is the issue related to emitting a constant
+// island to the end of a function, ldr literal instruction can be
+// out of available address range when the function size  is ~1MB.
+// This strings are comment out due to unit test time execution.
+// ~1 minute.
+// .rep 0x100000
+//    nop
+// .endr
+    b .Lreturn_point
+.endif
+
+.Lreturn_point:
+    ldp x19, x20, [sp, #16]
+    ldp x29, x30, [sp], #32
+    ret
+.size foo, .-foo
+
+.ifdef CIOUTSIDEFUNC
+.LConstants:
+  .Llimit: .word 100
+  .LStep:  .xword 3
+.endif
+
+
+  .global main
+  .type main, %function
+main:
+# Dummy relocation to force relocation mode
+.reloc 0, R_AARCH64_NONE
+  mov x0, #0
+  bl foo
+  mov x0, 0
+  mov w8, #93
+  svc #0
+
+.size main, .-main
+



More information about the llvm-commits mailing list