[llvm] [bolt][aarch64] simplify rodata/literal load for X86 & AArch64 (PR #165723)
Alexey Moksyakov via llvm-commits
llvm-commits at lists.llvm.org
Fri Dec 5 13:30:05 PST 2025
https://github.com/yavtuk updated https://github.com/llvm/llvm-project/pull/165723
>From 85cc465dd52cd3f139aa274c69deeb86c6efa564 Mon Sep 17 00:00:00 2001
From: Alexey Moksyakov <yavtuk at yandex.ru>
Date: Thu, 30 Oct 2025 17:23:18 +0300
Subject: [PATCH 1/2] [bolt][aarch64] test to reproduce the issue with ldr reg,
literal
ldr reg, literal instruction is limited +/- 1MB range,
emitCI put the constants by the end of function and the one is out of available range.
---
bolt/test/AArch64/materialize-constant.s | 74 ++++++++++++++++++++++++
1 file changed, 74 insertions(+)
create mode 100644 bolt/test/AArch64/materialize-constant.s
diff --git a/bolt/test/AArch64/materialize-constant.s b/bolt/test/AArch64/materialize-constant.s
new file mode 100644
index 0000000000000..1c15626b09594
--- /dev/null
+++ b/bolt/test/AArch64/materialize-constant.s
@@ -0,0 +1,74 @@
+// this test checks a load literal instructions changed to movk
+
+// REQUIRES: system-linux
+
+# RUN: llvm-mc -filetype=obj -triple aarch64-unknown-unknown %s -o %t.o
+
+# RUN: link_fdata %s %t.o %t.fdata
+# RUN: %clang %cflags -pie %t.o -o %t.exe -Wl,-q -Wl,-z,relro -Wl,-z,now
+# RUN: llvm-bolt %t.exe -o %t.bolt -data %t.fdata \
+# RUN: --keep-nops --eliminate-unreachable=false
+# RUN: llvm-objdump --disassemble-symbols=foo %t.bolt | FileCheck %s
+
+# CHECK: mov{{.*}} w19, #0
+# CHECK-NEXT: mov{{.*}} w22, #0
+# CHECK-NEXT: movk{{.*}} w23, #0, lsl #16
+# CHECK-NEXT: movk{{.*}} w23, #100
+# CHECK-NEXT: movk{{.*}} w24, #0, lsl #16
+# CHECK-NEXT: movk{{.*}} w24, #3
+
+ .text
+ .align 4
+ .local foo
+ .type foo, %function
+foo:
+# FDATA: 1 main 0 1 foo 0 0 10
+ stp x29, x30, [sp, #-32]!
+ stp x19, x20, [sp, #16]
+ mov x29, sp
+
+ mov w19, #0 // counter = 0
+ mov w22, #0 // result = 0
+
+ ldr w23, .Llimit
+ ldr w24, .LStep
+ b .LStub
+
+.LConstants:
+ .Llimit: .word 100
+ .LStep: .word 3
+
+.LStub:
+.rep 0x100000
+ nop
+.endr
+ b .Lmain_loop
+
+.Lmain_loop:
+ madd w22, w19, w24, w22 // result += counter * increment
+
+ add w19, w19, #1
+ cmp w19, w23
+ b.lt .Lmain_loop
+
+ mov w0, w22
+
+ b .Lreturn_point
+
+.Lreturn_point:
+ ldp x19, x20, [sp, #16]
+ ldp x29, x30, [sp], #32
+ ret
+.size foo, .-foo
+
+
+ .global main
+ .type main, %function
+main:
+ mov x0, #0
+ bl foo
+ mov x0, 0
+ mov w8, #93
+ svc #0
+
+ .size main, .-main
>From d6bafa22354cff9a483a6c5d68a4cc4ee2d4f311 Mon Sep 17 00:00:00 2001
From: Moksyakov Alexey <moksyakov.alexey at huawei.com>
Date: Fri, 21 Nov 2025 08:16:39 +0000
Subject: [PATCH 2/2] [bolt] simplify constant loads for X86 & AArch64
This patch fixed the issue related to load literal
for AArch64 (bolt/test/AArch64/materialize-constant.s),
address range for literal is limited +/- 1MB,
emitCI puts the constants by the end of function and
the one is out of available range.
SimplifyRODataLoads is enabled by default for X86 & AArch64
Signed-off-by: Moksyakov Alexey <moksyakov.alexey at huawei.com>
---
bolt/include/bolt/Core/MCPlusBuilder.h | 8 +++
bolt/lib/Passes/BinaryPasses.cpp | 44 +++++++++++----
bolt/lib/Rewrite/BinaryPassManager.cpp | 10 ++--
.../Target/AArch64/AArch64MCPlusBuilder.cpp | 49 ++++++++++++++++
bolt/lib/Target/X86/X86MCPlusBuilder.cpp | 19 +++++++
bolt/test/AArch64/materialize-constant.s | 56 ++++++++++---------
6 files changed, 145 insertions(+), 41 deletions(-)
diff --git a/bolt/include/bolt/Core/MCPlusBuilder.h b/bolt/include/bolt/Core/MCPlusBuilder.h
index 69ae4fb8ddcc9..7f1c989d6c291 100644
--- a/bolt/include/bolt/Core/MCPlusBuilder.h
+++ b/bolt/include/bolt/Core/MCPlusBuilder.h
@@ -1872,6 +1872,14 @@ class MCPlusBuilder {
return {};
}
+ virtual InstructionListType materializeConstant(BinaryContext &BC,
+ const MCInst &Inst,
+ StringRef ConstantData,
+ uint64_t Offset) const {
+ llvm_unreachable("not implemented");
+ return {};
+ }
+
/// Creates a new unconditional branch instruction in Inst and set its operand
/// to TBB.
virtual void createUncondBranch(MCInst &Inst, const MCSymbol *TBB,
diff --git a/bolt/lib/Passes/BinaryPasses.cpp b/bolt/lib/Passes/BinaryPasses.cpp
index 1d187de11c35e..9ae109843fc97 100644
--- a/bolt/lib/Passes/BinaryPasses.cpp
+++ b/bolt/lib/Passes/BinaryPasses.cpp
@@ -1187,7 +1187,8 @@ bool SimplifyRODataLoads::simplifyRODataLoads(BinaryFunction &BF) {
uint64_t NumDynamicLocalLoadsFound = 0;
for (BinaryBasicBlock *BB : BF.getLayout().blocks()) {
- for (MCInst &Inst : *BB) {
+ for (auto It = BB->begin(); It != BB->end(); ++It) {
+ const MCInst &Inst = *It;
unsigned Opcode = Inst.getOpcode();
const MCInstrDesc &Desc = BC.MII->get(Opcode);
@@ -1200,7 +1201,7 @@ bool SimplifyRODataLoads::simplifyRODataLoads(BinaryFunction &BF) {
if (MIB->hasPCRelOperand(Inst)) {
// Try to find the symbol that corresponds to the PC-relative operand.
- MCOperand *DispOpI = MIB->getMemOperandDisp(Inst);
+ MCOperand *DispOpI = MIB->getMemOperandDisp(const_cast<MCInst &>(Inst));
assert(DispOpI != Inst.end() && "expected PC-relative displacement");
assert(DispOpI->isExpr() &&
"found PC-relative with non-symbolic displacement");
@@ -1226,28 +1227,49 @@ bool SimplifyRODataLoads::simplifyRODataLoads(BinaryFunction &BF) {
}
// Get the contents of the section containing the target address of the
- // memory operand. We are only interested in read-only sections.
+ // memory operand. We are only interested in read-only sections for X86,
+ // for aarch64 the sections can be read-only or executable.
ErrorOr<BinarySection &> DataSection =
BC.getSectionForAddress(TargetAddress);
if (!DataSection || DataSection->isWritable())
continue;
+ if (DataSection->isText()) {
+ // If data is not part of a function, check if it is part of a global CI
+ // Do not proceed if there aren't data markers for CIs
+ BinaryFunction *BFTgt =
+ BC.getBinaryFunctionContainingAddress(TargetAddress,
+ /*CheckPastEnd*/ false,
+ /*UseMaxSize*/ true);
+ const bool IsInsideFunc =
+ BFTgt && BFTgt->isInConstantIsland(TargetAddress);
+
+ auto CIEndIter = BC.AddressToConstantIslandMap.end();
+ auto CIIter = BC.AddressToConstantIslandMap.find(TargetAddress);
+ if (!IsInsideFunc && CIIter == CIEndIter)
+ continue;
+ }
+
if (BC.getRelocationAt(TargetAddress) ||
BC.getDynamicRelocationAt(TargetAddress))
continue;
- uint32_t Offset = TargetAddress - DataSection->getAddress();
- StringRef ConstantData = DataSection->getContents();
-
++NumLocalLoadsFound;
if (BB->hasProfile())
NumDynamicLocalLoadsFound += BB->getExecutionCount();
- if (MIB->replaceMemOperandWithImm(Inst, ConstantData, Offset)) {
- ++NumLocalLoadsSimplified;
- if (BB->hasProfile())
- NumDynamicLocalLoadsSimplified += BB->getExecutionCount();
- }
+ uint32_t Offset = TargetAddress - DataSection->getAddress();
+ StringRef ConstantData = DataSection->getContents();
+ const InstructionListType Instrs =
+ MIB->materializeConstant(BC, Inst, ConstantData, Offset);
+ if (Instrs.empty())
+ continue;
+
+ It = std::next(BB->replaceInstruction(It, Instrs), Instrs.size() - 1);
+
+ ++NumLocalLoadsSimplified;
+ if (BB->hasProfile())
+ NumDynamicLocalLoadsSimplified += BB->getExecutionCount();
}
}
diff --git a/bolt/lib/Rewrite/BinaryPassManager.cpp b/bolt/lib/Rewrite/BinaryPassManager.cpp
index 1a0f6d75d63e8..17c280dedc76a 100644
--- a/bolt/lib/Rewrite/BinaryPassManager.cpp
+++ b/bolt/lib/Rewrite/BinaryPassManager.cpp
@@ -236,7 +236,7 @@ static cl::opt<bool> SimplifyRODataLoads(
"simplify-rodata-loads",
cl::desc("simplify loads from read-only sections by replacing the memory "
"operand with the constant found in the corresponding section"),
- cl::cat(BoltOptCategory));
+ cl::init(true), cl::cat(BoltOptCategory));
static cl::list<std::string>
SpecializeMemcpy1("memcpy1-spec",
@@ -432,9 +432,11 @@ Error BinaryFunctionPassManager::runAllPasses(BinaryContext &BC) {
std::make_unique<JTFootprintReduction>(PrintJTFootprintReduction),
opts::JTFootprintReductionFlag);
- Manager.registerPass(
- std::make_unique<SimplifyRODataLoads>(PrintSimplifyROLoads),
- opts::SimplifyRODataLoads);
+ if (!BC.isRISCV()) {
+ Manager.registerPass(
+ std::make_unique<SimplifyRODataLoads>(PrintSimplifyROLoads),
+ opts::SimplifyRODataLoads);
+ }
Manager.registerPass(std::make_unique<RegReAssign>(PrintRegReAssign),
opts::RegReAssign);
diff --git a/bolt/lib/Target/AArch64/AArch64MCPlusBuilder.cpp b/bolt/lib/Target/AArch64/AArch64MCPlusBuilder.cpp
index db3989d6b0b5f..f6435a0885df0 100644
--- a/bolt/lib/Target/AArch64/AArch64MCPlusBuilder.cpp
+++ b/bolt/lib/Target/AArch64/AArch64MCPlusBuilder.cpp
@@ -2770,6 +2770,55 @@ class AArch64MCPlusBuilder : public MCPlusBuilder {
return Insts;
}
+ InstructionListType materializeConstant(BinaryContext &BC,
+ const MCInst &Inst,
+ StringRef ConstantData,
+ uint64_t Offset) const override {
+ struct InstInfo {
+ // Size in bytes that Inst loads from memory.
+ uint8_t DataSize;
+ // Number of instructions needed to materialize the constant.
+ uint8_t numInstrs;
+ // Opcode to use for materializing the constant.
+ unsigned Opcode;
+ };
+
+ InstInfo I;
+ InstructionListType Insts(0);
+ switch (Inst.getOpcode()) {
+ case AArch64::LDRWl:
+ I = {4, 2, AArch64::MOVKWi};
+ break;
+ case AArch64::LDRXl:
+ I = {8, 4, AArch64::MOVKXi};
+ break;
+ default:
+ return Insts;
+ }
+
+ if (ConstantData.size() - Offset < I.DataSize)
+ return Insts;
+
+ uint64_t Offset = 8;
+ DataExtractor DE(ConstantData, BC.AsmInfo->isLittleEndian(),
+ BC.AsmInfo->getCodePointerSize());
+ const uint64_t ImmVal = DE.getUnsigned(&Offset, I.DataSize);
+
+ Insts.resize(I.numInstrs);
+ unsigned shift = (Insts.size() - 1) * 16;
+ MCPhysReg Reg = Inst.getOperand(0).getReg();
+ for (unsigned i = 0; i < Insts.size(); i++, shift -= 16) {
+ Insts[i].setOpcode(I.Opcode);
+ Insts[i].clear();
+ Insts[i].addOperand(MCOperand::createReg(Reg));
+ Insts[i].addOperand(MCOperand::createReg(Reg));
+ Insts[i].addOperand(MCOperand::createImm((ImmVal >> shift) & 0xFFFF));
+ Insts[i].addOperand(MCOperand::createImm(shift));
+ }
+
+ return Insts;
+ }
+
std::optional<Relocation>
createRelocation(const MCFixup &Fixup,
const MCAsmBackend &MAB) const override {
diff --git a/bolt/lib/Target/X86/X86MCPlusBuilder.cpp b/bolt/lib/Target/X86/X86MCPlusBuilder.cpp
index 7c24c2ce136fa..9b2f091ca354a 100644
--- a/bolt/lib/Target/X86/X86MCPlusBuilder.cpp
+++ b/bolt/lib/Target/X86/X86MCPlusBuilder.cpp
@@ -1477,6 +1477,25 @@ class X86MCPlusBuilder : public MCPlusBuilder {
return true;
}
+ InstructionListType materializeConstant(BinaryContext &BC,
+ const MCInst &Inst,
+ StringRef ConstantData,
+ uint64_t Offset) const override {
+ InstructionListType Instrs;
+ MCInst InstCopy = Inst;
+
+ if (!replaceMemOperandWithImm(InstCopy, ConstantData, Offset))
+ return Instrs;
+
+ Instrs.emplace_back();
+ Instrs.back().setOpcode(InstCopy.getOpcode());
+ Instrs.back().clear();
+ for (unsigned i = 0; i < InstCopy.getNumOperands(); ++i)
+ Instrs.back().addOperand(InstCopy.getOperand(i));
+
+ return Instrs;
+ }
+
/// TODO: this implementation currently works for the most common opcodes that
/// load from memory. It can be extended to work with memory store opcodes as
/// well as more memory load opcodes.
diff --git a/bolt/test/AArch64/materialize-constant.s b/bolt/test/AArch64/materialize-constant.s
index 1c15626b09594..f4d0c3cafbb8d 100644
--- a/bolt/test/AArch64/materialize-constant.s
+++ b/bolt/test/AArch64/materialize-constant.s
@@ -1,28 +1,28 @@
// this test checks a load literal instructions changed to movk
-// REQUIRES: system-linux
+# REQUIRES: system-linux
-# RUN: llvm-mc -filetype=obj -triple aarch64-unknown-unknown %s -o %t.o
+# RUN: llvm-mc -filetype=obj -triple aarch64-unknown-unknown \
+# RUN: --defsym CIBIGFUNC=1 %s -o %t.o
+# RUN: %clang %cflags %t.o -Wl,-q -o %t.exe
+# RUN: llvm-bolt %t.exe -o %t.bolt --lite=0 \
+# RUN: --keep-nops --eliminate-unreachable=false \
+# RUN: | FileCheck %s --check-prefix=CHECK-LOGS
-# RUN: link_fdata %s %t.o %t.fdata
-# RUN: %clang %cflags -pie %t.o -o %t.exe -Wl,-q -Wl,-z,relro -Wl,-z,now
-# RUN: llvm-bolt %t.exe -o %t.bolt -data %t.fdata \
-# RUN: --keep-nops --eliminate-unreachable=false
-# RUN: llvm-objdump --disassemble-symbols=foo %t.bolt | FileCheck %s
+# RUN: llvm-mc -filetype=obj -triple aarch64-unknown-unknown \
+# RUN: --defsym CIOUTSIDEFUNC=1 %s -o %t.o
+# RUN: %clang %cflags %t.o -Wl,-q -o %t.exe
+# RUN: llvm-bolt %t.exe -o %t.bolt --lite=0 \
+# RUN: --keep-nops --eliminate-unreachable=false \
+# RUN: | FileCheck %s --check-prefix=CHECK-LOGS
-# CHECK: mov{{.*}} w19, #0
-# CHECK-NEXT: mov{{.*}} w22, #0
-# CHECK-NEXT: movk{{.*}} w23, #0, lsl #16
-# CHECK-NEXT: movk{{.*}} w23, #100
-# CHECK-NEXT: movk{{.*}} w24, #0, lsl #16
-# CHECK-NEXT: movk{{.*}} w24, #3
+# CHECK-LOGS: simplified 2 out of 2 loads
.text
.align 4
.local foo
.type foo, %function
foo:
-# FDATA: 1 main 0 1 foo 0 0 10
stp x29, x30, [sp, #-32]!
stp x19, x20, [sp, #16]
mov x29, sp
@@ -31,44 +31,48 @@ foo:
mov w22, #0 // result = 0
ldr w23, .Llimit
- ldr w24, .LStep
- b .LStub
+ ldr x24, .LStep
+.ifdef CIBIGFUNC
+ b .LStub
.LConstants:
.Llimit: .word 100
- .LStep: .word 3
-
+ .LStep: .xword 3
.LStub:
.rep 0x100000
nop
.endr
b .Lmain_loop
+.endif
.Lmain_loop:
madd w22, w19, w24, w22 // result += counter * increment
-
add w19, w19, #1
cmp w19, w23
b.lt .Lmain_loop
-
mov w0, w22
-
b .Lreturn_point
-
.Lreturn_point:
ldp x19, x20, [sp, #16]
ldp x29, x30, [sp], #32
ret
.size foo, .-foo
+.ifdef CIOUTSIDEFUNC
+.LConstants:
+ .Llimit: .word 100
+ .LStep: .xword 3
+.endif
+
.global main
.type main, %function
main:
mov x0, #0
bl foo
- mov x0, 0
- mov w8, #93
- svc #0
+ mov x0, 0
+ mov w8, #93
+ svc #0
+
+.size main, .-main
- .size main, .-main
More information about the llvm-commits
mailing list