[llvm] [BOLT][AArch64] Add indirect call promotion support (PR #184733)
Haibo Jiang via llvm-commits
llvm-commits at lists.llvm.org
Wed Mar 4 22:22:08 PST 2026
https://github.com/Jianghibo updated https://github.com/llvm/llvm-project/pull/184733
>From 3e5162f5d90064f136d237608e7b8bd6e496b489 Mon Sep 17 00:00:00 2001
From: jianghaibo <jianghaibo9 at huawei.com>
Date: Thu, 5 Mar 2026 11:32:58 +0800
Subject: [PATCH] [BOLT][AArch64] Add indirect call promotion support
- extends MCPlusBuilder with comparision between registers.
- updates the ICP pass to request a scavenged register for AArch64
callsites.
- wires ICP code generation on AArch64.
---
bolt/include/bolt/Core/MCPlusBuilder.h | 22 +-
bolt/lib/Passes/IndirectCallPromotion.cpp | 17 +-
.../Target/AArch64/AArch64MCPlusBuilder.cpp | 219 ++++++++++++++++++
bolt/lib/Target/X86/X86MCPlusBuilder.cpp | 2 +-
bolt/test/AArch64/icp-inline.s | 80 +++++++
5 files changed, 335 insertions(+), 5 deletions(-)
create mode 100644 bolt/test/AArch64/icp-inline.s
diff --git a/bolt/include/bolt/Core/MCPlusBuilder.h b/bolt/include/bolt/Core/MCPlusBuilder.h
index a672d7a456896..e9b5ab661843c 100644
--- a/bolt/include/bolt/Core/MCPlusBuilder.h
+++ b/bolt/include/bolt/Core/MCPlusBuilder.h
@@ -2071,6 +2071,26 @@ class MCPlusBuilder {
return {};
}
+ /// Create a sequence of instructions to compare contents of a register
+ /// \p Reg1 to a register \p Reg2 and jump to \p Target if they are equal.
+ virtual InstructionListType createCmpJEWithReg(MCPhysReg Reg1,
+ MCPhysReg Reg2,
+ const MCSymbol *Target,
+ MCContext *Ctx) const {
+ llvm_unreachable("not implemented");
+ return {};
+ }
+
+ /// Create a sequence of instructions to compare contents of a register
+ /// \p Reg1 to a register \p Reg2 and jump to \p Target if they are different.
+ virtual InstructionListType createCmpJNEWithReg(MCPhysReg Reg1,
+ MCPhysReg Reg2,
+ const MCSymbol *Target,
+ MCContext *Ctx) const {
+ llvm_unreachable("not implemented");
+ return {};
+ }
+
/// Find memcpy size in bytes by using preceding instructions.
/// Returns std::nullopt if size cannot be determined (no-op for most
/// targets).
@@ -2495,7 +2515,7 @@ class MCPlusBuilder {
};
virtual BlocksVectorTy indirectCallPromotion(
- const MCInst &CallInst,
+ const MCInst &CallInst, MCPhysReg Reg,
const std::vector<std::pair<MCSymbol *, uint64_t>> &Targets,
const std::vector<std::pair<MCSymbol *, uint64_t>> &VtableSyms,
const std::vector<MCInst *> &MethodFetchInsns,
diff --git a/bolt/lib/Passes/IndirectCallPromotion.cpp b/bolt/lib/Passes/IndirectCallPromotion.cpp
index 8a01cb974c5da..169d357bc3c17 100644
--- a/bolt/lib/Passes/IndirectCallPromotion.cpp
+++ b/bolt/lib/Passes/IndirectCallPromotion.cpp
@@ -1151,7 +1151,7 @@ Error IndirectCallPromotion::runOnFunctions(BinaryContext &BC) {
std::unique_ptr<RegAnalysis> RA;
std::unique_ptr<BinaryFunctionCallGraph> CG;
- if (OptimizeJumpTables) {
+ if (OptimizeJumpTables || BC.isAArch64()) {
CG.reset(new BinaryFunctionCallGraph(buildCallGraph(BC)));
RA.reset(new RegAnalysis(BC, &BFs, &*CG));
}
@@ -1365,14 +1365,25 @@ Error IndirectCallPromotion::runOnFunctions(BinaryContext &BC) {
MethodInfo.second.push_back(TargetFetchInst);
}
+ MCPhysReg Reg = 0;
+ if (BC.isAArch64()) {
+ Reg = Info.getLivenessAnalysis().scavengeRegAfter(&Inst);
+ LLVM_DEBUG(dbgs()
+ << "BOLT-DEBUG: ICP "
+ << (Reg ? "found the free register "
+ : "could not find a free register")
+ << BC.MRI->getName(Reg) << " to save function address.\n");
+ }
+
// Generate new promoted call code for this callsite.
MCPlusBuilder::BlocksVectorTy ICPcode =
(IsJumpTable && !opts::ICPJumpTablesByTarget)
? BC.MIB->jumpTablePromotion(Inst, SymTargets,
MethodInfo.second, BC.Ctx.get())
: BC.MIB->indirectCallPromotion(
- Inst, SymTargets, MethodInfo.first, MethodInfo.second,
- opts::ICPOldCodeSequence, BC.Ctx.get());
+ Inst, Reg, SymTargets, MethodInfo.first,
+ MethodInfo.second, opts::ICPOldCodeSequence,
+ BC.Ctx.get());
if (ICPcode.empty()) {
if (opts::Verbosity >= 1)
diff --git a/bolt/lib/Target/AArch64/AArch64MCPlusBuilder.cpp b/bolt/lib/Target/AArch64/AArch64MCPlusBuilder.cpp
index 3955ff378be41..de6a557726ddc 100644
--- a/bolt/lib/Target/AArch64/AArch64MCPlusBuilder.cpp
+++ b/bolt/lib/Target/AArch64/AArch64MCPlusBuilder.cpp
@@ -2111,6 +2111,40 @@ class AArch64MCPlusBuilder : public MCPlusBuilder {
return Code;
}
+ // This helper function creates the snippet of code that compares a register
+ // Reg1 with a register Reg2, and jumps to Target if they are equal.
+ InstructionListType createCmpJEWithReg(MCPhysReg Reg1, MCPhysReg Reg2,
+ const MCSymbol *Target,
+ MCContext *Ctx) const override {
+ InstructionListType Code;
+ Code.emplace_back(MCInstBuilder(AArch64::SUBSXrs)
+ .addReg(AArch64::XZR)
+ .addReg(Reg1)
+ .addReg(Reg2)
+ .addImm(0));
+ Code.emplace_back(MCInstBuilder(AArch64::Bcc)
+ .addImm(AArch64CC::EQ)
+ .addExpr(MCSymbolRefExpr::create(Target, *Ctx)));
+ return Code;
+ }
+
+ // This helper function creates the snippet of code that compares a register
+ // Reg1 with a register Reg2, and jumps to Target if they are not equal.
+ InstructionListType createCmpJNEWithReg(MCPhysReg Reg1, MCPhysReg Reg2,
+ const MCSymbol *Target,
+ MCContext *Ctx) const override {
+ InstructionListType Code;
+ Code.emplace_back(MCInstBuilder(AArch64::SUBSXrs)
+ .addReg(AArch64::XZR)
+ .addReg(Reg1)
+ .addReg(Reg2)
+ .addImm(0));
+ Code.emplace_back(MCInstBuilder(AArch64::Bcc)
+ .addImm(AArch64CC::NE)
+ .addExpr(MCSymbolRefExpr::create(Target, *Ctx)));
+ return Code;
+ }
+
void createTailCall(MCInst &Inst, const MCSymbol *Target,
MCContext *Ctx) override {
return createDirectCall(Inst, Target, Ctx, /*IsTailCall*/ true);
@@ -3135,6 +3169,191 @@ class AArch64MCPlusBuilder : public MCPlusBuilder {
return Insts;
}
+ BlocksVectorTy indirectCallPromotion(
+ const MCInst &CallInst, MCPhysReg Reg,
+ const std::vector<std::pair<MCSymbol *, uint64_t>> &Targets,
+ const std::vector<std::pair<MCSymbol *, uint64_t>> &VtableSyms,
+ const std::vector<MCInst *> &MethodFetchInsns,
+ const bool MinimizeCodeSize, MCContext *Ctx) override {
+ const bool IsTailCall = isTailCall(CallInst);
+ const bool IsJumpTable = getJumpTable(CallInst) != 0;
+ BlocksVectorTy Results;
+ if (!Reg)
+ return Results;
+
+ // Label for the current code block.
+ MCSymbol *NextTarget = nullptr;
+
+ // The join block which contains all the instructions following CallInst.
+ // MergeBlock remains null if CallInst is a tail call.
+ MCSymbol *MergeBlock = nullptr;
+
+ MCPhysReg FuncAddrReg = Reg;
+
+ const bool LoadElim = !VtableSyms.empty();
+ assert((!LoadElim || VtableSyms.size() == Targets.size()) &&
+ "There must be a vtable entry for every method "
+ "in the targets vector.");
+
+ // TODO: No real-world case was found for MinimizeCodeSize, so there is
+ // no implementation like x86.
+ if (MinimizeCodeSize && !LoadElim)
+ return Results;
+
+ const auto jumpToMergeBlock = [&](InstructionListType &NewCall) {
+ assert(MergeBlock);
+ NewCall.push_back(CallInst);
+ MCInst &Merge = NewCall.back();
+ Merge.clear();
+ createUncondBranch(Merge, MergeBlock, Ctx);
+ };
+
+ for (unsigned int i = 0; i < Targets.size(); ++i) {
+ Results.emplace_back(NextTarget, InstructionListType());
+ InstructionListType *NewCall = &Results.back().second;
+
+ // Compare current call target to a specific address.
+ assert(Targets[i].first && "All ICP targets must be to known symbols");
+ const MCSymbol *Sym = LoadElim ? VtableSyms[i].first : Targets[i].first;
+ const uint64_t Addend = LoadElim ? VtableSyms[i].second : 0;
+
+ // Adr target
+ InstructionListType Adr =
+ materializeAddress(Sym, Ctx, FuncAddrReg, Addend);
+ NewCall->insert(NewCall->end(), Adr.begin(), Adr.end());
+
+ // Original call address.
+ assert(CallInst.getOperand(0).isReg() &&
+ "No register was found for indirect-call.");
+ MCPhysReg TargetReg = CallInst.getOperand(0).getReg();
+
+ // Generate label for the next block
+ NextTarget = Ctx->createNamedTempSymbol();
+
+ if (IsJumpTable) {
+ // Jump to target if target address match.
+ InstructionListType CmpJmp =
+ createCmpJEWithReg(TargetReg, FuncAddrReg, Targets[i].first, Ctx);
+ NewCall->insert(NewCall->end(), CmpJmp.begin(), CmpJmp.end());
+ } else {
+ // Jump to next compare if target address don't match.
+ InstructionListType CmpJmp =
+ createCmpJNEWithReg(TargetReg, FuncAddrReg, NextTarget, Ctx);
+ NewCall->insert(NewCall->end(), CmpJmp.begin(), CmpJmp.end());
+
+ // Call specific target directly
+ Results.emplace_back(Ctx->createNamedTempSymbol(),
+ InstructionListType());
+ NewCall = &Results.back().second;
+ NewCall->push_back(CallInst);
+ MCInst &CallOrJmp = NewCall->back();
+
+ CallOrJmp.clear();
+ CallOrJmp.setOpcode(IsTailCall ? AArch64::B : AArch64::BL);
+ CallOrJmp.addOperand(MCOperand::createExpr(getTargetExprFor(
+ CallOrJmp, MCSymbolRefExpr::create(Targets[i].first, *Ctx), *Ctx,
+ 0)));
+
+ if (IsTailCall) {
+ setTailCall(CallOrJmp);
+ } else {
+ if (std::optional<uint32_t> Offset = getOffset(CallInst))
+ // Annotated as duplicated call
+ setOffset(CallOrJmp, *Offset);
+ }
+
+ if (isInvoke(CallInst) && !isInvoke(CallOrJmp)) {
+ // Copy over any EH or GNU args size information from the original
+ // call.
+ std::optional<MCPlus::MCLandingPad> EHInfo = getEHInfo(CallInst);
+ if (EHInfo)
+ addEHInfo(CallOrJmp, *EHInfo);
+ int64_t GnuArgsSize = getGnuArgsSize(CallInst);
+ if (GnuArgsSize >= 0)
+ addGnuArgsSize(CallOrJmp, GnuArgsSize);
+ }
+
+ if (!IsTailCall) {
+ // The fallthrough block for the most common target should be the
+ // merge block.
+ if (i == 0) {
+ // Fallthrough to merge block. The CFG will be fixed in the ICP
+ // pass.
+ MergeBlock = Ctx->createNamedTempSymbol();
+ } else {
+ // Insert jump to the merge block if we are not doing a fallthrough.
+ jumpToMergeBlock(*NewCall);
+ }
+ }
+ }
+ }
+
+ // Cold call block
+ Results.emplace_back(NextTarget, InstructionListType());
+ InstructionListType &NewCall = Results.back().second;
+ for (const MCInst *Inst : MethodFetchInsns)
+ if (Inst != &CallInst)
+ NewCall.push_back(*Inst);
+ NewCall.push_back(CallInst);
+
+ // Jump to merge block from cold call block
+ if (!IsTailCall && !IsJumpTable) {
+ jumpToMergeBlock(NewCall);
+
+ // Record merge block
+ Results.emplace_back(MergeBlock, InstructionListType());
+ }
+
+ return Results;
+ }
+
+ // Note: now, JTIndexReg is always NoRegister on the aarch64. This function
+ // can work only when the aarch64 supports parsing JTIndexReg in the future.
+ BlocksVectorTy jumpTablePromotion(
+ const MCInst &IJmpInst,
+ const std::vector<std::pair<MCSymbol *, uint64_t>> &Targets,
+ const std::vector<MCInst *> &TargetFetchInsns,
+ MCContext *Ctx) const override {
+ assert(getJumpTable(IJmpInst) != 0);
+ uint16_t IndexReg = getAnnotationAs<uint16_t>(IJmpInst, "JTIndexReg");
+ if (IndexReg == 0)
+ return BlocksVectorTy();
+
+ BlocksVectorTy Results;
+
+ // Label for the current code block.
+ MCSymbol *NextTarget = nullptr;
+
+ for (unsigned int i = 0; i < Targets.size(); ++i) {
+ Results.emplace_back(NextTarget, InstructionListType());
+ InstructionListType *CurBB = &Results.back().second;
+
+ // Compare current index to a specific index
+ const uint64_t CaseIdx = Targets[i].second;
+ // Immediate indice is out of 12b bit range
+ if (!isUInt<12>(CaseIdx))
+ return BlocksVectorTy();
+
+ InstructionListType CmpJmp =
+ createCmpJE(IndexReg, CaseIdx, Targets[i].first, Ctx);
+ CurBB->insert(CurBB->end(), CmpJmp.begin(), CmpJmp.end());
+
+ // Jump to next target compare
+ NextTarget = Ctx->createNamedTempSymbol();
+ }
+
+ // Cold call block.
+ Results.emplace_back(NextTarget, InstructionListType());
+ InstructionListType &CurBB = Results.back().second;
+ for (const MCInst *Inst : TargetFetchInsns)
+ if (Inst != &IJmpInst)
+ CurBB.push_back(*Inst);
+
+ CurBB.push_back(IJmpInst);
+
+ return Results;
+ }
+
void createBTI(MCInst &Inst, BTIKind BTI) const override {
Inst.setOpcode(AArch64::HINT);
Inst.clear();
diff --git a/bolt/lib/Target/X86/X86MCPlusBuilder.cpp b/bolt/lib/Target/X86/X86MCPlusBuilder.cpp
index 51e7d27f18a0b..795ce8905c31a 100644
--- a/bolt/lib/Target/X86/X86MCPlusBuilder.cpp
+++ b/bolt/lib/Target/X86/X86MCPlusBuilder.cpp
@@ -3278,7 +3278,7 @@ class X86MCPlusBuilder : public MCPlusBuilder {
}
BlocksVectorTy indirectCallPromotion(
- const MCInst &CallInst,
+ const MCInst &CallInst, MCPhysReg Reg,
const std::vector<std::pair<MCSymbol *, uint64_t>> &Targets,
const std::vector<std::pair<MCSymbol *, uint64_t>> &VtableSyms,
const std::vector<MCInst *> &MethodFetchInsns,
diff --git a/bolt/test/AArch64/icp-inline.s b/bolt/test/AArch64/icp-inline.s
new file mode 100644
index 0000000000000..4e6199d4d0ae7
--- /dev/null
+++ b/bolt/test/AArch64/icp-inline.s
@@ -0,0 +1,80 @@
+## This test verifies the effect of icp on aarch64 and inline after icp.
+
+## The assembly was produced from C code compiled with clang -O1 -S:
+
+# int foo(int x) { return x + 1; }
+# int bar(int x) { return x*100 + 42; }
+# typedef int (*const fn)(int);
+# fn funcs[] = { foo, bar };
+#
+# int main(int argc, char *argv[]) {
+# fn func = funcs[argc];
+# return func(0);
+# }
+
+# RUN: llvm-mc -filetype=obj -triple aarch64-unknown-unknown %s -o %t.o
+# RUN: link_fdata %s %t.o %t.fdata
+# RUN: llvm-strip --strip-unneeded %t.o
+# RUN: %clang %cflags %t.o -o %t.exe -Wl,-q -nostdlib -pie
+
+# Indirect call promotion without inline
+# RUN: llvm-bolt %t.exe --icp=calls --icp-calls-topn=1 \
+# RUN: -o %t.null --lite=0 --assume-abi --print-icp --data %t.fdata \
+# RUN: | FileCheck %s --check-prefix=CHECK-ICP-NO-INLINE
+# CHECK-ICP-NO-INLINE: Binary Function "main" after indirect-call-promotion
+# CHECK-ICP-NO-INLINE: b bar
+# CHECK-ICP-NO-INLINE: End of Function "main"
+
+# Indirect call promotion with inline
+# RUN: llvm-bolt %t.exe --icp=calls --icp-calls-topn=1 --inline-small-functions \
+# RUN: -o %t.null --lite=0 --assume-abi --inline-small-functions-bytes=12 \
+# RUN: --print-inline --data %t.fdata \
+# RUN: | FileCheck %s --check-prefix=CHECK-ICP-WITH-INLINE
+# CHECK-ICP-WITH-INLINE: Binary Function "main" after indirect-call-promotion
+# CHECK-ICP-WITH-INLINE: br x1
+# CHECK-ICP-WITH-INLINE-NOT: b bar
+# CHECK-ICP-WITH-INLINE: End of Function "main"
+ .globl foo
+ .type foo, at function
+foo:
+ .cfi_startproc
+ add w0, w0, #1
+ ret
+ .Lfunc_end0:
+ .size foo, .Lfunc_end0-foo
+ .cfi_endproc
+
+ .globl bar
+ .type bar, at function
+bar:
+ .cfi_startproc
+ mov w8, #100
+ mov x9, #42
+ madd w0, w0, w8, w9
+ ret
+.Lfunc_end1:
+ .size bar, .Lfunc_end1-bar
+ .cfi_endproc
+
+ .globl main
+ .type main, at function
+main:
+ .cfi_startproc
+ adrp x8, funcs
+ add x8, x8, :lo12:funcs
+ ldr x1, [x8, w0, sxtw #3]
+ mov w0, wzr
+ br x1
+# FDATA: 1 main 10 1 foo 0 0 1
+# FDATA: 1 main 10 1 bar 0 0 2
+.Lfunc_end2:
+ .size main, .Lfunc_end2-main
+ .cfi_endproc
+
+ .type funcs, at object
+ .section .data.rel.ro,"aw", at progbits
+ .globl funcs
+funcs:
+ .xword foo
+ .xword bar
+ .size funcs, 16
More information about the llvm-commits
mailing list