[llvm] [BOLT][AArch64] Add indirect call promotion support (PR #184733)

Haibo Jiang via llvm-commits llvm-commits at lists.llvm.org
Wed Mar 4 22:22:08 PST 2026


https://github.com/Jianghibo updated https://github.com/llvm/llvm-project/pull/184733

>From 3e5162f5d90064f136d237608e7b8bd6e496b489 Mon Sep 17 00:00:00 2001
From: jianghaibo <jianghaibo9 at huawei.com>
Date: Thu, 5 Mar 2026 11:32:58 +0800
Subject: [PATCH] [BOLT][AArch64] Add indirect call promotion support

- extends MCPlusBuilder with comparision between registers.
- updates the ICP pass to request a scavenged register for AArch64
  callsites.
- wires ICP code generation on AArch64.
---
 bolt/include/bolt/Core/MCPlusBuilder.h        |  22 +-
 bolt/lib/Passes/IndirectCallPromotion.cpp     |  17 +-
 .../Target/AArch64/AArch64MCPlusBuilder.cpp   | 219 ++++++++++++++++++
 bolt/lib/Target/X86/X86MCPlusBuilder.cpp      |   2 +-
 bolt/test/AArch64/icp-inline.s                |  80 +++++++
 5 files changed, 335 insertions(+), 5 deletions(-)
 create mode 100644 bolt/test/AArch64/icp-inline.s

diff --git a/bolt/include/bolt/Core/MCPlusBuilder.h b/bolt/include/bolt/Core/MCPlusBuilder.h
index a672d7a456896..e9b5ab661843c 100644
--- a/bolt/include/bolt/Core/MCPlusBuilder.h
+++ b/bolt/include/bolt/Core/MCPlusBuilder.h
@@ -2071,6 +2071,26 @@ class MCPlusBuilder {
     return {};
   }
 
+  /// Create a sequence of instructions to compare contents of a register
+  /// \p Reg1 to a register \p Reg2 and jump to \p Target if they are equal.
+  virtual InstructionListType createCmpJEWithReg(MCPhysReg Reg1,
+                                                 MCPhysReg Reg2,
+                                                 const MCSymbol *Target,
+                                                 MCContext *Ctx) const {
+    llvm_unreachable("not implemented");
+    return {};
+  }
+
+  /// Create a sequence of instructions to compare contents of a register
+  /// \p Reg1 to a register \p Reg2 and jump to \p Target if they are different.
+  virtual InstructionListType createCmpJNEWithReg(MCPhysReg Reg1,
+                                                  MCPhysReg Reg2,
+                                                  const MCSymbol *Target,
+                                                  MCContext *Ctx) const {
+    llvm_unreachable("not implemented");
+    return {};
+  }
+
   /// Find memcpy size in bytes by using preceding instructions.
   /// Returns std::nullopt if size cannot be determined (no-op for most
   /// targets).
@@ -2495,7 +2515,7 @@ class MCPlusBuilder {
   };
 
   virtual BlocksVectorTy indirectCallPromotion(
-      const MCInst &CallInst,
+      const MCInst &CallInst, MCPhysReg Reg,
       const std::vector<std::pair<MCSymbol *, uint64_t>> &Targets,
       const std::vector<std::pair<MCSymbol *, uint64_t>> &VtableSyms,
       const std::vector<MCInst *> &MethodFetchInsns,
diff --git a/bolt/lib/Passes/IndirectCallPromotion.cpp b/bolt/lib/Passes/IndirectCallPromotion.cpp
index 8a01cb974c5da..169d357bc3c17 100644
--- a/bolt/lib/Passes/IndirectCallPromotion.cpp
+++ b/bolt/lib/Passes/IndirectCallPromotion.cpp
@@ -1151,7 +1151,7 @@ Error IndirectCallPromotion::runOnFunctions(BinaryContext &BC) {
 
   std::unique_ptr<RegAnalysis> RA;
   std::unique_ptr<BinaryFunctionCallGraph> CG;
-  if (OptimizeJumpTables) {
+  if (OptimizeJumpTables || BC.isAArch64()) {
     CG.reset(new BinaryFunctionCallGraph(buildCallGraph(BC)));
     RA.reset(new RegAnalysis(BC, &BFs, &*CG));
   }
@@ -1365,14 +1365,25 @@ Error IndirectCallPromotion::runOnFunctions(BinaryContext &BC) {
           MethodInfo.second.push_back(TargetFetchInst);
         }
 
+        MCPhysReg Reg = 0;
+        if (BC.isAArch64()) {
+          Reg = Info.getLivenessAnalysis().scavengeRegAfter(&Inst);
+          LLVM_DEBUG(dbgs()
+                     << "BOLT-DEBUG: ICP "
+                     << (Reg ? "found the free register "
+                             : "could not find a free register")
+                     << BC.MRI->getName(Reg) << " to save function address.\n");
+        }
+
         // Generate new promoted call code for this callsite.
         MCPlusBuilder::BlocksVectorTy ICPcode =
             (IsJumpTable && !opts::ICPJumpTablesByTarget)
                 ? BC.MIB->jumpTablePromotion(Inst, SymTargets,
                                              MethodInfo.second, BC.Ctx.get())
                 : BC.MIB->indirectCallPromotion(
-                      Inst, SymTargets, MethodInfo.first, MethodInfo.second,
-                      opts::ICPOldCodeSequence, BC.Ctx.get());
+                      Inst, Reg, SymTargets, MethodInfo.first,
+                      MethodInfo.second, opts::ICPOldCodeSequence,
+                      BC.Ctx.get());
 
         if (ICPcode.empty()) {
           if (opts::Verbosity >= 1)
diff --git a/bolt/lib/Target/AArch64/AArch64MCPlusBuilder.cpp b/bolt/lib/Target/AArch64/AArch64MCPlusBuilder.cpp
index 3955ff378be41..de6a557726ddc 100644
--- a/bolt/lib/Target/AArch64/AArch64MCPlusBuilder.cpp
+++ b/bolt/lib/Target/AArch64/AArch64MCPlusBuilder.cpp
@@ -2111,6 +2111,40 @@ class AArch64MCPlusBuilder : public MCPlusBuilder {
     return Code;
   }
 
+  // This helper function creates the snippet of code that compares a register
+  // Reg1 with a register Reg2, and jumps to Target if they are equal.
+  InstructionListType createCmpJEWithReg(MCPhysReg Reg1, MCPhysReg Reg2,
+                                         const MCSymbol *Target,
+                                         MCContext *Ctx) const override {
+    InstructionListType Code;
+    Code.emplace_back(MCInstBuilder(AArch64::SUBSXrs)
+                          .addReg(AArch64::XZR)
+                          .addReg(Reg1)
+                          .addReg(Reg2)
+                          .addImm(0));
+    Code.emplace_back(MCInstBuilder(AArch64::Bcc)
+                          .addImm(AArch64CC::EQ)
+                          .addExpr(MCSymbolRefExpr::create(Target, *Ctx)));
+    return Code;
+  }
+
+  // This helper function creates the snippet of code that compares a register
+  // Reg1 with a register Reg2, and jumps to Target if they are not equal.
+  InstructionListType createCmpJNEWithReg(MCPhysReg Reg1, MCPhysReg Reg2,
+                                          const MCSymbol *Target,
+                                          MCContext *Ctx) const override {
+    InstructionListType Code;
+    Code.emplace_back(MCInstBuilder(AArch64::SUBSXrs)
+                          .addReg(AArch64::XZR)
+                          .addReg(Reg1)
+                          .addReg(Reg2)
+                          .addImm(0));
+    Code.emplace_back(MCInstBuilder(AArch64::Bcc)
+                          .addImm(AArch64CC::NE)
+                          .addExpr(MCSymbolRefExpr::create(Target, *Ctx)));
+    return Code;
+  }
+
   void createTailCall(MCInst &Inst, const MCSymbol *Target,
                       MCContext *Ctx) override {
     return createDirectCall(Inst, Target, Ctx, /*IsTailCall*/ true);
@@ -3135,6 +3169,191 @@ class AArch64MCPlusBuilder : public MCPlusBuilder {
     return Insts;
   }
 
+  BlocksVectorTy indirectCallPromotion(
+      const MCInst &CallInst, MCPhysReg Reg,
+      const std::vector<std::pair<MCSymbol *, uint64_t>> &Targets,
+      const std::vector<std::pair<MCSymbol *, uint64_t>> &VtableSyms,
+      const std::vector<MCInst *> &MethodFetchInsns,
+      const bool MinimizeCodeSize, MCContext *Ctx) override {
+    const bool IsTailCall = isTailCall(CallInst);
+    const bool IsJumpTable = getJumpTable(CallInst) != 0;
+    BlocksVectorTy Results;
+    if (!Reg)
+      return Results;
+
+    // Label for the current code block.
+    MCSymbol *NextTarget = nullptr;
+
+    // The join block which contains all the instructions following CallInst.
+    // MergeBlock remains null if CallInst is a tail call.
+    MCSymbol *MergeBlock = nullptr;
+
+    MCPhysReg FuncAddrReg = Reg;
+
+    const bool LoadElim = !VtableSyms.empty();
+    assert((!LoadElim || VtableSyms.size() == Targets.size()) &&
+           "There must be a vtable entry for every method "
+           "in the targets vector.");
+
+    // TODO: No real-world case was found for MinimizeCodeSize, so there is
+    // no implementation like x86.
+    if (MinimizeCodeSize && !LoadElim)
+      return Results;
+
+    const auto jumpToMergeBlock = [&](InstructionListType &NewCall) {
+      assert(MergeBlock);
+      NewCall.push_back(CallInst);
+      MCInst &Merge = NewCall.back();
+      Merge.clear();
+      createUncondBranch(Merge, MergeBlock, Ctx);
+    };
+
+    for (unsigned int i = 0; i < Targets.size(); ++i) {
+      Results.emplace_back(NextTarget, InstructionListType());
+      InstructionListType *NewCall = &Results.back().second;
+
+      // Compare current call target to a specific address.
+      assert(Targets[i].first && "All ICP targets must be to known symbols");
+      const MCSymbol *Sym = LoadElim ? VtableSyms[i].first : Targets[i].first;
+      const uint64_t Addend = LoadElim ? VtableSyms[i].second : 0;
+
+      // Adr target
+      InstructionListType Adr =
+          materializeAddress(Sym, Ctx, FuncAddrReg, Addend);
+      NewCall->insert(NewCall->end(), Adr.begin(), Adr.end());
+
+      // Original call address.
+      assert(CallInst.getOperand(0).isReg() &&
+             "No register was found for indirect-call.");
+      MCPhysReg TargetReg = CallInst.getOperand(0).getReg();
+
+      // Generate label for the next block
+      NextTarget = Ctx->createNamedTempSymbol();
+
+      if (IsJumpTable) {
+        // Jump to target if target address match.
+        InstructionListType CmpJmp =
+            createCmpJEWithReg(TargetReg, FuncAddrReg, Targets[i].first, Ctx);
+        NewCall->insert(NewCall->end(), CmpJmp.begin(), CmpJmp.end());
+      } else {
+        // Jump to next compare if target address don't match.
+        InstructionListType CmpJmp =
+            createCmpJNEWithReg(TargetReg, FuncAddrReg, NextTarget, Ctx);
+        NewCall->insert(NewCall->end(), CmpJmp.begin(), CmpJmp.end());
+
+        // Call specific target directly
+        Results.emplace_back(Ctx->createNamedTempSymbol(),
+                             InstructionListType());
+        NewCall = &Results.back().second;
+        NewCall->push_back(CallInst);
+        MCInst &CallOrJmp = NewCall->back();
+
+        CallOrJmp.clear();
+        CallOrJmp.setOpcode(IsTailCall ? AArch64::B : AArch64::BL);
+        CallOrJmp.addOperand(MCOperand::createExpr(getTargetExprFor(
+            CallOrJmp, MCSymbolRefExpr::create(Targets[i].first, *Ctx), *Ctx,
+            0)));
+
+        if (IsTailCall) {
+          setTailCall(CallOrJmp);
+        } else {
+          if (std::optional<uint32_t> Offset = getOffset(CallInst))
+            // Annotated as duplicated call
+            setOffset(CallOrJmp, *Offset);
+        }
+
+        if (isInvoke(CallInst) && !isInvoke(CallOrJmp)) {
+          // Copy over any EH or GNU args size information from the original
+          // call.
+          std::optional<MCPlus::MCLandingPad> EHInfo = getEHInfo(CallInst);
+          if (EHInfo)
+            addEHInfo(CallOrJmp, *EHInfo);
+          int64_t GnuArgsSize = getGnuArgsSize(CallInst);
+          if (GnuArgsSize >= 0)
+            addGnuArgsSize(CallOrJmp, GnuArgsSize);
+        }
+
+        if (!IsTailCall) {
+          // The fallthrough block for the most common target should be the
+          // merge block.
+          if (i == 0) {
+            // Fallthrough to merge block. The CFG will be fixed in the ICP
+            // pass.
+            MergeBlock = Ctx->createNamedTempSymbol();
+          } else {
+            // Insert jump to the merge block if we are not doing a fallthrough.
+            jumpToMergeBlock(*NewCall);
+          }
+        }
+      }
+    }
+
+    // Cold call block
+    Results.emplace_back(NextTarget, InstructionListType());
+    InstructionListType &NewCall = Results.back().second;
+    for (const MCInst *Inst : MethodFetchInsns)
+      if (Inst != &CallInst)
+        NewCall.push_back(*Inst);
+    NewCall.push_back(CallInst);
+
+    // Jump to merge block from cold call block
+    if (!IsTailCall && !IsJumpTable) {
+      jumpToMergeBlock(NewCall);
+
+      // Record merge block
+      Results.emplace_back(MergeBlock, InstructionListType());
+    }
+
+    return Results;
+  }
+
+  // Note: now, JTIndexReg is always NoRegister on the aarch64. This function
+  // can work only when the aarch64 supports parsing JTIndexReg in the future.
+  BlocksVectorTy jumpTablePromotion(
+      const MCInst &IJmpInst,
+      const std::vector<std::pair<MCSymbol *, uint64_t>> &Targets,
+      const std::vector<MCInst *> &TargetFetchInsns,
+      MCContext *Ctx) const override {
+    assert(getJumpTable(IJmpInst) != 0);
+    uint16_t IndexReg = getAnnotationAs<uint16_t>(IJmpInst, "JTIndexReg");
+    if (IndexReg == 0)
+      return BlocksVectorTy();
+
+    BlocksVectorTy Results;
+
+    // Label for the current code block.
+    MCSymbol *NextTarget = nullptr;
+
+    for (unsigned int i = 0; i < Targets.size(); ++i) {
+      Results.emplace_back(NextTarget, InstructionListType());
+      InstructionListType *CurBB = &Results.back().second;
+
+      // Compare current index to a specific index
+      const uint64_t CaseIdx = Targets[i].second;
+      // Immediate indice is out of 12b bit range
+      if (!isUInt<12>(CaseIdx))
+        return BlocksVectorTy();
+
+      InstructionListType CmpJmp =
+          createCmpJE(IndexReg, CaseIdx, Targets[i].first, Ctx);
+      CurBB->insert(CurBB->end(), CmpJmp.begin(), CmpJmp.end());
+
+      // Jump to next target compare
+      NextTarget = Ctx->createNamedTempSymbol();
+    }
+
+    // Cold call block.
+    Results.emplace_back(NextTarget, InstructionListType());
+    InstructionListType &CurBB = Results.back().second;
+    for (const MCInst *Inst : TargetFetchInsns)
+      if (Inst != &IJmpInst)
+        CurBB.push_back(*Inst);
+
+    CurBB.push_back(IJmpInst);
+
+    return Results;
+  }
+
   void createBTI(MCInst &Inst, BTIKind BTI) const override {
     Inst.setOpcode(AArch64::HINT);
     Inst.clear();
diff --git a/bolt/lib/Target/X86/X86MCPlusBuilder.cpp b/bolt/lib/Target/X86/X86MCPlusBuilder.cpp
index 51e7d27f18a0b..795ce8905c31a 100644
--- a/bolt/lib/Target/X86/X86MCPlusBuilder.cpp
+++ b/bolt/lib/Target/X86/X86MCPlusBuilder.cpp
@@ -3278,7 +3278,7 @@ class X86MCPlusBuilder : public MCPlusBuilder {
   }
 
   BlocksVectorTy indirectCallPromotion(
-      const MCInst &CallInst,
+      const MCInst &CallInst, MCPhysReg Reg,
       const std::vector<std::pair<MCSymbol *, uint64_t>> &Targets,
       const std::vector<std::pair<MCSymbol *, uint64_t>> &VtableSyms,
       const std::vector<MCInst *> &MethodFetchInsns,
diff --git a/bolt/test/AArch64/icp-inline.s b/bolt/test/AArch64/icp-inline.s
new file mode 100644
index 0000000000000..4e6199d4d0ae7
--- /dev/null
+++ b/bolt/test/AArch64/icp-inline.s
@@ -0,0 +1,80 @@
+## This test verifies the effect of icp on aarch64 and inline after icp.
+
+## The assembly was produced from C code compiled with clang -O1 -S:
+
+# int foo(int x) { return x + 1; }
+# int bar(int x) { return x*100 + 42; }
+# typedef int (*const fn)(int);
+# fn funcs[] = { foo, bar };
+#
+# int main(int argc, char *argv[]) {
+#   fn func = funcs[argc];
+#   return func(0);
+# }
+
+# RUN: llvm-mc -filetype=obj -triple aarch64-unknown-unknown %s -o %t.o
+# RUN: link_fdata %s %t.o %t.fdata
+# RUN: llvm-strip --strip-unneeded %t.o
+# RUN: %clang %cflags %t.o -o %t.exe -Wl,-q -nostdlib -pie
+
+# Indirect call promotion without inline
+# RUN: llvm-bolt %t.exe --icp=calls --icp-calls-topn=1 \
+# RUN:   -o %t.null --lite=0 --assume-abi --print-icp --data %t.fdata \
+# RUN:   | FileCheck %s --check-prefix=CHECK-ICP-NO-INLINE
+# CHECK-ICP-NO-INLINE: Binary Function "main" after indirect-call-promotion
+# CHECK-ICP-NO-INLINE: b    bar
+# CHECK-ICP-NO-INLINE: End of Function "main"
+
+# Indirect call promotion with inline
+# RUN: llvm-bolt %t.exe --icp=calls --icp-calls-topn=1 --inline-small-functions \
+# RUN:   -o %t.null --lite=0 --assume-abi --inline-small-functions-bytes=12 \
+# RUN:   --print-inline --data %t.fdata \
+# RUN:   | FileCheck %s --check-prefix=CHECK-ICP-WITH-INLINE
+# CHECK-ICP-WITH-INLINE:     Binary Function "main" after indirect-call-promotion
+# CHECK-ICP-WITH-INLINE:     br    x1
+# CHECK-ICP-WITH-INLINE-NOT: b    bar
+# CHECK-ICP-WITH-INLINE:     End of Function "main"
+    .globl  foo
+    .type   foo, at function
+foo:
+    .cfi_startproc
+    add     w0, w0, #1
+    ret
+    .Lfunc_end0:
+    .size   foo, .Lfunc_end0-foo
+    .cfi_endproc
+
+    .globl  bar
+    .type   bar, at function
+bar:
+    .cfi_startproc
+    mov     w8, #100
+    mov     x9, #42
+    madd    w0, w0, w8, w9
+    ret
+.Lfunc_end1:
+    .size   bar, .Lfunc_end1-bar
+    .cfi_endproc
+
+    .globl  main
+    .type   main, at function
+main:
+    .cfi_startproc
+    adrp    x8, funcs
+    add     x8, x8, :lo12:funcs
+    ldr     x1, [x8, w0, sxtw #3]
+    mov     w0, wzr
+    br     x1
+# FDATA: 1 main 10 1 foo 0 0 1
+# FDATA: 1 main 10 1 bar 0 0 2
+.Lfunc_end2:
+    .size   main, .Lfunc_end2-main
+    .cfi_endproc
+
+    .type   funcs, at object
+    .section    .data.rel.ro,"aw", at progbits
+    .globl  funcs
+funcs:
+    .xword  foo
+    .xword  bar
+    .size   funcs, 16



More information about the llvm-commits mailing list