[llvm] [BOLT][AArch64] Fixed indirect call instrumentation snippet (PR #141918)

Alexey Moksyakov via llvm-commits llvm-commits at lists.llvm.org
Tue Nov 18 03:37:17 PST 2025


https://github.com/yavtuk updated https://github.com/llvm/llvm-project/pull/141918

>From 0a63443fd53bfd0fe7d54ca4b59ecb1f71aebddf Mon Sep 17 00:00:00 2001
From: Moksyakov Alexey <moksyakov.alexey at huawei.com>
Date: Thu, 29 May 2025 07:28:13 +0000
Subject: [PATCH 01/10] [bolt][aarch64] Fixed indirect call instrumentation
 snippet

Indirect call instrumentation snippet uses x16 register in exit
handler to go to destination target

    __bolt_instr_ind_call_handler_func:
            msr  nzcv, x1
            ldp  x0, x1, [sp], #16
            ldr  x16, [sp], #16
            ldp  x0, x1, [sp], #16
            br   x16	<-----

Depend on compiler x16 register used to store smtng in cross function calling.
This patch adds the instrumentation snippet by calling instrumentation
runtime library through indirect call instruction and adding the wrapper
to store/load target value and the register for original indirect instruction.

Example:
            mov x16, foo

    infirectCall:
            adrp x8, Label
            add  x8, x8, #:lo12:Lable
            blr x8

Before:

    Instrumented indirect call:
            stp     x0, x1, [sp, #-16]!
            mov     x0, x8
            movk    x1, #0x0, lsl #48
            movk    x1, #0x0, lsl #32
            movk    x1, #0x0, lsl #16
            movk    x1, #0x0
            stp     x0, x1, [sp, #-16]!
            adrp    x0, __bolt_instr_ind_call_handler_func
            add     x0, x0, #:lo12:__bolt_instr_ind_call_handler_func
            blr     x0

    __bolt_instr_ind_call_handler:  (exit snippet)
            msr     nzcv, x1
            ldp     x0, x1, [sp], #16
            ldr     x16, [sp], #16
            ldp     x0, x1, [sp], #16
            br      x16    <- overwrites the original value in X16

    __bolt_instr_ind_call_handler_func:  (entry snippet)
            stp     x0, x1, [sp, #-16]!
            mrs     x1, nzcv
            adrp    x0, __bolt_instr_ind_call_handler
            add     x0, x0, x0, #:lo12:__bolt_instr_ind_call_handler
            ldr     x0, [x0]
            cmp     x0, #0x0
            b.eq    __bolt_instr_ind_call_handler
            str     x30, [sp, #-16]!
            blr     x0     <--- runtime lib store/load all regs
            ldr     x30, [sp], #16
            b       __bolt_instr_ind_call_handler

_________________________________________________________________________

After:

            mov     x16, foo
    infirectCall:
            adrp    x8, Label
            add     x8, x8, #:lo12:Lable
            blr     x8

    Instrumented indirect call:
            stp     x0, x1, [sp, #-16]!
            mov     x0, x8
            movk    x1, #0x0, lsl #48
            movk    x1, #0x0, lsl #32
            movk    x1, #0x0, lsl #16
            movk    x1, #0x0
            stp     x0, x0, [sp, #-16]!
            adrp    x8, __bolt_instr_ind_call_handler_func
            add     x8, x8, #:lo12:__bolt_instr_ind_call_handler_func
            str     x30, [sp, #-16]!
            blr     x8       <--- call trampoline instr lib
            ldr     x30, [sp], #16
            ldp     x0, x1, [sp], #16
            mov     x8, x0   <---- restore original target
            ldp     x0, x1, [sp], #16
            blr     x8       <--- original indirect call instruction

    // don't touch regs besides x0, x1
    __bolt_instr_ind_call_handler:  (exit snippet)
            ldr     x1, sp, 16
            msr     nzcv, x1
            ldp     x0, x1, [sp], #16
            ret     <---- return to original function with indirect call

    __bolt_instr_ind_call_handler_func: (entry snippet)
            stp     x0, x1, [sp, #-16]!
            mrs     x1, nzcv
            str     x1, [sp, #-16]!
            adrp    x0, __bolt_instr_ind_call_handler
            add     x0, x0, #:lo12:__bolt_instr_ind_call_handler
            ldr     x0, [x0]
            cmp     x0, #0x0
            b.eq    __bolt_instr_ind_call_handler
            str     x30, [sp, #-16]!
            blr     x0     <--- runtime lib store/load all regs
            ldr     x30, [sp], #16
            b       __bolt_instr_ind_call_handler
---
 bolt/include/bolt/Core/MCPlusBuilder.h        |   5 +
 bolt/lib/Passes/Instrumentation.cpp           |  18 ++-
 .../Target/AArch64/AArch64MCPlusBuilder.cpp   | 126 +++++++++++++-----
 bolt/runtime/instr.cpp                        |   4 +-
 4 files changed, 111 insertions(+), 42 deletions(-)

diff --git a/bolt/include/bolt/Core/MCPlusBuilder.h b/bolt/include/bolt/Core/MCPlusBuilder.h
index b233452985502..a8a3a58dba836 100644
--- a/bolt/include/bolt/Core/MCPlusBuilder.h
+++ b/bolt/include/bolt/Core/MCPlusBuilder.h
@@ -511,6 +511,11 @@ class MCPlusBuilder {
     llvm_unreachable("not implemented");
   }
 
+  virtual void createDirectBranch(MCInst &Inst, const MCSymbol *Target,
+                                  MCContext *Ctx) {
+    llvm_unreachable("not implemented");
+  }
+
   virtual MCPhysReg getX86R11() const { llvm_unreachable("not implemented"); }
 
   virtual unsigned getShortBranchOpcode(unsigned Opcode) const {
diff --git a/bolt/lib/Passes/Instrumentation.cpp b/bolt/lib/Passes/Instrumentation.cpp
index fbf889279f1c0..e84acd00da369 100644
--- a/bolt/lib/Passes/Instrumentation.cpp
+++ b/bolt/lib/Passes/Instrumentation.cpp
@@ -293,9 +293,12 @@ void Instrumentation::instrumentIndirectTarget(BinaryBasicBlock &BB,
                                                BinaryBasicBlock::iterator &Iter,
                                                BinaryFunction &FromFunction,
                                                uint32_t From) {
-  auto L = FromFunction.getBinaryContext().scopeLock();
-  const size_t IndCallSiteID = Summary->IndCallDescriptions.size();
-  createIndCallDescription(FromFunction, From);
+  size_t IndCallSiteID;
+  {
+    auto L = FromFunction.getBinaryContext().scopeLock();
+    IndCallSiteID = Summary->IndCallDescriptions.size();
+    createIndCallDescription(FromFunction, From);
+  }
 
   BinaryContext &BC = FromFunction.getBinaryContext();
   bool IsTailCall = BC.MIB->isTailCall(*Iter);
@@ -305,9 +308,12 @@ void Instrumentation::instrumentIndirectTarget(BinaryBasicBlock &BB,
                  : IndCallHandlerExitBBFunction->getSymbol(),
       IndCallSiteID, &*BC.Ctx);
 
-  Iter = BB.eraseInstruction(Iter);
-  Iter = insertInstructions(CounterInstrs, BB, Iter);
-  --Iter;
+  if (!BC.isAArch64()) {
+    Iter = BB.eraseInstruction(Iter);
+    Iter = insertInstructions(CounterInstrs, BB, Iter);
+    --Iter;
+  } else
+    Iter = insertInstructions(CounterInstrs, BB, Iter);
 }
 
 bool Instrumentation::instrumentOneTarget(
diff --git a/bolt/lib/Target/AArch64/AArch64MCPlusBuilder.cpp b/bolt/lib/Target/AArch64/AArch64MCPlusBuilder.cpp
index 9d5a578cfbdff..4895df33bec81 100644
--- a/bolt/lib/Target/AArch64/AArch64MCPlusBuilder.cpp
+++ b/bolt/lib/Target/AArch64/AArch64MCPlusBuilder.cpp
@@ -1966,6 +1966,15 @@ class AArch64MCPlusBuilder : public MCPlusBuilder {
       convertJmpToTailCall(Inst);
   }
 
+  void createDirectBranch(MCInst &Inst, const MCSymbol *Target,
+                          MCContext *Ctx) override {
+    Inst.setOpcode(AArch64::B);
+    Inst.clear();
+    Inst.addOperand(MCOperand::createExpr(getTargetExprFor(
+        Inst, MCSymbolRefExpr::create(Target, MCSymbolRefExpr::VK_None, *Ctx),
+        *Ctx, 0)));
+  }
+
   bool analyzeBranch(InstructionIterator Begin, InstructionIterator End,
                      const MCSymbol *&TBB, const MCSymbol *&FBB,
                      MCInst *&CondBranch,
@@ -2328,21 +2337,26 @@ class AArch64MCPlusBuilder : public MCPlusBuilder {
   }
 
   InstructionListType createInstrumentedIndCallHandlerExitBB() const override {
-    InstructionListType Insts(5);
     // Code sequence for instrumented indirect call handler:
+    //   ldr  x1, [sp, #16]
     //   msr  nzcv, x1
     //   ldp  x0, x1, [sp], #16
-    //   ldr  x16, [sp], #16
-    //   ldp  x0, x1, [sp], #16
-    //   br   x16
-    setSystemFlag(Insts[0], AArch64::X1);
-    createPopRegisters(Insts[1], AArch64::X0, AArch64::X1);
-    // Here we load address of the next function which should be called in the
-    // original binary to X16 register. Writing to X16 is permitted without
-    // needing to restore.
-    loadReg(Insts[2], AArch64::X16, AArch64::SP);
-    createPopRegisters(Insts[3], AArch64::X0, AArch64::X1);
-    createIndirectBranch(Insts[4], AArch64::X16, 0);
+    //   ret
+
+    InstructionListType Insts;
+
+    Insts.emplace_back();
+    loadReg(Insts.back(), AArch64::X1, AArch64::SP);
+
+    Insts.emplace_back();
+    setSystemFlag(Insts.back(), AArch64::X1);
+
+    Insts.emplace_back();
+    createPopRegisters(Insts.back(), AArch64::X0, AArch64::X1);
+
+    Insts.emplace_back();
+    createReturn(Insts.back());
+
     return Insts;
   }
 
@@ -2418,39 +2432,69 @@ class AArch64MCPlusBuilder : public MCPlusBuilder {
                                                      MCSymbol *HandlerFuncAddr,
                                                      int CallSiteID,
                                                      MCContext *Ctx) override {
-    InstructionListType Insts;
     // Code sequence used to enter indirect call instrumentation helper:
-    //   stp x0, x1, [sp, #-16]! createPushRegisters
+    //   stp x0, x1, [sp, #-16]! createPushRegisters  (1)
     //   mov target x0  convertIndirectCallToLoad -> orr x0 target xzr
     //   mov x1 CallSiteID createLoadImmediate ->
     //   movk    x1, #0x0, lsl #48
     //   movk    x1, #0x0, lsl #32
     //   movk    x1, #0x0, lsl #16
     //   movk    x1, #0x0
-    //   stp x0, x1, [sp, #-16]!
-    //   bl *HandlerFuncAddr createIndirectCall ->
+    //   stp x0, x1, [sp, #-16]!    (2)
     //   adr x0 *HandlerFuncAddr -> adrp + add
-    //   blr x0
+    //   str x30, [sp, #-16]!  (3)
+    //   blr x0   (__bolt_instr_ind_call_handler_func)
+    //   ldr x30, sp, #16      (3)
+    //   ldp x0, x1, [sp], #16   (2)
+    //   mov x0, x0  ; move target address to used register
+    //   ldp x0, x1, [sp], #16   (1)
+
+    InstructionListType Insts;
     Insts.emplace_back();
-    createPushRegisters(Insts.back(), AArch64::X0, AArch64::X1);
+    createPushRegisters(Insts.back(), getIntArgRegister(0),
+                        getIntArgRegister(1));
     Insts.emplace_back(CallInst);
-    convertIndirectCallToLoad(Insts.back(), AArch64::X0);
+    convertIndirectCallToLoad(Insts.back(), getIntArgRegister(0));
     InstructionListType LoadImm =
         createLoadImmediate(getIntArgRegister(1), CallSiteID);
     Insts.insert(Insts.end(), LoadImm.begin(), LoadImm.end());
     Insts.emplace_back();
-    createPushRegisters(Insts.back(), AArch64::X0, AArch64::X1);
+    createPushRegisters(Insts.back(), getIntArgRegister(0),
+                        getIntArgRegister(1));
     Insts.resize(Insts.size() + 2);
-    InstructionListType Addr =
-        materializeAddress(HandlerFuncAddr, Ctx, AArch64::X0);
+    InstructionListType Addr = materializeAddress(
+        HandlerFuncAddr, Ctx, CallInst.getOperand(0).getReg());
     assert(Addr.size() == 2 && "Invalid Addr size");
     std::copy(Addr.begin(), Addr.end(), Insts.end() - Addr.size());
+
+    Insts.emplace_back();
+    storeReg(Insts.back(), AArch64::LR, getSpRegister(/*Size*/ 8));
+
+    Insts.emplace_back();
+    createIndirectCallInst(Insts.back(), false,
+                           CallInst.getOperand(0).getReg());
+
     Insts.emplace_back();
-    createIndirectCallInst(Insts.back(), isTailCall(CallInst), AArch64::X0);
+    loadReg(Insts.back(), AArch64::LR, getSpRegister(/*Size*/ 8));
 
-    // Carry over metadata including tail call marker if present.
-    stripAnnotations(Insts.back());
-    moveAnnotations(std::move(CallInst), Insts.back());
+    Insts.emplace_back();
+    createPopRegisters(Insts.back(), getIntArgRegister(0),
+                       getIntArgRegister(1));
+
+    // move x0 to indirect call register
+    Insts.emplace_back();
+    Insts.back().setOpcode(AArch64::ORRXrs);
+    Insts.back().insert(Insts.back().begin(),
+                        MCOperand::createReg(CallInst.getOperand(0).getReg()));
+    Insts.back().insert(Insts.back().begin() + 1,
+                        MCOperand::createReg(AArch64::XZR));
+    Insts.back().insert(Insts.back().begin() + 2,
+                        MCOperand::createReg(getIntArgRegister(0)));
+    Insts.back().insert(Insts.back().begin() + 3, MCOperand::createImm(0));
+
+    Insts.emplace_back();
+    createPopRegisters(Insts.back(), getIntArgRegister(0),
+                       getIntArgRegister(1));
 
     return Insts;
   }
@@ -2472,30 +2516,44 @@ class AArch64MCPlusBuilder : public MCPlusBuilder {
     //   ldr     x30, [sp], #16
     //   b       IndCallHandler
     InstructionListType Insts;
+
     Insts.emplace_back();
-    createPushRegisters(Insts.back(), AArch64::X0, AArch64::X1);
+    createPushRegisters(Insts.back(), getIntArgRegister(0),
+                        getIntArgRegister(1));
+
     Insts.emplace_back();
     getSystemFlag(Insts.back(), getIntArgRegister(1));
+
+    Insts.emplace_back();
+    storeReg(Insts.back(), getIntArgRegister(1), getSpRegister(/*Size*/ 8));
+
     Insts.emplace_back();
     Insts.emplace_back();
     InstructionListType Addr =
-        materializeAddress(InstrTrampoline, Ctx, AArch64::X0);
+        materializeAddress(InstrTrampoline, Ctx, getIntArgRegister(0));
     std::copy(Addr.begin(), Addr.end(), Insts.end() - Addr.size());
     assert(Addr.size() == 2 && "Invalid Addr size");
+
     Insts.emplace_back();
-    loadReg(Insts.back(), AArch64::X0, AArch64::X0);
+    loadReg(Insts.back(), getIntArgRegister(0), getIntArgRegister(0));
+
     InstructionListType cmpJmp =
-        createCmpJE(AArch64::X0, 0, IndCallHandler, Ctx);
+        createCmpJE(getIntArgRegister(0), 0, IndCallHandler, Ctx);
     Insts.insert(Insts.end(), cmpJmp.begin(), cmpJmp.end());
+
     Insts.emplace_back();
-    storeReg(Insts.back(), AArch64::LR, AArch64::SP);
+    storeReg(Insts.back(), AArch64::LR, getSpRegister(/*Size*/ 8));
+
     Insts.emplace_back();
     Insts.back().setOpcode(AArch64::BLR);
-    Insts.back().addOperand(MCOperand::createReg(AArch64::X0));
+    Insts.back().addOperand(MCOperand::createReg(getIntArgRegister(0)));
+
     Insts.emplace_back();
-    loadReg(Insts.back(), AArch64::LR, AArch64::SP);
+    loadReg(Insts.back(), AArch64::LR, getSpRegister(/*Size*/ 8));
+
     Insts.emplace_back();
-    createDirectCall(Insts.back(), IndCallHandler, Ctx, /*IsTailCall*/ true);
+    createDirectBranch(Insts.back(), IndCallHandler, Ctx);
+
     return Insts;
   }
 
diff --git a/bolt/runtime/instr.cpp b/bolt/runtime/instr.cpp
index ae356e71cbe41..a174b982cbb84 100644
--- a/bolt/runtime/instr.cpp
+++ b/bolt/runtime/instr.cpp
@@ -1668,7 +1668,7 @@ extern "C" __attribute((naked)) void __bolt_instr_indirect_call()
 #if defined(__aarch64__)
   // clang-format off
   __asm__ __volatile__(SAVE_ALL
-                       "ldp x0, x1, [sp, #288]\n"
+                       "ldp x0, x1, [sp, #320]\n"
                        "bl instrumentIndirectCall\n"
                        RESTORE_ALL
                        "ret\n"
@@ -1705,7 +1705,7 @@ extern "C" __attribute((naked)) void __bolt_instr_indirect_tailcall()
 #if defined(__aarch64__)
   // clang-format off
   __asm__ __volatile__(SAVE_ALL
-                       "ldp x0, x1, [sp, #288]\n"
+                       "ldp x0, x1, [sp, #320]\n"
                        "bl instrumentIndirectCall\n"
                        RESTORE_ALL
                        "ret\n"

>From 4b0560fb10485e561563325d491c964c6a1d3768 Mon Sep 17 00:00:00 2001
From: Alexey Moksyakov <yavtuk at yandex.ru>
Date: Wed, 5 Nov 2025 14:22:29 +0300
Subject: [PATCH 02/10] [bolt][aarch64] Refactor insturmentation code snippet.
 NFC

Signed-off-by: Moksyakov Alexey <moksyakov.alexey at huawei.com>
---
 .../Target/AArch64/AArch64MCPlusBuilder.cpp   | 19 ++++++++++---------
 1 file changed, 10 insertions(+), 9 deletions(-)

diff --git a/bolt/lib/Target/AArch64/AArch64MCPlusBuilder.cpp b/bolt/lib/Target/AArch64/AArch64MCPlusBuilder.cpp
index 4895df33bec81..38ed4a390f747 100644
--- a/bolt/lib/Target/AArch64/AArch64MCPlusBuilder.cpp
+++ b/bolt/lib/Target/AArch64/AArch64MCPlusBuilder.cpp
@@ -1971,8 +1971,7 @@ class AArch64MCPlusBuilder : public MCPlusBuilder {
     Inst.setOpcode(AArch64::B);
     Inst.clear();
     Inst.addOperand(MCOperand::createExpr(getTargetExprFor(
-        Inst, MCSymbolRefExpr::create(Target, MCSymbolRefExpr::VK_None, *Ctx),
-        *Ctx, 0)));
+        Inst, MCSymbolRefExpr::create(Target, *Ctx), *Ctx, 0)));
   }
 
   bool analyzeBranch(InstructionIterator Begin, InstructionIterator End,
@@ -2346,13 +2345,14 @@ class AArch64MCPlusBuilder : public MCPlusBuilder {
     InstructionListType Insts;
 
     Insts.emplace_back();
-    loadReg(Insts.back(), AArch64::X1, AArch64::SP);
+    loadReg(Insts.back(), getIntArgRegister(1), AArch64::SP);
 
     Insts.emplace_back();
-    setSystemFlag(Insts.back(), AArch64::X1);
+    setSystemFlag(Insts.back(), getIntArgRegister(1));
 
     Insts.emplace_back();
-    createPopRegisters(Insts.back(), AArch64::X0, AArch64::X1);
+    createPopRegisters(Insts.back(), getIntArgRegister(0),
+                       getIntArgRegister(1));
 
     Insts.emplace_back();
     createReturn(Insts.back());
@@ -2503,11 +2503,12 @@ class AArch64MCPlusBuilder : public MCPlusBuilder {
   createInstrumentedIndCallHandlerEntryBB(const MCSymbol *InstrTrampoline,
                                           const MCSymbol *IndCallHandler,
                                           MCContext *Ctx) override {
-    // Code sequence used to check whether InstrTampoline was initialized
+    // Code sequence used to check whether InstrTrampoline was initialized
     // and call it if so, returns via IndCallHandler
     //   stp     x0, x1, [sp, #-16]!
     //   mrs     x1, nzcv
-    //   adr     x0, InstrTrampoline -> adrp + add
+    //   adrp    x0, InstrTrampoline
+    //   add     x0, x0, #lo12:InstrTrampoline
     //   ldr     x0, [x0]
     //   subs    x0, x0, #0x0
     //   b.eq    IndCallHandler
@@ -2537,9 +2538,9 @@ class AArch64MCPlusBuilder : public MCPlusBuilder {
     Insts.emplace_back();
     loadReg(Insts.back(), getIntArgRegister(0), getIntArgRegister(0));
 
-    InstructionListType cmpJmp =
+    InstructionListType CmpJmp =
         createCmpJE(getIntArgRegister(0), 0, IndCallHandler, Ctx);
-    Insts.insert(Insts.end(), cmpJmp.begin(), cmpJmp.end());
+    Insts.insert(Insts.end(), CmpJmp.begin(), CmpJmp.end());
 
     Insts.emplace_back();
     storeReg(Insts.back(), AArch64::LR, getSpRegister(/*Size*/ 8));

>From f5d4f48978635a4b766622ce04cb2f0871508157 Mon Sep 17 00:00:00 2001
From: Alexey Moksyakov <yavtuk at yandex.ru>
Date: Wed, 5 Nov 2025 14:25:43 +0300
Subject: [PATCH 03/10] [bolt][aarch64] Extend checking for indirect call
 instrumentation test

---
 .../AArch64/instrumentation-ind-call.c        | 40 ++++++++++++++++++-
 1 file changed, 39 insertions(+), 1 deletion(-)

diff --git a/bolt/test/runtime/AArch64/instrumentation-ind-call.c b/bolt/test/runtime/AArch64/instrumentation-ind-call.c
index f9056da333b4e..f66ac5a351cea 100644
--- a/bolt/test/runtime/AArch64/instrumentation-ind-call.c
+++ b/bolt/test/runtime/AArch64/instrumentation-ind-call.c
@@ -15,9 +15,47 @@ int main() {
 REQUIRES: system-linux,bolt-runtime
 
 RUN: %clang %cflags %s -o %t.exe -Wl,-q -no-pie -fpie
+RUN: llvm-objdump --disassemble-symbols=main %t.exe \
+RUN:   | FileCheck %s --check-prefix=CHECKINDIRECTREG
+
+CHECKINDIRECTREG: mov w0, #0xa
+CHECKINDIRECTREG-NEXT: mov w1, #0x14
+CHECKINDIRECTREG-NEXT: blr x8
 
 RUN: llvm-bolt %t.exe --instrument --instrumentation-file=%t.fdata \
-RUN:   -o %t.instrumented
+RUN:   -o %t.instrumented \
+RUN:   | FileCheck %s --check-prefix=CHECK-INSTR-LOG
+
+CHECK-INSTR-LOG: BOLT-INSTRUMENTER: Number of indirect call site descriptors: 1
+
+RUN: llvm-objdump --disassemble-symbols=main %t.instrumented \
+RUN:   | FileCheck %s --check-prefix=CHECK-INSTR-INDIRECTREG
+
+CHECK-INSTR-INDIRECTREG: mov w0, #0xa
+CHECK-INSTR-INDIRECTREG-NEXT: mov w1, #0x14
+// store current values
+CHECK-INSTR-INDIRECTREG-NEXT: stp x0, x1, {{.*}}
+// store the indirect target address in x0
+CHECK-INSTR-INDIRECTREG-NEXT: mov x0, x8
+// load callsite id into x1
+CHECK-INSTR-INDIRECTREG-NEXT: movk x1, {{.*}}
+CHECK-INSTR-INDIRECTREG-NEXT: movk x1, {{.*}}
+CHECK-INSTR-INDIRECTREG-NEXT: movk x1, {{.*}}
+CHECK-INSTR-INDIRECTREG-NEXT: movk x1, {{.*}}
+CHECK-INSTR-INDIRECTREG-NEXT: stp x0, x1, {{.*}}
+CHECK-INSTR-INDIRECTREG-NEXT: adrp x8, {{.*}}
+CHECK-INSTR-INDIRECTREG-NEXT: add x8, {{.*}}
+// store return address, used by library handler
+CHECK-INSTR-INDIRECTREG-NEXT: str x30, {{.*}}
+// call instrumentation library handler function
+CHECK-INSTR-INDIRECTREG-NEXT: blr x8
+// restore registers saved before
+CHECK-INSTR-INDIRECTREG-NEXT: ldr x30, {{.*}}
+CHECK-INSTR-INDIRECTREG-NEXT: ldp x0, x1, {{.*}}
+CHECK-INSTR-INDIRECTREG-NEXT: mov x8, x0
+CHECK-INSTR-INDIRECTREG-NEXT: ldp x0, x1, {{.*}}
+// original indirect call instruction
+CHECK-INSTR-INDIRECTREG-NEXT: blr x8
 
 # Instrumented program needs to finish returning zero
 RUN: %t.instrumented | FileCheck %s -check-prefix=CHECK-OUTPUT

>From a7e1e2c801bbf9d506fe56babafda9bf1971ee63 Mon Sep 17 00:00:00 2001
From: Alexey Moksyakov <yavtuk at yandex.ru>
Date: Wed, 5 Nov 2025 16:21:35 +0300
Subject: [PATCH 04/10] [bolt][aarch64] Sync comment with code snippet for
 indirect call. NFC

Signed-off-by: Moksyakov Alexey <moksyakov.alexey at huawei.com>
---
 bolt/lib/Target/AArch64/AArch64MCPlusBuilder.cpp | 1 +
 1 file changed, 1 insertion(+)

diff --git a/bolt/lib/Target/AArch64/AArch64MCPlusBuilder.cpp b/bolt/lib/Target/AArch64/AArch64MCPlusBuilder.cpp
index 38ed4a390f747..69df630ff17f1 100644
--- a/bolt/lib/Target/AArch64/AArch64MCPlusBuilder.cpp
+++ b/bolt/lib/Target/AArch64/AArch64MCPlusBuilder.cpp
@@ -2507,6 +2507,7 @@ class AArch64MCPlusBuilder : public MCPlusBuilder {
     // and call it if so, returns via IndCallHandler
     //   stp     x0, x1, [sp, #-16]!
     //   mrs     x1, nzcv
+    //   str     x1, [sp, #-16]!
     //   adrp    x0, InstrTrampoline
     //   add     x0, x0, #lo12:InstrTrampoline
     //   ldr     x0, [x0]

>From 9eede6946bb6e40027c3a804cfc5ad77f91ba018 Mon Sep 17 00:00:00 2001
From: Moksyakov Alexey <moksyakov.alexey at huawei.com>
Date: Thu, 6 Nov 2025 13:55:30 +0000
Subject: [PATCH 05/10] [bolt][aarch64] Change indirect call handler snippet

Signed-off-by: Moksyakov Alexey <moksyakov.alexey at huawei.com>
---
 .../Target/AArch64/AArch64MCPlusBuilder.cpp   | 27 ++++++++++++-------
 1 file changed, 17 insertions(+), 10 deletions(-)

diff --git a/bolt/lib/Target/AArch64/AArch64MCPlusBuilder.cpp b/bolt/lib/Target/AArch64/AArch64MCPlusBuilder.cpp
index 69df630ff17f1..7526796d16b5c 100644
--- a/bolt/lib/Target/AArch64/AArch64MCPlusBuilder.cpp
+++ b/bolt/lib/Target/AArch64/AArch64MCPlusBuilder.cpp
@@ -2509,8 +2509,7 @@ class AArch64MCPlusBuilder : public MCPlusBuilder {
     //   mrs     x1, nzcv
     //   str     x1, [sp, #-16]!
     //   adrp    x0, InstrTrampoline
-    //   add     x0, x0, #lo12:InstrTrampoline
-    //   ldr     x0, [x0]
+    //   ldr     x0, [x0, #lo12:InstrTrampoline]
     //   subs    x0, x0, #0x0
     //   b.eq    IndCallHandler
     //   str     x30, [sp, #-16]!
@@ -2529,15 +2528,23 @@ class AArch64MCPlusBuilder : public MCPlusBuilder {
     Insts.emplace_back();
     storeReg(Insts.back(), getIntArgRegister(1), getSpRegister(/*Size*/ 8));
 
-    Insts.emplace_back();
-    Insts.emplace_back();
-    InstructionListType Addr =
-        materializeAddress(InstrTrampoline, Ctx, getIntArgRegister(0));
-    std::copy(Addr.begin(), Addr.end(), Insts.end() - Addr.size());
-    assert(Addr.size() == 2 && "Invalid Addr size");
+    // load handler address
+    MCInst InstAdrp;
+    InstAdrp.setOpcode(AArch64::ADRP);
+    InstAdrp.addOperand(MCOperand::createReg(getIntArgRegister(0)));
+    InstAdrp.addOperand(MCOperand::createImm(0));
+    setOperandToSymbolRef(InstAdrp, /* OpNum */ 1, InstrTrampoline,
+                          /* Addend */ 0, Ctx, ELF::R_AARCH64_ADR_GOT_PAGE);
+    Insts.emplace_back(InstAdrp);
 
-    Insts.emplace_back();
-    loadReg(Insts.back(), getIntArgRegister(0), getIntArgRegister(0));
+    MCInst InstLoad;
+    InstLoad.setOpcode(AArch64::LDRXui);
+    InstLoad.addOperand(MCOperand::createReg(getIntArgRegister(0)));
+    InstLoad.addOperand(MCOperand::createReg(getIntArgRegister(0)));
+    InstLoad.addOperand(MCOperand::createImm(0));
+    setOperandToSymbolRef(InstLoad, /* OpNum */ 2, InstrTrampoline,
+                          /* Addend */ 0, Ctx, ELF::R_AARCH64_LD64_GOT_LO12_NC);
+    Insts.emplace_back(InstLoad);
 
     InstructionListType CmpJmp =
         createCmpJE(getIntArgRegister(0), 0, IndCallHandler, Ctx);

>From cc807f967e42357f77f9138ab849345da389c5c1 Mon Sep 17 00:00:00 2001
From: Moksyakov Alexey <moksyakov.alexey at huawei.com>
Date: Thu, 6 Nov 2025 14:46:29 +0000
Subject: [PATCH 06/10] [bolt][aarch64] Change indirect call instrumentation
 snippet

Remove redundant load/store instructions in entryBB/exitBB
indirect call instrumentation snippet, move msr/mrs to
SAVE_ALL/RESTORE_ALL

Signed-off-by: Moksyakov Alexey <moksyakov.alexey at huawei.com>
---
 bolt/lib/Target/AArch64/AArch64MCPlusBuilder.cpp | 16 ----------------
 bolt/runtime/instr.cpp                           |  4 ++--
 bolt/runtime/sys_aarch64.h                       |  6 ++++--
 3 files changed, 6 insertions(+), 20 deletions(-)

diff --git a/bolt/lib/Target/AArch64/AArch64MCPlusBuilder.cpp b/bolt/lib/Target/AArch64/AArch64MCPlusBuilder.cpp
index 7526796d16b5c..d83bc401f9929 100644
--- a/bolt/lib/Target/AArch64/AArch64MCPlusBuilder.cpp
+++ b/bolt/lib/Target/AArch64/AArch64MCPlusBuilder.cpp
@@ -2337,19 +2337,11 @@ class AArch64MCPlusBuilder : public MCPlusBuilder {
 
   InstructionListType createInstrumentedIndCallHandlerExitBB() const override {
     // Code sequence for instrumented indirect call handler:
-    //   ldr  x1, [sp, #16]
-    //   msr  nzcv, x1
     //   ldp  x0, x1, [sp], #16
     //   ret
 
     InstructionListType Insts;
 
-    Insts.emplace_back();
-    loadReg(Insts.back(), getIntArgRegister(1), AArch64::SP);
-
-    Insts.emplace_back();
-    setSystemFlag(Insts.back(), getIntArgRegister(1));
-
     Insts.emplace_back();
     createPopRegisters(Insts.back(), getIntArgRegister(0),
                        getIntArgRegister(1));
@@ -2506,8 +2498,6 @@ class AArch64MCPlusBuilder : public MCPlusBuilder {
     // Code sequence used to check whether InstrTrampoline was initialized
     // and call it if so, returns via IndCallHandler
     //   stp     x0, x1, [sp, #-16]!
-    //   mrs     x1, nzcv
-    //   str     x1, [sp, #-16]!
     //   adrp    x0, InstrTrampoline
     //   ldr     x0, [x0, #lo12:InstrTrampoline]
     //   subs    x0, x0, #0x0
@@ -2522,12 +2512,6 @@ class AArch64MCPlusBuilder : public MCPlusBuilder {
     createPushRegisters(Insts.back(), getIntArgRegister(0),
                         getIntArgRegister(1));
 
-    Insts.emplace_back();
-    getSystemFlag(Insts.back(), getIntArgRegister(1));
-
-    Insts.emplace_back();
-    storeReg(Insts.back(), getIntArgRegister(1), getSpRegister(/*Size*/ 8));
-
     // load handler address
     MCInst InstAdrp;
     InstAdrp.setOpcode(AArch64::ADRP);
diff --git a/bolt/runtime/instr.cpp b/bolt/runtime/instr.cpp
index a174b982cbb84..90ace77718603 100644
--- a/bolt/runtime/instr.cpp
+++ b/bolt/runtime/instr.cpp
@@ -1668,7 +1668,7 @@ extern "C" __attribute((naked)) void __bolt_instr_indirect_call()
 #if defined(__aarch64__)
   // clang-format off
   __asm__ __volatile__(SAVE_ALL
-                       "ldp x0, x1, [sp, #320]\n"
+                       "ldp x0, x1, [sp, #304]\n"
                        "bl instrumentIndirectCall\n"
                        RESTORE_ALL
                        "ret\n"
@@ -1705,7 +1705,7 @@ extern "C" __attribute((naked)) void __bolt_instr_indirect_tailcall()
 #if defined(__aarch64__)
   // clang-format off
   __asm__ __volatile__(SAVE_ALL
-                       "ldp x0, x1, [sp, #320]\n"
+                       "ldp x0, x1, [sp, #304]\n"
                        "bl instrumentIndirectCall\n"
                        RESTORE_ALL
                        "ret\n"
diff --git a/bolt/runtime/sys_aarch64.h b/bolt/runtime/sys_aarch64.h
index 77c9cfcc99f98..1875e0620678e 100644
--- a/bolt/runtime/sys_aarch64.h
+++ b/bolt/runtime/sys_aarch64.h
@@ -18,10 +18,12 @@
   "stp x24, x25, [sp, #-16]!\n"                                                \
   "stp x26, x27, [sp, #-16]!\n"                                                \
   "stp x28, x29, [sp, #-16]!\n"                                                \
-  "str x30, [sp,#-16]!\n"
+  "mrs x29, nzcv\n"                                                            \
+  "stp x29, x30, [sp, #-16]!\n"
 // Mirrors SAVE_ALL
 #define RESTORE_ALL                                                            \
-  "ldr x30, [sp], #16\n"                                                       \
+  "ldp x29, x30, [sp], #16\n"                                                  \
+  "msr nzcv, x29\n"                                                            \
   "ldp x28, x29, [sp], #16\n"                                                  \
   "ldp x26, x27, [sp], #16\n"                                                  \
   "ldp x24, x25, [sp], #16\n"                                                  \

>From 1d9511cb4dfd998880d02de48b2f5899517b0566 Mon Sep 17 00:00:00 2001
From: Alexey Moksyakov <yavtuk at yandex.ru>
Date: Thu, 6 Nov 2025 19:03:10 +0300
Subject: [PATCH 07/10] [bolt][aarch64] set unused for set/get system flag

---
 bolt/lib/Target/AArch64/AArch64MCPlusBuilder.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/bolt/lib/Target/AArch64/AArch64MCPlusBuilder.cpp b/bolt/lib/Target/AArch64/AArch64MCPlusBuilder.cpp
index d83bc401f9929..3d962244a0da6 100644
--- a/bolt/lib/Target/AArch64/AArch64MCPlusBuilder.cpp
+++ b/bolt/lib/Target/AArch64/AArch64MCPlusBuilder.cpp
@@ -38,14 +38,14 @@ using namespace bolt;
 
 namespace {
 
-static void getSystemFlag(MCInst &Inst, MCPhysReg RegName) {
+[[maybe_unused]] static void getSystemFlag(MCInst &Inst, MCPhysReg RegName) {
   Inst.setOpcode(AArch64::MRS);
   Inst.clear();
   Inst.addOperand(MCOperand::createReg(RegName));
   Inst.addOperand(MCOperand::createImm(AArch64SysReg::NZCV));
 }
 
-static void setSystemFlag(MCInst &Inst, MCPhysReg RegName) {
+[[maybe_unused]] static void setSystemFlag(MCInst &Inst, MCPhysReg RegName) {
   Inst.setOpcode(AArch64::MSR);
   Inst.clear();
   Inst.addOperand(MCOperand::createImm(AArch64SysReg::NZCV));

>From 411c273a016e60c5142e4a185017bce4fd7a7444 Mon Sep 17 00:00:00 2001
From: Alexey Moksyakov <yavtuk at yandex.ru>
Date: Fri, 7 Nov 2025 11:08:53 +0300
Subject: [PATCH 08/10] [bolt][aarch64] Remove redundant stp/ldp instr for
 indirect call instrumentation

---
 bolt/lib/Target/AArch64/AArch64MCPlusBuilder.cpp | 10 ----------
 1 file changed, 10 deletions(-)

diff --git a/bolt/lib/Target/AArch64/AArch64MCPlusBuilder.cpp b/bolt/lib/Target/AArch64/AArch64MCPlusBuilder.cpp
index 3d962244a0da6..11ab54661a948 100644
--- a/bolt/lib/Target/AArch64/AArch64MCPlusBuilder.cpp
+++ b/bolt/lib/Target/AArch64/AArch64MCPlusBuilder.cpp
@@ -2337,15 +2337,10 @@ class AArch64MCPlusBuilder : public MCPlusBuilder {
 
   InstructionListType createInstrumentedIndCallHandlerExitBB() const override {
     // Code sequence for instrumented indirect call handler:
-    //   ldp  x0, x1, [sp], #16
     //   ret
 
     InstructionListType Insts;
 
-    Insts.emplace_back();
-    createPopRegisters(Insts.back(), getIntArgRegister(0),
-                       getIntArgRegister(1));
-
     Insts.emplace_back();
     createReturn(Insts.back());
 
@@ -2497,7 +2492,6 @@ class AArch64MCPlusBuilder : public MCPlusBuilder {
                                           MCContext *Ctx) override {
     // Code sequence used to check whether InstrTrampoline was initialized
     // and call it if so, returns via IndCallHandler
-    //   stp     x0, x1, [sp, #-16]!
     //   adrp    x0, InstrTrampoline
     //   ldr     x0, [x0, #lo12:InstrTrampoline]
     //   subs    x0, x0, #0x0
@@ -2508,10 +2502,6 @@ class AArch64MCPlusBuilder : public MCPlusBuilder {
     //   b       IndCallHandler
     InstructionListType Insts;
 
-    Insts.emplace_back();
-    createPushRegisters(Insts.back(), getIntArgRegister(0),
-                        getIntArgRegister(1));
-
     // load handler address
     MCInst InstAdrp;
     InstAdrp.setOpcode(AArch64::ADRP);

>From 31ff53fbb7e83591b32a7796d83eab3c03dfb86c Mon Sep 17 00:00:00 2001
From: Alexey Moksyakov <yavtuk at yandex.ru>
Date: Fri, 7 Nov 2025 11:10:01 +0300
Subject: [PATCH 09/10] [bolt][aarch64] Change stack address for indirect call
 instrumentation

---
 bolt/runtime/instr.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/bolt/runtime/instr.cpp b/bolt/runtime/instr.cpp
index 90ace77718603..ae356e71cbe41 100644
--- a/bolt/runtime/instr.cpp
+++ b/bolt/runtime/instr.cpp
@@ -1668,7 +1668,7 @@ extern "C" __attribute((naked)) void __bolt_instr_indirect_call()
 #if defined(__aarch64__)
   // clang-format off
   __asm__ __volatile__(SAVE_ALL
-                       "ldp x0, x1, [sp, #304]\n"
+                       "ldp x0, x1, [sp, #288]\n"
                        "bl instrumentIndirectCall\n"
                        RESTORE_ALL
                        "ret\n"
@@ -1705,7 +1705,7 @@ extern "C" __attribute((naked)) void __bolt_instr_indirect_tailcall()
 #if defined(__aarch64__)
   // clang-format off
   __asm__ __volatile__(SAVE_ALL
-                       "ldp x0, x1, [sp, #304]\n"
+                       "ldp x0, x1, [sp, #288]\n"
                        "bl instrumentIndirectCall\n"
                        RESTORE_ALL
                        "ret\n"

>From 5effeb731082c3a3d69fb73b9802208023bba622 Mon Sep 17 00:00:00 2001
From: Alexey Moksyakov <yavtuk at yandex.ru>
Date: Tue, 18 Nov 2025 14:37:02 +0300
Subject: [PATCH 10/10] [bolt][aarch64] Change indirect call instrumentation
 snippet

The target address and link register pushed on stack, call site id move to X1 register directly because __bolt_instr_ind_call_handler_func/__bolt_instr_ind_tailcall_handler_func hooks use X0 register to fill handler from instrumentation library.
---
 bolt/test/runtime/AArch64/instrumentation-ind-call.c | 7 ++-----
 1 file changed, 2 insertions(+), 5 deletions(-)

diff --git a/bolt/test/runtime/AArch64/instrumentation-ind-call.c b/bolt/test/runtime/AArch64/instrumentation-ind-call.c
index f66ac5a351cea..2020735bbc82a 100644
--- a/bolt/test/runtime/AArch64/instrumentation-ind-call.c
+++ b/bolt/test/runtime/AArch64/instrumentation-ind-call.c
@@ -42,16 +42,13 @@ CHECK-INSTR-INDIRECTREG-NEXT: movk x1, {{.*}}
 CHECK-INSTR-INDIRECTREG-NEXT: movk x1, {{.*}}
 CHECK-INSTR-INDIRECTREG-NEXT: movk x1, {{.*}}
 CHECK-INSTR-INDIRECTREG-NEXT: movk x1, {{.*}}
-CHECK-INSTR-INDIRECTREG-NEXT: stp x0, x1, {{.*}}
+CHECK-INSTR-INDIRECTREG-NEXT: stp x0, x30, {{.*}}
 CHECK-INSTR-INDIRECTREG-NEXT: adrp x8, {{.*}}
 CHECK-INSTR-INDIRECTREG-NEXT: add x8, {{.*}}
-// store return address, used by library handler
-CHECK-INSTR-INDIRECTREG-NEXT: str x30, {{.*}}
 // call instrumentation library handler function
 CHECK-INSTR-INDIRECTREG-NEXT: blr x8
 // restore registers saved before
-CHECK-INSTR-INDIRECTREG-NEXT: ldr x30, {{.*}}
-CHECK-INSTR-INDIRECTREG-NEXT: ldp x0, x1, {{.*}}
+CHECK-INSTR-INDIRECTREG-NEXT: ldp x0, x30, {{.*}}
 CHECK-INSTR-INDIRECTREG-NEXT: mov x8, x0
 CHECK-INSTR-INDIRECTREG-NEXT: ldp x0, x1, {{.*}}
 // original indirect call instruction



More information about the llvm-commits mailing list