[llvm] [BOLT][AArch64] Handle OpNegateRAState to enable optimizing binaries with pac-ret hardening (PR #120064)

Gergely Bálint via llvm-commits llvm-commits at lists.llvm.org
Wed Sep 10 06:39:38 PDT 2025


https://github.com/bgergely0 updated https://github.com/llvm/llvm-project/pull/120064

>From 9d8186c1fe6edad92420e6a046875c6ac0c41eb5 Mon Sep 17 00:00:00 2001
From: Gergely Balint <gergely.balint at arm.com>
Date: Tue, 19 Nov 2024 09:43:25 +0100
Subject: [PATCH 01/25] [BOLT] Recognize paciasp and autiasp instructions

---
 bolt/include/bolt/Core/MCPlusBuilder.h           | 7 +++++++
 bolt/lib/Target/AArch64/AArch64MCPlusBuilder.cpp | 6 ++++++
 2 files changed, 13 insertions(+)

diff --git a/bolt/include/bolt/Core/MCPlusBuilder.h b/bolt/include/bolt/Core/MCPlusBuilder.h
index f902a8c43cd1d..0c25ba52ce05e 100644
--- a/bolt/include/bolt/Core/MCPlusBuilder.h
+++ b/bolt/include/bolt/Core/MCPlusBuilder.h
@@ -815,6 +815,13 @@ class MCPlusBuilder {
     llvm_unreachable("not implemented");
     return false;
   }
+  virtual bool isPAuth(MCInst &Inst) const {
+    llvm_unreachable("not implemented");
+  }
+
+  virtual bool isPSign(MCInst &Inst) const {
+    llvm_unreachable("not implemented");
+  }
 
   virtual bool isCleanRegXOR(const MCInst &Inst) const {
     llvm_unreachable("not implemented");
diff --git a/bolt/lib/Target/AArch64/AArch64MCPlusBuilder.cpp b/bolt/lib/Target/AArch64/AArch64MCPlusBuilder.cpp
index 973261765f951..2266733f956ee 100644
--- a/bolt/lib/Target/AArch64/AArch64MCPlusBuilder.cpp
+++ b/bolt/lib/Target/AArch64/AArch64MCPlusBuilder.cpp
@@ -894,6 +894,12 @@ class AArch64MCPlusBuilder : public MCPlusBuilder {
     }
     return false;
   }
+  bool isPAuth(MCInst &Inst) const override {
+    return Inst.getOpcode() == AArch64::AUTIASP;
+  }
+  bool isPSign(MCInst &Inst) const override {
+    return Inst.getOpcode() == AArch64::PACIASP;
+  }
 
   bool isRegToRegMove(const MCInst &Inst, MCPhysReg &From,
                       MCPhysReg &To) const override {

>From c22e4d9ad413fcb1997546aee4b9096ca113459a Mon Sep 17 00:00:00 2001
From: Gergely Balint <gergely.balint at arm.com>
Date: Mon, 16 Dec 2024 10:45:41 +0100
Subject: [PATCH 02/25] [BOLT] Support for OpNegateRAState

- save OpNegateRAState, RememberState and RestoreState locations
  when parsing input
- determine the RA states from these before other optimizations
  in MarkRAStates Pass
- after optimizations, we can insert OpNegateRAState at state
  boundaries and other needed locations (e.g. split functions)
  in InsertNegateRAStatePass
---
 bolt/include/bolt/Core/BinaryFunction.h       |  45 ++++++
 bolt/include/bolt/Core/MCPlus.h               |   9 +-
 bolt/include/bolt/Core/MCPlusBuilder.h        |  59 ++++++++
 .../bolt/Passes/InsertNegateRAStatePass.h     |  44 ++++++
 bolt/include/bolt/Passes/MarkRAStates.h       |  33 ++++
 bolt/lib/Core/BinaryBasicBlock.cpp            |   6 +-
 bolt/lib/Core/BinaryFunction.cpp              |   1 +
 bolt/lib/Core/Exceptions.cpp                  |  24 ++-
 bolt/lib/Core/MCPlusBuilder.cpp               |  68 +++++++++
 bolt/lib/Passes/CMakeLists.txt                |   2 +
 bolt/lib/Passes/InsertNegateRAStatePass.cpp   | 142 ++++++++++++++++++
 bolt/lib/Passes/MarkRAStates.cpp              | 133 ++++++++++++++++
 bolt/lib/Rewrite/BinaryPassManager.cpp        |   7 +
 13 files changed, 567 insertions(+), 6 deletions(-)
 create mode 100644 bolt/include/bolt/Passes/InsertNegateRAStatePass.h
 create mode 100644 bolt/include/bolt/Passes/MarkRAStates.h
 create mode 100644 bolt/lib/Passes/InsertNegateRAStatePass.cpp
 create mode 100644 bolt/lib/Passes/MarkRAStates.cpp

diff --git a/bolt/include/bolt/Core/BinaryFunction.h b/bolt/include/bolt/Core/BinaryFunction.h
index ae580520b9110..ebe12929c0973 100644
--- a/bolt/include/bolt/Core/BinaryFunction.h
+++ b/bolt/include/bolt/Core/BinaryFunction.h
@@ -1646,6 +1646,51 @@ class BinaryFunction {
 
   void setHasInferredProfile(bool Inferred) { HasInferredProfile = Inferred; }
 
+  /// Find corrected offset the same way addCFIInstruction does it to skip NOPs.
+  std::optional<uint64_t> getCorrectedCFIOffset(uint64_t Offset) {
+    assert(!Instructions.empty());
+    auto I = Instructions.lower_bound(Offset);
+    if (Offset == getSize()) {
+      assert(I == Instructions.end() && "unexpected iterator value");
+      // Sometimes compiler issues restore_state after all instructions
+      // in the function (even after nop).
+      --I;
+      Offset = I->first;
+    }
+    assert(I->first == Offset && "CFI pointing to unknown instruction");
+    if (I == Instructions.begin())
+      return {};
+
+    --I;
+    while (I != Instructions.begin() && BC.MIB->isNoop(I->second)) {
+      Offset = I->first;
+      --I;
+    }
+    return Offset;
+  }
+
+  void setInstModifiesRAState(uint8_t CFIOpcode, uint64_t Offset) {
+    std::optional<uint64_t> CorrectedOffset = getCorrectedCFIOffset(Offset);
+    if (CorrectedOffset) {
+      auto I = Instructions.lower_bound(*CorrectedOffset);
+      I--;
+
+      switch (CFIOpcode) {
+      case dwarf::DW_CFA_AARCH64_negate_ra_state:
+        BC.MIB->setNegateRAState(I->second);
+        break;
+      case dwarf::DW_CFA_remember_state:
+        BC.MIB->setRememberState(I->second);
+        break;
+      case dwarf::DW_CFA_restore_state:
+        BC.MIB->setRestoreState(I->second);
+        break;
+      default:
+        assert(0 && "CFI Opcode not covered by function");
+      }
+    }
+  }
+
   void addCFIInstruction(uint64_t Offset, MCCFIInstruction &&Inst) {
     assert(!Instructions.empty());
 
diff --git a/bolt/include/bolt/Core/MCPlus.h b/bolt/include/bolt/Core/MCPlus.h
index 601d709712864..a95bba36c5a6e 100644
--- a/bolt/include/bolt/Core/MCPlus.h
+++ b/bolt/include/bolt/Core/MCPlus.h
@@ -72,7 +72,14 @@ class MCAnnotation {
     kLabel,               /// MCSymbol pointing to this instruction.
     kSize,                /// Size of the instruction.
     kDynamicBranch,       /// Jit instruction patched at runtime.
-    kGeneric              /// First generic annotation.
+    kSigning,             /// Inst is a signing instruction (paciasp, etc.).
+    kSigned,              /// Inst is in a range where RA is signed.
+    kAuthenticating,      /// Authenticating inst (e.g. autiasp).
+    kUnsigned,            /// Inst is in a range where RA is unsigned.
+    kRememberState,       /// Inst has rememberState CFI.
+    kRestoreState,        /// Inst has restoreState CFI.
+    kNegateState,         /// Inst has OpNegateRAState CFI.
+    kGeneric,             /// First generic annotation.
   };
 
   virtual void print(raw_ostream &OS) const = 0;
diff --git a/bolt/include/bolt/Core/MCPlusBuilder.h b/bolt/include/bolt/Core/MCPlusBuilder.h
index 0c25ba52ce05e..05ae63cc63cb9 100644
--- a/bolt/include/bolt/Core/MCPlusBuilder.h
+++ b/bolt/include/bolt/Core/MCPlusBuilder.h
@@ -68,6 +68,20 @@ class MCPlusBuilder {
 public:
   using AllocatorIdTy = uint16_t;
 
+  std::optional<int64_t> getAnnotationAtOpIndex(const MCInst &Inst,
+                                                unsigned OpIndex) const {
+    std::optional<unsigned> FirstAnnotationOp = getFirstAnnotationOpIndex(Inst);
+    if (!FirstAnnotationOp)
+      return std::nullopt;
+
+    if (*FirstAnnotationOp > OpIndex || Inst.getNumOperands() < OpIndex)
+      return std::nullopt;
+
+    auto Op = Inst.begin() + OpIndex;
+    const int64_t ImmValue = Op->getImm();
+    return extractAnnotationIndex(ImmValue);
+  }
+
 private:
   /// A struct that represents a single annotation allocator
   struct AnnotationAllocator {
@@ -1296,6 +1310,51 @@ class MCPlusBuilder {
   /// Return true if the instruction is a tail call.
   bool isTailCall(const MCInst &Inst) const;
 
+  /// Stores NegateRAState annotation on \p Inst.
+  void setNegateRAState(MCInst &Inst) const;
+
+  /// Return true if \p Inst has NegateRAState annotation.
+  bool hasNegateRAState(const MCInst &Inst) const;
+
+  /// Sets RememberState annotation on \p Inst.
+  void setRememberState(MCInst &Inst) const;
+
+  /// Return true if \p Inst has RememberState annotation.
+  bool hasRememberState(const MCInst &Inst) const;
+
+  /// Stores RestoreState annotation on \p Inst.
+  void setRestoreState(MCInst &Inst) const;
+
+  /// Return true if \p Inst has RestoreState annotation.
+  bool hasRestoreState(const MCInst &Inst) const;
+
+  /// Stores RA Signed annotation on \p Inst.
+  void setRASigned(MCInst &Inst) const;
+
+  /// Return true if \p Inst has Signed RA annotation.
+  bool isRASigned(const MCInst &Inst) const;
+
+  /// Stores RA Signing annotation on \p Inst.
+  void setRASigning(MCInst &Inst) const;
+
+  /// Return true if \p Inst has Signing RA annotation.
+  bool isRASigning(const MCInst &Inst) const;
+
+  /// Stores Authenticating annotation on \p Inst.
+  void setAuthenticating(MCInst &Inst) const;
+
+  /// Return true if \p Inst has Authenticating annotation.
+  bool isAuthenticating(const MCInst &Inst) const;
+
+  /// Stores RA Unsigned annotation on \p Inst.
+  void setRAUnsigned(MCInst &Inst) const;
+
+  /// Return true if \p Inst has Unsigned RA annotation.
+  bool isRAUnsigned(const MCInst &Inst) const;
+
+  /// Return true if \p Inst doesn't have any annotation related to RA state.
+  bool isRAStateUnknown(const MCInst &Inst) const;
+
   /// Return true if the instruction is a call with an exception handling info.
   virtual bool isInvoke(const MCInst &Inst) const {
     return isCall(Inst) && getEHInfo(Inst);
diff --git a/bolt/include/bolt/Passes/InsertNegateRAStatePass.h b/bolt/include/bolt/Passes/InsertNegateRAStatePass.h
new file mode 100644
index 0000000000000..e62006baa2eff
--- /dev/null
+++ b/bolt/include/bolt/Passes/InsertNegateRAStatePass.h
@@ -0,0 +1,44 @@
+//===- bolt/Passes/InsertNegateRAStatePass.cpp ----------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the InsertNegateRAStatePass class.
+//
+//===----------------------------------------------------------------------===//
+#ifndef BOLT_PASSES_INSERT_NEGATE_RA_STATE_PASS
+#define BOLT_PASSES_INSERT_NEGATE_RA_STATE_PASS
+
+#include "bolt/Passes/BinaryPasses.h"
+#include <stack>
+
+namespace llvm {
+namespace bolt {
+
+class InsertNegateRAState : public BinaryFunctionPass {
+public:
+  explicit InsertNegateRAState() : BinaryFunctionPass(false) {}
+
+  const char *getName() const override { return "insert-negate-ra-state-pass"; }
+
+  /// Pass entry point
+  Error runOnFunctions(BinaryContext &BC) override;
+  void runOnFunction(BinaryFunction &BF);
+
+private:
+  /// Loops over all instructions and adds OpNegateRAState CFI
+  /// after any pointer signing or authenticating instructions.
+  /// Returns true, if any OpNegateRAState CFIs were added.
+  bool addNegateRAStateAfterPacOrAuth(BinaryFunction &BF);
+  /// Because states are tracked as MCAnnotations on individual instructions,
+  /// newly inserted instructions do not have a state associated with them.
+  /// New states are "inherited" from the last known state.
+  void fixUnknownStates(BinaryFunction &BF);
+};
+
+} // namespace bolt
+} // namespace llvm
+#endif
diff --git a/bolt/include/bolt/Passes/MarkRAStates.h b/bolt/include/bolt/Passes/MarkRAStates.h
new file mode 100644
index 0000000000000..e7a49f813b6a7
--- /dev/null
+++ b/bolt/include/bolt/Passes/MarkRAStates.h
@@ -0,0 +1,33 @@
+//===- bolt/Passes/MarkRAStates.cpp ---------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the MarkRAStates class.
+//
+//===----------------------------------------------------------------------===//
+#ifndef BOLT_PASSES_MARK_RA_STATES
+#define BOLT_PASSES_MARK_RA_STATES
+
+#include "bolt/Passes/BinaryPasses.h"
+
+namespace llvm {
+namespace bolt {
+
+class MarkRAStates : public BinaryFunctionPass {
+public:
+  explicit MarkRAStates() : BinaryFunctionPass(false) {}
+
+  const char *getName() const override { return "mark-ra-states"; }
+
+  /// Pass entry point
+  Error runOnFunctions(BinaryContext &BC) override;
+  void runOnFunction(BinaryFunction &BF);
+};
+
+} // namespace bolt
+} // namespace llvm
+#endif
diff --git a/bolt/lib/Core/BinaryBasicBlock.cpp b/bolt/lib/Core/BinaryBasicBlock.cpp
index 311d5c15b8dca..22126eed67a6c 100644
--- a/bolt/lib/Core/BinaryBasicBlock.cpp
+++ b/bolt/lib/Core/BinaryBasicBlock.cpp
@@ -201,7 +201,11 @@ int32_t BinaryBasicBlock::getCFIStateAtInstr(const MCInst *Instr) const {
       InstrSeen = (&Inst == Instr);
       continue;
     }
-    if (Function->getBinaryContext().MIB->isCFI(Inst)) {
+    // Ignoring OpNegateRAState CFIs here, as they dont have a "State"
+    // number associated with them.
+    if (Function->getBinaryContext().MIB->isCFI(Inst) &&
+        (Function->getCFIFor(Inst)->getOperation() !=
+         MCCFIInstruction::OpNegateRAState)) {
       LastCFI = &Inst;
       break;
     }
diff --git a/bolt/lib/Core/BinaryFunction.cpp b/bolt/lib/Core/BinaryFunction.cpp
index eec68ff5a5fce..7a044104f3b4e 100644
--- a/bolt/lib/Core/BinaryFunction.cpp
+++ b/bolt/lib/Core/BinaryFunction.cpp
@@ -2804,6 +2804,7 @@ struct CFISnapshot {
   void advanceTo(int32_t State) {
     for (int32_t I = CurState, E = State; I != E; ++I) {
       const MCCFIInstruction &Instr = FDE[I];
+      assert(Instr.getOperation() != MCCFIInstruction::OpNegateRAState);
       if (Instr.getOperation() != MCCFIInstruction::OpRestoreState) {
         update(Instr, I);
         continue;
diff --git a/bolt/lib/Core/Exceptions.cpp b/bolt/lib/Core/Exceptions.cpp
index 0b2e63b8ca6a7..63b7ad43b1dec 100644
--- a/bolt/lib/Core/Exceptions.cpp
+++ b/bolt/lib/Core/Exceptions.cpp
@@ -568,10 +568,21 @@ bool CFIReaderWriter::fillCFIInfoFor(BinaryFunction &Function) const {
     case DW_CFA_remember_state:
       Function.addCFIInstruction(
           Offset, MCCFIInstruction::createRememberState(nullptr));
+
+      if (Function.getBinaryContext().isAArch64())
+        // Support for pointer authentication:
+        // We need to annotate instructions that modify the RA State, to work
+        // out the state of each instruction in MarkRAStates Pass.
+        Function.setInstModifiesRAState(DW_CFA_remember_state, Offset);
       break;
     case DW_CFA_restore_state:
       Function.addCFIInstruction(Offset,
                                  MCCFIInstruction::createRestoreState(nullptr));
+      if (Function.getBinaryContext().isAArch64())
+        // Support for pointer authentication:
+        // We need to annotate instructions that modify the RA State, to work
+        // out the state of each instruction in MarkRAStates Pass.
+        Function.setInstModifiesRAState(DW_CFA_restore_state, Offset);
       break;
     case DW_CFA_def_cfa:
       Function.addCFIInstruction(
@@ -629,11 +640,16 @@ bool CFIReaderWriter::fillCFIInfoFor(BinaryFunction &Function) const {
         BC.errs() << "BOLT-WARNING: DW_CFA_MIPS_advance_loc unimplemented\n";
       return false;
     case DW_CFA_GNU_window_save:
-      // DW_CFA_GNU_window_save and DW_CFA_GNU_NegateRAState just use the same
-      // id but mean different things. The latter is used in AArch64.
+      // DW_CFA_GNU_window_save and DW_CFA_AARCH64_negate_ra_state just use the
+      // same id but mean different things. The latter is used in AArch64.
       if (Function.getBinaryContext().isAArch64()) {
-        Function.addCFIInstruction(
-            Offset, MCCFIInstruction::createNegateRAState(nullptr));
+        // The location OpNegateRAState CFIs are needed
+        // depends on the order of BasicBlocks, which changes during
+        // optimizations. Instead of adding OpNegateRAState CFIs, an annotation
+        // is added to the instruction, to mark that the instruction modifies
+        // the RA State. The actual state for instructions are worked out in
+        // MarkRAStates based on these annotations.
+        Function.setInstModifiesRAState(DW_CFA_AARCH64_negate_ra_state, Offset);
         break;
       }
       if (opts::Verbosity >= 1)
diff --git a/bolt/lib/Core/MCPlusBuilder.cpp b/bolt/lib/Core/MCPlusBuilder.cpp
index fa8f4d1df308b..aa2a784442a5f 100644
--- a/bolt/lib/Core/MCPlusBuilder.cpp
+++ b/bolt/lib/Core/MCPlusBuilder.cpp
@@ -149,6 +149,74 @@ bool MCPlusBuilder::isTailCall(const MCInst &Inst) const {
   return false;
 }
 
+void MCPlusBuilder::setNegateRAState(MCInst &Inst) const {
+  assert(!hasAnnotation(Inst, MCAnnotation::kNegateState));
+  setAnnotationOpValue(Inst, MCAnnotation::kNegateState, true);
+}
+
+bool MCPlusBuilder::hasNegateRAState(const MCInst &Inst) const {
+  return hasAnnotation(Inst, MCAnnotation::kNegateState);
+}
+
+void MCPlusBuilder::setRememberState(MCInst &Inst) const {
+  assert(!hasAnnotation(Inst, MCAnnotation::kRememberState));
+  setAnnotationOpValue(Inst, MCAnnotation::kRememberState, true);
+}
+
+bool MCPlusBuilder::hasRememberState(const MCInst &Inst) const {
+  return hasAnnotation(Inst, MCAnnotation::kRememberState);
+}
+
+void MCPlusBuilder::setRestoreState(MCInst &Inst) const {
+  assert(!hasAnnotation(Inst, MCAnnotation::kRestoreState));
+  setAnnotationOpValue(Inst, MCAnnotation::kRestoreState, true);
+}
+
+bool MCPlusBuilder::hasRestoreState(const MCInst &Inst) const {
+  return hasAnnotation(Inst, MCAnnotation::kRestoreState);
+}
+
+void MCPlusBuilder::setRASigned(MCInst &Inst) const {
+  assert(!hasAnnotation(Inst, MCAnnotation::kSigned));
+  setAnnotationOpValue(Inst, MCAnnotation::kSigned, true);
+}
+
+bool MCPlusBuilder::isRASigned(const MCInst &Inst) const {
+  return hasAnnotation(Inst, MCAnnotation::kSigned);
+}
+
+void MCPlusBuilder::setRASigning(MCInst &Inst) const {
+  assert(!hasAnnotation(Inst, MCAnnotation::kSigning));
+  setAnnotationOpValue(Inst, MCAnnotation::kSigning, true);
+}
+
+bool MCPlusBuilder::isRASigning(const MCInst &Inst) const {
+  return hasAnnotation(Inst, MCAnnotation::kSigning);
+}
+
+void MCPlusBuilder::setAuthenticating(MCInst &Inst) const {
+  assert(!hasAnnotation(Inst, MCAnnotation::kAuthenticating));
+  setAnnotationOpValue(Inst, MCAnnotation::kAuthenticating, true);
+}
+
+bool MCPlusBuilder::isAuthenticating(const MCInst &Inst) const {
+  return hasAnnotation(Inst, MCAnnotation::kAuthenticating);
+}
+
+void MCPlusBuilder::setRAUnsigned(MCInst &Inst) const {
+  assert(!hasAnnotation(Inst, MCAnnotation::kUnsigned));
+  setAnnotationOpValue(Inst, MCAnnotation::kUnsigned, true);
+}
+
+bool MCPlusBuilder::isRAUnsigned(const MCInst &Inst) const {
+  return hasAnnotation(Inst, MCAnnotation::kUnsigned);
+}
+
+bool MCPlusBuilder::isRAStateUnknown(const MCInst &Inst) const {
+  return !(isRAUnsigned(Inst) || isRASigned(Inst) || isRASigning(Inst) ||
+           isAuthenticating(Inst));
+}
+
 std::optional<MCLandingPad> MCPlusBuilder::getEHInfo(const MCInst &Inst) const {
   if (!isCall(Inst))
     return std::nullopt;
diff --git a/bolt/lib/Passes/CMakeLists.txt b/bolt/lib/Passes/CMakeLists.txt
index 77d2bb9c2bcb5..d7519518f186f 100644
--- a/bolt/lib/Passes/CMakeLists.txt
+++ b/bolt/lib/Passes/CMakeLists.txt
@@ -17,12 +17,14 @@ add_llvm_library(LLVMBOLTPasses
   IdenticalCodeFolding.cpp
   IndirectCallPromotion.cpp
   Inliner.cpp
+  InsertNegateRAStatePass.cpp
   Instrumentation.cpp
   JTFootprintReduction.cpp
   LongJmp.cpp
   LoopInversionPass.cpp
   LivenessAnalysis.cpp
   MCF.cpp
+  MarkRAStates.cpp
   PatchEntries.cpp
   PAuthGadgetScanner.cpp
   PettisAndHansen.cpp
diff --git a/bolt/lib/Passes/InsertNegateRAStatePass.cpp b/bolt/lib/Passes/InsertNegateRAStatePass.cpp
new file mode 100644
index 0000000000000..098ebf8f953b4
--- /dev/null
+++ b/bolt/lib/Passes/InsertNegateRAStatePass.cpp
@@ -0,0 +1,142 @@
+//===- bolt/Passes/InsertNegateRAStatePass.cpp ----------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the InsertNegateRAStatePass class. It inserts
+// OpNegateRAState CFIs to places where the state of two consecutive
+// instructions are different.
+//
+//===----------------------------------------------------------------------===//
+#include "bolt/Passes/InsertNegateRAStatePass.h"
+#include "bolt/Core/BinaryFunction.h"
+#include "bolt/Core/ParallelUtilities.h"
+#include "bolt/Utils/CommandLineOpts.h"
+#include <cstdlib>
+#include <fstream>
+#include <iterator>
+
+using namespace llvm;
+
+namespace llvm {
+namespace bolt {
+
+void InsertNegateRAState::runOnFunction(BinaryFunction &BF) {
+  BinaryContext &BC = BF.getBinaryContext();
+
+  if (BF.getState() == BinaryFunction::State::Empty)
+    return;
+
+  if (BF.getState() != BinaryFunction::State::CFG &&
+      BF.getState() != BinaryFunction::State::CFG_Finalized) {
+    BC.outs() << "BOLT-INFO: No CFG for " << BF.getPrintName()
+              << " in InsertNegateRAStatePass\n";
+    return;
+  }
+
+  // If none is inserted, the function doesn't need more work.
+  if (!addNegateRAStateAfterPacOrAuth(BF))
+    return;
+
+  fixUnknownStates(BF);
+
+  bool FirstIter = true;
+  MCInst PrevInst;
+  BinaryBasicBlock *PrevBB = nullptr;
+  auto *Begin = BF.getLayout().block_begin();
+  auto *End = BF.getLayout().block_end();
+  for (auto *BB = Begin; BB != End; BB++) {
+
+    // Support for function splitting:
+    // if two consecutive BBs are going to end up in different functions,
+    // we have to negate the RA State, so the new function starts with a Signed
+    // state.
+    if (PrevBB != nullptr &&
+        PrevBB->getFragmentNum() != (*BB)->getFragmentNum() &&
+        BC.MIB->isRASigned(*((*BB)->begin()))) {
+      BF.addCFIInstruction(*BB, (*BB)->begin(),
+                           MCCFIInstruction::createNegateRAState(nullptr));
+    }
+
+    for (auto It = (*BB)->begin(); It != (*BB)->end(); ++It) {
+
+      MCInst &Inst = *It;
+      if (BC.MIB->isCFI(Inst))
+        continue;
+
+      if (!FirstIter) {
+        if ((BC.MIB->isRASigned(PrevInst) && BC.MIB->isRAUnsigned(Inst)) ||
+            (BC.MIB->isRAUnsigned(PrevInst) && BC.MIB->isRASigned(Inst))) {
+
+          It = BF.addCFIInstruction(
+              *BB, It, MCCFIInstruction::createNegateRAState(nullptr));
+        }
+
+      } else {
+        FirstIter = false;
+      }
+      PrevInst = *It;
+    }
+    PrevBB = *BB;
+  }
+}
+
+bool InsertNegateRAState::addNegateRAStateAfterPacOrAuth(BinaryFunction &BF) {
+  BinaryContext &BC = BF.getBinaryContext();
+  bool FoundAny = false;
+  for (BinaryBasicBlock &BB : BF) {
+    for (auto Iter = BB.begin(); Iter != BB.end(); ++Iter) {
+      MCInst &Inst = *Iter;
+      if (BC.MIB->isPSign(Inst) || BC.MIB->isPAuth(Inst)) {
+        Iter = BF.addCFIInstruction(
+            &BB, Iter + 1, MCCFIInstruction::createNegateRAState(nullptr));
+        FoundAny = true;
+      }
+    }
+  }
+  return FoundAny;
+}
+
+void InsertNegateRAState::fixUnknownStates(BinaryFunction &BF) {
+  BinaryContext &BC = BF.getBinaryContext();
+  bool FirstIter = true;
+  MCInst PrevInst;
+  for (BinaryBasicBlock &BB : BF) {
+    for (auto It = BB.begin(); It != BB.end(); ++It) {
+
+      MCInst &Inst = *It;
+      if (BC.MIB->isCFI(Inst))
+        continue;
+
+      if (!FirstIter && BC.MIB->isRAStateUnknown(Inst)) {
+        if (BC.MIB->isRASigned(PrevInst) || BC.MIB->isRASigning(PrevInst)) {
+          BC.MIB->setRASigned(Inst);
+        } else if (BC.MIB->isRAUnsigned(PrevInst) ||
+                   BC.MIB->isAuthenticating(PrevInst)) {
+          BC.MIB->setRAUnsigned(Inst);
+        }
+      } else {
+        FirstIter = false;
+      }
+      PrevInst = Inst;
+    }
+  }
+}
+
+Error InsertNegateRAState::runOnFunctions(BinaryContext &BC) {
+  ParallelUtilities::WorkFuncTy WorkFun = [&](BinaryFunction &BF) {
+    runOnFunction(BF);
+  };
+
+  ParallelUtilities::runOnEachFunction(
+      BC, ParallelUtilities::SchedulingPolicy::SP_TRIVIAL, WorkFun, nullptr,
+      "InsertNegateRAStatePass");
+
+  return Error::success();
+}
+
+} // end namespace bolt
+} // end namespace llvm
diff --git a/bolt/lib/Passes/MarkRAStates.cpp b/bolt/lib/Passes/MarkRAStates.cpp
new file mode 100644
index 0000000000000..adccf2090c36f
--- /dev/null
+++ b/bolt/lib/Passes/MarkRAStates.cpp
@@ -0,0 +1,133 @@
+//===- bolt/Passes/MarkRAStates.cpp ---------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the MarkRAStates class.
+// Three CFIs have an influence on the RA State of an instruction:
+// - NegateRAState flips the RA State,
+// - RememberState pushes the RA State to a stack,
+// - RestoreState pops the RA State from the stack.
+// These are saved as MCAnnotations on instructions they refer to at CFI
+// reading (in CFIReaderWriter::fillCFIInfoFor). In this pass, we can work out
+// the RA State of each instruction, and save it as new MCAnnotations. The new
+// annotations are Signing, Signed, Authenticating and Unsigned. After
+// optimizations, .cfi_negate_ra_state CFIs are added to the places where the
+// state changes in InsertNegateRAStatePass.
+//
+//===----------------------------------------------------------------------===//
+#include "bolt/Passes/MarkRAStates.h"
+#include "bolt/Core/BinaryFunction.h"
+#include "bolt/Core/ParallelUtilities.h"
+#include "bolt/Utils/CommandLineOpts.h"
+#include <cstdlib>
+#include <fstream>
+#include <iterator>
+
+#include <iostream>
+#include <optional>
+#include <stack>
+
+using namespace llvm;
+
+namespace llvm {
+namespace bolt {
+
+void MarkRAStates::runOnFunction(BinaryFunction &BF) {
+
+  if (BF.isIgnored())
+    return;
+
+  BinaryContext &BC = BF.getBinaryContext();
+
+  for (BinaryBasicBlock &BB : BF) {
+    for (auto It = BB.begin(); It != BB.end(); ++It) {
+      MCInst &Inst = *It;
+      if ((BC.MIB->isPSign(Inst) || BC.MIB->isPAuth(Inst)) &&
+          !BC.MIB->hasNegateRAState(Inst)) {
+        // no .cfi_negate_ra_state attached to signing or authenticating instr
+        // means, that this is a function with handwritten assembly, which might
+        // not respect Clang's conventions (e.g. tailcalls are always
+        // authenticated, so functions always start with unsigned RAState when
+        // working with compiler-generated code)
+        BF.setIgnored();
+        BC.outs() << "BOLT-INFO: inconsistent RAStates in function "
+                  << BF.getPrintName() << "\n";
+        return;
+      }
+    }
+  }
+
+  bool RAState = false;
+  std::stack<bool> RAStateStack;
+
+  for (BinaryBasicBlock &BB : BF) {
+    for (auto It = BB.begin(); It != BB.end(); ++It) {
+
+      MCInst &Inst = *It;
+      if (BC.MIB->isCFI(Inst))
+        continue;
+
+      if (BC.MIB->isPSign(Inst)) {
+        if (RAState) {
+          // RA signing instructions should only follow unsigned RA state.
+          BC.outs() << "BOLT-INFO: inconsistent RAStates in function "
+                    << BF.getPrintName() << "\n";
+          BF.setIgnored();
+          return;
+        }
+        BC.MIB->setRASigning(Inst);
+      } else if (BC.MIB->isPAuth(Inst)) {
+        if (!RAState) {
+          // RA authenticating instructions should only follow signed RA state.
+          BC.outs() << "BOLT-INFO: inconsistent RAStates in function "
+                    << BF.getPrintName() << "\n";
+          BF.setIgnored();
+          return;
+        }
+        BC.MIB->setAuthenticating(Inst);
+      } else if (RAState) {
+        BC.MIB->setRASigned(Inst);
+      } else {
+        BC.MIB->setRAUnsigned(Inst);
+      }
+
+      // Updating RAState. All updates are valid from the next instruction.
+      // Because the same instruction can have remember and restore, the order
+      // here is relevant. This is the reason to loop over Annotations instead
+      // of just checking each in a predefined order.
+      for (unsigned int Idx = 0; Idx < Inst.getNumOperands(); Idx++) {
+        std::optional<int64_t> Annotation =
+            BC.MIB->getAnnotationAtOpIndex(Inst, Idx);
+        if (!Annotation)
+          continue;
+        if (Annotation == MCPlus::MCAnnotation::kNegateState)
+          RAState = !RAState;
+        else if (Annotation == MCPlus::MCAnnotation::kRememberState)
+          RAStateStack.push(RAState);
+        else if (Annotation == MCPlus::MCAnnotation::kRestoreState) {
+          RAState = RAStateStack.top();
+          RAStateStack.pop();
+        }
+      }
+    }
+  }
+}
+
+Error MarkRAStates::runOnFunctions(BinaryContext &BC) {
+  ParallelUtilities::WorkFuncTy WorkFun = [&](BinaryFunction &BF) {
+    runOnFunction(BF);
+  };
+
+  ParallelUtilities::runOnEachFunction(
+      BC, ParallelUtilities::SchedulingPolicy::SP_TRIVIAL, WorkFun, nullptr,
+      "MarkRAStates");
+
+  return Error::success();
+}
+
+} // end namespace bolt
+} // end namespace llvm
diff --git a/bolt/lib/Rewrite/BinaryPassManager.cpp b/bolt/lib/Rewrite/BinaryPassManager.cpp
index 996d2e972599d..8b412854dc701 100644
--- a/bolt/lib/Rewrite/BinaryPassManager.cpp
+++ b/bolt/lib/Rewrite/BinaryPassManager.cpp
@@ -19,11 +19,13 @@
 #include "bolt/Passes/IdenticalCodeFolding.h"
 #include "bolt/Passes/IndirectCallPromotion.h"
 #include "bolt/Passes/Inliner.h"
+#include "bolt/Passes/InsertNegateRAStatePass.h"
 #include "bolt/Passes/Instrumentation.h"
 #include "bolt/Passes/JTFootprintReduction.h"
 #include "bolt/Passes/LongJmp.h"
 #include "bolt/Passes/LoopInversionPass.h"
 #include "bolt/Passes/MCF.h"
+#include "bolt/Passes/MarkRAStates.h"
 #include "bolt/Passes/PLTCall.h"
 #include "bolt/Passes/PatchEntries.h"
 #include "bolt/Passes/ProfileQualityStats.h"
@@ -350,6 +352,9 @@ Error BinaryFunctionPassManager::runPasses() {
 Error BinaryFunctionPassManager::runAllPasses(BinaryContext &BC) {
   BinaryFunctionPassManager Manager(BC);
 
+  if (BC.isAArch64())
+    Manager.registerPass(std::make_unique<MarkRAStates>());
+
   Manager.registerPass(
       std::make_unique<EstimateEdgeCounts>(PrintEstimateEdgeCounts));
 
@@ -509,6 +514,8 @@ Error BinaryFunctionPassManager::runAllPasses(BinaryContext &BC) {
     // targets. No extra instructions after this pass, otherwise we may have
     // relocations out of range and crash during linking.
     Manager.registerPass(std::make_unique<LongJmpPass>(PrintLongJmp));
+
+    Manager.registerPass(std::make_unique<InsertNegateRAState>());
   }
 
   // This pass should always run last.*

>From 2f5a9240ffd2eeb462c1a589fb8d07aca5105170 Mon Sep 17 00:00:00 2001
From: Gergely Balint <gergely.balint at arm.com>
Date: Mon, 7 Apr 2025 18:19:20 +0200
Subject: [PATCH 03/25] [BOLT][AArch64] Fix which PSign and PAuth variants are
 used (#120064)

- only the ones operating on LR should be marked
    with .cfi_cfi_negate_ra_state
- added support for fused PtrAuth and Ret instructions,
    e.g. RETAA.
---
 bolt/include/bolt/Core/MCPlusBuilder.h        | 22 ++++++++++-----
 .../bolt/Passes/InsertNegateRAStatePass.h     |  4 ++-
 bolt/lib/Passes/InsertNegateRAStatePass.cpp   | 13 ++++++---
 bolt/lib/Passes/MarkRAStates.cpp              |  7 +++--
 .../Target/AArch64/AArch64MCPlusBuilder.cpp   | 28 +++++++++++++++----
 5 files changed, 53 insertions(+), 21 deletions(-)

diff --git a/bolt/include/bolt/Core/MCPlusBuilder.h b/bolt/include/bolt/Core/MCPlusBuilder.h
index 05ae63cc63cb9..6d41eaf45b03e 100644
--- a/bolt/include/bolt/Core/MCPlusBuilder.h
+++ b/bolt/include/bolt/Core/MCPlusBuilder.h
@@ -610,6 +610,21 @@ class MCPlusBuilder {
     return std::nullopt;
   }
 
+  virtual bool isPSignOnLR(const MCInst &Inst) const {
+    llvm_unreachable("not implemented");
+    return false;
+  }
+
+  virtual bool isPAuthOnLR(const MCInst &Inst) const {
+    llvm_unreachable("not implemented");
+    return false;
+  }
+
+  virtual bool isPAuthAndRet(const MCInst &Inst) const {
+    llvm_unreachable("not implemented");
+    return false;
+  }
+
   /// Returns the register used as a return address. Returns std::nullopt if
   /// not applicable, such as reading the return address from a system register
   /// or from the stack.
@@ -829,13 +844,6 @@ class MCPlusBuilder {
     llvm_unreachable("not implemented");
     return false;
   }
-  virtual bool isPAuth(MCInst &Inst) const {
-    llvm_unreachable("not implemented");
-  }
-
-  virtual bool isPSign(MCInst &Inst) const {
-    llvm_unreachable("not implemented");
-  }
 
   virtual bool isCleanRegXOR(const MCInst &Inst) const {
     llvm_unreachable("not implemented");
diff --git a/bolt/include/bolt/Passes/InsertNegateRAStatePass.h b/bolt/include/bolt/Passes/InsertNegateRAStatePass.h
index e62006baa2eff..ce73b5a152d12 100644
--- a/bolt/include/bolt/Passes/InsertNegateRAStatePass.h
+++ b/bolt/include/bolt/Passes/InsertNegateRAStatePass.h
@@ -30,7 +30,9 @@ class InsertNegateRAState : public BinaryFunctionPass {
 
 private:
   /// Loops over all instructions and adds OpNegateRAState CFI
-  /// after any pointer signing or authenticating instructions.
+  /// after any pointer signing or authenticating instructions,
+  /// which operate on the LR, except fused ptrauth + ret instructions
+  /// (such as RETAA).
   /// Returns true, if any OpNegateRAState CFIs were added.
   bool addNegateRAStateAfterPacOrAuth(BinaryFunction &BF);
   /// Because states are tracked as MCAnnotations on individual instructions,
diff --git a/bolt/lib/Passes/InsertNegateRAStatePass.cpp b/bolt/lib/Passes/InsertNegateRAStatePass.cpp
index 098ebf8f953b4..05d4d58c7e4cf 100644
--- a/bolt/lib/Passes/InsertNegateRAStatePass.cpp
+++ b/bolt/lib/Passes/InsertNegateRAStatePass.cpp
@@ -46,14 +46,16 @@ void InsertNegateRAState::runOnFunction(BinaryFunction &BF) {
   bool FirstIter = true;
   MCInst PrevInst;
   BinaryBasicBlock *PrevBB = nullptr;
+  // We need to iterate on BBs in the Layout order
+  // not in the order they are stored in the BF class.
   auto *Begin = BF.getLayout().block_begin();
   auto *End = BF.getLayout().block_end();
   for (auto *BB = Begin; BB != End; BB++) {
 
     // Support for function splitting:
-    // if two consecutive BBs are going to end up in different functions,
-    // we have to negate the RA State, so the new function starts with a Signed
-    // state.
+    // if two consecutive BBs with Signed state are going to end up in different
+    // functions, we have to add a OpNegateRAState to the beginning of the newly
+    // split function, so it starts with a Signed state.
     if (PrevBB != nullptr &&
         PrevBB->getFragmentNum() != (*BB)->getFragmentNum() &&
         BC.MIB->isRASigned(*((*BB)->begin()))) {
@@ -68,6 +70,8 @@ void InsertNegateRAState::runOnFunction(BinaryFunction &BF) {
         continue;
 
       if (!FirstIter) {
+        // Consecutive instructions with different RAState means we need to add
+        // a OpNegateRAState.
         if ((BC.MIB->isRASigned(PrevInst) && BC.MIB->isRAUnsigned(Inst)) ||
             (BC.MIB->isRAUnsigned(PrevInst) && BC.MIB->isRASigned(Inst))) {
 
@@ -90,7 +94,8 @@ bool InsertNegateRAState::addNegateRAStateAfterPacOrAuth(BinaryFunction &BF) {
   for (BinaryBasicBlock &BB : BF) {
     for (auto Iter = BB.begin(); Iter != BB.end(); ++Iter) {
       MCInst &Inst = *Iter;
-      if (BC.MIB->isPSign(Inst) || BC.MIB->isPAuth(Inst)) {
+      if (BC.MIB->isPSignOnLR(Inst) ||
+          (BC.MIB->isPAuthOnLR(Inst) && !BC.MIB->isPAuthAndRet(Inst))) {
         Iter = BF.addCFIInstruction(
             &BB, Iter + 1, MCCFIInstruction::createNegateRAState(nullptr));
         FoundAny = true;
diff --git a/bolt/lib/Passes/MarkRAStates.cpp b/bolt/lib/Passes/MarkRAStates.cpp
index adccf2090c36f..8121fffb93c9f 100644
--- a/bolt/lib/Passes/MarkRAStates.cpp
+++ b/bolt/lib/Passes/MarkRAStates.cpp
@@ -46,7 +46,8 @@ void MarkRAStates::runOnFunction(BinaryFunction &BF) {
   for (BinaryBasicBlock &BB : BF) {
     for (auto It = BB.begin(); It != BB.end(); ++It) {
       MCInst &Inst = *It;
-      if ((BC.MIB->isPSign(Inst) || BC.MIB->isPAuth(Inst)) &&
+      if ((BC.MIB->isPSignOnLR(Inst) ||
+           (BC.MIB->isPAuthOnLR(Inst) && !BC.MIB->isPAuthAndRet(Inst))) &&
           !BC.MIB->hasNegateRAState(Inst)) {
         // no .cfi_negate_ra_state attached to signing or authenticating instr
         // means, that this is a function with handwritten assembly, which might
@@ -71,7 +72,7 @@ void MarkRAStates::runOnFunction(BinaryFunction &BF) {
       if (BC.MIB->isCFI(Inst))
         continue;
 
-      if (BC.MIB->isPSign(Inst)) {
+      if (BC.MIB->isPSignOnLR(Inst)) {
         if (RAState) {
           // RA signing instructions should only follow unsigned RA state.
           BC.outs() << "BOLT-INFO: inconsistent RAStates in function "
@@ -80,7 +81,7 @@ void MarkRAStates::runOnFunction(BinaryFunction &BF) {
           return;
         }
         BC.MIB->setRASigning(Inst);
-      } else if (BC.MIB->isPAuth(Inst)) {
+      } else if (BC.MIB->isPAuthOnLR(Inst)) {
         if (!RAState) {
           // RA authenticating instructions should only follow signed RA state.
           BC.outs() << "BOLT-INFO: inconsistent RAStates in function "
diff --git a/bolt/lib/Target/AArch64/AArch64MCPlusBuilder.cpp b/bolt/lib/Target/AArch64/AArch64MCPlusBuilder.cpp
index 2266733f956ee..e0061bd12857c 100644
--- a/bolt/lib/Target/AArch64/AArch64MCPlusBuilder.cpp
+++ b/bolt/lib/Target/AArch64/AArch64MCPlusBuilder.cpp
@@ -231,6 +231,28 @@ class AArch64MCPlusBuilder : public MCPlusBuilder {
     }
   }
 
+  bool isPSignOnLR(const MCInst &Inst) const override {
+    std::optional<MCPhysReg> SignReg = getSignedReg(Inst);
+    return SignReg && *SignReg == AArch64::LR;
+  }
+
+  bool isPAuthOnLR(const MCInst &Inst) const override {
+    // LDR(A|B) should not be covered.
+    bool IsChecked;
+    std::optional<MCPhysReg> AuthReg =
+        getWrittenAuthenticatedReg(Inst, IsChecked);
+    return !IsChecked && AuthReg && *AuthReg == AArch64::LR;
+  }
+
+  bool isPAuthAndRet(const MCInst &Inst) const override {
+    return Inst.getOpcode() == AArch64::RETAA ||
+           Inst.getOpcode() == AArch64::RETAB ||
+           Inst.getOpcode() == AArch64::RETAASPPCi ||
+           Inst.getOpcode() == AArch64::RETABSPPCi ||
+           Inst.getOpcode() == AArch64::RETAASPPCr ||
+           Inst.getOpcode() == AArch64::RETABSPPCr;
+  }
+
   std::optional<MCPhysReg> getSignedReg(const MCInst &Inst) const override {
     switch (Inst.getOpcode()) {
     case AArch64::PACIA:
@@ -894,12 +916,6 @@ class AArch64MCPlusBuilder : public MCPlusBuilder {
     }
     return false;
   }
-  bool isPAuth(MCInst &Inst) const override {
-    return Inst.getOpcode() == AArch64::AUTIASP;
-  }
-  bool isPSign(MCInst &Inst) const override {
-    return Inst.getOpcode() == AArch64::PACIASP;
-  }
 
   bool isRegToRegMove(const MCInst &Inst, MCPhysReg &From,
                       MCPhysReg &To) const override {

>From 19ea611bdb90c22bd0451c96c7e60e4a55b4d9ff Mon Sep 17 00:00:00 2001
From: Gergely Balint <gergely.balint at arm.com>
Date: Fri, 4 Apr 2025 11:23:22 +0200
Subject: [PATCH 04/25] [BOLT] Add unit tests for negate_ra_state cfi handling

- also add match_dwarf.py, a tool used by the unit tests.
---
 bolt/lib/Core/BinaryFunction.cpp              |  24 +--
 bolt/test/AArch64/negate-ra-state-incorrect.s |  44 ++++++
 bolt/test/AArch64/negate-ra-state.s           |  42 ++++++
 bolt/test/lit.cfg.py                          |   7 +
 bolt/test/match_dwarf.py                      | 137 ++++++++++++++++++
 5 files changed, 233 insertions(+), 21 deletions(-)
 create mode 100644 bolt/test/AArch64/negate-ra-state-incorrect.s
 create mode 100644 bolt/test/AArch64/negate-ra-state.s
 create mode 100755 bolt/test/match_dwarf.py

diff --git a/bolt/lib/Core/BinaryFunction.cpp b/bolt/lib/Core/BinaryFunction.cpp
index 7a044104f3b4e..c4811888a58f9 100644
--- a/bolt/lib/Core/BinaryFunction.cpp
+++ b/bolt/lib/Core/BinaryFunction.cpp
@@ -2782,14 +2782,8 @@ struct CFISnapshot {
     case MCCFIInstruction::OpLLVMDefAspaceCfa:
     case MCCFIInstruction::OpLabel:
     case MCCFIInstruction::OpValOffset:
-      llvm_unreachable("unsupported CFI opcode");
-      break;
     case MCCFIInstruction::OpNegateRAState:
-      if (!(opts::BinaryAnalysisMode || opts::HeatmapMode)) {
-        llvm_unreachable("BOLT-ERROR: binaries using pac-ret hardening (e.g. "
-                         "as produced by '-mbranch-protection=pac-ret') are "
-                         "currently not supported by BOLT.");
-      }
+      llvm_unreachable("unsupported CFI opcode");
       break;
     case MCCFIInstruction::OpRememberState:
     case MCCFIInstruction::OpRestoreState:
@@ -2929,15 +2923,9 @@ struct CFISnapshotDiff : public CFISnapshot {
     case MCCFIInstruction::OpLLVMDefAspaceCfa:
     case MCCFIInstruction::OpLabel:
     case MCCFIInstruction::OpValOffset:
+    case MCCFIInstruction::OpNegateRAState:
       llvm_unreachable("unsupported CFI opcode");
       return false;
-    case MCCFIInstruction::OpNegateRAState:
-      if (!(opts::BinaryAnalysisMode || opts::HeatmapMode)) {
-        llvm_unreachable("BOLT-ERROR: binaries using pac-ret hardening (e.g. "
-                         "as produced by '-mbranch-protection=pac-ret') are "
-                         "currently not supported by BOLT.");
-      }
-      break;
     case MCCFIInstruction::OpRememberState:
     case MCCFIInstruction::OpRestoreState:
     case MCCFIInstruction::OpGnuArgsSize:
@@ -3086,14 +3074,8 @@ BinaryFunction::unwindCFIState(int32_t FromState, int32_t ToState,
     case MCCFIInstruction::OpLLVMDefAspaceCfa:
     case MCCFIInstruction::OpLabel:
     case MCCFIInstruction::OpValOffset:
-      llvm_unreachable("unsupported CFI opcode");
-      break;
     case MCCFIInstruction::OpNegateRAState:
-      if (!(opts::BinaryAnalysisMode || opts::HeatmapMode)) {
-        llvm_unreachable("BOLT-ERROR: binaries using pac-ret hardening (e.g. "
-                         "as produced by '-mbranch-protection=pac-ret') are "
-                         "currently not supported by BOLT.");
-      }
+      llvm_unreachable("unsupported CFI opcode");
       break;
     case MCCFIInstruction::OpGnuArgsSize:
       // do not affect CFI state
diff --git a/bolt/test/AArch64/negate-ra-state-incorrect.s b/bolt/test/AArch64/negate-ra-state-incorrect.s
new file mode 100644
index 0000000000000..c6b8b36939f4d
--- /dev/null
+++ b/bolt/test/AArch64/negate-ra-state-incorrect.s
@@ -0,0 +1,44 @@
+# RUN: llvm-mc -filetype=obj -triple aarch64-unknown-unknown %s -o %t.o
+# RUN: %clang %cflags  %t.o -o %t.exe -Wl,-q
+# RUN: llvm-bolt %t.exe -o %t.exe.bolt | FileCheck %s
+
+# check that the output is listing foo as incorrect
+# CHECK: BOLT-INFO: inconsistent RAStates in function foo
+
+# check that foo got Ignored, so it's not in the new .text section
+# RUN: llvm-objdump %t.exe.bolt -d -j .text > %t.exe.dump
+# RUN: not grep "<foo>:" %t.exe.dump
+
+
+# How is this test incorrect?
+# There is an extra .cfi_negate_ra_state in foo.
+# Because of this, we will get to the autiasp (hint #29)
+# in a (seemingly) unsigned state. That is incorrect.
+  .text
+  .globl  foo
+  .p2align        2
+  .type   foo, at function
+foo:
+  .cfi_startproc
+  hint    #25
+  .cfi_negate_ra_state
+  sub     sp, sp, #16
+  stp     x29, x30, [sp, #16]             // 16-byte Folded Spill
+  .cfi_def_cfa_offset 16
+  str     w0, [sp, #12]
+  ldr     w8, [sp, #12]
+  .cfi_negate_ra_state
+  add     w0, w8, #1
+  ldp     x29, x30, [sp, #16]             // 16-byte Folded Reload
+  add     sp, sp, #16
+  hint    #29
+  .cfi_negate_ra_state
+  ret
+.Lfunc_end1:
+  .size   foo, .Lfunc_end1-foo
+  .cfi_endproc
+
+  .global _start
+  .type _start, %function
+_start:
+  b foo
diff --git a/bolt/test/AArch64/negate-ra-state.s b/bolt/test/AArch64/negate-ra-state.s
new file mode 100644
index 0000000000000..11c511a254c71
--- /dev/null
+++ b/bolt/test/AArch64/negate-ra-state.s
@@ -0,0 +1,42 @@
+# RUN: llvm-mc -filetype=obj -triple aarch64-unknown-unknown %s -o %t.o
+# RUN: %clang %cflags  %t.o -o %t.exe -Wl,-q
+
+# RUN: llvm-objdump %t.exe -d > %t.exe.dump
+# RUN: llvm-objdump --dwarf=frames %t.exe > %t.exe.dump-dwarf
+# RUN: match-dwarf %t.exe.dump %t.exe.dump-dwarf foo > %t.match-dwarf.txt
+
+# RUN: llvm-bolt %t.exe -o %t.exe.bolt
+
+# RUN: llvm-objdump %t.exe.bolt -d > %t.exe.bolt.dump
+# RUN: llvm-objdump --dwarf=frames %t.exe.bolt  > %t.exe.bolt.dump-dwarf
+# RUN: match-dwarf %t.exe.bolt.dump %t.exe.bolt.dump-dwarf foo > %t.bolt.match-dwarf.txt
+
+# RUN: diff %t.match-dwarf.txt %t.bolt.match-dwarf.txt
+
+	.text
+	.globl	foo
+	.p2align	2
+	.type	foo, at function
+foo:
+	.cfi_startproc
+	hint	#25
+	.cfi_negate_ra_state
+	sub	sp, sp, #16
+	stp	x29, x30, [sp, #16]             // 16-byte Folded Spill
+	.cfi_def_cfa_offset 16
+	str	w0, [sp, #12]
+	ldr	w8, [sp, #12]
+	add	w0, w8, #1
+	ldp	x29, x30, [sp, #16]             // 16-byte Folded Reload
+	add	sp, sp, #16
+	hint	#29
+	.cfi_negate_ra_state
+	ret
+.Lfunc_end1:
+	.size	foo, .Lfunc_end1-foo
+	.cfi_endproc
+
+	.global _start
+	.type _start, %function
+_start:
+	b foo
diff --git a/bolt/test/lit.cfg.py b/bolt/test/lit.cfg.py
index 0d05229be2bf3..60617cd76f968 100644
--- a/bolt/test/lit.cfg.py
+++ b/bolt/test/lit.cfg.py
@@ -89,6 +89,7 @@
 config.substitutions.append(("%cxxflags", ""))
 
 link_fdata_cmd = os.path.join(config.test_source_root, "link_fdata.py")
+match_dwarf_cmd = os.path.join(config.test_source_root, "match_dwarf.py")
 
 tool_dirs = [config.llvm_tools_dir, config.test_source_root]
 
@@ -131,6 +132,12 @@
     ToolSubst("llvm-readobj", unresolved="fatal"),
     ToolSubst("llvm-dwp", unresolved="fatal"),
     ToolSubst("split-file", unresolved="fatal"),
+    ToolSubst(
+        "match-dwarf",
+        command=sys.executable,
+        unresolved="fatal",
+        extra_args=[match_dwarf_cmd],
+    ),
 ]
 llvm_config.add_tool_substitutions(tools, tool_dirs)
 
diff --git a/bolt/test/match_dwarf.py b/bolt/test/match_dwarf.py
new file mode 100755
index 0000000000000..3d3ab22042d5c
--- /dev/null
+++ b/bolt/test/match_dwarf.py
@@ -0,0 +1,137 @@
+#!/usr/bin/env python3
+
+# This tool helps matching dwarf dumps
+# (= the output from running llvm-objdump --dwarf=frames),
+# by address to function names (which are parsed from a normal objdump).
+# The script is used for checking if .cfi_negate_ra_state CFIs
+# are generated by BOLT the same way they are generated by LLVM.
+# The script is called twice in unittests: once with the objdumps of
+# the BOLT input binary, and once with the output binary from BOLT.
+# We output the offsets of .cfi_negate_ra_state instructions from the
+# function's start address to see that BOLT can generate them to the same
+# locations.
+# Because we check the location, this is only useful for testing without
+# optimization flags, so `llvm-bolt input.exe -o output.exe`
+
+
+import argparse
+import subprocess
+import sys
+import re
+
+
+class NameDwarfPair(object):
+    def __init__(self, name, body):
+        self.name = name
+        self.body = body
+        self.finalized = False
+
+    def append(self, body_line):
+        # only store elements into the body until the first whitespace line is encountered.
+        if body_line.isspace():
+            self.finalized = True
+        if not self.finalized:
+            self.body += body_line
+
+    def print(self):
+        print(self.name)
+        print(self.body)
+
+    def parse_negate_offsets(self):
+        """
+        Create a list of locations/offsets of the negate_ra_state CFIs in the
+        dwarf entry. To find offsets for each, we match the DW_CFA_advance_loc
+        entries, and sum up their values.
+        """
+        negate_offsets = []
+        loc = 0
+        # TODO: make sure this is not printed in hex
+        re_advloc = r"DW_CFA_advance_loc: (\d+)"
+
+        for line in self.body.splitlines():
+            # if line matches advance_loc int
+            match = re.search(re_advloc, line)
+            if match:
+                loc += int(match.group(1))
+            if "DW_CFA_AARCH64_negate_ra_state" in line:
+                negate_offsets.append(loc)
+
+        self.negate_offsets = negate_offsets
+
+    def __eq__(self, other):
+        return self.name == other.name and self.negate_offsets == other.negate_offsets
+
+
+def extract_function_addresses(objdump):
+    """
+    Parse and return address-to-name dictionary from objdump file.
+    Function names in the objdump look like this:
+        000123abc <foo>:
+    We create a dict from the addr (000123abc), to the name (foo).
+    """
+    addr_name_dict = dict()
+    re_function = re.compile(r"^([0-9a-fA-F]+)\s<(.*)>:$")
+    with open(objdump, "r") as f:
+        for line in f.readlines():
+            match = re_function.match(line)
+            if not match:
+                continue
+            m_addr = match.groups()[0]
+            m_name = match.groups()[1]
+            addr_name_dict[int(m_addr, 16)] = m_name
+
+    return addr_name_dict
+
+
+def match_dwarf_to_name(dwarfdump, addr_name_dict):
+    """
+    Parse dwarf dump, and match names to blocks using the dict from the objdump.
+    Return a list of NameDwarfPairs.
+    The matched lines look like this:
+    000123 000456 000789 FDE cie=000000  pc=0123abc...0456def
+    We do not have the function name for this, only the PC range it applies to.
+    We match the pc=0123abc (the start address), and find the matching name from
+    the addr_name_dict.
+    The resultint NameDwarfPair will hold the lines this header applied to, and
+    instead of the header with the addresses, it will just have the function name.
+    """
+    re_address_line = re.compile(r".*pc=([0-9a-fA-F]+)\.\.\.([0-9a-fA-F]+)")
+    with open(dwarfdump, "r") as dw:
+        functions = []
+        for line in dw.readlines():
+            match = re_address_line.match(line)
+            if not match:
+                if len(functions) > 0:
+                    functions[-1].append(line)
+                continue
+            pc_start_address = match.groups()[0]
+            name = addr_name_dict.get(int(pc_start_address, 16))
+            functions.append(NameDwarfPair(name, ""))
+
+        return functions
+
+
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("objdump", help="Objdump file")
+    parser.add_argument(
+        "dwarfdump", help="dwarf dump file created with 'llvm-objdump --dwarf=frames'"
+    )
+    parser.add_argument("function", help="Function to search CFIs in.")
+
+    args = parser.parse_args()
+
+    addr_name_dict = extract_function_addresses(args.objdump)
+    functions = match_dwarf_to_name(args.dwarfdump, addr_name_dict)
+
+    for f in functions:
+        if f.name == args.function:
+            f.parse_negate_offsets()
+            print(f.negate_offsets)
+            break
+    else:
+        print(f"{args.function} not found")
+        exit(-1)
+
+
+main()

>From 9b53bc343a799e379a806287b764fc6197523d9e Mon Sep 17 00:00:00 2001
From: Gergely Balint <gergely.balint at arm.com>
Date: Fri, 23 May 2025 14:11:47 +0200
Subject: [PATCH 05/25] [BOLT] Basic exception unwinding test

---
 bolt/test/runtime/AArch64/negate-ra-state.cpp | 26 +++++++++++++++++++
 1 file changed, 26 insertions(+)
 create mode 100644 bolt/test/runtime/AArch64/negate-ra-state.cpp

diff --git a/bolt/test/runtime/AArch64/negate-ra-state.cpp b/bolt/test/runtime/AArch64/negate-ra-state.cpp
new file mode 100644
index 0000000000000..60b0b08950b58
--- /dev/null
+++ b/bolt/test/runtime/AArch64/negate-ra-state.cpp
@@ -0,0 +1,26 @@
+// REQUIRES: system-linux,bolt-runtime
+
+// RUN: %clangxx --target=aarch64-unknown-linux-gnu \
+// RUN: -mbranch-protection=pac-ret -Wl,-q %s -o %t.exe
+// RUN: llvm-bolt %t.exe -o %t.bolt.exe
+// RUN: %t.bolt.exe | FileCheck %s
+
+// CHECK: Exception caught: Exception from bar().
+
+#include <cstdio>
+#include <stdexcept>
+
+void bar() { throw std::runtime_error("Exception from bar()."); }
+
+void foo() {
+  try {
+    bar();
+  } catch (const std::exception &e) {
+    printf("Exception caught: %s\n", e.what());
+  }
+}
+
+int main() {
+  foo();
+  return 0;
+}

>From cf8684765c8d8b81f7a4553f2f12debede5f696f Mon Sep 17 00:00:00 2001
From: Gergely Balint <gergely.balint at arm.com>
Date: Tue, 29 Jul 2025 14:28:34 +0000
Subject: [PATCH 06/25] [BOLT] Add OpNegateRAState to printCFI

---
 bolt/lib/Core/BinaryContext.cpp | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/bolt/lib/Core/BinaryContext.cpp b/bolt/lib/Core/BinaryContext.cpp
index 84f1853469709..c8f623c2374bc 100644
--- a/bolt/lib/Core/BinaryContext.cpp
+++ b/bolt/lib/Core/BinaryContext.cpp
@@ -1865,6 +1865,9 @@ void BinaryContext::printCFI(raw_ostream &OS, const MCCFIInstruction &Inst) {
   case MCCFIInstruction::OpGnuArgsSize:
     OS << "OpGnuArgsSize";
     break;
+  case MCCFIInstruction::OpNegateRAState:
+    OS << "OpNegateRAState";
+    break;
   default:
     OS << "Op#" << Operation;
     break;

>From f605baf0fd52184a57ebf41baefcfa7991b95ff8 Mon Sep 17 00:00:00 2001
From: Gergely Balint <gergely.balint at arm.com>
Date: Tue, 29 Jul 2025 14:28:57 +0000
Subject: [PATCH 07/25] [BOLT] Improve function splitting at OpNegateRAState
 handling

- Previous version used the Layout API which should not be used.
- This version iterates on Fragments to have the same effect.
- NegateRAState is added to the first *non-empty* BB of the cold fragment.
---
 bolt/lib/Passes/InsertNegateRAStatePass.cpp | 44 +++++++++++----------
 1 file changed, 24 insertions(+), 20 deletions(-)

diff --git a/bolt/lib/Passes/InsertNegateRAStatePass.cpp b/bolt/lib/Passes/InsertNegateRAStatePass.cpp
index 05d4d58c7e4cf..0eb0e3aef00d4 100644
--- a/bolt/lib/Passes/InsertNegateRAStatePass.cpp
+++ b/bolt/lib/Passes/InsertNegateRAStatePass.cpp
@@ -43,27 +43,32 @@ void InsertNegateRAState::runOnFunction(BinaryFunction &BF) {
 
   fixUnknownStates(BF);
 
-  bool FirstIter = true;
-  MCInst PrevInst;
-  BinaryBasicBlock *PrevBB = nullptr;
-  // We need to iterate on BBs in the Layout order
-  // not in the order they are stored in the BF class.
-  auto *Begin = BF.getLayout().block_begin();
-  auto *End = BF.getLayout().block_end();
-  for (auto *BB = Begin; BB != End; BB++) {
-
-    // Support for function splitting:
-    // if two consecutive BBs with Signed state are going to end up in different
-    // functions, we have to add a OpNegateRAState to the beginning of the newly
-    // split function, so it starts with a Signed state.
-    if (PrevBB != nullptr &&
-        PrevBB->getFragmentNum() != (*BB)->getFragmentNum() &&
-        BC.MIB->isRASigned(*((*BB)->begin()))) {
-      BF.addCFIInstruction(*BB, (*BB)->begin(),
+  // Support for function splitting:
+  // if two consecutive BBs with Signed state are going to end up in different
+  // functions (so are held by different FunctionFragments), we have to add a
+  // OpNegateRAState to the beginning of the newly split function, so it starts
+  // with a Signed state.
+  for (FunctionFragment &FF : BF.getLayout().fragments()) {
+    // Find the first BB in the FF which has Instructions.
+    // BOLT can generate empty BBs at function splitting which are only used as
+    // target labels. We should add the negate-ra-state CFI to the first
+    // non-empty BB.
+    auto FirstNonEmpty =
+        std::find_if(FF.begin(), FF.end(), [](BinaryBasicBlock *BB) {
+          // getFirstNonPseudo returns BB.end() if it does not find any
+          // Instructions.
+          return BB->getFirstNonPseudo() != BB->end();
+        });
+    if (BC.MIB->isRASigned(*((*FirstNonEmpty)->begin()))) {
+      BF.addCFIInstruction(*FirstNonEmpty, (*FirstNonEmpty)->begin(),
                            MCCFIInstruction::createNegateRAState(nullptr));
     }
+  }
 
-    for (auto It = (*BB)->begin(); It != (*BB)->end(); ++It) {
+  bool FirstIter = true;
+  MCInst PrevInst;
+  for (BinaryBasicBlock &BB : BF) {
+    for (auto It = BB.begin(); It != BB.end(); ++It) {
 
       MCInst &Inst = *It;
       if (BC.MIB->isCFI(Inst))
@@ -76,7 +81,7 @@ void InsertNegateRAState::runOnFunction(BinaryFunction &BF) {
             (BC.MIB->isRAUnsigned(PrevInst) && BC.MIB->isRASigned(Inst))) {
 
           It = BF.addCFIInstruction(
-              *BB, It, MCCFIInstruction::createNegateRAState(nullptr));
+              &BB, It, MCCFIInstruction::createNegateRAState(nullptr));
         }
 
       } else {
@@ -84,7 +89,6 @@ void InsertNegateRAState::runOnFunction(BinaryFunction &BF) {
       }
       PrevInst = *It;
     }
-    PrevBB = *BB;
   }
 }
 

>From 12a04c734479b9eefa427ead5e6ec8ff2c892578 Mon Sep 17 00:00:00 2001
From: Gergely Balint <gergely.balint at arm.com>
Date: Thu, 31 Jul 2025 08:42:06 +0000
Subject: [PATCH 08/25] [BOLT] Bugfix: CFIs can be placed before the first
 Instruction

- This caused a crash when trying to Annotate RAState-changing CFIs
    (RememberState, RestoreState, NegateRAState).
- The fix introduces an InitialRAState for each BinaryFunction.
- If we have a NegateRAState before the first Instr, we set that to
  True.
- In MarkRAStates, we push the InitialRAState to the RAStateStack: as we
  may have omitted the RememberState at the function start, its RestoreState
  pair would try to pop an empty stack otherwise.
---
 bolt/include/bolt/Core/BinaryFunction.h |  5 +++++
 bolt/lib/Core/Exceptions.cpp            | 21 ++++++++++++++++-----
 bolt/lib/Passes/MarkRAStates.cpp        |  3 ++-
 3 files changed, 23 insertions(+), 6 deletions(-)

diff --git a/bolt/include/bolt/Core/BinaryFunction.h b/bolt/include/bolt/Core/BinaryFunction.h
index ebe12929c0973..c797f9e2ac71a 100644
--- a/bolt/include/bolt/Core/BinaryFunction.h
+++ b/bolt/include/bolt/Core/BinaryFunction.h
@@ -147,6 +147,9 @@ class BinaryFunction {
     PF_MEMEVENT = 4, /// Profile has mem events.
   };
 
+  void setInitialRAState(bool State) { InitialRAState = State; }
+  bool getInitialRAState() { return InitialRAState; }
+
   /// Struct for tracking exception handling ranges.
   struct CallSite {
     const MCSymbol *Start;
@@ -220,6 +223,8 @@ class BinaryFunction {
   /// Current state of the function.
   State CurrentState{State::Empty};
 
+  bool InitialRAState{false};
+
   /// A list of symbols associated with the function entry point.
   ///
   /// Multiple symbols would typically result from identical code-folding
diff --git a/bolt/lib/Core/Exceptions.cpp b/bolt/lib/Core/Exceptions.cpp
index 63b7ad43b1dec..954c21e14ba17 100644
--- a/bolt/lib/Core/Exceptions.cpp
+++ b/bolt/lib/Core/Exceptions.cpp
@@ -569,20 +569,24 @@ bool CFIReaderWriter::fillCFIInfoFor(BinaryFunction &Function) const {
       Function.addCFIInstruction(
           Offset, MCCFIInstruction::createRememberState(nullptr));
 
-      if (Function.getBinaryContext().isAArch64())
+      if (Function.getBinaryContext().isAArch64()) {
         // Support for pointer authentication:
         // We need to annotate instructions that modify the RA State, to work
         // out the state of each instruction in MarkRAStates Pass.
-        Function.setInstModifiesRAState(DW_CFA_remember_state, Offset);
+        if (Offset != 0)
+          Function.setInstModifiesRAState(DW_CFA_remember_state, Offset);
+      }
       break;
     case DW_CFA_restore_state:
       Function.addCFIInstruction(Offset,
                                  MCCFIInstruction::createRestoreState(nullptr));
-      if (Function.getBinaryContext().isAArch64())
+      if (Function.getBinaryContext().isAArch64()) {
         // Support for pointer authentication:
         // We need to annotate instructions that modify the RA State, to work
         // out the state of each instruction in MarkRAStates Pass.
-        Function.setInstModifiesRAState(DW_CFA_restore_state, Offset);
+        if (Offset != 0)
+          Function.setInstModifiesRAState(DW_CFA_restore_state, Offset);
+      }
       break;
     case DW_CFA_def_cfa:
       Function.addCFIInstruction(
@@ -649,7 +653,14 @@ bool CFIReaderWriter::fillCFIInfoFor(BinaryFunction &Function) const {
         // is added to the instruction, to mark that the instruction modifies
         // the RA State. The actual state for instructions are worked out in
         // MarkRAStates based on these annotations.
-        Function.setInstModifiesRAState(DW_CFA_AARCH64_negate_ra_state, Offset);
+        if (Offset != 0)
+          Function.setInstModifiesRAState(DW_CFA_AARCH64_negate_ra_state,
+                                          Offset);
+        else
+          // We cannot Annotate an instruction at Offset == 0.
+          // Instead, we save the initial (Signed) state, and push it to
+          // MarkRAStates' RAStateStack.
+          Function.setInitialRAState(true);
         break;
       }
       if (opts::Verbosity >= 1)
diff --git a/bolt/lib/Passes/MarkRAStates.cpp b/bolt/lib/Passes/MarkRAStates.cpp
index 8121fffb93c9f..d7db5532002bf 100644
--- a/bolt/lib/Passes/MarkRAStates.cpp
+++ b/bolt/lib/Passes/MarkRAStates.cpp
@@ -62,8 +62,9 @@ void MarkRAStates::runOnFunction(BinaryFunction &BF) {
     }
   }
 
-  bool RAState = false;
+  bool RAState = BF.getInitialRAState();
   std::stack<bool> RAStateStack;
+  RAStateStack.push(RAState);
 
   for (BinaryBasicBlock &BB : BF) {
     for (auto It = BB.begin(); It != BB.end(); ++It) {

>From 8663454568b1475fe3d1ebbc221d03ace461de67 Mon Sep 17 00:00:00 2001
From: Gergely Balint <gergely.balint at arm.com>
Date: Fri, 1 Aug 2025 08:21:17 +0000
Subject: [PATCH 09/25] [BOLT] Add function-splitting test

---
 .../runtime/AArch64/pacret-function-split.cpp | 42 +++++++++++++++++++
 1 file changed, 42 insertions(+)
 create mode 100644 bolt/test/runtime/AArch64/pacret-function-split.cpp

diff --git a/bolt/test/runtime/AArch64/pacret-function-split.cpp b/bolt/test/runtime/AArch64/pacret-function-split.cpp
new file mode 100644
index 0000000000000..208fc5c115571
--- /dev/null
+++ b/bolt/test/runtime/AArch64/pacret-function-split.cpp
@@ -0,0 +1,42 @@
+/* This test check that the negate-ra-state CFIs are properly emitted in case of
+   function splitting. The test checks two things:
+    - we split at the correct location: to test the feature,
+        we need to split *before* the bl __cxa_throw at PLT call is made,
+        so the unwinder has to unwind from the split (cold) part.
+
+    - the BOLTed binary runs, and returns the string from foo.
+
+# REQUIRES: system-linux,bolt-runtime
+
+# FDATA: 1 main #split# 1 _Z3foov 0 0 1
+
+# RUN: %clangxx --target=aarch64-unknown-linux-gnu \
+# RUN: -mbranch-protection=pac-ret %s -o %t.exe -Wl,-q
+# RUN: link_fdata %s %t.exe %t.fdata
+# RUN: llvm-bolt %t.exe -o %t.bolt --split-functions --split-eh \
+# RUN: --split-strategy=profile2 --split-all-cold --print-split \
+# RUN: --print-only=_Z3foov --data=%t.fdata 2>&1 | FileCheck \
+# RUN: --check-prefix=BOLT-CHECK %s
+# RUN: %t.bolt | FileCheck %s  --check-prefix=RUN-CHECK
+
+# BOLT-CHECK-NOT: bl      __cxa_throw at PLT
+# BOLT-CHECK: -------   HOT-COLD SPLIT POINT   -------
+# BOLT-CHECK: bl      __cxa_throw at PLT
+
+# RUN-CHECK: Exception caught: Exception from foo().
+*/
+
+#include <cstdio>
+#include <stdexcept>
+
+void foo() { throw std::runtime_error("Exception from foo()."); }
+
+int main() {
+  try {
+    __asm__ __volatile__("split:");
+    foo();
+  } catch (const std::exception &e) {
+    printf("Exception caught: %s\n", e.what());
+  }
+  return 0;
+}

>From 3c05227fc6c68962e39269a2a5a0364c4e23c40f Mon Sep 17 00:00:00 2001
From: Gergely Balint <gergely.balint at arm.com>
Date: Mon, 4 Aug 2025 13:48:49 +0000
Subject: [PATCH 10/25] [BOLT] Improve warnings in MarkRAStates

- Previously, the same warning was printed in
  3 different cases. Changed warnings to be more specific.
---
 bolt/lib/Passes/MarkRAStates.cpp | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/bolt/lib/Passes/MarkRAStates.cpp b/bolt/lib/Passes/MarkRAStates.cpp
index d7db5532002bf..1b56cd83a2314 100644
--- a/bolt/lib/Passes/MarkRAStates.cpp
+++ b/bolt/lib/Passes/MarkRAStates.cpp
@@ -57,6 +57,8 @@ void MarkRAStates::runOnFunction(BinaryFunction &BF) {
         BF.setIgnored();
         BC.outs() << "BOLT-INFO: inconsistent RAStates in function "
                   << BF.getPrintName() << "\n";
+        BC.outs()
+            << "BOLT-INFO: ptr sign/auth inst without .cfi_negate_ra_state\n";
         return;
       }
     }
@@ -78,6 +80,8 @@ void MarkRAStates::runOnFunction(BinaryFunction &BF) {
           // RA signing instructions should only follow unsigned RA state.
           BC.outs() << "BOLT-INFO: inconsistent RAStates in function "
                     << BF.getPrintName() << "\n";
+          BC.outs() << "BOLT-INFO: ptr signing inst encountered in Signed RA "
+                       "state.\n";
           BF.setIgnored();
           return;
         }
@@ -87,6 +91,8 @@ void MarkRAStates::runOnFunction(BinaryFunction &BF) {
           // RA authenticating instructions should only follow signed RA state.
           BC.outs() << "BOLT-INFO: inconsistent RAStates in function "
                     << BF.getPrintName() << "\n";
+          BC.outs() << "BOLT-INFO: ptr authenticating inst encountered in "
+                       "Unsigned RA state.\n";
           BF.setIgnored();
           return;
         }

>From 5625e809a3b1ac896cf3fd0b2de3ad98b9b6cc33 Mon Sep 17 00:00:00 2001
From: Gergely Balint <gergely.balint at arm.com>
Date: Tue, 5 Aug 2025 13:33:44 +0000
Subject: [PATCH 11/25] [BOLT] Improve negate-ra-state-incorrect test

- check all 3 possibilities that MarkRAStates warns about,
- simplify the asm functions in the test,
- use FileCheck instead of grep on the disassembly.
---
 bolt/test/AArch64/negate-ra-state-incorrect.s | 79 ++++++++++++++-----
 1 file changed, 59 insertions(+), 20 deletions(-)

diff --git a/bolt/test/AArch64/negate-ra-state-incorrect.s b/bolt/test/AArch64/negate-ra-state-incorrect.s
index c6b8b36939f4d..ad922d83c6d61 100644
--- a/bolt/test/AArch64/negate-ra-state-incorrect.s
+++ b/bolt/test/AArch64/negate-ra-state-incorrect.s
@@ -1,19 +1,31 @@
+# This test checks that MarkRAStates pass ignores functions with
+# malformed .cfi_negate_ra_state sequences in the input binary.
+
+# The cases checked are:
+#   - extra .cfi_negate_ra_state in Signed state: checked in foo,
+#   - extra .cfi_negate_ra_state in Unsigned state: checked in bar,
+#   - missing .cfi_negate_ra_state from PSign or PAuth instructions: checked in baz.
+
 # RUN: llvm-mc -filetype=obj -triple aarch64-unknown-unknown %s -o %t.o
 # RUN: %clang %cflags  %t.o -o %t.exe -Wl,-q
-# RUN: llvm-bolt %t.exe -o %t.exe.bolt | FileCheck %s
+# RUN: llvm-bolt %t.exe -o %t.exe.bolt --no-threads | FileCheck %s --check-prefix=CHECK-BOLT
+
+# CHECK-BOLT: BOLT-INFO: inconsistent RAStates in function foo
+# CHECK-BOLT-NEXT: BOLT-INFO: ptr authenticating inst encountered in Unsigned RA state.
 
-# check that the output is listing foo as incorrect
-# CHECK: BOLT-INFO: inconsistent RAStates in function foo
+# CHECK-BOLT: BOLT-INFO: inconsistent RAStates in function bar
+# CHECK-BOLT-NEXT: BOLT-INFO: ptr signing inst encountered in Signed RA state
 
-# check that foo got Ignored, so it's not in the new .text section
-# RUN: llvm-objdump %t.exe.bolt -d -j .text > %t.exe.dump
-# RUN: not grep "<foo>:" %t.exe.dump
+# CHECK-BOLT: BOLT-INFO: inconsistent RAStates in function baz
+# CHECK-BOLT-NEXT: BOLT-INFO: ptr sign/auth inst without .cfi_negate_ra_state
+
+# Check that the incorrect functions got ignored, so they are not in the new .text section
+# RUN: llvm-objdump %t.exe.bolt -d -j .text | FileCheck %s --check-prefix=CHECK-OBJDUMP
+# CHECK-OBJDUMP-NOT: <foo>:
+# CHECK-OBJDUMP-NOT: <bar>:
+# CHECK-OBJDUMP-NOT: <baz>:
 
 
-# How is this test incorrect?
-# There is an extra .cfi_negate_ra_state in foo.
-# Because of this, we will get to the autiasp (hint #29)
-# in a (seemingly) unsigned state. That is incorrect.
   .text
   .globl  foo
   .p2align        2
@@ -22,23 +34,50 @@ foo:
   .cfi_startproc
   hint    #25
   .cfi_negate_ra_state
-  sub     sp, sp, #16
-  stp     x29, x30, [sp, #16]             // 16-byte Folded Spill
-  .cfi_def_cfa_offset 16
-  str     w0, [sp, #12]
-  ldr     w8, [sp, #12]
+  mov x1, #0
+  .cfi_negate_ra_state        // Incorrect CFI in signed state
+  hint    #29
+  .cfi_negate_ra_state
+  ret
+  .cfi_endproc
+  .size   foo, .-foo
+
+  .text
+  .globl  bar
+  .p2align        2
+  .type   bar, at function
+bar:
+  .cfi_startproc
+  mov x1, #0
+  .cfi_negate_ra_state      // Incorrect CFI in unsigned state
+  hint    #25
   .cfi_negate_ra_state
-  add     w0, w8, #1
-  ldp     x29, x30, [sp, #16]             // 16-byte Folded Reload
-  add     sp, sp, #16
+  mov x1, #0
   hint    #29
   .cfi_negate_ra_state
   ret
-.Lfunc_end1:
-  .size   foo, .Lfunc_end1-foo
   .cfi_endproc
+  .size   bar, .-bar
+
+  .text
+  .globl  baz
+  .p2align        2
+  .type   baz, at function
+baz:
+  .cfi_startproc
+  mov x1, #0
+  hint    #25
+  .cfi_negate_ra_state
+  mov x1, #0
+  hint    #29
+                            // Missing .cfi_negate_ra_state
+  ret
+  .cfi_endproc
+  .size   baz, .-baz
 
   .global _start
   .type _start, %function
 _start:
   b foo
+  b bar
+  b baz

>From dd2395ca70dda9879b30fe99636f77079377c17f Mon Sep 17 00:00:00 2001
From: Gergely Balint <gergely.balint at arm.com>
Date: Wed, 6 Aug 2025 13:57:29 +0000
Subject: [PATCH 12/25] [BOLT] Do run negate-ra-state rewriting passes on all
 functions

- if a BinaryFunction did not contain any .cfi_negate_ra_state,
  we can skip MarkRAStates and InsertNegateRAStatePass. This can happen
  with handwritten assembly, or if the binary was compiled in a way that
  drops DWARF tables.
---
 bolt/include/bolt/Core/BinaryFunction.h     | 6 ++++++
 bolt/lib/Core/Exceptions.cpp                | 1 +
 bolt/lib/Passes/InsertNegateRAStatePass.cpp | 8 +++++++-
 bolt/lib/Passes/MarkRAStates.cpp            | 8 +++++++-
 4 files changed, 21 insertions(+), 2 deletions(-)

diff --git a/bolt/include/bolt/Core/BinaryFunction.h b/bolt/include/bolt/Core/BinaryFunction.h
index c797f9e2ac71a..5335ec5a9a7bd 100644
--- a/bolt/include/bolt/Core/BinaryFunction.h
+++ b/bolt/include/bolt/Core/BinaryFunction.h
@@ -147,6 +147,8 @@ class BinaryFunction {
     PF_MEMEVENT = 4, /// Profile has mem events.
   };
 
+  void setContainedNegateRAState() { HadNegateRAState = true; }
+  bool containedNegateRAState() { return HadNegateRAState; }
   void setInitialRAState(bool State) { InitialRAState = State; }
   bool getInitialRAState() { return InitialRAState; }
 
@@ -223,6 +225,10 @@ class BinaryFunction {
   /// Current state of the function.
   State CurrentState{State::Empty};
 
+  /// Indicates if the Function contained .cfi-negate-ra-state. These are not
+  /// read from the binary. This boolean is used when deciding to run the
+  /// .cfi-negate-ra-state rewriting passes on a function or not.
+  bool HadNegateRAState{false};
   bool InitialRAState{false};
 
   /// A list of symbols associated with the function entry point.
diff --git a/bolt/lib/Core/Exceptions.cpp b/bolt/lib/Core/Exceptions.cpp
index 954c21e14ba17..08bd1e694d901 100644
--- a/bolt/lib/Core/Exceptions.cpp
+++ b/bolt/lib/Core/Exceptions.cpp
@@ -647,6 +647,7 @@ bool CFIReaderWriter::fillCFIInfoFor(BinaryFunction &Function) const {
       // DW_CFA_GNU_window_save and DW_CFA_AARCH64_negate_ra_state just use the
       // same id but mean different things. The latter is used in AArch64.
       if (Function.getBinaryContext().isAArch64()) {
+        Function.setContainedNegateRAState();
         // The location OpNegateRAState CFIs are needed
         // depends on the order of BasicBlocks, which changes during
         // optimizations. Instead of adding OpNegateRAState CFIs, an annotation
diff --git a/bolt/lib/Passes/InsertNegateRAStatePass.cpp b/bolt/lib/Passes/InsertNegateRAStatePass.cpp
index 0eb0e3aef00d4..80857204ff53c 100644
--- a/bolt/lib/Passes/InsertNegateRAStatePass.cpp
+++ b/bolt/lib/Passes/InsertNegateRAStatePass.cpp
@@ -137,7 +137,13 @@ void InsertNegateRAState::fixUnknownStates(BinaryFunction &BF) {
 
 Error InsertNegateRAState::runOnFunctions(BinaryContext &BC) {
   ParallelUtilities::WorkFuncTy WorkFun = [&](BinaryFunction &BF) {
-    runOnFunction(BF);
+    if (BF.containedNegateRAState()) {
+      // We can skip functions which did not include negate-ra-state CFIs. This
+      // includes code using pac-ret hardening as well, if the binary is
+      // compiled with `-fno-exceptions -fno-unwind-tables
+      // -fno-asynchronous-unwind-tables`
+      runOnFunction(BF);
+    }
   };
 
   ParallelUtilities::runOnEachFunction(
diff --git a/bolt/lib/Passes/MarkRAStates.cpp b/bolt/lib/Passes/MarkRAStates.cpp
index 1b56cd83a2314..9e2ab049a67f5 100644
--- a/bolt/lib/Passes/MarkRAStates.cpp
+++ b/bolt/lib/Passes/MarkRAStates.cpp
@@ -127,7 +127,13 @@ void MarkRAStates::runOnFunction(BinaryFunction &BF) {
 
 Error MarkRAStates::runOnFunctions(BinaryContext &BC) {
   ParallelUtilities::WorkFuncTy WorkFun = [&](BinaryFunction &BF) {
-    runOnFunction(BF);
+    if (BF.containedNegateRAState()) {
+      // We can skip functions which did not include negate-ra-state CFIs. This
+      // includes code using pac-ret hardening as well, if the binary is
+      // compiled with `-fno-exceptions -fno-unwind-tables
+      // -fno-asynchronous-unwind-tables`
+      runOnFunction(BF);
+    }
   };
 
   ParallelUtilities::runOnEachFunction(

>From 871de4c9ff53b2c5ea3b19db30b4dc9aa9fcd2de Mon Sep 17 00:00:00 2001
From: Gergely Balint <gergely.balint at arm.com>
Date: Wed, 6 Aug 2025 13:58:37 +0000
Subject: [PATCH 13/25] [BOLT] Introduce --disallow-pacret flag

---
 bolt/include/bolt/Utils/CommandLineOpts.h    |  1 +
 bolt/lib/Rewrite/BinaryPassManager.cpp       |  9 +++++++
 bolt/lib/Rewrite/RewriteInstance.cpp         | 11 +++++++++
 bolt/test/AArch64/negate-ra-state-disallow.s | 25 ++++++++++++++++++++
 4 files changed, 46 insertions(+)
 create mode 100644 bolt/test/AArch64/negate-ra-state-disallow.s

diff --git a/bolt/include/bolt/Utils/CommandLineOpts.h b/bolt/include/bolt/Utils/CommandLineOpts.h
index a75b6bf720ec4..cad49adf38573 100644
--- a/bolt/include/bolt/Utils/CommandLineOpts.h
+++ b/bolt/include/bolt/Utils/CommandLineOpts.h
@@ -72,6 +72,7 @@ extern llvm::cl::opt<std::string> OutputFilename;
 extern llvm::cl::opt<std::string> PerfData;
 extern llvm::cl::opt<bool> PrintCacheMetrics;
 extern llvm::cl::opt<bool> PrintSections;
+extern llvm::cl::opt<bool> DisallowPacret;
 
 // The format to use with -o in aggregation mode (perf2bolt)
 enum ProfileFormatKind { PF_Fdata, PF_YAML };
diff --git a/bolt/lib/Rewrite/BinaryPassManager.cpp b/bolt/lib/Rewrite/BinaryPassManager.cpp
index 8b412854dc701..4f104ea3ee7c3 100644
--- a/bolt/lib/Rewrite/BinaryPassManager.cpp
+++ b/bolt/lib/Rewrite/BinaryPassManager.cpp
@@ -275,6 +275,15 @@ static cl::opt<bool> ShortenInstructions("shorten-instructions",
                                          cl::desc("shorten instructions"),
                                          cl::init(true),
                                          cl::cat(BoltOptCategory));
+
+// This flag is used to "gate" the negate-ra-state CFI handling.
+// Sometimes, binaries use pac-ret but not contain negate-ra-state CFIs. That
+// should cause no issues for BOLT.
+cl::opt<bool> DisallowPacret(
+    "disallow-pacret",
+    cl::desc("Disable processing binaries containing negate-ra-state DWARF "
+             "CFIs (e.g. binaries using pac-ret hardening)"),
+    cl::cat(BoltOptCategory));
 } // namespace opts
 
 namespace llvm {
diff --git a/bolt/lib/Rewrite/RewriteInstance.cpp b/bolt/lib/Rewrite/RewriteInstance.cpp
index fe4a23cc01382..97cc87ee884c7 100644
--- a/bolt/lib/Rewrite/RewriteInstance.cpp
+++ b/bolt/lib/Rewrite/RewriteInstance.cpp
@@ -3491,6 +3491,17 @@ void RewriteInstance::disassembleFunctions() {
       }
     }
 
+    // Check if fillCFIInfoFor removed any OpNegateRAState CFIs from the
+    // function.
+    if (Function.containedNegateRAState()) {
+      if (opts::DisallowPacret) {
+        BC->errs() << "BOLT-ERROR: --disallow-pacret flag was used, but "
+                   << Function.getPrintName()
+                   << " contains .cfi-negate-ra-state.\n";
+        exit(1);
+      }
+    }
+
     // Parse LSDA.
     if (Function.getLSDAAddress() != 0 &&
         !BC->getFragmentsToSkip().count(&Function)) {
diff --git a/bolt/test/AArch64/negate-ra-state-disallow.s b/bolt/test/AArch64/negate-ra-state-disallow.s
new file mode 100644
index 0000000000000..97488be6eb940
--- /dev/null
+++ b/bolt/test/AArch64/negate-ra-state-disallow.s
@@ -0,0 +1,25 @@
+# RUN: llvm-mc -filetype=obj -triple aarch64-unknown-unknown %s -o %t.o
+# RUN: %clang %cflags  %t.o -o %t.exe -Wl,-q
+# RUN: not llvm-bolt %t.exe -o %t.exe.bolt --disallow-pacret 2>&1 | FileCheck %s
+
+# CHECK: BOLT-ERROR: --disallow-pacret flag was used, but foo contains .cfi-negate-ra-state.
+
+  .text
+  .globl  foo
+  .p2align        2
+  .type   foo, at function
+foo:
+  .cfi_startproc
+  hint    #25
+  .cfi_negate_ra_state
+  mov x1, #0
+  hint    #29
+  .cfi_negate_ra_state
+  ret
+  .cfi_endproc
+  .size   foo, .-foo
+
+  .global _start
+  .type _start, %function
+_start:
+  b foo

>From 8c05e2eb277be8f604da07c9eca092d0eab32a92 Mon Sep 17 00:00:00 2001
From: Gergely Balint <gergely.balint at arm.com>
Date: Fri, 8 Aug 2025 12:32:56 +0000
Subject: [PATCH 14/25] [BOLT][NFC] Review nits

- remove unused imports
- fix clangd warnings
- rename functions
- improve comments
- reformat lib/Core/Exceptions.cpp
---
 bolt/include/bolt/Core/MCPlusBuilder.h             |  2 +-
 bolt/include/bolt/Passes/InsertNegateRAStatePass.h | 12 +++++++-----
 bolt/lib/Core/Exceptions.cpp                       | 12 ++++++------
 bolt/lib/Passes/InsertNegateRAStatePass.cpp        | 14 ++++++--------
 bolt/lib/Passes/MarkRAStates.cpp                   | 13 +++----------
 5 files changed, 23 insertions(+), 30 deletions(-)

diff --git a/bolt/include/bolt/Core/MCPlusBuilder.h b/bolt/include/bolt/Core/MCPlusBuilder.h
index 6d41eaf45b03e..efea2f4590ee7 100644
--- a/bolt/include/bolt/Core/MCPlusBuilder.h
+++ b/bolt/include/bolt/Core/MCPlusBuilder.h
@@ -77,7 +77,7 @@ class MCPlusBuilder {
     if (*FirstAnnotationOp > OpIndex || Inst.getNumOperands() < OpIndex)
       return std::nullopt;
 
-    auto Op = Inst.begin() + OpIndex;
+    const auto *Op = Inst.begin() + OpIndex;
     const int64_t ImmValue = Op->getImm();
     return extractAnnotationIndex(ImmValue);
   }
diff --git a/bolt/include/bolt/Passes/InsertNegateRAStatePass.h b/bolt/include/bolt/Passes/InsertNegateRAStatePass.h
index ce73b5a152d12..2394356401564 100644
--- a/bolt/include/bolt/Passes/InsertNegateRAStatePass.h
+++ b/bolt/include/bolt/Passes/InsertNegateRAStatePass.h
@@ -13,7 +13,6 @@
 #define BOLT_PASSES_INSERT_NEGATE_RA_STATE_PASS
 
 #include "bolt/Passes/BinaryPasses.h"
-#include <stack>
 
 namespace llvm {
 namespace bolt {
@@ -31,14 +30,17 @@ class InsertNegateRAState : public BinaryFunctionPass {
 private:
   /// Loops over all instructions and adds OpNegateRAState CFI
   /// after any pointer signing or authenticating instructions,
-  /// which operate on the LR, except fused ptrauth + ret instructions
-  /// (such as RETAA).
+  /// which operate on the LR, except fused pauth + ret instructions
+  /// (such as RETAA). Normal pauth and psign instructions are "special cases",
+  /// meaning they always need an OpNegateRAState CFI after them.
+  /// Fused pauth + ret instructions are not, they work as any other
+  /// instruction.
   /// Returns true, if any OpNegateRAState CFIs were added.
-  bool addNegateRAStateAfterPacOrAuth(BinaryFunction &BF);
+  bool addNegateRAStateAfterPSignOrPAuth(BinaryFunction &BF);
   /// Because states are tracked as MCAnnotations on individual instructions,
   /// newly inserted instructions do not have a state associated with them.
   /// New states are "inherited" from the last known state.
-  void fixUnknownStates(BinaryFunction &BF);
+  void inferUnknownStates(BinaryFunction &BF);
 };
 
 } // namespace bolt
diff --git a/bolt/lib/Core/Exceptions.cpp b/bolt/lib/Core/Exceptions.cpp
index 08bd1e694d901..eedca368aaaa0 100644
--- a/bolt/lib/Core/Exceptions.cpp
+++ b/bolt/lib/Core/Exceptions.cpp
@@ -648,12 +648,12 @@ bool CFIReaderWriter::fillCFIInfoFor(BinaryFunction &Function) const {
       // same id but mean different things. The latter is used in AArch64.
       if (Function.getBinaryContext().isAArch64()) {
         Function.setContainedNegateRAState();
-        // The location OpNegateRAState CFIs are needed
-        // depends on the order of BasicBlocks, which changes during
-        // optimizations. Instead of adding OpNegateRAState CFIs, an annotation
-        // is added to the instruction, to mark that the instruction modifies
-        // the RA State. The actual state for instructions are worked out in
-        // MarkRAStates based on these annotations.
+        // The location OpNegateRAState CFIs are needed depends on the order of
+        // BasicBlocks, which changes during optimizations. Instead of adding
+        // OpNegateRAState CFIs, an annotation is added to the instruction, to
+        // mark that the instruction modifies the RA State. The actual state for
+        // instructions are worked out in MarkRAStates based on these
+        // annotations.
         if (Offset != 0)
           Function.setInstModifiesRAState(DW_CFA_AARCH64_negate_ra_state,
                                           Offset);
diff --git a/bolt/lib/Passes/InsertNegateRAStatePass.cpp b/bolt/lib/Passes/InsertNegateRAStatePass.cpp
index 80857204ff53c..8393f2b05a16f 100644
--- a/bolt/lib/Passes/InsertNegateRAStatePass.cpp
+++ b/bolt/lib/Passes/InsertNegateRAStatePass.cpp
@@ -14,10 +14,7 @@
 #include "bolt/Passes/InsertNegateRAStatePass.h"
 #include "bolt/Core/BinaryFunction.h"
 #include "bolt/Core/ParallelUtilities.h"
-#include "bolt/Utils/CommandLineOpts.h"
 #include <cstdlib>
-#include <fstream>
-#include <iterator>
 
 using namespace llvm;
 
@@ -38,10 +35,10 @@ void InsertNegateRAState::runOnFunction(BinaryFunction &BF) {
   }
 
   // If none is inserted, the function doesn't need more work.
-  if (!addNegateRAStateAfterPacOrAuth(BF))
+  if (!addNegateRAStateAfterPSignOrPAuth(BF))
     return;
 
-  fixUnknownStates(BF);
+  inferUnknownStates(BF);
 
   // Support for function splitting:
   // if two consecutive BBs with Signed state are going to end up in different
@@ -53,7 +50,7 @@ void InsertNegateRAState::runOnFunction(BinaryFunction &BF) {
     // BOLT can generate empty BBs at function splitting which are only used as
     // target labels. We should add the negate-ra-state CFI to the first
     // non-empty BB.
-    auto FirstNonEmpty =
+    auto *FirstNonEmpty =
         std::find_if(FF.begin(), FF.end(), [](BinaryBasicBlock *BB) {
           // getFirstNonPseudo returns BB.end() if it does not find any
           // Instructions.
@@ -92,7 +89,8 @@ void InsertNegateRAState::runOnFunction(BinaryFunction &BF) {
   }
 }
 
-bool InsertNegateRAState::addNegateRAStateAfterPacOrAuth(BinaryFunction &BF) {
+bool InsertNegateRAState::addNegateRAStateAfterPSignOrPAuth(
+    BinaryFunction &BF) {
   BinaryContext &BC = BF.getBinaryContext();
   bool FoundAny = false;
   for (BinaryBasicBlock &BB : BF) {
@@ -109,7 +107,7 @@ bool InsertNegateRAState::addNegateRAStateAfterPacOrAuth(BinaryFunction &BF) {
   return FoundAny;
 }
 
-void InsertNegateRAState::fixUnknownStates(BinaryFunction &BF) {
+void InsertNegateRAState::inferUnknownStates(BinaryFunction &BF) {
   BinaryContext &BC = BF.getBinaryContext();
   bool FirstIter = true;
   MCInst PrevInst;
diff --git a/bolt/lib/Passes/MarkRAStates.cpp b/bolt/lib/Passes/MarkRAStates.cpp
index 9e2ab049a67f5..aacb14fb7dcc9 100644
--- a/bolt/lib/Passes/MarkRAStates.cpp
+++ b/bolt/lib/Passes/MarkRAStates.cpp
@@ -22,12 +22,7 @@
 #include "bolt/Passes/MarkRAStates.h"
 #include "bolt/Core/BinaryFunction.h"
 #include "bolt/Core/ParallelUtilities.h"
-#include "bolt/Utils/CommandLineOpts.h"
 #include <cstdlib>
-#include <fstream>
-#include <iterator>
-
-#include <iostream>
 #include <optional>
 #include <stack>
 
@@ -49,11 +44,9 @@ void MarkRAStates::runOnFunction(BinaryFunction &BF) {
       if ((BC.MIB->isPSignOnLR(Inst) ||
            (BC.MIB->isPAuthOnLR(Inst) && !BC.MIB->isPAuthAndRet(Inst))) &&
           !BC.MIB->hasNegateRAState(Inst)) {
-        // no .cfi_negate_ra_state attached to signing or authenticating instr
-        // means, that this is a function with handwritten assembly, which might
-        // not respect Clang's conventions (e.g. tailcalls are always
-        // authenticated, so functions always start with unsigned RAState when
-        // working with compiler-generated code)
+        // Not all functions have .cfi_negate_ra_state in them. But if one does,
+        // we expect psign/pauth instructions to have the hasNegateRAState
+        // annotation.
         BF.setIgnored();
         BC.outs() << "BOLT-INFO: inconsistent RAStates in function "
                   << BF.getPrintName() << "\n";

>From ac7e16cc355d5e029687cea3a8644af1398506e8 Mon Sep 17 00:00:00 2001
From: Gergely Balint <gergely.balint at arm.com>
Date: Fri, 8 Aug 2025 12:39:02 +0000
Subject: [PATCH 15/25] [BOLT] Refactor and improve InsertNegateRAStatePass

- separate function splitting code into its own function
- change the iteration looking for state-switches between consecutive
  instructions to only look for changes *inside* FunctionFragments,
  and skip borders between them.
---
 .../bolt/Passes/InsertNegateRAStatePass.h     |  7 ++
 bolt/lib/Passes/InsertNegateRAStatePass.cpp   | 91 ++++++++++---------
 2 files changed, 53 insertions(+), 45 deletions(-)

diff --git a/bolt/include/bolt/Passes/InsertNegateRAStatePass.h b/bolt/include/bolt/Passes/InsertNegateRAStatePass.h
index 2394356401564..0b1654c1380b9 100644
--- a/bolt/include/bolt/Passes/InsertNegateRAStatePass.h
+++ b/bolt/include/bolt/Passes/InsertNegateRAStatePass.h
@@ -41,6 +41,13 @@ class InsertNegateRAState : public BinaryFunctionPass {
   /// newly inserted instructions do not have a state associated with them.
   /// New states are "inherited" from the last known state.
   void inferUnknownStates(BinaryFunction &BF);
+
+  /// Support for function splitting:
+  /// if two consecutive BBs with Signed state are going to end up in different
+  /// functions (so are held by different FunctionFragments), we have to add a
+  /// OpNegateRAState to the beginning of the newly split function, so it starts
+  /// with a Signed state.
+  void coverFunctionFragmentStart(BinaryFunction &BF, FunctionFragment &FF);
 };
 
 } // namespace bolt
diff --git a/bolt/lib/Passes/InsertNegateRAStatePass.cpp b/bolt/lib/Passes/InsertNegateRAStatePass.cpp
index 8393f2b05a16f..8ee2d28d0e379 100644
--- a/bolt/lib/Passes/InsertNegateRAStatePass.cpp
+++ b/bolt/lib/Passes/InsertNegateRAStatePass.cpp
@@ -34,57 +34,35 @@ void InsertNegateRAState::runOnFunction(BinaryFunction &BF) {
     return;
   }
 
-  // If none is inserted, the function doesn't need more work.
-  if (!addNegateRAStateAfterPSignOrPAuth(BF))
-    return;
+  // Attach .cfi_negate_ra_state to the "trivial" cases first.
+  addNegateRAStateAfterPSignOrPAuth(BF);
 
   inferUnknownStates(BF);
 
-  // Support for function splitting:
-  // if two consecutive BBs with Signed state are going to end up in different
-  // functions (so are held by different FunctionFragments), we have to add a
-  // OpNegateRAState to the beginning of the newly split function, so it starts
-  // with a Signed state.
   for (FunctionFragment &FF : BF.getLayout().fragments()) {
-    // Find the first BB in the FF which has Instructions.
-    // BOLT can generate empty BBs at function splitting which are only used as
-    // target labels. We should add the negate-ra-state CFI to the first
-    // non-empty BB.
-    auto *FirstNonEmpty =
-        std::find_if(FF.begin(), FF.end(), [](BinaryBasicBlock *BB) {
-          // getFirstNonPseudo returns BB.end() if it does not find any
-          // Instructions.
-          return BB->getFirstNonPseudo() != BB->end();
-        });
-    if (BC.MIB->isRASigned(*((*FirstNonEmpty)->begin()))) {
-      BF.addCFIInstruction(*FirstNonEmpty, (*FirstNonEmpty)->begin(),
-                           MCCFIInstruction::createNegateRAState(nullptr));
-    }
-  }
-
-  bool FirstIter = true;
-  MCInst PrevInst;
-  for (BinaryBasicBlock &BB : BF) {
-    for (auto It = BB.begin(); It != BB.end(); ++It) {
-
-      MCInst &Inst = *It;
-      if (BC.MIB->isCFI(Inst))
-        continue;
-
-      if (!FirstIter) {
-        // Consecutive instructions with different RAState means we need to add
-        // a OpNegateRAState.
-        if ((BC.MIB->isRASigned(PrevInst) && BC.MIB->isRAUnsigned(Inst)) ||
-            (BC.MIB->isRAUnsigned(PrevInst) && BC.MIB->isRASigned(Inst))) {
-
-          It = BF.addCFIInstruction(
-              &BB, It, MCCFIInstruction::createNegateRAState(nullptr));
+    coverFunctionFragmentStart(BF, FF);
+    bool FirstIter = true;
+    MCInst PrevInst;
+    // As this pass runs after function splitting, we should only check
+    // consecutive instructions inside FunctionFragments.
+    for (BinaryBasicBlock *BB : FF) {
+      for (auto It = BB->begin(); It != BB->end(); ++It) {
+        MCInst &Inst = *It;
+        if (BC.MIB->isCFI(Inst))
+          continue;
+        if (!FirstIter) {
+          // Consecutive instructions with different RAState means we need to
+          // add a OpNegateRAState.
+          if ((BC.MIB->isRASigned(PrevInst) && BC.MIB->isRAUnsigned(Inst)) ||
+              (BC.MIB->isRAUnsigned(PrevInst) && BC.MIB->isRASigned(Inst))) {
+            It = BF.addCFIInstruction(
+                BB, It, MCCFIInstruction::createNegateRAState(nullptr));
+          }
+        } else {
+          FirstIter = false;
         }
-
-      } else {
-        FirstIter = false;
+        PrevInst = *It;
       }
-      PrevInst = *It;
     }
   }
 }
@@ -107,6 +85,29 @@ bool InsertNegateRAState::addNegateRAStateAfterPSignOrPAuth(
   return FoundAny;
 }
 
+void InsertNegateRAState::coverFunctionFragmentStart(BinaryFunction &BF,
+                                                     FunctionFragment &FF) {
+  BinaryContext &BC = BF.getBinaryContext();
+  if (FF.empty())
+    return;
+  // Find the first BB in the FF which has Instructions.
+  // BOLT can generate empty BBs at function splitting which are only used as
+  // target labels. We should add the negate-ra-state CFI to the first
+  // non-empty BB.
+  auto *FirstNonEmpty =
+      std::find_if(FF.begin(), FF.end(), [](BinaryBasicBlock *BB) {
+        // getFirstNonPseudo returns BB.end() if it does not find any
+        // Instructions.
+        return BB->getFirstNonPseudo() != BB->end();
+      });
+  // If a function is already split in the input, the first FF can also start
+  // with Signed state. This covers that scenario as well.
+  if (BC.MIB->isRASigned(*((*FirstNonEmpty)->begin()))) {
+    BF.addCFIInstruction(*FirstNonEmpty, (*FirstNonEmpty)->begin(),
+                         MCCFIInstruction::createNegateRAState(nullptr));
+  }
+}
+
 void InsertNegateRAState::inferUnknownStates(BinaryFunction &BF) {
   BinaryContext &BC = BF.getBinaryContext();
   bool FirstIter = true;

>From e60be8ee64e8d906d49193a9a024aba429f722b6 Mon Sep 17 00:00:00 2001
From: Gergely Balint <gergely.balint at arm.com>
Date: Mon, 11 Aug 2025 11:20:45 +0000
Subject: [PATCH 16/25] [BOLT] Add function splitting lit test without runtime
 dependency

- Test uses lit and FileCheck to check if OpNegateRAState CFIs are
  generated in the correct location for function-splitting.
- Also fix a bug related to function-splitting.
---
 bolt/lib/Passes/InsertNegateRAStatePass.cpp |  3 +-
 bolt/test/AArch64/pacret-split-funcs.s      | 56 +++++++++++++++++++++
 2 files changed, 58 insertions(+), 1 deletion(-)
 create mode 100644 bolt/test/AArch64/pacret-split-funcs.s

diff --git a/bolt/lib/Passes/InsertNegateRAStatePass.cpp b/bolt/lib/Passes/InsertNegateRAStatePass.cpp
index 8ee2d28d0e379..60af3485ea4d1 100644
--- a/bolt/lib/Passes/InsertNegateRAStatePass.cpp
+++ b/bolt/lib/Passes/InsertNegateRAStatePass.cpp
@@ -102,7 +102,8 @@ void InsertNegateRAState::coverFunctionFragmentStart(BinaryFunction &BF,
       });
   // If a function is already split in the input, the first FF can also start
   // with Signed state. This covers that scenario as well.
-  if (BC.MIB->isRASigned(*((*FirstNonEmpty)->begin()))) {
+  if (BC.MIB->isRASigned(*((*FirstNonEmpty)->begin())) ||
+      BC.MIB->isAuthenticating(*((*FirstNonEmpty)->begin()))) {
     BF.addCFIInstruction(*FirstNonEmpty, (*FirstNonEmpty)->begin(),
                          MCCFIInstruction::createNegateRAState(nullptr));
   }
diff --git a/bolt/test/AArch64/pacret-split-funcs.s b/bolt/test/AArch64/pacret-split-funcs.s
new file mode 100644
index 0000000000000..6582257948c62
--- /dev/null
+++ b/bolt/test/AArch64/pacret-split-funcs.s
@@ -0,0 +1,56 @@
+# Checking that we generate an OpNegateRAState CFI after the split point,
+# when splitting a region with signed RA state.
+
+# REQUIRES: system-linux
+
+# RUN: %clang %cflags -o %t %s
+# RUN: %clang %s %cflags -Wl,-q -o %t
+# RUN: link_fdata --no-lbr %s %t %t.fdata
+# RUN: llvm-bolt %t -o %t.bolt --data %t.fdata -split-functions \
+# RUN: --print-only foo --print-split --print-all 2>&1 | FileCheck %s
+
+# Checking that we don't see any OpNegateRAState CFIs before the insertion pass.
+# CHECK-NOT: OpNegateRAState
+# CHECK: Binary Function "foo" after insert-negate-ra-state-pass
+
+# CHECK:       paciasp
+# CHECK-NEXT:  OpNegateRAState
+
+# CHECK: -------   HOT-COLD SPLIT POINT   -------
+
+# CHECK:         OpNegateRAState
+# CHECK-NEXT:    autiasp
+# CHECK-NEXT:    OpNegateRAState
+# CHECK-NEXT:    ret
+
+# CHECK:         autiasp
+# CHECK-NEXT:    OpNegateRAState
+# CHECK-NEXT:    ret
+
+# End of the insert-negate-ra-state-pass logs
+# CHECK: Binary Function "foo" after finalize-functions
+
+  .text
+  .globl  foo
+  .type foo, %function
+foo:
+.cfi_startproc
+.entry_bb:
+     paciasp
+    .cfi_negate_ra_state     // indicating that paciasp changed the RA state to signed
+# FDATA: 1 foo #.entry_bb# 10
+    cmp x0, #0
+    b.eq .Lcold_bb1
+    autiasp
+    .cfi_negate_ra_state     // indicating that autiasp changed the RA state to unsigned
+    ret
+    .cfi_negate_ra_state     // ret has unsigned RA state, but the next inst (autiasp) has signed RA state
+.Lcold_bb1:
+    autiasp
+    .cfi_negate_ra_state     // indicating that autiasp changed the RA state to unsigned
+    ret
+.cfi_endproc
+  .size foo, .-foo
+
+## Force relocation mode.
+.reloc 0, R_AARCH64_NONE

>From 6e13a3549b7981e8382235829c5d715714ec3ec8 Mon Sep 17 00:00:00 2001
From: Gergely Balint <gergely.balint at arm.com>
Date: Mon, 11 Aug 2025 13:04:15 +0000
Subject: [PATCH 17/25] [BOLT] Remove unnecessary script, and rewrite unit test
 using it

- the checks that match_dwarf.py helped with can also be done using BOLT
  logs.
- rewrote the negate-ra-state.s test to only rely on logs to verify
  correct OpNegateRAState CFI generation.
---
 bolt/test/AArch64/negate-ra-state.s | 100 +++++++++++++-------
 bolt/test/lit.cfg.py                |   7 --
 bolt/test/match_dwarf.py            | 137 ----------------------------
 3 files changed, 67 insertions(+), 177 deletions(-)
 delete mode 100755 bolt/test/match_dwarf.py

diff --git a/bolt/test/AArch64/negate-ra-state.s b/bolt/test/AArch64/negate-ra-state.s
index 11c511a254c71..65483dfaf28e5 100644
--- a/bolt/test/AArch64/negate-ra-state.s
+++ b/bolt/test/AArch64/negate-ra-state.s
@@ -1,42 +1,76 @@
+# Checking that .cfi-negate_ra_state directives are emitted in the same location as in the input in the case of no optimizations.
+
 # RUN: llvm-mc -filetype=obj -triple aarch64-unknown-unknown %s -o %t.o
 # RUN: %clang %cflags  %t.o -o %t.exe -Wl,-q
+# RUN: llvm-bolt %t.exe -o %t.exe.bolt --no-threads --print-all | FileCheck %s --check-prefix=CHECK-BOLT
+
+# Check that the negate-ra-state at the start of bar is not discarded.
+# If it was discarded, MarkRAState would report bar as having inconsistent RAStates.
+# CHECK-BOLT-NOT: BOLT-INFO: inconsistent RAStates in function foo
+# CHECK-BOLT-NOT: BOLT-INFO: inconsistent RAStates in function bar
+
+# Check that OpNegateRAState CFIs are generated correctly.
+# CHECK-BOLT: Binary Function "foo" after insert-negate-ra-state-pass {
+# CHECK-BOLT:         paciasp
+# CHECK-BOLT-NEXT:    OpNegateRAState
+
+# CHECK-BOLT:      DWARF CFI Instructions:
+# CHECK-BOLT-NEXT:     0:  OpNegateRAState
+# CHECK-BOLT-NEXT: End of Function "foo"
+
+# CHECK-BOLT: Binary Function "bar" after insert-negate-ra-state-pass {
+# CHECK-BOLT:         OpNegateRAState
+# CHECK-BOLT-NEXT:    mov     x1, #0x0
+# CHECK-BOLT-NEXT:    mov     x1, #0x1
+# CHECK-BOLT-NEXT:    autiasp
+# CHECK-BOLT-NEXT:    OpNegateRAState
+# CHECK-BOLT-NEXT:    ret
 
-# RUN: llvm-objdump %t.exe -d > %t.exe.dump
-# RUN: llvm-objdump --dwarf=frames %t.exe > %t.exe.dump-dwarf
-# RUN: match-dwarf %t.exe.dump %t.exe.dump-dwarf foo > %t.match-dwarf.txt
+# CHECK-BOLT:     DWARF CFI Instructions:
+# CHECK-BOLT-NEXT:     0:  OpNegateRAState
+# CHECK-BOLT-NEXT:     1:  OpNegateRAState
+# CHECK-BOLT-NEXT: End of Function "bar"
 
-# RUN: llvm-bolt %t.exe -o %t.exe.bolt
+# End of negate-ra-state insertion logs for foo and bar.
+# CHECK: Binary Function "_start" after insert-negate-ra-state-pass {
 
-# RUN: llvm-objdump %t.exe.bolt -d > %t.exe.bolt.dump
-# RUN: llvm-objdump --dwarf=frames %t.exe.bolt  > %t.exe.bolt.dump-dwarf
-# RUN: match-dwarf %t.exe.bolt.dump %t.exe.bolt.dump-dwarf foo > %t.bolt.match-dwarf.txt
+# Check that the functions are in the new .text section
+# RUN: llvm-objdump %t.exe.bolt -d -j .text | FileCheck %s --check-prefix=CHECK-OBJDUMP
+# CHECK-OBJDUMP: <foo>:
+# CHECK-OBJDUMP: <bar>:
 
-# RUN: diff %t.match-dwarf.txt %t.bolt.match-dwarf.txt
 
-	.text
-	.globl	foo
-	.p2align	2
-	.type	foo, at function
+  .text
+  .globl  foo
+  .p2align        2
+  .type   foo, at function
 foo:
-	.cfi_startproc
-	hint	#25
-	.cfi_negate_ra_state
-	sub	sp, sp, #16
-	stp	x29, x30, [sp, #16]             // 16-byte Folded Spill
-	.cfi_def_cfa_offset 16
-	str	w0, [sp, #12]
-	ldr	w8, [sp, #12]
-	add	w0, w8, #1
-	ldp	x29, x30, [sp, #16]             // 16-byte Folded Reload
-	add	sp, sp, #16
-	hint	#29
-	.cfi_negate_ra_state
-	ret
-.Lfunc_end1:
-	.size	foo, .Lfunc_end1-foo
-	.cfi_endproc
-
-	.global _start
-	.type _start, %function
+  .cfi_startproc
+  paciasp
+  .cfi_negate_ra_state
+  mov x1, #0
+  b bar
+  .cfi_endproc
+  .size   foo, .-foo
+
+
+
+  .text
+  .globl  bar
+  .p2align        2
+  .type   bar, at function
+bar:
+  .cfi_startproc
+  .cfi_negate_ra_state    // Indicating that RA is signed from the start of bar.
+  mov x1, #0
+  mov x1, #1
+  autiasp
+  .cfi_negate_ra_state
+  ret
+  .cfi_endproc
+  .size   bar, .-bar
+
+  .global _start
+  .type _start, %function
 _start:
-	b foo
+  b foo
diff --git a/bolt/test/lit.cfg.py b/bolt/test/lit.cfg.py
index 60617cd76f968..0d05229be2bf3 100644
--- a/bolt/test/lit.cfg.py
+++ b/bolt/test/lit.cfg.py
@@ -89,7 +89,6 @@
 config.substitutions.append(("%cxxflags", ""))
 
 link_fdata_cmd = os.path.join(config.test_source_root, "link_fdata.py")
-match_dwarf_cmd = os.path.join(config.test_source_root, "match_dwarf.py")
 
 tool_dirs = [config.llvm_tools_dir, config.test_source_root]
 
@@ -132,12 +131,6 @@
     ToolSubst("llvm-readobj", unresolved="fatal"),
     ToolSubst("llvm-dwp", unresolved="fatal"),
     ToolSubst("split-file", unresolved="fatal"),
-    ToolSubst(
-        "match-dwarf",
-        command=sys.executable,
-        unresolved="fatal",
-        extra_args=[match_dwarf_cmd],
-    ),
 ]
 llvm_config.add_tool_substitutions(tools, tool_dirs)
 
diff --git a/bolt/test/match_dwarf.py b/bolt/test/match_dwarf.py
deleted file mode 100755
index 3d3ab22042d5c..0000000000000
--- a/bolt/test/match_dwarf.py
+++ /dev/null
@@ -1,137 +0,0 @@
-#!/usr/bin/env python3
-
-# This tool helps matching dwarf dumps
-# (= the output from running llvm-objdump --dwarf=frames),
-# by address to function names (which are parsed from a normal objdump).
-# The script is used for checking if .cfi_negate_ra_state CFIs
-# are generated by BOLT the same way they are generated by LLVM.
-# The script is called twice in unittests: once with the objdumps of
-# the BOLT input binary, and once with the output binary from BOLT.
-# We output the offsets of .cfi_negate_ra_state instructions from the
-# function's start address to see that BOLT can generate them to the same
-# locations.
-# Because we check the location, this is only useful for testing without
-# optimization flags, so `llvm-bolt input.exe -o output.exe`
-
-
-import argparse
-import subprocess
-import sys
-import re
-
-
-class NameDwarfPair(object):
-    def __init__(self, name, body):
-        self.name = name
-        self.body = body
-        self.finalized = False
-
-    def append(self, body_line):
-        # only store elements into the body until the first whitespace line is encountered.
-        if body_line.isspace():
-            self.finalized = True
-        if not self.finalized:
-            self.body += body_line
-
-    def print(self):
-        print(self.name)
-        print(self.body)
-
-    def parse_negate_offsets(self):
-        """
-        Create a list of locations/offsets of the negate_ra_state CFIs in the
-        dwarf entry. To find offsets for each, we match the DW_CFA_advance_loc
-        entries, and sum up their values.
-        """
-        negate_offsets = []
-        loc = 0
-        # TODO: make sure this is not printed in hex
-        re_advloc = r"DW_CFA_advance_loc: (\d+)"
-
-        for line in self.body.splitlines():
-            # if line matches advance_loc int
-            match = re.search(re_advloc, line)
-            if match:
-                loc += int(match.group(1))
-            if "DW_CFA_AARCH64_negate_ra_state" in line:
-                negate_offsets.append(loc)
-
-        self.negate_offsets = negate_offsets
-
-    def __eq__(self, other):
-        return self.name == other.name and self.negate_offsets == other.negate_offsets
-
-
-def extract_function_addresses(objdump):
-    """
-    Parse and return address-to-name dictionary from objdump file.
-    Function names in the objdump look like this:
-        000123abc <foo>:
-    We create a dict from the addr (000123abc), to the name (foo).
-    """
-    addr_name_dict = dict()
-    re_function = re.compile(r"^([0-9a-fA-F]+)\s<(.*)>:$")
-    with open(objdump, "r") as f:
-        for line in f.readlines():
-            match = re_function.match(line)
-            if not match:
-                continue
-            m_addr = match.groups()[0]
-            m_name = match.groups()[1]
-            addr_name_dict[int(m_addr, 16)] = m_name
-
-    return addr_name_dict
-
-
-def match_dwarf_to_name(dwarfdump, addr_name_dict):
-    """
-    Parse dwarf dump, and match names to blocks using the dict from the objdump.
-    Return a list of NameDwarfPairs.
-    The matched lines look like this:
-    000123 000456 000789 FDE cie=000000  pc=0123abc...0456def
-    We do not have the function name for this, only the PC range it applies to.
-    We match the pc=0123abc (the start address), and find the matching name from
-    the addr_name_dict.
-    The resultint NameDwarfPair will hold the lines this header applied to, and
-    instead of the header with the addresses, it will just have the function name.
-    """
-    re_address_line = re.compile(r".*pc=([0-9a-fA-F]+)\.\.\.([0-9a-fA-F]+)")
-    with open(dwarfdump, "r") as dw:
-        functions = []
-        for line in dw.readlines():
-            match = re_address_line.match(line)
-            if not match:
-                if len(functions) > 0:
-                    functions[-1].append(line)
-                continue
-            pc_start_address = match.groups()[0]
-            name = addr_name_dict.get(int(pc_start_address, 16))
-            functions.append(NameDwarfPair(name, ""))
-
-        return functions
-
-
-def main():
-    parser = argparse.ArgumentParser()
-    parser.add_argument("objdump", help="Objdump file")
-    parser.add_argument(
-        "dwarfdump", help="dwarf dump file created with 'llvm-objdump --dwarf=frames'"
-    )
-    parser.add_argument("function", help="Function to search CFIs in.")
-
-    args = parser.parse_args()
-
-    addr_name_dict = extract_function_addresses(args.objdump)
-    functions = match_dwarf_to_name(args.dwarfdump, addr_name_dict)
-
-    for f in functions:
-        if f.name == args.function:
-            f.parse_negate_offsets()
-            print(f.negate_offsets)
-            break
-    else:
-        print(f"{args.function} not found")
-        exit(-1)
-
-
-main()

>From 04452435d046a3749d7345574749c4452776710f Mon Sep 17 00:00:00 2001
From: Gergely Balint <gergely.balint at arm.com>
Date: Mon, 11 Aug 2025 12:46:30 +0000
Subject: [PATCH 18/25] [BOLT][NFC] Simplify RAState tracking

- Remove 'Authenticating' and 'Signing' MCAnnotations.
- The same logic can be done using only 'Signed' and 'Unsigned' annotations.
- To check if an instruction is signing or authenticating, we can use
  the PSignOnLR, and PAuthOnLR functions.
---
 bolt/include/bolt/Core/MCPlus.h               |  2 --
 bolt/include/bolt/Core/MCPlusBuilder.h        | 12 --------
 .../bolt/Passes/InsertNegateRAStatePass.h     |  9 ------
 bolt/lib/Core/MCPlusBuilder.cpp               | 21 +-------------
 bolt/lib/Passes/InsertNegateRAStatePass.cpp   | 28 ++-----------------
 bolt/lib/Passes/MarkRAStates.cpp              |  8 ++++--
 6 files changed, 10 insertions(+), 70 deletions(-)

diff --git a/bolt/include/bolt/Core/MCPlus.h b/bolt/include/bolt/Core/MCPlus.h
index a95bba36c5a6e..8b1f7033ffad1 100644
--- a/bolt/include/bolt/Core/MCPlus.h
+++ b/bolt/include/bolt/Core/MCPlus.h
@@ -72,9 +72,7 @@ class MCAnnotation {
     kLabel,               /// MCSymbol pointing to this instruction.
     kSize,                /// Size of the instruction.
     kDynamicBranch,       /// Jit instruction patched at runtime.
-    kSigning,             /// Inst is a signing instruction (paciasp, etc.).
     kSigned,              /// Inst is in a range where RA is signed.
-    kAuthenticating,      /// Authenticating inst (e.g. autiasp).
     kUnsigned,            /// Inst is in a range where RA is unsigned.
     kRememberState,       /// Inst has rememberState CFI.
     kRestoreState,        /// Inst has restoreState CFI.
diff --git a/bolt/include/bolt/Core/MCPlusBuilder.h b/bolt/include/bolt/Core/MCPlusBuilder.h
index efea2f4590ee7..6295d4f43d191 100644
--- a/bolt/include/bolt/Core/MCPlusBuilder.h
+++ b/bolt/include/bolt/Core/MCPlusBuilder.h
@@ -1342,18 +1342,6 @@ class MCPlusBuilder {
   /// Return true if \p Inst has Signed RA annotation.
   bool isRASigned(const MCInst &Inst) const;
 
-  /// Stores RA Signing annotation on \p Inst.
-  void setRASigning(MCInst &Inst) const;
-
-  /// Return true if \p Inst has Signing RA annotation.
-  bool isRASigning(const MCInst &Inst) const;
-
-  /// Stores Authenticating annotation on \p Inst.
-  void setAuthenticating(MCInst &Inst) const;
-
-  /// Return true if \p Inst has Authenticating annotation.
-  bool isAuthenticating(const MCInst &Inst) const;
-
   /// Stores RA Unsigned annotation on \p Inst.
   void setRAUnsigned(MCInst &Inst) const;
 
diff --git a/bolt/include/bolt/Passes/InsertNegateRAStatePass.h b/bolt/include/bolt/Passes/InsertNegateRAStatePass.h
index 0b1654c1380b9..836948bf5e9c0 100644
--- a/bolt/include/bolt/Passes/InsertNegateRAStatePass.h
+++ b/bolt/include/bolt/Passes/InsertNegateRAStatePass.h
@@ -28,15 +28,6 @@ class InsertNegateRAState : public BinaryFunctionPass {
   void runOnFunction(BinaryFunction &BF);
 
 private:
-  /// Loops over all instructions and adds OpNegateRAState CFI
-  /// after any pointer signing or authenticating instructions,
-  /// which operate on the LR, except fused pauth + ret instructions
-  /// (such as RETAA). Normal pauth and psign instructions are "special cases",
-  /// meaning they always need an OpNegateRAState CFI after them.
-  /// Fused pauth + ret instructions are not, they work as any other
-  /// instruction.
-  /// Returns true, if any OpNegateRAState CFIs were added.
-  bool addNegateRAStateAfterPSignOrPAuth(BinaryFunction &BF);
   /// Because states are tracked as MCAnnotations on individual instructions,
   /// newly inserted instructions do not have a state associated with them.
   /// New states are "inherited" from the last known state.
diff --git a/bolt/lib/Core/MCPlusBuilder.cpp b/bolt/lib/Core/MCPlusBuilder.cpp
index aa2a784442a5f..5afc14a6e8854 100644
--- a/bolt/lib/Core/MCPlusBuilder.cpp
+++ b/bolt/lib/Core/MCPlusBuilder.cpp
@@ -185,24 +185,6 @@ bool MCPlusBuilder::isRASigned(const MCInst &Inst) const {
   return hasAnnotation(Inst, MCAnnotation::kSigned);
 }
 
-void MCPlusBuilder::setRASigning(MCInst &Inst) const {
-  assert(!hasAnnotation(Inst, MCAnnotation::kSigning));
-  setAnnotationOpValue(Inst, MCAnnotation::kSigning, true);
-}
-
-bool MCPlusBuilder::isRASigning(const MCInst &Inst) const {
-  return hasAnnotation(Inst, MCAnnotation::kSigning);
-}
-
-void MCPlusBuilder::setAuthenticating(MCInst &Inst) const {
-  assert(!hasAnnotation(Inst, MCAnnotation::kAuthenticating));
-  setAnnotationOpValue(Inst, MCAnnotation::kAuthenticating, true);
-}
-
-bool MCPlusBuilder::isAuthenticating(const MCInst &Inst) const {
-  return hasAnnotation(Inst, MCAnnotation::kAuthenticating);
-}
-
 void MCPlusBuilder::setRAUnsigned(MCInst &Inst) const {
   assert(!hasAnnotation(Inst, MCAnnotation::kUnsigned));
   setAnnotationOpValue(Inst, MCAnnotation::kUnsigned, true);
@@ -213,8 +195,7 @@ bool MCPlusBuilder::isRAUnsigned(const MCInst &Inst) const {
 }
 
 bool MCPlusBuilder::isRAStateUnknown(const MCInst &Inst) const {
-  return !(isRAUnsigned(Inst) || isRASigned(Inst) || isRASigning(Inst) ||
-           isAuthenticating(Inst));
+  return !(isRAUnsigned(Inst) || isRASigned(Inst));
 }
 
 std::optional<MCLandingPad> MCPlusBuilder::getEHInfo(const MCInst &Inst) const {
diff --git a/bolt/lib/Passes/InsertNegateRAStatePass.cpp b/bolt/lib/Passes/InsertNegateRAStatePass.cpp
index 60af3485ea4d1..273944b860e36 100644
--- a/bolt/lib/Passes/InsertNegateRAStatePass.cpp
+++ b/bolt/lib/Passes/InsertNegateRAStatePass.cpp
@@ -34,9 +34,6 @@ void InsertNegateRAState::runOnFunction(BinaryFunction &BF) {
     return;
   }
 
-  // Attach .cfi_negate_ra_state to the "trivial" cases first.
-  addNegateRAStateAfterPSignOrPAuth(BF);
-
   inferUnknownStates(BF);
 
   for (FunctionFragment &FF : BF.getLayout().fragments()) {
@@ -67,24 +64,6 @@ void InsertNegateRAState::runOnFunction(BinaryFunction &BF) {
   }
 }
 
-bool InsertNegateRAState::addNegateRAStateAfterPSignOrPAuth(
-    BinaryFunction &BF) {
-  BinaryContext &BC = BF.getBinaryContext();
-  bool FoundAny = false;
-  for (BinaryBasicBlock &BB : BF) {
-    for (auto Iter = BB.begin(); Iter != BB.end(); ++Iter) {
-      MCInst &Inst = *Iter;
-      if (BC.MIB->isPSignOnLR(Inst) ||
-          (BC.MIB->isPAuthOnLR(Inst) && !BC.MIB->isPAuthAndRet(Inst))) {
-        Iter = BF.addCFIInstruction(
-            &BB, Iter + 1, MCCFIInstruction::createNegateRAState(nullptr));
-        FoundAny = true;
-      }
-    }
-  }
-  return FoundAny;
-}
-
 void InsertNegateRAState::coverFunctionFragmentStart(BinaryFunction &BF,
                                                      FunctionFragment &FF) {
   BinaryContext &BC = BF.getBinaryContext();
@@ -102,8 +81,7 @@ void InsertNegateRAState::coverFunctionFragmentStart(BinaryFunction &BF,
       });
   // If a function is already split in the input, the first FF can also start
   // with Signed state. This covers that scenario as well.
-  if (BC.MIB->isRASigned(*((*FirstNonEmpty)->begin())) ||
-      BC.MIB->isAuthenticating(*((*FirstNonEmpty)->begin()))) {
+  if (BC.MIB->isRASigned(*((*FirstNonEmpty)->begin()))) {
     BF.addCFIInstruction(*FirstNonEmpty, (*FirstNonEmpty)->begin(),
                          MCCFIInstruction::createNegateRAState(nullptr));
   }
@@ -121,10 +99,10 @@ void InsertNegateRAState::inferUnknownStates(BinaryFunction &BF) {
         continue;
 
       if (!FirstIter && BC.MIB->isRAStateUnknown(Inst)) {
-        if (BC.MIB->isRASigned(PrevInst) || BC.MIB->isRASigning(PrevInst)) {
+        if (BC.MIB->isRASigned(PrevInst) || BC.MIB->isPSignOnLR(PrevInst)) {
           BC.MIB->setRASigned(Inst);
         } else if (BC.MIB->isRAUnsigned(PrevInst) ||
-                   BC.MIB->isAuthenticating(PrevInst)) {
+                   BC.MIB->isPAuthOnLR(PrevInst)) {
           BC.MIB->setRAUnsigned(Inst);
         }
       } else {
diff --git a/bolt/lib/Passes/MarkRAStates.cpp b/bolt/lib/Passes/MarkRAStates.cpp
index aacb14fb7dcc9..8ce1b67814608 100644
--- a/bolt/lib/Passes/MarkRAStates.cpp
+++ b/bolt/lib/Passes/MarkRAStates.cpp
@@ -78,7 +78,9 @@ void MarkRAStates::runOnFunction(BinaryFunction &BF) {
           BF.setIgnored();
           return;
         }
-        BC.MIB->setRASigning(Inst);
+        // The signing instruction itself is unsinged, but the next will be
+        // signed.
+        BC.MIB->setRAUnsigned(Inst);
       } else if (BC.MIB->isPAuthOnLR(Inst)) {
         if (!RAState) {
           // RA authenticating instructions should only follow signed RA state.
@@ -89,7 +91,9 @@ void MarkRAStates::runOnFunction(BinaryFunction &BF) {
           BF.setIgnored();
           return;
         }
-        BC.MIB->setAuthenticating(Inst);
+        // The authenticating instruction itself is signed, but the next will be
+        // unsigned.
+        BC.MIB->setRASigned(Inst);
       } else if (RAState) {
         BC.MIB->setRASigned(Inst);
       } else {

>From 9a4ff401d18dd203c0ec79c9cc2c8fbe1532f5e2 Mon Sep 17 00:00:00 2001
From: Gergely Balint <gergely.balint at arm.com>
Date: Wed, 13 Aug 2025 12:44:50 +0000
Subject: [PATCH 19/25] [BOLT] Add negate-ra-state-reorder test

- Test checks that that reordering BBs change the number of
  negate-ra-state CFIs, and their locations.
---
 bolt/test/AArch64/negate-ra-state-reorder.s | 73 +++++++++++++++++++++
 1 file changed, 73 insertions(+)
 create mode 100644 bolt/test/AArch64/negate-ra-state-reorder.s

diff --git a/bolt/test/AArch64/negate-ra-state-reorder.s b/bolt/test/AArch64/negate-ra-state-reorder.s
new file mode 100644
index 0000000000000..2659f75aff9c9
--- /dev/null
+++ b/bolt/test/AArch64/negate-ra-state-reorder.s
@@ -0,0 +1,73 @@
+# Checking that after reordering BasicBlocks, the generated OpNegateRAState instructions
+# are placed where the RA state is different between two consecutive instructions.
+# This case demonstrates, that the input might have a different amount than the output:
+# input has 4, but output only has 3.
+
+# RUN: llvm-mc -filetype=obj -triple aarch64-unknown-unknown %s -o %t.o
+# RUN: %clang %cflags  %t.o -o %t.exe -Wl,-q
+# RUN: llvm-bolt %t.exe -o %t.exe.bolt --no-threads --reorder-blocks=reverse \
+# RUN: --print-cfg --print-after-lowering --print-only foo | FileCheck %s
+
+# Check that the reordering succeeded.
+# CHECK: Binary Function "foo" after building cfg {
+# CHECK: BB Layout   : .LBB00, .Ltmp2, .Ltmp0, .Ltmp1
+# CHECK: Binary Function "foo" after inst-lowering {
+# CHECK: BB Layout   : .LBB00, .Ltmp1, .Ltmp0, .Ltmp2
+
+
+# Check the generated CFIs.
+# CHECK:         OpNegateRAState
+# CHECK-NEXT:    mov     x2, #0x6
+
+# CHECK:         autiasp
+# CHECK-NEXT:    OpNegateRAState
+# CHECK-NEXT:    ret
+
+# CHECK:         paciasp
+# CHECK-NEXT:    OpNegateRAState
+
+# CHECK:         DWARF CFI Instructions:
+# CHECK-NEXT:        0:  OpNegateRAState
+# CHECK-NEXT:        1:  OpNegateRAState
+# CHECK-NEXT:        2:  OpNegateRAState
+# CHECK-NEXT:    End of Function "foo"
+
+  .text
+  .globl  foo
+  .p2align        2
+  .type   foo, at function
+foo:
+  .cfi_startproc
+  // RA is unsigned
+  mov x1, #0
+  mov x1, #1
+  mov x1, #2
+  // jump into the signed "range"
+  b .Lmiddle
+.Lback:
+// sign RA
+  paciasp
+  .cfi_negate_ra_state
+  mov x2, #3
+  mov x2, #4
+  // skip unsigned instructions
+  b .Lcont
+  .cfi_negate_ra_state
+.Lmiddle:
+// RA is unsigned
+  mov x4, #5
+  b .Lback
+  .cfi_negate_ra_state
+.Lcont:
+// continue in signed state
+  mov x2, #6
+  autiasp
+  .cfi_negate_ra_state
+  ret
+  .cfi_endproc
+  .size   foo, .-foo
+
+  .global _start
+  .type _start, %function
+_start:
+  b foo

>From 5d500d52b5feb3a8d40a7dd6416c9a50f942ff83 Mon Sep 17 00:00:00 2001
From: Gergely Balint <gergely.balint at arm.com>
Date: Thu, 14 Aug 2025 11:36:14 +0000
Subject: [PATCH 20/25] [BOLT] Print stats in MarkRAStates, InsertNegateRAState

- print statistics on the output about function getting Ignored in
  MarkRAStates, and on rewriting OpNegateRAState in
  InsertNegateRAStatePass.
- improve error messages in MarkRAStates
---
 .../bolt/Passes/InsertNegateRAStatePass.h     |  1 +
 bolt/include/bolt/Passes/MarkRAStates.h       |  5 ++-
 bolt/lib/Passes/InsertNegateRAStatePass.cpp   |  8 +++-
 bolt/lib/Passes/MarkRAStates.cpp              | 44 +++++++++++--------
 bolt/test/AArch64/negate-ra-state-incorrect.s | 11 ++---
 5 files changed, 41 insertions(+), 28 deletions(-)

diff --git a/bolt/include/bolt/Passes/InsertNegateRAStatePass.h b/bolt/include/bolt/Passes/InsertNegateRAStatePass.h
index 836948bf5e9c0..9acd28424c59a 100644
--- a/bolt/include/bolt/Passes/InsertNegateRAStatePass.h
+++ b/bolt/include/bolt/Passes/InsertNegateRAStatePass.h
@@ -28,6 +28,7 @@ class InsertNegateRAState : public BinaryFunctionPass {
   void runOnFunction(BinaryFunction &BF);
 
 private:
+  uint64_t FunctionsModified{0};
   /// Because states are tracked as MCAnnotations on individual instructions,
   /// newly inserted instructions do not have a state associated with them.
   /// New states are "inherited" from the last known state.
diff --git a/bolt/include/bolt/Passes/MarkRAStates.h b/bolt/include/bolt/Passes/MarkRAStates.h
index e7a49f813b6a7..8870540aa7e9e 100644
--- a/bolt/include/bolt/Passes/MarkRAStates.h
+++ b/bolt/include/bolt/Passes/MarkRAStates.h
@@ -25,7 +25,10 @@ class MarkRAStates : public BinaryFunctionPass {
 
   /// Pass entry point
   Error runOnFunctions(BinaryContext &BC) override;
-  void runOnFunction(BinaryFunction &BF);
+  bool runOnFunction(BinaryFunction &BF);
+
+private:
+  uint64_t FunctionsIgnored{0};
 };
 
 } // namespace bolt
diff --git a/bolt/lib/Passes/InsertNegateRAStatePass.cpp b/bolt/lib/Passes/InsertNegateRAStatePass.cpp
index 273944b860e36..7b93a0ca2b251 100644
--- a/bolt/lib/Passes/InsertNegateRAStatePass.cpp
+++ b/bolt/lib/Passes/InsertNegateRAStatePass.cpp
@@ -115,11 +115,12 @@ void InsertNegateRAState::inferUnknownStates(BinaryFunction &BF) {
 
 Error InsertNegateRAState::runOnFunctions(BinaryContext &BC) {
   ParallelUtilities::WorkFuncTy WorkFun = [&](BinaryFunction &BF) {
-    if (BF.containedNegateRAState()) {
+    if (BF.containedNegateRAState() && !BF.isIgnored()) {
       // We can skip functions which did not include negate-ra-state CFIs. This
       // includes code using pac-ret hardening as well, if the binary is
       // compiled with `-fno-exceptions -fno-unwind-tables
       // -fno-asynchronous-unwind-tables`
+      FunctionsModified++;
       runOnFunction(BF);
     }
   };
@@ -128,6 +129,11 @@ Error InsertNegateRAState::runOnFunctions(BinaryContext &BC) {
       BC, ParallelUtilities::SchedulingPolicy::SP_TRIVIAL, WorkFun, nullptr,
       "InsertNegateRAStatePass");
 
+  BC.outs() << "BOLT-INFO: rewritten pac-ret DWARF info in "
+            << FunctionsModified << " out of " << BC.getBinaryFunctions().size()
+            << " functions "
+            << format("(%.2lf%%).\n", (100.0 * FunctionsModified) /
+                                          BC.getBinaryFunctions().size());
   return Error::success();
 }
 
diff --git a/bolt/lib/Passes/MarkRAStates.cpp b/bolt/lib/Passes/MarkRAStates.cpp
index 8ce1b67814608..2e766a6ed9be6 100644
--- a/bolt/lib/Passes/MarkRAStates.cpp
+++ b/bolt/lib/Passes/MarkRAStates.cpp
@@ -31,10 +31,7 @@ using namespace llvm;
 namespace llvm {
 namespace bolt {
 
-void MarkRAStates::runOnFunction(BinaryFunction &BF) {
-
-  if (BF.isIgnored())
-    return;
+bool MarkRAStates::runOnFunction(BinaryFunction &BF) {
 
   BinaryContext &BC = BF.getBinaryContext();
 
@@ -49,10 +46,9 @@ void MarkRAStates::runOnFunction(BinaryFunction &BF) {
         // annotation.
         BF.setIgnored();
         BC.outs() << "BOLT-INFO: inconsistent RAStates in function "
-                  << BF.getPrintName() << "\n";
-        BC.outs()
-            << "BOLT-INFO: ptr sign/auth inst without .cfi_negate_ra_state\n";
-        return;
+                  << BF.getPrintName()
+                  << ": ptr sign/auth inst without .cfi_negate_ra_state\n";
+        return false;
       }
     }
   }
@@ -72,11 +68,10 @@ void MarkRAStates::runOnFunction(BinaryFunction &BF) {
         if (RAState) {
           // RA signing instructions should only follow unsigned RA state.
           BC.outs() << "BOLT-INFO: inconsistent RAStates in function "
-                    << BF.getPrintName() << "\n";
-          BC.outs() << "BOLT-INFO: ptr signing inst encountered in Signed RA "
-                       "state.\n";
+                    << BF.getPrintName()
+                    << ": ptr signing inst encountered in Signed RA state\n";
           BF.setIgnored();
-          return;
+          return false;
         }
         // The signing instruction itself is unsinged, but the next will be
         // signed.
@@ -85,11 +80,11 @@ void MarkRAStates::runOnFunction(BinaryFunction &BF) {
         if (!RAState) {
           // RA authenticating instructions should only follow signed RA state.
           BC.outs() << "BOLT-INFO: inconsistent RAStates in function "
-                    << BF.getPrintName() << "\n";
-          BC.outs() << "BOLT-INFO: ptr authenticating inst encountered in "
-                       "Unsigned RA state.\n";
+                    << BF.getPrintName()
+                    << ": ptr authenticating inst encountered in Unsigned RA "
+                       "state\n";
           BF.setIgnored();
-          return;
+          return false;
         }
         // The authenticating instruction itself is signed, but the next will be
         // unsigned.
@@ -120,16 +115,19 @@ void MarkRAStates::runOnFunction(BinaryFunction &BF) {
       }
     }
   }
+  return true;
 }
 
 Error MarkRAStates::runOnFunctions(BinaryContext &BC) {
   ParallelUtilities::WorkFuncTy WorkFun = [&](BinaryFunction &BF) {
-    if (BF.containedNegateRAState()) {
+    if (BF.containedNegateRAState() && !BF.isIgnored()) {
       // We can skip functions which did not include negate-ra-state CFIs. This
       // includes code using pac-ret hardening as well, if the binary is
       // compiled with `-fno-exceptions -fno-unwind-tables
       // -fno-asynchronous-unwind-tables`
-      runOnFunction(BF);
+      if (!runOnFunction(BF)) {
+        FunctionsIgnored++;
+      }
     }
   };
 
@@ -137,6 +135,16 @@ Error MarkRAStates::runOnFunctions(BinaryContext &BC) {
       BC, ParallelUtilities::SchedulingPolicy::SP_TRIVIAL, WorkFun, nullptr,
       "MarkRAStates");
 
+  int Total = llvm::count_if(
+      BC.getBinaryFunctions(),
+      [&](std::pair<const unsigned long, BinaryFunction> &P) {
+        return P.second.containedNegateRAState() && !P.second.isIgnored();
+      });
+  BC.outs() << "BOLT-INFO: MarkRAStates ran on " << Total
+            << " functions. Ignored " << FunctionsIgnored << " functions "
+            << format("(%.2lf%%)", (100.0 * FunctionsIgnored) / Total)
+            << " because of CFI inconsistencies.\n";
+
   return Error::success();
 }
 
diff --git a/bolt/test/AArch64/negate-ra-state-incorrect.s b/bolt/test/AArch64/negate-ra-state-incorrect.s
index ad922d83c6d61..14d2c384a877d 100644
--- a/bolt/test/AArch64/negate-ra-state-incorrect.s
+++ b/bolt/test/AArch64/negate-ra-state-incorrect.s
@@ -10,14 +10,9 @@
 # RUN: %clang %cflags  %t.o -o %t.exe -Wl,-q
 # RUN: llvm-bolt %t.exe -o %t.exe.bolt --no-threads | FileCheck %s --check-prefix=CHECK-BOLT
 
-# CHECK-BOLT: BOLT-INFO: inconsistent RAStates in function foo
-# CHECK-BOLT-NEXT: BOLT-INFO: ptr authenticating inst encountered in Unsigned RA state.
-
-# CHECK-BOLT: BOLT-INFO: inconsistent RAStates in function bar
-# CHECK-BOLT-NEXT: BOLT-INFO: ptr signing inst encountered in Signed RA state
-
-# CHECK-BOLT: BOLT-INFO: inconsistent RAStates in function baz
-# CHECK-BOLT-NEXT: BOLT-INFO: ptr sign/auth inst without .cfi_negate_ra_state
+# CHECK-BOLT: BOLT-INFO: inconsistent RAStates in function foo: ptr authenticating inst encountered in Unsigned RA state
+# CHECK-BOLT: BOLT-INFO: inconsistent RAStates in function bar: ptr signing inst encountered in Signed RA state
+# CHECK-BOLT: BOLT-INFO: inconsistent RAStates in function baz: ptr sign/auth inst without .cfi_negate_ra_state
 
 # Check that the incorrect functions got ignored, so they are not in the new .text section
 # RUN: llvm-objdump %t.exe.bolt -d -j .text | FileCheck %s --check-prefix=CHECK-OBJDUMP

>From bbc6981deee2baede8ca115599cd4b7f0cf77a1e Mon Sep 17 00:00:00 2001
From: Gergely Balint <gergely.balint at arm.com>
Date: Wed, 21 May 2025 18:40:10 +0200
Subject: [PATCH 21/25] [BOLT] Added docs/PacRetDesign.md

---
 bolt/docs/PacRetDesign.md | 154 ++++++++++++++++++++++++++++++++++++++
 1 file changed, 154 insertions(+)
 create mode 100644 bolt/docs/PacRetDesign.md

diff --git a/bolt/docs/PacRetDesign.md b/bolt/docs/PacRetDesign.md
new file mode 100644
index 0000000000000..b6046d6d70077
--- /dev/null
+++ b/bolt/docs/PacRetDesign.md
@@ -0,0 +1,154 @@
+# Optimizing binaries with pac-ret hardening
+
+This is a design document about processing the `DW_CFA_AARCH64_negate_ra_state` DWARF instruction in BOLT. As is describes internal design decisions, the intended audience is BOLT developers. The document is an updated version of the [RFC posted on the LLVM Discourse](https://discourse.llvm.org/t/rfc-bolt-aarch64-handle-opnegaterastate-to-enable-optimizing-binaries-with-pac-ret-hardening/86594).
+
+
+`DW_CFA_AARCH64_negate_ra_state` is also referred to as  `.cfi_negate_ra_state` in assembly, or `OpNegateRAState` is BOLT sources. In this document, I will use **negate-ra-state** as a shorthand.
+
+## Introduction
+
+### Pointer Authentication
+
+Refer to the [pac-ret section of the BOLT-binary-analysis document](BinaryAnalysis.md#pac-ret-analysis).
+
+### DW_CFA_AARCH64_negate_ra_state
+
+The negate-ra-state CFI is a vendor-specific Call Frame Instruction defined in the [Arm ABI](https://github.com/ARM-software/abi-aa/blob/main/aadwarf64/aadwarf64.rst#id1).
+
+```
+The DW_CFA_AARCH64_negate_ra_state operation negates bit[0] of the RA_SIGN_STATE pseudo-register.
+```
+
+This bit indicates to the unwinder whether the current return address is signed or not (hence the name). The unwinder uses this information to authenticate the pointer, and remove the Pointer Authentication Code (PAC) bits. Incorrect negate-ra-state placement can lead to the unwinder trying to authenticate an unsigned pointer (which segfaults), or skipping authenticating a signed pointer, and trying to access an incorrect location (also leading to a segfault).
+
+(Note: not *all* unwinders do this. Some use the `xpac` instruction to strip the PAC bits without authenticating the pointer. This is incorrect, as it allows control-flow modification in the case of unwinding.)
+
+There are no DWARF instructions to directly set or clear the RA State. However, two other CFIs can also affect the RA state:
+- `DW_CFA_remember_state`: this CFI stores register rules onto an implicit stack.
+- `DW_CFA_restore_state`:  this CFI pops rules from this stack.
+
+Example:
+
+| CFI                            | Effect on RA state             |
+| ------------------------------ | ------------------------------ |
+| (default)                      | 0                              |
+| DW_CFA_AARCH64_negate_ra_state | 0 -> 1                         |
+| DW_CFA_remember_state          | 1 pushed to the stack          |
+| DW_CFA_AARCH64_negate_ra_state | 1 -> 0                         |
+| DW_CFA_restore_state           | 0 -> 1 (popped from the stack) |
+
+The Arm ABI also defines the DW_CFA_AARCH64_negate_ra_state_with_pc CFI, but it is not widely used, and is [likely to become deprecated](https://github.com/ARM-software/abi-aa/issues/327).
+
+### Where are these CFIs needed?
+
+In all locations, where two consecutive instructions have different RA state, this need to be indicated to the unwinder. This happens at pointer signing and authenticating. The other case where two consecutive instructions have different RA state, but neither of them is signing or authenticating means that they are not next to each other in control flow. One is part of an execution path with signed RA, the other is part of a path with an unsigned RA.
+
+In the example below, the first BasicBlock ends in a conditional branch, and jumps to two different BasicBlocks, each with their own authentication, and return. The instructions on the border of the second and third BasicBlock have different RA states. The `ret` at the end of the second BasicBlock is in unsigned state. The start of the third BasicBlock is after the `paciasp` in the control flow, but before the authentication. In this case, a negate-ra-state is needed at the end of the second BasicBlock.
+
+```
+        +----------------+
+        |     paciasp    |
+        |                |
+        |      b.cc      |
+        +--------+-------+
+                 |
++----------------+
+|                |
+|       +--------v-------+
+|       |                |
+|       |    autiasp     |
+|       |      ret       |   // RA: unsigned
+|       +----------------+
++----------------+
+                 |
+        +--------v-------+  // RA: signed
+        |                |
+        |     autiasp    |
+        |      ret       |
+        +----------------+
+```
+
+> [!important]
+> The unwinder does not follow the control flow graph. It reads unwind information in the layout order.
+
+Because these locations are dependent on how the function layout looks, negate-ra-state CFIs will become invalid during BasicBlock reordering.
+
+## Solution design
+
+The patch introduces two new passes:
+1. `MarkRAStatesPass`: assigns the RA state to each instruction based on the CFIs in the input binary
+2. `InsertNegateRAStatePass`: reads those assigned instruction RA states after optimizations, and emits `DW_CFA_AARCH64_negate_ra_state` CFIs at the correct places: wherever there is a state change between two consecutive instructions in the layout order.
+
+To track metadata on individual instructions, the `MCAnnotation` class was extended. These also have helper function in `MCPlusBuilder`.
+
+### Saving annotations at CFI reading
+
+CFIs are read and added to BinaryFunctions in `CFIReaderWriter::FillCFIInfoFor`. At this point, we add MCAnnotations about negate-ra-state, remember-state and restore-state CFIs to the instructions they refer to. This is to not interfere with the CFI processing that already happens in BOLT (e.g. remember-state and restore-state CFIs are removed in `normalizeCFIState` for reasons unrelated to PAC).
+
+As we add the MCAnnotations *to instructions*, we have to account for the case where the function starts with a CFI altering the RA state. If a function starts with a negate-ra-state CFI for example, we cannot save the annotation on the first instruction, because that itself should already be signed. This is why all BinaryFunctions have an `initialRAState` bool. If the `Offset` the CFI refers to is zero, we don't store an annotation, but set the `initialRAState` in `FillCFIInfoFor`. This info is then used in `MarkRAStates`.
+
+### Binaries without DWARF info
+
+In some cases, the DWARF tables are stripped from the binary. These programs usually have some other unwind-mechanism. To account for code that uses Pointer Authentication, but does not have DWARF CFIs, the passes only run on functions that had at least one negate-ra-state CFI. This is marked during CFI reading.
+
+This also makes sure that the passes don't run on functions that do not store the return address to the stack, and don't need Pointer Authentication, saving on runtime overhead.
+
+In summary:
+- pointer auth is not used: no change, the new passes do not run.
+- pointer auth is used, but DWARF info is stripped: no change, the new passes do not run.
+- pointer auth is used, and we have DWARF CFIs: passes run, and rewrite the negate-ra-state CFI.
+
+### MarkRAStates Pass
+
+This pass runs before optimizations reorder anything.
+
+It processes MCAnnotations generated during the CFI reading stage to check if instructions have either of the three CFIs that can modify RA state:
+- negate-ra-state
+- remember-state
+- restore-state
+
+Then it adds new MCAnnotations to each instruction, indicating their RA state. Those annotations are:
+- Signed
+- Unsigned
+
+Below is a simple example, that shows the two different type of annotations: what we have before the pass, and after it.
+
+| Instruction                 | Before          |  After   |
+| --------------------------- | --------------- | -------- |
+| paciasp                     | negate-ra-state | unsigned |
+| stp	x29, x30, [sp, #-0x10]! |                 | signed   |
+| mov	x29, sp                 |                 | signed   |
+| ldp	x29, x30, [sp], #0x10   |                 | signed   |
+| autiasp                     | negate-ra-state | signed   |
+| ret                         |                 | unsigned |
+
+##### Error handling in MarkRAState Pass:
+
+Whenever the MarkRAStates pass finds inconsistencies in the current BinaryFunction, it ignores it by calling `BF.setIgnored()`. This prevents BOLT from optimizing that function, but it will still be emitted as part of the original section (`.bolt.org.text`) in its original form.
+
+The inconsistencies are as follows:
+- finding a `pac*` instruction when already in signed state
+- finding an `aut*` instruction when already in unsigned state
+- finding `pac*` and `aut*` instructions without `.cfi_negate_ra_state`.
+
+Users will be informed about the number of ignored function in the pass, and the exact functions ignored.
+
+### InsertNegateRAStatePass
+
+This pass runs after the optimizations are done. In essence, it does the _inverse_ of MarkRAState pass:
+1. it reads the RA state annotations attached to the instructions, and
+2. whenever the state changes, it adds a PseudoInstruction that holds an OpNegateRAState CFI.
+
+##### Covering newly generated instructions:
+
+Some BOLT passes can add new Instructions. In InsertNegateRAStatePass, we have to know what RA state these have.
+
+The current solution has the `inferUnknownStates` function to cover these, using a fairly simple strategy: unknown states are inherited from last known state. Testing so far has shown this implementation is sufficient, but to prove correctness, we would need to examine all passes that insert new instructions.
+
+### Optimizations requiring special attention
+
+Marking states before optimizations assure that instructions can be moved around freely. The only special case is function splitting. When a function is split, the split part becomes a new function in the emitted binary. For unwinding to work, it needs to "replay" all CFI that lead up to the split point. BOLT does this for other CFIs. As negate-ra-state is not read (only stored as an Annotation), we have to do this "manually" in InsertNegateRAStatePass. Here, if the split part starts with an instruction that has Signed RA state, we add a negate-ra-state CFI to indicate this.
+
+## Option to disallow the feature
+
+To aid debugging, we added the `--disallow-pacret` flag. If the flag is used, and a function `containedNegateRAState()` after `FillCFIInfoFor()`, BOLT exits with an error. With this flag, the feature is on by default.

>From 76ad0580bf354b748de7702c4cf912b79ceeb8c9 Mon Sep 17 00:00:00 2001
From: Gergely Balint <gergely.balint at arm.com>
Date: Mon, 8 Sep 2025 13:19:33 +0000
Subject: [PATCH 22/25] [BOLT] Review nits

- fix spelling in MarkRAStates
- improve negate-ra-state.s test
- improve pacret-split-funcs.s test
---
 bolt/lib/Passes/MarkRAStates.cpp       |  2 +-
 bolt/test/AArch64/negate-ra-state.s    | 10 +++++-----
 bolt/test/AArch64/pacret-split-funcs.s |  5 +++--
 3 files changed, 9 insertions(+), 8 deletions(-)

diff --git a/bolt/lib/Passes/MarkRAStates.cpp b/bolt/lib/Passes/MarkRAStates.cpp
index 2e766a6ed9be6..57e727e16074d 100644
--- a/bolt/lib/Passes/MarkRAStates.cpp
+++ b/bolt/lib/Passes/MarkRAStates.cpp
@@ -73,7 +73,7 @@ bool MarkRAStates::runOnFunction(BinaryFunction &BF) {
           BF.setIgnored();
           return false;
         }
-        // The signing instruction itself is unsinged, but the next will be
+        // The signing instruction itself is unsigned, the next will be
         // signed.
         BC.MIB->setRAUnsigned(Inst);
       } else if (BC.MIB->isPAuthOnLR(Inst)) {
diff --git a/bolt/test/AArch64/negate-ra-state.s b/bolt/test/AArch64/negate-ra-state.s
index 65483dfaf28e5..30786d4ef9f70 100644
--- a/bolt/test/AArch64/negate-ra-state.s
+++ b/bolt/test/AArch64/negate-ra-state.s
@@ -1,11 +1,16 @@
 # Checking that .cfi-negate_ra_state directives are emitted in the same location as in the input in the case of no optimizations.
 
+# The foo and bar functions are a pair, with the first signing the return address,
+# and the second authenticating it. We have a tailcall between the two.
+# This is testing that BOLT can handle functions starting in signed RA state.
+
 # RUN: llvm-mc -filetype=obj -triple aarch64-unknown-unknown %s -o %t.o
 # RUN: %clang %cflags  %t.o -o %t.exe -Wl,-q
 # RUN: llvm-bolt %t.exe -o %t.exe.bolt --no-threads --print-all | FileCheck %s --check-prefix=CHECK-BOLT
 
 # Check that the negate-ra-state at the start of bar is not discarded.
 # If it was discarded, MarkRAState would report bar as having inconsistent RAStates.
+# This is testing the handling of initialRAState on the BinaryFunction.
 # CHECK-BOLT-NOT: BOLT-INFO: inconsistent RAStates in function foo
 # CHECK-BOLT-NOT: BOLT-INFO: inconsistent RAStates in function bar
 
@@ -69,8 +74,3 @@ bar:
   ret
   .cfi_endproc
   .size   bar, .-bar
-
-  .global _start
-  .type _start, %function
-_start:
-  b foo
diff --git a/bolt/test/AArch64/pacret-split-funcs.s b/bolt/test/AArch64/pacret-split-funcs.s
index 6582257948c62..a20d2a3688146 100644
--- a/bolt/test/AArch64/pacret-split-funcs.s
+++ b/bolt/test/AArch64/pacret-split-funcs.s
@@ -36,16 +36,17 @@
 foo:
 .cfi_startproc
 .entry_bb:
+# FDATA: 1 foo #.entry_bb# 10
      paciasp
     .cfi_negate_ra_state     // indicating that paciasp changed the RA state to signed
-# FDATA: 1 foo #.entry_bb# 10
     cmp x0, #0
     b.eq .Lcold_bb1
+.Lfallthrough:
     autiasp
     .cfi_negate_ra_state     // indicating that autiasp changed the RA state to unsigned
     ret
     .cfi_negate_ra_state     // ret has unsigned RA state, but the next inst (autiasp) has signed RA state
-.Lcold_bb1:
+.Lcold_bb1:                  // split point
     autiasp
     .cfi_negate_ra_state     // indicating that autiasp changed the RA state to unsigned
     ret

>From 35306ef3cf370402717b8d1bd0ed8a829f0c4f09 Mon Sep 17 00:00:00 2001
From: Gergely Balint <gergely.balint at arm.com>
Date: Mon, 8 Sep 2025 14:29:47 +0000
Subject: [PATCH 23/25] [BOLT] Fix multithreading in #120064

- use SkipPredicates,
- use atomic counters.
---
 bolt/include/bolt/Core/BinaryFunction.h       |  2 +-
 .../bolt/Passes/InsertNegateRAStatePass.h     |  1 -
 bolt/include/bolt/Passes/MarkRAStates.h       |  3 ---
 bolt/lib/Passes/InsertNegateRAStatePass.cpp   | 23 +++++++++++--------
 bolt/lib/Passes/MarkRAStates.cpp              | 23 +++++++++++--------
 5 files changed, 27 insertions(+), 25 deletions(-)

diff --git a/bolt/include/bolt/Core/BinaryFunction.h b/bolt/include/bolt/Core/BinaryFunction.h
index 5335ec5a9a7bd..38748bb434ed1 100644
--- a/bolt/include/bolt/Core/BinaryFunction.h
+++ b/bolt/include/bolt/Core/BinaryFunction.h
@@ -148,7 +148,7 @@ class BinaryFunction {
   };
 
   void setContainedNegateRAState() { HadNegateRAState = true; }
-  bool containedNegateRAState() { return HadNegateRAState; }
+  bool containedNegateRAState() const { return HadNegateRAState; }
   void setInitialRAState(bool State) { InitialRAState = State; }
   bool getInitialRAState() { return InitialRAState; }
 
diff --git a/bolt/include/bolt/Passes/InsertNegateRAStatePass.h b/bolt/include/bolt/Passes/InsertNegateRAStatePass.h
index 9acd28424c59a..836948bf5e9c0 100644
--- a/bolt/include/bolt/Passes/InsertNegateRAStatePass.h
+++ b/bolt/include/bolt/Passes/InsertNegateRAStatePass.h
@@ -28,7 +28,6 @@ class InsertNegateRAState : public BinaryFunctionPass {
   void runOnFunction(BinaryFunction &BF);
 
 private:
-  uint64_t FunctionsModified{0};
   /// Because states are tracked as MCAnnotations on individual instructions,
   /// newly inserted instructions do not have a state associated with them.
   /// New states are "inherited" from the last known state.
diff --git a/bolt/include/bolt/Passes/MarkRAStates.h b/bolt/include/bolt/Passes/MarkRAStates.h
index 8870540aa7e9e..675ab9727142b 100644
--- a/bolt/include/bolt/Passes/MarkRAStates.h
+++ b/bolt/include/bolt/Passes/MarkRAStates.h
@@ -26,9 +26,6 @@ class MarkRAStates : public BinaryFunctionPass {
   /// Pass entry point
   Error runOnFunctions(BinaryContext &BC) override;
   bool runOnFunction(BinaryFunction &BF);
-
-private:
-  uint64_t FunctionsIgnored{0};
 };
 
 } // namespace bolt
diff --git a/bolt/lib/Passes/InsertNegateRAStatePass.cpp b/bolt/lib/Passes/InsertNegateRAStatePass.cpp
index 7b93a0ca2b251..de033736a15ff 100644
--- a/bolt/lib/Passes/InsertNegateRAStatePass.cpp
+++ b/bolt/lib/Passes/InsertNegateRAStatePass.cpp
@@ -114,20 +114,23 @@ void InsertNegateRAState::inferUnknownStates(BinaryFunction &BF) {
 }
 
 Error InsertNegateRAState::runOnFunctions(BinaryContext &BC) {
+  std::atomic<uint64_t> FunctionsModified{0};
   ParallelUtilities::WorkFuncTy WorkFun = [&](BinaryFunction &BF) {
-    if (BF.containedNegateRAState() && !BF.isIgnored()) {
-      // We can skip functions which did not include negate-ra-state CFIs. This
-      // includes code using pac-ret hardening as well, if the binary is
-      // compiled with `-fno-exceptions -fno-unwind-tables
-      // -fno-asynchronous-unwind-tables`
-      FunctionsModified++;
-      runOnFunction(BF);
-    }
+    FunctionsModified++;
+    runOnFunction(BF);
+  };
+
+  ParallelUtilities::PredicateTy SkipPredicate = [&](const BinaryFunction &BF) {
+    // We can skip functions which did not include negate-ra-state CFIs. This
+    // includes code using pac-ret hardening as well, if the binary is
+    // compiled with `-fno-exceptions -fno-unwind-tables
+    // -fno-asynchronous-unwind-tables`
+    return !BF.containedNegateRAState() || BF.isIgnored();
   };
 
   ParallelUtilities::runOnEachFunction(
-      BC, ParallelUtilities::SchedulingPolicy::SP_TRIVIAL, WorkFun, nullptr,
-      "InsertNegateRAStatePass");
+      BC, ParallelUtilities::SchedulingPolicy::SP_TRIVIAL, WorkFun,
+      SkipPredicate, "InsertNegateRAStatePass");
 
   BC.outs() << "BOLT-INFO: rewritten pac-ret DWARF info in "
             << FunctionsModified << " out of " << BC.getBinaryFunctions().size()
diff --git a/bolt/lib/Passes/MarkRAStates.cpp b/bolt/lib/Passes/MarkRAStates.cpp
index 57e727e16074d..89a8065a6e3f0 100644
--- a/bolt/lib/Passes/MarkRAStates.cpp
+++ b/bolt/lib/Passes/MarkRAStates.cpp
@@ -119,21 +119,24 @@ bool MarkRAStates::runOnFunction(BinaryFunction &BF) {
 }
 
 Error MarkRAStates::runOnFunctions(BinaryContext &BC) {
+  std::atomic<uint64_t> FunctionsIgnored{0};
   ParallelUtilities::WorkFuncTy WorkFun = [&](BinaryFunction &BF) {
-    if (BF.containedNegateRAState() && !BF.isIgnored()) {
-      // We can skip functions which did not include negate-ra-state CFIs. This
-      // includes code using pac-ret hardening as well, if the binary is
-      // compiled with `-fno-exceptions -fno-unwind-tables
-      // -fno-asynchronous-unwind-tables`
-      if (!runOnFunction(BF)) {
-        FunctionsIgnored++;
-      }
+    if (!runOnFunction(BF)) {
+      FunctionsIgnored++;
     }
   };
 
+  ParallelUtilities::PredicateTy SkipPredicate = [&](const BinaryFunction &BF) {
+    // We can skip functions which did not include negate-ra-state CFIs. This
+    // includes code using pac-ret hardening as well, if the binary is
+    // compiled with `-fno-exceptions -fno-unwind-tables
+    // -fno-asynchronous-unwind-tables`
+    return !BF.containedNegateRAState() || BF.isIgnored();
+  };
+
   ParallelUtilities::runOnEachFunction(
-      BC, ParallelUtilities::SchedulingPolicy::SP_TRIVIAL, WorkFun, nullptr,
-      "MarkRAStates");
+      BC, ParallelUtilities::SchedulingPolicy::SP_TRIVIAL, WorkFun,
+      SkipPredicate, "MarkRAStates");
 
   int Total = llvm::count_if(
       BC.getBinaryFunctions(),

>From 112f831035057b374f20dfc8e24407c99fc89c05 Mon Sep 17 00:00:00 2001
From: Gergely Balint <gergely.balint at arm.com>
Date: Wed, 10 Sep 2025 07:09:37 +0000
Subject: [PATCH 24/25] [BOLT] Address review

- improve docs/PacRetDesign.md
- improve pacret-split-funcs test
---
 bolt/docs/PacRetDesign.md              | 146 ++++++++++++++++++-------
 bolt/test/AArch64/pacret-split-funcs.s |  19 ++--
 2 files changed, 117 insertions(+), 48 deletions(-)

diff --git a/bolt/docs/PacRetDesign.md b/bolt/docs/PacRetDesign.md
index b6046d6d70077..1461260949486 100644
--- a/bolt/docs/PacRetDesign.md
+++ b/bolt/docs/PacRetDesign.md
@@ -1,9 +1,14 @@
 # Optimizing binaries with pac-ret hardening
 
-This is a design document about processing the `DW_CFA_AARCH64_negate_ra_state` DWARF instruction in BOLT. As is describes internal design decisions, the intended audience is BOLT developers. The document is an updated version of the [RFC posted on the LLVM Discourse](https://discourse.llvm.org/t/rfc-bolt-aarch64-handle-opnegaterastate-to-enable-optimizing-binaries-with-pac-ret-hardening/86594).
+This is a design document about processing the `DW_CFA_AARCH64_negate_ra_state`
+DWARF instruction in BOLT. As it describes internal design decisions, the
+intended audience is BOLT developers. The document is an updated version of the
+[RFC posted on the LLVM Discourse](https://discourse.llvm.org/t/rfc-bolt-aarch64-handle-opnegaterastate-to-enable-optimizing-binaries-with-pac-ret-hardening/86594).
 
 
-`DW_CFA_AARCH64_negate_ra_state` is also referred to as  `.cfi_negate_ra_state` in assembly, or `OpNegateRAState` is BOLT sources. In this document, I will use **negate-ra-state** as a shorthand.
+`DW_CFA_AARCH64_negate_ra_state` is also referred to as  `.cfi_negate_ra_state`
+in assembly, or `OpNegateRAState` in BOLT sources. In this document, I will use
+**negate-ra-state** as a shorthand.
 
 ## Introduction
 
@@ -13,17 +18,26 @@ Refer to the [pac-ret section of the BOLT-binary-analysis document](BinaryAnalys
 
 ### DW_CFA_AARCH64_negate_ra_state
 
-The negate-ra-state CFI is a vendor-specific Call Frame Instruction defined in the [Arm ABI](https://github.com/ARM-software/abi-aa/blob/main/aadwarf64/aadwarf64.rst#id1).
+The negate-ra-state CFI is a vendor-specific Call Frame Instruction defined in
+the [Arm ABI](https://github.com/ARM-software/abi-aa/blob/main/aadwarf64/aadwarf64.rst#id1).
 
 ```
 The DW_CFA_AARCH64_negate_ra_state operation negates bit[0] of the RA_SIGN_STATE pseudo-register.
 ```
 
-This bit indicates to the unwinder whether the current return address is signed or not (hence the name). The unwinder uses this information to authenticate the pointer, and remove the Pointer Authentication Code (PAC) bits. Incorrect negate-ra-state placement can lead to the unwinder trying to authenticate an unsigned pointer (which segfaults), or skipping authenticating a signed pointer, and trying to access an incorrect location (also leading to a segfault).
+This bit indicates to the unwinder whether the current return address is signed
+or not (hence the name). The unwinder uses this information to authenticate the
+pointer, and remove the Pointer Authentication Code (PAC) bits. Incorrect
+negate-ra-state placement can lead to the unwinder trying to authenticate an
+unsigned pointer (which segfaults), or skipping authenticating a signed pointer,
+and trying to access an incorrect location (also leading to a segfault).
 
-(Note: not *all* unwinders do this. Some use the `xpac` instruction to strip the PAC bits without authenticating the pointer. This is incorrect, as it allows control-flow modification in the case of unwinding.)
+Note: not *all* unwinders do this. Some use the `xpac` instruction to strip the
+PAC bits without authenticating the pointer. This is an incorrect (incomplete)
+implementation, as it allows control-flow modification in the case of unwinding.
 
-There are no DWARF instructions to directly set or clear the RA State. However, two other CFIs can also affect the RA state:
+There are no DWARF instructions to directly set or clear the RA State. However,
+two other CFIs can also affect the RA state:
 - `DW_CFA_remember_state`: this CFI stores register rules onto an implicit stack.
 - `DW_CFA_restore_state`:  this CFI pops rules from this stack.
 
@@ -37,13 +51,25 @@ Example:
 | DW_CFA_AARCH64_negate_ra_state | 1 -> 0                         |
 | DW_CFA_restore_state           | 0 -> 1 (popped from the stack) |
 
-The Arm ABI also defines the DW_CFA_AARCH64_negate_ra_state_with_pc CFI, but it is not widely used, and is [likely to become deprecated](https://github.com/ARM-software/abi-aa/issues/327).
+The Arm ABI also defines the DW_CFA_AARCH64_negate_ra_state_with_pc CFI, but it
+is not widely used, and is [likely to become deprecated](https://github.com/ARM-software/abi-aa/issues/327).
 
 ### Where are these CFIs needed?
 
-In all locations, where two consecutive instructions have different RA state, this need to be indicated to the unwinder. This happens at pointer signing and authenticating. The other case where two consecutive instructions have different RA state, but neither of them is signing or authenticating means that they are not next to each other in control flow. One is part of an execution path with signed RA, the other is part of a path with an unsigned RA.
-
-In the example below, the first BasicBlock ends in a conditional branch, and jumps to two different BasicBlocks, each with their own authentication, and return. The instructions on the border of the second and third BasicBlock have different RA states. The `ret` at the end of the second BasicBlock is in unsigned state. The start of the third BasicBlock is after the `paciasp` in the control flow, but before the authentication. In this case, a negate-ra-state is needed at the end of the second BasicBlock.
+In all locations, where two consecutive instructions have different RA state,
+this needs to be indicated to the unwinder. This happens at pointer signing and
+authenticating. The other case where two consecutive instructions have different
+RA state, but neither of them is signing or authenticating means that they are
+not next to each other in control flow. One is part of an execution path with
+signed RA, the other is part of a path with an unsigned RA.
+
+In the example below, the first BasicBlock ends in a conditional branch, and
+jumps to two different BasicBlocks, each with their own authentication, and
+return. The instructions on the border of the second and third BasicBlock have
+different RA states. The `ret` at the end of the second BasicBlock is in unsigned
+state. The start of the third BasicBlock is after the `paciasp` in the control
+flow, but before the authentication. In this case, a negate-ra-state is needed
+at the end of the second BasicBlock.
 
 ```
         +----------------+
@@ -69,86 +95,132 @@ In the example below, the first BasicBlock ends in a conditional branch, and jum
 ```
 
 > [!important]
-> The unwinder does not follow the control flow graph. It reads unwind information in the layout order.
+> The unwinder does not follow the control flow graph. It reads unwind
+> information in the layout order.
 
-Because these locations are dependent on how the function layout looks, negate-ra-state CFIs will become invalid during BasicBlock reordering.
+Because these locations are dependent on how the function layout looks,
+negate-ra-state CFIs will become invalid during BasicBlock reordering.
 
 ## Solution design
 
 The patch introduces two new passes:
-1. `MarkRAStatesPass`: assigns the RA state to each instruction based on the CFIs in the input binary
-2. `InsertNegateRAStatePass`: reads those assigned instruction RA states after optimizations, and emits `DW_CFA_AARCH64_negate_ra_state` CFIs at the correct places: wherever there is a state change between two consecutive instructions in the layout order.
+1. `MarkRAStatesPass`: assigns the RA state to each instruction based on the CFIs
+    in the input binary
+2. `InsertNegateRAStatePass`: reads those assigned instruction RA states after
+    optimizations, and emits `DW_CFA_AARCH64_negate_ra_state` CFIs at the correct
+    places: wherever there is a state change between two consecutive instructions
+    in the layout order.
 
-To track metadata on individual instructions, the `MCAnnotation` class was extended. These also have helper function in `MCPlusBuilder`.
+To track metadata on individual instructions, the `MCAnnotation` class was
+extended. These also have helper functions in `MCPlusBuilder`.
 
 ### Saving annotations at CFI reading
 
-CFIs are read and added to BinaryFunctions in `CFIReaderWriter::FillCFIInfoFor`. At this point, we add MCAnnotations about negate-ra-state, remember-state and restore-state CFIs to the instructions they refer to. This is to not interfere with the CFI processing that already happens in BOLT (e.g. remember-state and restore-state CFIs are removed in `normalizeCFIState` for reasons unrelated to PAC).
+CFIs are read and added to BinaryFunctions in `CFIReaderWriter::FillCFIInfoFor`.
+At this point, we add MCAnnotations about negate-ra-state, remember-state and
+restore-state CFIs to the instructions they refer to. This is to not interfere
+with the CFI processing that already happens in BOLT (e.g. remember-state and
+restore-state CFIs are removed in `normalizeCFIState` for reasons unrelated to PAC).
 
-As we add the MCAnnotations *to instructions*, we have to account for the case where the function starts with a CFI altering the RA state. If a function starts with a negate-ra-state CFI for example, we cannot save the annotation on the first instruction, because that itself should already be signed. This is why all BinaryFunctions have an `initialRAState` bool. If the `Offset` the CFI refers to is zero, we don't store an annotation, but set the `initialRAState` in `FillCFIInfoFor`. This info is then used in `MarkRAStates`.
+As we add the MCAnnotations *to instructions*, we have to account for the case
+where the function starts with a CFI altering the RA state. If a function starts
+with a negate-ra-state CFI for example, we cannot save the annotation on the
+first instruction, because that itself should already be signed. This is why all
+BinaryFunctions have an `initialRAState` bool. If the `Offset` the CFI refers to
+is zero, we don't store an annotation, but set the `initialRAState` in
+`FillCFIInfoFor`. This information is then used in `MarkRAStates`.
 
 ### Binaries without DWARF info
 
-In some cases, the DWARF tables are stripped from the binary. These programs usually have some other unwind-mechanism. To account for code that uses Pointer Authentication, but does not have DWARF CFIs, the passes only run on functions that had at least one negate-ra-state CFI. This is marked during CFI reading.
+In some cases, the DWARF tables are stripped from the binary. These programs
+usually have some other unwind-mechanism. To account for code that uses Pointer
+Authentication, but does not have DWARF CFIs, the passes only run on functions
+that had at least one negate-ra-state CFI. This information is saved on the
+functions during CFI reading.
 
-This also makes sure that the passes don't run on functions that do not store the return address to the stack, and don't need Pointer Authentication, saving on runtime overhead.
+This also makes sure that the passes don't run on functions that do not store
+the return address to the stack, and don't need Pointer Authentication, saving
+on runtime overhead.
 
 In summary:
 - pointer auth is not used: no change, the new passes do not run.
-- pointer auth is used, but DWARF info is stripped: no change, the new passes do not run.
-- pointer auth is used, and we have DWARF CFIs: passes run, and rewrite the negate-ra-state CFI.
+- pointer auth is used, but DWARF info is stripped: no change, the new passes
+  do not run.
+- pointer auth is used, and we have DWARF CFIs: passes run, and rewrite the
+  negate-ra-state CFI.
 
 ### MarkRAStates Pass
 
 This pass runs before optimizations reorder anything.
 
-It processes MCAnnotations generated during the CFI reading stage to check if instructions have either of the three CFIs that can modify RA state:
+It processes MCAnnotations generated during the CFI reading stage to check if
+instructions have either of the three CFIs that can modify RA state:
 - negate-ra-state
 - remember-state
 - restore-state
 
-Then it adds new MCAnnotations to each instruction, indicating their RA state. Those annotations are:
+Then it adds new MCAnnotations to each instruction, indicating their RA state.
+Those annotations are:
 - Signed
 - Unsigned
 
-Below is a simple example, that shows the two different type of annotations: what we have before the pass, and after it.
+Below is a simple example, that shows the two different type of annotations:
+what we have before the pass, and after it.
 
-| Instruction                 | Before          |  After   |
-| --------------------------- | --------------- | -------- |
-| paciasp                     | negate-ra-state | unsigned |
+| Instruction                   | Before          |  After   |
+| ----------------------------- | --------------- | -------- |
+| paciasp                       | negate-ra-state | unsigned |
 | stp	x29, x30, [sp, #-0x10]! |                 | signed   |
 | mov	x29, sp                 |                 | signed   |
 | ldp	x29, x30, [sp], #0x10   |                 | signed   |
-| autiasp                     | negate-ra-state | signed   |
-| ret                         |                 | unsigned |
+| autiasp                       | negate-ra-state | signed   |
+| ret                           |                 | unsigned |
 
 ##### Error handling in MarkRAState Pass:
 
-Whenever the MarkRAStates pass finds inconsistencies in the current BinaryFunction, it ignores it by calling `BF.setIgnored()`. This prevents BOLT from optimizing that function, but it will still be emitted as part of the original section (`.bolt.org.text`) in its original form.
+Whenever the MarkRAStates pass finds inconsistencies in the current
+BinaryFunction, it ignores it by calling `BF.setIgnored()`. This prevents BOLT
+from optimizing that function, but it will still be emitted as part of the
+original section (`.bolt.org.text`) in its original form.
 
 The inconsistencies are as follows:
 - finding a `pac*` instruction when already in signed state
 - finding an `aut*` instruction when already in unsigned state
 - finding `pac*` and `aut*` instructions without `.cfi_negate_ra_state`.
 
-Users will be informed about the number of ignored function in the pass, and the exact functions ignored.
+Users will be informed about the number of ignored functions in the pass, the
+exact functions ignored, and the found inconsistency.
 
 ### InsertNegateRAStatePass
 
-This pass runs after the optimizations are done. In essence, it does the _inverse_ of MarkRAState pass:
+This pass runs after the optimizations are done. In essence, it does the _inverse_
+of MarkRAState pass:
 1. it reads the RA state annotations attached to the instructions, and
-2. whenever the state changes, it adds a PseudoInstruction that holds an OpNegateRAState CFI.
+2. whenever the state changes, it adds a PseudoInstruction that holds an
+   OpNegateRAState CFI.
 
 ##### Covering newly generated instructions:
 
-Some BOLT passes can add new Instructions. In InsertNegateRAStatePass, we have to know what RA state these have.
+Some BOLT passes can add new Instructions. In InsertNegateRAStatePass, we have
+to know what RA state these have.
 
-The current solution has the `inferUnknownStates` function to cover these, using a fairly simple strategy: unknown states are inherited from last known state. Testing so far has shown this implementation is sufficient, but to prove correctness, we would need to examine all passes that insert new instructions.
+The current solution has the `inferUnknownStates` function to cover these, using
+a fairly simple strategy: unknown states inherit the last known state. Testing so
+far has shown that this implementation is sufficient.
 
 ### Optimizations requiring special attention
 
-Marking states before optimizations assure that instructions can be moved around freely. The only special case is function splitting. When a function is split, the split part becomes a new function in the emitted binary. For unwinding to work, it needs to "replay" all CFI that lead up to the split point. BOLT does this for other CFIs. As negate-ra-state is not read (only stored as an Annotation), we have to do this "manually" in InsertNegateRAStatePass. Here, if the split part starts with an instruction that has Signed RA state, we add a negate-ra-state CFI to indicate this.
+Marking states before optimizations ensure that instructions can be moved around
+freely. The only special case is function splitting. When a function is split,
+the split part becomes a new function in the emitted binary. For unwinding to
+work, it needs to "replay" all CFIs that lead up to the split point. BOLT does
+this for other CFIs. As negate-ra-state is not read (only stored as an Annotation),
+we have to do this manually in InsertNegateRAStatePass. Here, if the split part
+starts with an instruction that has Signed RA state, we add a negate-ra-state CFI
+to indicate this.
 
 ## Option to disallow the feature
 
-To aid debugging, we added the `--disallow-pacret` flag. If the flag is used, and a function `containedNegateRAState()` after `FillCFIInfoFor()`, BOLT exits with an error. With this flag, the feature is on by default.
+To aid debugging, we added the `--disallow-pacret` flag. If the flag is used,
+and a function `containedNegateRAState()` after `FillCFIInfoFor()`, BOLT exits
+with an error. With this flag, the feature is on by default.
diff --git a/bolt/test/AArch64/pacret-split-funcs.s b/bolt/test/AArch64/pacret-split-funcs.s
index a20d2a3688146..27b3471045523 100644
--- a/bolt/test/AArch64/pacret-split-funcs.s
+++ b/bolt/test/AArch64/pacret-split-funcs.s
@@ -1,10 +1,10 @@
 # Checking that we generate an OpNegateRAState CFI after the split point,
 # when splitting a region with signed RA state.
+# We split at the fallthrough label.
 
 # REQUIRES: system-linux
 
-# RUN: %clang %cflags -o %t %s
-# RUN: %clang %s %cflags -Wl,-q -o %t
+# RUN: %clang %s %cflags -march=armv8.3-a -Wl,-q -o %t
 # RUN: link_fdata --no-lbr %s %t %t.fdata
 # RUN: llvm-bolt %t -o %t.bolt --data %t.fdata -split-functions \
 # RUN: --print-only foo --print-split --print-all 2>&1 | FileCheck %s
@@ -19,14 +19,11 @@
 # CHECK: -------   HOT-COLD SPLIT POINT   -------
 
 # CHECK:         OpNegateRAState
+# CHECK-NEXT:    mov x0, #0x1
 # CHECK-NEXT:    autiasp
 # CHECK-NEXT:    OpNegateRAState
 # CHECK-NEXT:    ret
 
-# CHECK:         autiasp
-# CHECK-NEXT:    OpNegateRAState
-# CHECK-NEXT:    ret
-
 # End of the insert-negate-ra-state-pass logs
 # CHECK: Binary Function "foo" after finalize-functions
 
@@ -41,15 +38,15 @@ foo:
     .cfi_negate_ra_state     // indicating that paciasp changed the RA state to signed
     cmp x0, #0
     b.eq .Lcold_bb1
-.Lfallthrough:
+.Lfallthrough:               // split point
+    mov x0, #1
     autiasp
     .cfi_negate_ra_state     // indicating that autiasp changed the RA state to unsigned
     ret
+.Lcold_bb1:                  // Instructions below are not important, they are just here so the cold block is not empty.
     .cfi_negate_ra_state     // ret has unsigned RA state, but the next inst (autiasp) has signed RA state
-.Lcold_bb1:                  // split point
-    autiasp
-    .cfi_negate_ra_state     // indicating that autiasp changed the RA state to unsigned
-    ret
+    mov x0, #2
+    retaa
 .cfi_endproc
   .size foo, .-foo
 

>From e00859b405777bfadd5dfcdba66429e6d40d9b8d Mon Sep 17 00:00:00 2001
From: Gergely Balint <gergely.balint at arm.com>
Date: Wed, 10 Sep 2025 13:35:22 +0000
Subject: [PATCH 25/25] [BOLT] Change scheduling policy to SP_INT_LINEAR

- count the total applicable functions, and only run the passes if it's
  more than 0. This way we do not get a warning on the scheduling policy
  change.
---
 bolt/lib/Passes/InsertNegateRAStatePass.cpp | 29 ++++++++++++++-------
 bolt/lib/Passes/MarkRAStates.cpp            | 21 +++++++++------
 2 files changed, 33 insertions(+), 17 deletions(-)

diff --git a/bolt/lib/Passes/InsertNegateRAStatePass.cpp b/bolt/lib/Passes/InsertNegateRAStatePass.cpp
index de033736a15ff..355af4a6948a1 100644
--- a/bolt/lib/Passes/InsertNegateRAStatePass.cpp
+++ b/bolt/lib/Passes/InsertNegateRAStatePass.cpp
@@ -128,15 +128,26 @@ Error InsertNegateRAState::runOnFunctions(BinaryContext &BC) {
     return !BF.containedNegateRAState() || BF.isIgnored();
   };
 
-  ParallelUtilities::runOnEachFunction(
-      BC, ParallelUtilities::SchedulingPolicy::SP_TRIVIAL, WorkFun,
-      SkipPredicate, "InsertNegateRAStatePass");
-
-  BC.outs() << "BOLT-INFO: rewritten pac-ret DWARF info in "
-            << FunctionsModified << " out of " << BC.getBinaryFunctions().size()
-            << " functions "
-            << format("(%.2lf%%).\n", (100.0 * FunctionsModified) /
-                                          BC.getBinaryFunctions().size());
+  int Total = llvm::count_if(
+      BC.getBinaryFunctions(),
+      [&](std::pair<const unsigned long, BinaryFunction> &P) {
+        return P.second.containedNegateRAState() && !P.second.isIgnored();
+      });
+
+  // If we attempt to run the pass, but SkipPredicate skips all functions,
+  // we get a warning that the SchedulingPolicy is changed to trivial.
+  // This would break bolt/test/AArch64/unmarked-data.test.
+  if (Total != 0) {
+    ParallelUtilities::runOnEachFunction(
+        BC, ParallelUtilities::SchedulingPolicy::SP_INST_LINEAR, WorkFun,
+        SkipPredicate, "InsertNegateRAStatePass");
+
+    BC.outs() << "BOLT-INFO: rewritten pac-ret DWARF info in "
+              << FunctionsModified << " out of "
+              << BC.getBinaryFunctions().size() << " functions "
+              << format("(%.2lf%%).\n", (100.0 * FunctionsModified) /
+                                            BC.getBinaryFunctions().size());
+  }
   return Error::success();
 }
 
diff --git a/bolt/lib/Passes/MarkRAStates.cpp b/bolt/lib/Passes/MarkRAStates.cpp
index 89a8065a6e3f0..f9fe86260e3b1 100644
--- a/bolt/lib/Passes/MarkRAStates.cpp
+++ b/bolt/lib/Passes/MarkRAStates.cpp
@@ -134,19 +134,24 @@ Error MarkRAStates::runOnFunctions(BinaryContext &BC) {
     return !BF.containedNegateRAState() || BF.isIgnored();
   };
 
-  ParallelUtilities::runOnEachFunction(
-      BC, ParallelUtilities::SchedulingPolicy::SP_TRIVIAL, WorkFun,
-      SkipPredicate, "MarkRAStates");
-
   int Total = llvm::count_if(
       BC.getBinaryFunctions(),
       [&](std::pair<const unsigned long, BinaryFunction> &P) {
         return P.second.containedNegateRAState() && !P.second.isIgnored();
       });
-  BC.outs() << "BOLT-INFO: MarkRAStates ran on " << Total
-            << " functions. Ignored " << FunctionsIgnored << " functions "
-            << format("(%.2lf%%)", (100.0 * FunctionsIgnored) / Total)
-            << " because of CFI inconsistencies.\n";
+
+  // If we attempt to run the pass, but SkipPredicate skips all functions,
+  // we get a warning that the SchedulingPolicy is changed to trivial.
+  // This would break bolt/test/AArch64/unmarked-data.test.
+  if (Total != 0) {
+    ParallelUtilities::runOnEachFunction(
+        BC, ParallelUtilities::SchedulingPolicy::SP_INST_LINEAR, WorkFun,
+        SkipPredicate, "MarkRAStates");
+    BC.outs() << "BOLT-INFO: MarkRAStates ran on " << Total
+              << " functions. Ignored " << FunctionsIgnored << " functions "
+              << format("(%.2lf%%)", (100.0 * FunctionsIgnored) / Total)
+              << " because of CFI inconsistencies.\n";
+  }
 
   return Error::success();
 }



More information about the llvm-commits mailing list