[llvm] [BOLT] Refactor MCInstReference and move it to Core (NFC) (PR #155846)

Anatoly Trosinenko via llvm-commits llvm-commits at lists.llvm.org
Thu Aug 28 07:56:18 PDT 2025


https://github.com/atrosinenko created https://github.com/llvm/llvm-project/pull/155846

Refactor MCInstReference class and move it from PAuth gadget scanner to Core.

MCInstReference is a class representing a constant reference to an instruction inside a parent entity - either inside a basic block (which has a reference to its parent function) or directly inside a function (when CFG information is not available).

This patch reapplies #138655 with `MCInstReference::getAddress()` function introduced and a fix for iterator usage.

>From 4a0d3e8632ee2e77cb64c0fffbbde2e6bcd15c54 Mon Sep 17 00:00:00 2001
From: Anatoly Trosinenko <atrosinenko at accesssoftek.com>
Date: Wed, 27 Aug 2025 14:19:30 +0300
Subject: [PATCH 1/3] [BOLT] Refactor MCInstReference and move it to Core (NFC)
 (#138655)

Refactor MCInstReference class and move it from PAuth gadget scanner to
Core.

MCInstReference is a class representing a constant reference to an
instruction inside a parent entity - either inside a basic block (which
has a reference to its parent function) or directly inside a function
(when CFG information is not available).
---
 bolt/include/bolt/Core/MCInstUtils.h          | 155 +++++++++++++++
 bolt/include/bolt/Passes/PAuthGadgetScanner.h | 176 +-----------------
 bolt/lib/Core/CMakeLists.txt                  |   1 +
 bolt/lib/Core/MCInstUtils.cpp                 |  56 ++++++
 bolt/lib/Passes/PAuthGadgetScanner.cpp        | 102 +++++-----
 5 files changed, 256 insertions(+), 234 deletions(-)
 create mode 100644 bolt/include/bolt/Core/MCInstUtils.h
 create mode 100644 bolt/lib/Core/MCInstUtils.cpp

diff --git a/bolt/include/bolt/Core/MCInstUtils.h b/bolt/include/bolt/Core/MCInstUtils.h
new file mode 100644
index 0000000000000..2e93dfaf4c275
--- /dev/null
+++ b/bolt/include/bolt/Core/MCInstUtils.h
@@ -0,0 +1,155 @@
+//===- bolt/Core/MCInstUtils.h ----------------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef BOLT_CORE_MCINSTUTILS_H
+#define BOLT_CORE_MCINSTUTILS_H
+
+#include "bolt/Core/BinaryBasicBlock.h"
+#include <map>
+#include <variant>
+
+namespace llvm {
+namespace bolt {
+
+class BinaryFunction;
+
+/// MCInstReference represents a reference to a constant MCInst as stored either
+/// in a BinaryFunction (i.e. before a CFG is created), or in a BinaryBasicBlock
+/// (after a CFG is created).
+class MCInstReference {
+  using nocfg_const_iterator = std::map<uint32_t, MCInst>::const_iterator;
+
+  // Two cases are possible:
+  // * functions with CFG reconstructed - a function stores a collection of
+  //   basic blocks, each basic block stores a contiguous vector of MCInst
+  // * functions without CFG - there are no basic blocks created,
+  //   the instructions are directly stored in std::map in BinaryFunction
+  //
+  // In both cases, the direct parent of MCInst is stored together with an
+  // iterator pointing to the instruction.
+
+  // Helper struct: CFG is available, the direct parent is a basic block,
+  // iterator's type is `MCInst *`.
+  struct RefInBB {
+    RefInBB(const BinaryBasicBlock *BB, const MCInst *Inst)
+        : BB(BB), It(Inst) {}
+    RefInBB(const RefInBB &Other) = default;
+    RefInBB &operator=(const RefInBB &Other) = default;
+
+    const BinaryBasicBlock *BB;
+    BinaryBasicBlock::const_iterator It;
+
+    bool operator==(const RefInBB &Other) const {
+      return BB == Other.BB && It == Other.It;
+    }
+  };
+
+  // Helper struct: CFG is *not* available, the direct parent is a function,
+  // iterator's type is std::map<uint32_t, MCInst>::iterator (the mapped value
+  // is an instruction's offset).
+  struct RefInBF {
+    RefInBF(const BinaryFunction *BF, nocfg_const_iterator It)
+        : BF(BF), It(It) {}
+    RefInBF(const RefInBF &Other) = default;
+    RefInBF &operator=(const RefInBF &Other) = default;
+
+    const BinaryFunction *BF;
+    nocfg_const_iterator It;
+
+    bool operator==(const RefInBF &Other) const {
+      return BF == Other.BF && It->first == Other.It->first;
+    }
+  };
+
+  std::variant<RefInBB, RefInBF> Reference;
+
+  // Utility methods to be used like this:
+  //
+  //     if (auto *Ref = tryGetRefInBB())
+  //       return Ref->doSomething(...);
+  //     return getRefInBF().doSomethingElse(...);
+  const RefInBB *tryGetRefInBB() const {
+    assert(std::get_if<RefInBB>(&Reference) ||
+           std::get_if<RefInBF>(&Reference));
+    return std::get_if<RefInBB>(&Reference);
+  }
+  const RefInBF &getRefInBF() const {
+    assert(std::get_if<RefInBF>(&Reference));
+    return *std::get_if<RefInBF>(&Reference);
+  }
+
+public:
+  /// Constructs an empty reference.
+  MCInstReference() : Reference(RefInBB(nullptr, nullptr)) {}
+  /// Constructs a reference to the instruction inside the basic block.
+  MCInstReference(const BinaryBasicBlock *BB, const MCInst *Inst)
+      : Reference(RefInBB(BB, Inst)) {
+    assert(BB && Inst && "Neither BB nor Inst should be nullptr");
+  }
+  /// Constructs a reference to the instruction inside the basic block.
+  MCInstReference(const BinaryBasicBlock *BB, unsigned Index)
+      : Reference(RefInBB(BB, &BB->getInstructionAtIndex(Index))) {
+    assert(BB && "Basic block should not be nullptr");
+  }
+  /// Constructs a reference to the instruction inside the function without
+  /// CFG information.
+  MCInstReference(const BinaryFunction *BF, nocfg_const_iterator It)
+      : Reference(RefInBF(BF, It)) {
+    assert(BF && "Function should not be nullptr");
+  }
+
+  /// Locates an instruction inside a function and returns a reference.
+  static MCInstReference get(const MCInst *Inst, const BinaryFunction &BF);
+
+  bool operator==(const MCInstReference &Other) const {
+    return Reference == Other.Reference;
+  }
+
+  const MCInst &getMCInst() const {
+    assert(!empty() && "Empty reference");
+    if (auto *Ref = tryGetRefInBB())
+      return *Ref->It;
+    return getRefInBF().It->second;
+  }
+
+  operator const MCInst &() const { return getMCInst(); }
+
+  bool empty() const {
+    if (auto *Ref = tryGetRefInBB())
+      return Ref->BB == nullptr;
+    return getRefInBF().BF == nullptr;
+  }
+
+  bool hasCFG() const { return !empty() && tryGetRefInBB() != nullptr; }
+
+  const BinaryFunction *getFunction() const {
+    assert(!empty() && "Empty reference");
+    if (auto *Ref = tryGetRefInBB())
+      return Ref->BB->getFunction();
+    return getRefInBF().BF;
+  }
+
+  const BinaryBasicBlock *getBasicBlock() const {
+    assert(!empty() && "Empty reference");
+    if (auto *Ref = tryGetRefInBB())
+      return Ref->BB;
+    return nullptr;
+  }
+
+  raw_ostream &print(raw_ostream &OS) const;
+};
+
+static inline raw_ostream &operator<<(raw_ostream &OS,
+                                      const MCInstReference &Ref) {
+  return Ref.print(OS);
+}
+
+} // namespace bolt
+} // namespace llvm
+
+#endif
diff --git a/bolt/include/bolt/Passes/PAuthGadgetScanner.h b/bolt/include/bolt/Passes/PAuthGadgetScanner.h
index 721fd664a3253..cb865a725d72a 100644
--- a/bolt/include/bolt/Passes/PAuthGadgetScanner.h
+++ b/bolt/include/bolt/Passes/PAuthGadgetScanner.h
@@ -11,187 +11,13 @@
 
 #include "bolt/Core/BinaryContext.h"
 #include "bolt/Core/BinaryFunction.h"
+#include "bolt/Core/MCInstUtils.h"
 #include "bolt/Passes/BinaryPasses.h"
 #include "llvm/Support/raw_ostream.h"
 #include <memory>
 
 namespace llvm {
 namespace bolt {
-
-/// @brief  MCInstReference represents a reference to an MCInst as stored either
-/// in a BinaryFunction (i.e. before a CFG is created), or in a BinaryBasicBlock
-/// (after a CFG is created). It aims to store the necessary information to be
-/// able to find the specific MCInst in either the BinaryFunction or
-/// BinaryBasicBlock data structures later, so that e.g. the InputAddress of
-/// the corresponding instruction can be computed.
-
-struct MCInstInBBReference {
-  BinaryBasicBlock *BB;
-  int64_t BBIndex;
-  MCInstInBBReference(BinaryBasicBlock *BB, int64_t BBIndex)
-      : BB(BB), BBIndex(BBIndex) {}
-  MCInstInBBReference() : BB(nullptr), BBIndex(0) {}
-  static MCInstInBBReference get(const MCInst *Inst, BinaryFunction &BF) {
-    for (BinaryBasicBlock &BB : BF)
-      for (size_t I = 0; I < BB.size(); ++I)
-        if (Inst == &BB.getInstructionAtIndex(I))
-          return MCInstInBBReference(&BB, I);
-    return {};
-  }
-  bool operator==(const MCInstInBBReference &RHS) const {
-    return BB == RHS.BB && BBIndex == RHS.BBIndex;
-  }
-  bool operator<(const MCInstInBBReference &RHS) const {
-    return std::tie(BB, BBIndex) < std::tie(RHS.BB, RHS.BBIndex);
-  }
-  operator MCInst &() const {
-    assert(BB != nullptr);
-    return BB->getInstructionAtIndex(BBIndex);
-  }
-  uint64_t getAddress() const {
-    // 4 bytes per instruction on AArch64.
-    // FIXME: the assumption of 4 byte per instruction needs to be fixed before
-    // this method gets used on any non-AArch64 binaries (but should be fine for
-    // pac-ret analysis, as that is an AArch64-specific feature).
-    return BB->getFunction()->getAddress() + BB->getOffset() + BBIndex * 4;
-  }
-};
-
-raw_ostream &operator<<(raw_ostream &OS, const MCInstInBBReference &);
-
-struct MCInstInBFReference {
-  BinaryFunction *BF;
-  uint64_t Offset;
-  MCInstInBFReference(BinaryFunction *BF, uint64_t Offset)
-      : BF(BF), Offset(Offset) {}
-
-  static MCInstInBFReference get(const MCInst *Inst, BinaryFunction &BF) {
-    for (auto &I : BF.instrs())
-      if (Inst == &I.second)
-        return MCInstInBFReference(&BF, I.first);
-    return {};
-  }
-
-  MCInstInBFReference() : BF(nullptr), Offset(0) {}
-  bool operator==(const MCInstInBFReference &RHS) const {
-    return BF == RHS.BF && Offset == RHS.Offset;
-  }
-  bool operator<(const MCInstInBFReference &RHS) const {
-    return std::tie(BF, Offset) < std::tie(RHS.BF, RHS.Offset);
-  }
-  operator MCInst &() const {
-    assert(BF != nullptr);
-    return *BF->getInstructionAtOffset(Offset);
-  }
-
-  uint64_t getOffset() const { return Offset; }
-
-  uint64_t getAddress() const { return BF->getAddress() + getOffset(); }
-};
-
-raw_ostream &operator<<(raw_ostream &OS, const MCInstInBFReference &);
-
-struct MCInstReference {
-  enum Kind { FunctionParent, BasicBlockParent };
-  Kind ParentKind;
-  union U {
-    MCInstInBBReference BBRef;
-    MCInstInBFReference BFRef;
-    U(MCInstInBBReference BBRef) : BBRef(BBRef) {}
-    U(MCInstInBFReference BFRef) : BFRef(BFRef) {}
-  } U;
-  MCInstReference(MCInstInBBReference BBRef)
-      : ParentKind(BasicBlockParent), U(BBRef) {}
-  MCInstReference(MCInstInBFReference BFRef)
-      : ParentKind(FunctionParent), U(BFRef) {}
-  MCInstReference(BinaryBasicBlock *BB, int64_t BBIndex)
-      : MCInstReference(MCInstInBBReference(BB, BBIndex)) {}
-  MCInstReference(BinaryFunction *BF, uint32_t Offset)
-      : MCInstReference(MCInstInBFReference(BF, Offset)) {}
-
-  static MCInstReference get(const MCInst *Inst, BinaryFunction &BF) {
-    if (BF.hasCFG())
-      return MCInstInBBReference::get(Inst, BF);
-    return MCInstInBFReference::get(Inst, BF);
-  }
-
-  bool operator<(const MCInstReference &RHS) const {
-    if (ParentKind != RHS.ParentKind)
-      return ParentKind < RHS.ParentKind;
-    switch (ParentKind) {
-    case BasicBlockParent:
-      return U.BBRef < RHS.U.BBRef;
-    case FunctionParent:
-      return U.BFRef < RHS.U.BFRef;
-    }
-    llvm_unreachable("");
-  }
-
-  bool operator==(const MCInstReference &RHS) const {
-    if (ParentKind != RHS.ParentKind)
-      return false;
-    switch (ParentKind) {
-    case BasicBlockParent:
-      return U.BBRef == RHS.U.BBRef;
-    case FunctionParent:
-      return U.BFRef == RHS.U.BFRef;
-    }
-    llvm_unreachable("");
-  }
-
-  operator MCInst &() const {
-    switch (ParentKind) {
-    case BasicBlockParent:
-      return U.BBRef;
-    case FunctionParent:
-      return U.BFRef;
-    }
-    llvm_unreachable("");
-  }
-
-  operator bool() const {
-    switch (ParentKind) {
-    case BasicBlockParent:
-      return U.BBRef.BB != nullptr;
-    case FunctionParent:
-      return U.BFRef.BF != nullptr;
-    }
-    llvm_unreachable("");
-  }
-
-  uint64_t getAddress() const {
-    switch (ParentKind) {
-    case BasicBlockParent:
-      return U.BBRef.getAddress();
-    case FunctionParent:
-      return U.BFRef.getAddress();
-    }
-    llvm_unreachable("");
-  }
-
-  BinaryFunction *getFunction() const {
-    switch (ParentKind) {
-    case FunctionParent:
-      return U.BFRef.BF;
-    case BasicBlockParent:
-      return U.BBRef.BB->getFunction();
-    }
-    llvm_unreachable("");
-  }
-
-  BinaryBasicBlock *getBasicBlock() const {
-    switch (ParentKind) {
-    case FunctionParent:
-      return nullptr;
-    case BasicBlockParent:
-      return U.BBRef.BB;
-    }
-    llvm_unreachable("");
-  }
-};
-
-raw_ostream &operator<<(raw_ostream &OS, const MCInstReference &);
-
 namespace PAuthGadgetScanner {
 
 // The report classes are designed to be used in an immutable manner.
diff --git a/bolt/lib/Core/CMakeLists.txt b/bolt/lib/Core/CMakeLists.txt
index fc72dc023c590..58cfcab370f16 100644
--- a/bolt/lib/Core/CMakeLists.txt
+++ b/bolt/lib/Core/CMakeLists.txt
@@ -32,6 +32,7 @@ add_llvm_library(LLVMBOLTCore
   GDBIndex.cpp
   HashUtilities.cpp
   JumpTable.cpp
+  MCInstUtils.cpp
   MCPlusBuilder.cpp
   ParallelUtilities.cpp
   Relocation.cpp
diff --git a/bolt/lib/Core/MCInstUtils.cpp b/bolt/lib/Core/MCInstUtils.cpp
new file mode 100644
index 0000000000000..3cdb9673d4dc0
--- /dev/null
+++ b/bolt/lib/Core/MCInstUtils.cpp
@@ -0,0 +1,56 @@
+//===- bolt/Passes/MCInstUtils.cpp ----------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "bolt/Core/MCInstUtils.h"
+#include "bolt/Core/BinaryBasicBlock.h"
+#include "bolt/Core/BinaryFunction.h"
+
+#include <iterator>
+
+using namespace llvm;
+using namespace llvm::bolt;
+
+MCInstReference MCInstReference::get(const MCInst *Inst,
+                                     const BinaryFunction &BF) {
+  if (BF.hasCFG()) {
+    for (BinaryBasicBlock &BB : BF)
+      for (MCInst &MI : BB)
+        if (&MI == Inst)
+          return MCInstReference(&BB, Inst);
+    return {};
+  }
+
+  for (auto I = BF.instrs().begin(), E = BF.instrs().end(); I != E; ++I) {
+    if (&I->second == Inst)
+      return MCInstReference(&BF, I);
+  }
+  return {};
+}
+
+raw_ostream &MCInstReference::print(raw_ostream &OS) const {
+  if (const RefInBB *Ref = tryGetRefInBB()) {
+    OS << "MCInstBBRef<";
+    if (Ref->BB == nullptr) {
+      OS << "BB:(null)";
+    } else {
+      unsigned IndexInBB = std::distance(Ref->BB->begin(), Ref->It);
+      OS << "BB:" << Ref->BB->getName() << ":" << IndexInBB;
+    }
+    OS << ">";
+    return OS;
+  }
+
+  const RefInBF &Ref = getRefInBF();
+  OS << "MCInstBFRef<";
+  if (Ref.BF == nullptr)
+    OS << "BF:(null)";
+  else
+    OS << "BF:" << Ref.BF->getPrintName() << ":" << Ref.It->first;
+  OS << ">";
+  return OS;
+}
diff --git a/bolt/lib/Passes/PAuthGadgetScanner.cpp b/bolt/lib/Passes/PAuthGadgetScanner.cpp
index 65c84ebc8c4f4..5d884e2d37354 100644
--- a/bolt/lib/Passes/PAuthGadgetScanner.cpp
+++ b/bolt/lib/Passes/PAuthGadgetScanner.cpp
@@ -24,39 +24,6 @@
 
 namespace llvm {
 namespace bolt {
-
-raw_ostream &operator<<(raw_ostream &OS, const MCInstInBBReference &Ref) {
-  OS << "MCInstBBRef<";
-  if (Ref.BB == nullptr)
-    OS << "BB:(null)";
-  else
-    OS << "BB:" << Ref.BB->getName() << ":" << Ref.BBIndex;
-  OS << ">";
-  return OS;
-}
-
-raw_ostream &operator<<(raw_ostream &OS, const MCInstInBFReference &Ref) {
-  OS << "MCInstBFRef<";
-  if (Ref.BF == nullptr)
-    OS << "BF:(null)";
-  else
-    OS << "BF:" << Ref.BF->getPrintName() << ":" << Ref.getOffset();
-  OS << ">";
-  return OS;
-}
-
-raw_ostream &operator<<(raw_ostream &OS, const MCInstReference &Ref) {
-  switch (Ref.ParentKind) {
-  case MCInstReference::BasicBlockParent:
-    OS << Ref.U.BBRef;
-    return OS;
-  case MCInstReference::FunctionParent:
-    OS << Ref.U.BFRef;
-    return OS;
-  }
-  llvm_unreachable("");
-}
-
 namespace PAuthGadgetScanner {
 
 [[maybe_unused]] static void traceInst(const BinaryContext &BC, StringRef Label,
@@ -91,10 +58,10 @@ template <typename T> static void iterateOverInstrs(BinaryFunction &BF, T Fn) {
   if (BF.hasCFG()) {
     for (BinaryBasicBlock &BB : BF)
       for (int64_t I = 0, E = BB.size(); I < E; ++I)
-        Fn(MCInstInBBReference(&BB, I));
+        Fn(MCInstReference(&BB, I));
   } else {
-    for (auto I : BF.instrs())
-      Fn(MCInstInBFReference(&BF, I.first));
+    for (auto I = BF.instrs().begin(), E = BF.instrs().end(); I != E; ++I)
+      Fn(MCInstReference(&BF, I));
   }
 }
 
@@ -566,7 +533,7 @@ class SrcSafetyAnalysis {
     std::vector<MCInstReference> Result;
     for (const MCInst *Inst : lastWritingInsts(S, ClobberedReg)) {
       MCInstReference Ref = MCInstReference::get(Inst, BF);
-      assert(Ref && "Expected Inst to be found");
+      assert(!Ref.empty() && "Expected Inst to be found");
       Result.push_back(Ref);
     }
     return Result;
@@ -1138,7 +1105,7 @@ class DstSafetyAnalysis {
     std::vector<MCInstReference> Result;
     for (const MCInst *Inst : firstLeakingInsts(S, LeakedReg)) {
       MCInstReference Ref = MCInstReference::get(Inst, BF);
-      assert(Ref && "Expected Inst to be found");
+      assert(!Ref.empty() && "Expected Inst to be found");
       Result.push_back(Ref);
     }
     return Result;
@@ -1345,8 +1312,7 @@ static bool shouldAnalyzeTailCallInst(const BinaryContext &BC,
   // (such as isBranch at the time of writing this comment), some don't (such
   // as isCall). For that reason, call MCInstrDesc's methods explicitly when
   // it is important.
-  const MCInstrDesc &Desc =
-      BC.MII->get(static_cast<const MCInst &>(Inst).getOpcode());
+  const MCInstrDesc &Desc = BC.MII->get(Inst.getMCInst().getOpcode());
   // Tail call should be a branch (but not necessarily an indirect one).
   if (!Desc.isBranch())
     return false;
@@ -1541,7 +1507,7 @@ void FunctionAnalysisContext::findUnsafeUses(
       // This is printed as "[message] in function [name], basic block ...,
       // at address ..." when the issue is reported to the user.
       Reports.push_back(make_generic_report(
-          MCInstReference::get(FirstInst, BF),
+          MCInstReference(&BB, FirstInst),
           "Warning: possibly imprecise CFG, the analysis quality may be "
           "degraded in this function. According to BOLT, unreachable code is "
           "found" /* in function [name]... */));
@@ -1705,31 +1671,49 @@ void Analysis::runOnFunction(BinaryFunction &BF,
   }
 }
 
+// Compute the instruction address for printing (may be slow).
+static uint64_t getAddress(const MCInstReference &Inst) {
+  const BinaryFunction *BF = Inst.getFunction();
+
+  if (Inst.hasCFG()) {
+    const BinaryBasicBlock *BB = Inst.getBasicBlock();
+
+    auto It = static_cast<BinaryBasicBlock::const_iterator>(&Inst.getMCInst());
+    unsigned IndexInBB = std::distance(BB->begin(), It);
+
+    // FIXME: this assumes all instructions are 4 bytes in size. This is true
+    // for AArch64, but it might be good to extract this function so it can be
+    // used elsewhere and for other targets too.
+    return BF->getAddress() + BB->getOffset() + IndexInBB * 4;
+  }
+
+  for (auto I = BF->instrs().begin(), E = BF->instrs().end(); I != E; ++I) {
+    if (&I->second == &Inst.getMCInst())
+      return BF->getAddress() + I->first;
+  }
+  llvm_unreachable("Instruction not found in function");
+}
+
 static void printBB(const BinaryContext &BC, const BinaryBasicBlock *BB,
                     size_t StartIndex = 0, size_t EndIndex = -1) {
   if (EndIndex == (size_t)-1)
     EndIndex = BB->size() - 1;
   const BinaryFunction *BF = BB->getFunction();
   for (unsigned I = StartIndex; I <= EndIndex; ++I) {
-    // FIXME: this assumes all instructions are 4 bytes in size. This is true
-    // for AArch64, but it might be good to extract this function so it can be
-    // used elsewhere and for other targets too.
-    uint64_t Address = BB->getOffset() + BF->getAddress() + 4 * I;
-    const MCInst &Inst = BB->getInstructionAtIndex(I);
+    MCInstReference Inst(BB, I);
     if (BC.MIB->isCFI(Inst))
       continue;
-    BC.printInstruction(outs(), Inst, Address, BF);
+    BC.printInstruction(outs(), Inst, getAddress(Inst), BF);
   }
 }
 
 static void reportFoundGadgetInSingleBBSingleRelatedInst(
     raw_ostream &OS, const BinaryContext &BC, const MCInstReference RelatedInst,
     const MCInstReference Location) {
-  BinaryBasicBlock *BB = Location.getBasicBlock();
-  assert(RelatedInst.ParentKind == MCInstReference::BasicBlockParent);
-  assert(Location.ParentKind == MCInstReference::BasicBlockParent);
-  MCInstInBBReference RelatedInstBB = RelatedInst.U.BBRef;
-  if (BB == RelatedInstBB.BB) {
+  const BinaryBasicBlock *BB = Location.getBasicBlock();
+  assert(RelatedInst.hasCFG());
+  assert(Location.hasCFG());
+  if (BB == RelatedInst.getBasicBlock()) {
     OS << "  This happens in the following basic block:\n";
     printBB(BC, BB);
   }
@@ -1737,16 +1721,16 @@ static void reportFoundGadgetInSingleBBSingleRelatedInst(
 
 void Diagnostic::printBasicInfo(raw_ostream &OS, const BinaryContext &BC,
                                 StringRef IssueKind) const {
-  BinaryFunction *BF = Location.getFunction();
-  BinaryBasicBlock *BB = Location.getBasicBlock();
+  const BinaryBasicBlock *BB = Location.getBasicBlock();
+  const BinaryFunction *BF = Location.getFunction();
 
   OS << "\nGS-PAUTH: " << IssueKind;
   OS << " in function " << BF->getPrintName();
   if (BB)
     OS << ", basic block " << BB->getName();
-  OS << ", at address " << llvm::format("%x", Location.getAddress()) << "\n";
+  OS << ", at address " << llvm::format("%x", getAddress(Location)) << "\n";
   OS << "  The instruction is ";
-  BC.printInstruction(OS, Location, Location.getAddress(), BF);
+  BC.printInstruction(OS, Location, getAddress(Location), BF);
 }
 
 void GadgetDiagnostic::generateReport(raw_ostream &OS,
@@ -1762,19 +1746,19 @@ static void printRelatedInstrs(raw_ostream &OS, const MCInstReference Location,
   // Sort by address to ensure output is deterministic.
   SmallVector<MCInstReference> RI(RelatedInstrs);
   llvm::sort(RI, [](const MCInstReference &A, const MCInstReference &B) {
-    return A.getAddress() < B.getAddress();
+    return getAddress(A) < getAddress(B);
   });
   for (unsigned I = 0; I < RI.size(); ++I) {
     MCInstReference InstRef = RI[I];
     OS << "  " << (I + 1) << ". ";
-    BC.printInstruction(OS, InstRef, InstRef.getAddress(), &BF);
+    BC.printInstruction(OS, InstRef, getAddress(InstRef), &BF);
   };
   if (RelatedInstrs.size() == 1) {
     const MCInstReference RelatedInst = RelatedInstrs[0];
     // Printing the details for the MCInstReference::FunctionParent case
     // is not implemented not to overcomplicate the code, as most functions
     // are expected to have CFG information.
-    if (RelatedInst.ParentKind == MCInstReference::BasicBlockParent)
+    if (RelatedInst.hasCFG())
       reportFoundGadgetInSingleBBSingleRelatedInst(OS, BC, RelatedInst,
                                                    Location);
   }

>From eb32eb8a1c3e6163d4efa606ce8bdd96dc73dd5a Mon Sep 17 00:00:00 2001
From: Anatoly Trosinenko <atrosinenko at accesssoftek.com>
Date: Thu, 28 Aug 2025 15:46:05 +0300
Subject: [PATCH 2/3] Move public members to the top of MCInstReference, make
 nocfg_const_iterator public

---
 bolt/include/bolt/Core/MCInstUtils.h | 121 ++++++++++++++-------------
 1 file changed, 61 insertions(+), 60 deletions(-)

diff --git a/bolt/include/bolt/Core/MCInstUtils.h b/bolt/include/bolt/Core/MCInstUtils.h
index 2e93dfaf4c275..82660573afaf3 100644
--- a/bolt/include/bolt/Core/MCInstUtils.h
+++ b/bolt/include/bolt/Core/MCInstUtils.h
@@ -22,68 +22,9 @@ class BinaryFunction;
 /// in a BinaryFunction (i.e. before a CFG is created), or in a BinaryBasicBlock
 /// (after a CFG is created).
 class MCInstReference {
+public:
   using nocfg_const_iterator = std::map<uint32_t, MCInst>::const_iterator;
 
-  // Two cases are possible:
-  // * functions with CFG reconstructed - a function stores a collection of
-  //   basic blocks, each basic block stores a contiguous vector of MCInst
-  // * functions without CFG - there are no basic blocks created,
-  //   the instructions are directly stored in std::map in BinaryFunction
-  //
-  // In both cases, the direct parent of MCInst is stored together with an
-  // iterator pointing to the instruction.
-
-  // Helper struct: CFG is available, the direct parent is a basic block,
-  // iterator's type is `MCInst *`.
-  struct RefInBB {
-    RefInBB(const BinaryBasicBlock *BB, const MCInst *Inst)
-        : BB(BB), It(Inst) {}
-    RefInBB(const RefInBB &Other) = default;
-    RefInBB &operator=(const RefInBB &Other) = default;
-
-    const BinaryBasicBlock *BB;
-    BinaryBasicBlock::const_iterator It;
-
-    bool operator==(const RefInBB &Other) const {
-      return BB == Other.BB && It == Other.It;
-    }
-  };
-
-  // Helper struct: CFG is *not* available, the direct parent is a function,
-  // iterator's type is std::map<uint32_t, MCInst>::iterator (the mapped value
-  // is an instruction's offset).
-  struct RefInBF {
-    RefInBF(const BinaryFunction *BF, nocfg_const_iterator It)
-        : BF(BF), It(It) {}
-    RefInBF(const RefInBF &Other) = default;
-    RefInBF &operator=(const RefInBF &Other) = default;
-
-    const BinaryFunction *BF;
-    nocfg_const_iterator It;
-
-    bool operator==(const RefInBF &Other) const {
-      return BF == Other.BF && It->first == Other.It->first;
-    }
-  };
-
-  std::variant<RefInBB, RefInBF> Reference;
-
-  // Utility methods to be used like this:
-  //
-  //     if (auto *Ref = tryGetRefInBB())
-  //       return Ref->doSomething(...);
-  //     return getRefInBF().doSomethingElse(...);
-  const RefInBB *tryGetRefInBB() const {
-    assert(std::get_if<RefInBB>(&Reference) ||
-           std::get_if<RefInBF>(&Reference));
-    return std::get_if<RefInBB>(&Reference);
-  }
-  const RefInBF &getRefInBF() const {
-    assert(std::get_if<RefInBF>(&Reference));
-    return *std::get_if<RefInBF>(&Reference);
-  }
-
-public:
   /// Constructs an empty reference.
   MCInstReference() : Reference(RefInBB(nullptr, nullptr)) {}
   /// Constructs a reference to the instruction inside the basic block.
@@ -142,6 +83,66 @@ class MCInstReference {
   }
 
   raw_ostream &print(raw_ostream &OS) const;
+
+private:
+  // Two cases are possible:
+  // * functions with CFG reconstructed - a function stores a collection of
+  //   basic blocks, each basic block stores a contiguous vector of MCInst
+  // * functions without CFG - there are no basic blocks created,
+  //   the instructions are directly stored in std::map in BinaryFunction
+  //
+  // In both cases, the direct parent of MCInst is stored together with an
+  // iterator pointing to the instruction.
+
+  // Helper struct: CFG is available, the direct parent is a basic block,
+  // iterator's type is `MCInst *`.
+  struct RefInBB {
+    RefInBB(const BinaryBasicBlock *BB, const MCInst *Inst)
+        : BB(BB), It(Inst) {}
+    RefInBB(const RefInBB &Other) = default;
+    RefInBB &operator=(const RefInBB &Other) = default;
+
+    const BinaryBasicBlock *BB;
+    BinaryBasicBlock::const_iterator It;
+
+    bool operator==(const RefInBB &Other) const {
+      return BB == Other.BB && It == Other.It;
+    }
+  };
+
+  // Helper struct: CFG is *not* available, the direct parent is a function,
+  // iterator's type is std::map<uint32_t, MCInst>::iterator (the mapped value
+  // is an instruction's offset).
+  struct RefInBF {
+    RefInBF(const BinaryFunction *BF, nocfg_const_iterator It)
+        : BF(BF), It(It) {}
+    RefInBF(const RefInBF &Other) = default;
+    RefInBF &operator=(const RefInBF &Other) = default;
+
+    const BinaryFunction *BF;
+    nocfg_const_iterator It;
+
+    bool operator==(const RefInBF &Other) const {
+      return BF == Other.BF && It->first == Other.It->first;
+    }
+  };
+
+  std::variant<RefInBB, RefInBF> Reference;
+
+  // Utility methods to be used like this:
+  //
+  //     if (auto *Ref = tryGetRefInBB())
+  //       return Ref->doSomething(...);
+  //     return getRefInBF().doSomethingElse(...);
+  const RefInBB *tryGetRefInBB() const {
+    assert(std::get_if<RefInBB>(&Reference) ||
+           std::get_if<RefInBF>(&Reference));
+    return std::get_if<RefInBB>(&Reference);
+  }
+  const RefInBF &getRefInBF() const {
+    assert(std::get_if<RefInBF>(&Reference));
+    return *std::get_if<RefInBF>(&Reference);
+  }
 };
 
 static inline raw_ostream &operator<<(raw_ostream &OS,

>From 3d8ac8c8cd8d5015b4f5b4a1dff6c7455e1401fd Mon Sep 17 00:00:00 2001
From: Anatoly Trosinenko <atrosinenko at accesssoftek.com>
Date: Thu, 28 Aug 2025 16:19:56 +0300
Subject: [PATCH 3/3] Fix iterating over the instructions of BinaryBasicBlock

---
 bolt/include/bolt/Core/MCInstUtils.h   | 15 ++++++--
 bolt/lib/Core/MCInstUtils.cpp          | 52 +++++++++++++++++++++++++-
 bolt/lib/Passes/PAuthGadgetScanner.cpp | 33 +++-------------
 3 files changed, 66 insertions(+), 34 deletions(-)

diff --git a/bolt/include/bolt/Core/MCInstUtils.h b/bolt/include/bolt/Core/MCInstUtils.h
index 82660573afaf3..a2797c6b31aa2 100644
--- a/bolt/include/bolt/Core/MCInstUtils.h
+++ b/bolt/include/bolt/Core/MCInstUtils.h
@@ -13,6 +13,10 @@
 #include <map>
 #include <variant>
 
+namespace llvm {
+class MCCodeEmitter;
+}
+
 namespace llvm {
 namespace bolt {
 
@@ -54,7 +58,7 @@ class MCInstReference {
   const MCInst &getMCInst() const {
     assert(!empty() && "Empty reference");
     if (auto *Ref = tryGetRefInBB())
-      return *Ref->It;
+      return *Ref->Inst;
     return getRefInBF().It->second;
   }
 
@@ -82,6 +86,9 @@ class MCInstReference {
     return nullptr;
   }
 
+  // MCCodeEmitter is not thread safe.
+  uint64_t getAddress(const MCCodeEmitter *Emitter = nullptr) const;
+
   raw_ostream &print(raw_ostream &OS) const;
 
 private:
@@ -98,15 +105,15 @@ class MCInstReference {
   // iterator's type is `MCInst *`.
   struct RefInBB {
     RefInBB(const BinaryBasicBlock *BB, const MCInst *Inst)
-        : BB(BB), It(Inst) {}
+        : BB(BB), Inst(Inst) {}
     RefInBB(const RefInBB &Other) = default;
     RefInBB &operator=(const RefInBB &Other) = default;
 
     const BinaryBasicBlock *BB;
-    BinaryBasicBlock::const_iterator It;
+    const MCInst *Inst;
 
     bool operator==(const RefInBB &Other) const {
-      return BB == Other.BB && It == Other.It;
+      return BB == Other.BB && Inst == Other.Inst;
     }
   };
 
diff --git a/bolt/lib/Core/MCInstUtils.cpp b/bolt/lib/Core/MCInstUtils.cpp
index 3cdb9673d4dc0..f721c259e9b28 100644
--- a/bolt/lib/Core/MCInstUtils.cpp
+++ b/bolt/lib/Core/MCInstUtils.cpp
@@ -9,12 +9,33 @@
 #include "bolt/Core/MCInstUtils.h"
 #include "bolt/Core/BinaryBasicBlock.h"
 #include "bolt/Core/BinaryFunction.h"
+#include "llvm/ADT/iterator.h"
 
-#include <iterator>
+#include <type_traits>
 
 using namespace llvm;
 using namespace llvm::bolt;
 
+// It is assumed in a few places that BinaryBasicBlock stores its instructions
+// in a contiguous vector. Give this assumption a name to simplify marking the
+// particular places with static_assert.
+using BasicBlockStorageIsVector =
+    std::is_same<BinaryBasicBlock::const_iterator,
+                 std::vector<MCInst>::const_iterator>;
+
+namespace {
+// Cannot reuse MCPlusBuilder::InstructionIterator because it has to be
+// constructed from a non-const std::map iterator.
+class mapped_mcinst_iterator
+    : public iterator_adaptor_base<mapped_mcinst_iterator,
+                                   MCInstReference::nocfg_const_iterator> {
+public:
+  mapped_mcinst_iterator(MCInstReference::nocfg_const_iterator It)
+      : iterator_adaptor_base(It) {}
+  const MCInst &operator*() const { return this->I->second; }
+};
+} // anonymous namespace
+
 MCInstReference MCInstReference::get(const MCInst *Inst,
                                      const BinaryFunction &BF) {
   if (BF.hasCFG()) {
@@ -32,13 +53,40 @@ MCInstReference MCInstReference::get(const MCInst *Inst,
   return {};
 }
 
+uint64_t MCInstReference::getAddress(const MCCodeEmitter *Emitter) const {
+  assert(!empty() && "Taking instruction address by empty reference");
+
+  const BinaryContext &BC = getFunction()->getBinaryContext();
+  if (auto *Ref = tryGetRefInBB()) {
+    static_assert(BasicBlockStorageIsVector::value,
+                  "Cannot use 'const MCInst *' as iterator type");
+    uint64_t AddressOfBB = getFunction()->getAddress() + Ref->BB->getOffset();
+    const MCInst *FirstInstInBB = &*Ref->BB->begin();
+
+    uint64_t OffsetInBB = BC.computeCodeSize(FirstInstInBB, Ref->Inst, Emitter);
+
+    return AddressOfBB + OffsetInBB;
+  }
+
+  auto &Ref = getRefInBF();
+  mapped_mcinst_iterator FirstInstInBF(Ref.BF->instrs().begin());
+  mapped_mcinst_iterator ThisInst(Ref.It);
+
+  uint64_t OffsetInBF = BC.computeCodeSize(FirstInstInBF, ThisInst, Emitter);
+
+  return getFunction()->getAddress() + OffsetInBF;
+}
+
 raw_ostream &MCInstReference::print(raw_ostream &OS) const {
   if (const RefInBB *Ref = tryGetRefInBB()) {
     OS << "MCInstBBRef<";
     if (Ref->BB == nullptr) {
       OS << "BB:(null)";
     } else {
-      unsigned IndexInBB = std::distance(Ref->BB->begin(), Ref->It);
+      static_assert(BasicBlockStorageIsVector::value,
+                    "Cannot use pointer arithmetic on 'const MCInst *'");
+      const MCInst *FirstInstInBB = &*Ref->BB->begin();
+      unsigned IndexInBB = Ref->Inst - FirstInstInBB;
       OS << "BB:" << Ref->BB->getName() << ":" << IndexInBB;
     }
     OS << ">";
diff --git a/bolt/lib/Passes/PAuthGadgetScanner.cpp b/bolt/lib/Passes/PAuthGadgetScanner.cpp
index 5d884e2d37354..287cbe31df3af 100644
--- a/bolt/lib/Passes/PAuthGadgetScanner.cpp
+++ b/bolt/lib/Passes/PAuthGadgetScanner.cpp
@@ -1671,29 +1671,6 @@ void Analysis::runOnFunction(BinaryFunction &BF,
   }
 }
 
-// Compute the instruction address for printing (may be slow).
-static uint64_t getAddress(const MCInstReference &Inst) {
-  const BinaryFunction *BF = Inst.getFunction();
-
-  if (Inst.hasCFG()) {
-    const BinaryBasicBlock *BB = Inst.getBasicBlock();
-
-    auto It = static_cast<BinaryBasicBlock::const_iterator>(&Inst.getMCInst());
-    unsigned IndexInBB = std::distance(BB->begin(), It);
-
-    // FIXME: this assumes all instructions are 4 bytes in size. This is true
-    // for AArch64, but it might be good to extract this function so it can be
-    // used elsewhere and for other targets too.
-    return BF->getAddress() + BB->getOffset() + IndexInBB * 4;
-  }
-
-  for (auto I = BF->instrs().begin(), E = BF->instrs().end(); I != E; ++I) {
-    if (&I->second == &Inst.getMCInst())
-      return BF->getAddress() + I->first;
-  }
-  llvm_unreachable("Instruction not found in function");
-}
-
 static void printBB(const BinaryContext &BC, const BinaryBasicBlock *BB,
                     size_t StartIndex = 0, size_t EndIndex = -1) {
   if (EndIndex == (size_t)-1)
@@ -1703,7 +1680,7 @@ static void printBB(const BinaryContext &BC, const BinaryBasicBlock *BB,
     MCInstReference Inst(BB, I);
     if (BC.MIB->isCFI(Inst))
       continue;
-    BC.printInstruction(outs(), Inst, getAddress(Inst), BF);
+    BC.printInstruction(outs(), Inst, Inst.getAddress(), BF);
   }
 }
 
@@ -1728,9 +1705,9 @@ void Diagnostic::printBasicInfo(raw_ostream &OS, const BinaryContext &BC,
   OS << " in function " << BF->getPrintName();
   if (BB)
     OS << ", basic block " << BB->getName();
-  OS << ", at address " << llvm::format("%x", getAddress(Location)) << "\n";
+  OS << ", at address " << llvm::format("%x", Location.getAddress()) << "\n";
   OS << "  The instruction is ";
-  BC.printInstruction(OS, Location, getAddress(Location), BF);
+  BC.printInstruction(OS, Location, Location.getAddress(), BF);
 }
 
 void GadgetDiagnostic::generateReport(raw_ostream &OS,
@@ -1746,12 +1723,12 @@ static void printRelatedInstrs(raw_ostream &OS, const MCInstReference Location,
   // Sort by address to ensure output is deterministic.
   SmallVector<MCInstReference> RI(RelatedInstrs);
   llvm::sort(RI, [](const MCInstReference &A, const MCInstReference &B) {
-    return getAddress(A) < getAddress(B);
+    return A.getAddress() < B.getAddress();
   });
   for (unsigned I = 0; I < RI.size(); ++I) {
     MCInstReference InstRef = RI[I];
     OS << "  " << (I + 1) << ". ";
-    BC.printInstruction(OS, InstRef, getAddress(InstRef), &BF);
+    BC.printInstruction(OS, InstRef, InstRef.getAddress(), &BF);
   };
   if (RelatedInstrs.size() == 1) {
     const MCInstReference RelatedInst = RelatedInstrs[0];



More information about the llvm-commits mailing list