[llvm] c82d09c - [PDB] Add public symbol lookup by address (#157361)

via llvm-commits llvm-commits at lists.llvm.org
Wed Sep 10 08:38:41 PDT 2025


Author: nerix
Date: 2025-09-10T17:38:37+02:00
New Revision: c82d09c96a4c0c5454143cc35c935557528bb86b

URL: https://github.com/llvm/llvm-project/commit/c82d09c96a4c0c5454143cc35c935557528bb86b
DIFF: https://github.com/llvm/llvm-project/commit/c82d09c96a4c0c5454143cc35c935557528bb86b.diff

LOG: [PDB] Add public symbol lookup by address (#157361)

This adds a method on the `PublicsStream` to look up symbols using their
address (segment + offset).
It's largely a reimplementation of
[`NearestSym`](https://github.com/microsoft/microsoft-pdb/blob/805655a28bd8198004be2ac27e6e0290121a5e89/PDB/dbi/gsi.cpp#L1492-L1581)
from the reference. However, we don't return the nearest symbol, but the
exact symbol.
Still, in case of ICF, we return the symbol that's first in the address
map. Users can then use the returned offset to read the next records to
check if multiple symbols overlap, if desired.

>From #149701.

Added: 
    llvm/unittests/DebugInfo/PDB/PublicsStreamTest.cpp

Modified: 
    llvm/include/llvm/DebugInfo/PDB/Native/PublicsStream.h
    llvm/lib/DebugInfo/PDB/Native/PublicsStream.cpp
    llvm/unittests/DebugInfo/PDB/CMakeLists.txt

Removed: 
    


################################################################################
diff  --git a/llvm/include/llvm/DebugInfo/PDB/Native/PublicsStream.h b/llvm/include/llvm/DebugInfo/PDB/Native/PublicsStream.h
index 2cb4bee8ca5df..c5fdad057e867 100644
--- a/llvm/include/llvm/DebugInfo/PDB/Native/PublicsStream.h
+++ b/llvm/include/llvm/DebugInfo/PDB/Native/PublicsStream.h
@@ -18,9 +18,13 @@ namespace llvm {
 namespace msf {
 class MappedBlockStream;
 }
+namespace codeview {
+class PublicSym32;
+}
 namespace pdb {
 struct PublicsStreamHeader;
 struct SectionOffset;
+class SymbolStream;
 
 class PublicsStream {
 public:
@@ -42,6 +46,20 @@ class PublicsStream {
     return SectionOffsets;
   }
 
+  /// Find a public symbol by a segment and offset.
+  ///
+  /// In case there is more than one symbol (for example due to ICF), the first
+  /// one is returned.
+  ///
+  /// \return If a symbol was found, the symbol at the provided address is
+  ///     returned as well as the index of this symbol in the address map. If
+  ///     the binary was linked with ICF, there might be more symbols with the
+  ///     same address after the returned one. If no symbol is found,
+  ///     `std::nullopt` is returned.
+  LLVM_ABI std::optional<std::pair<codeview::PublicSym32, size_t>>
+  findByAddress(const SymbolStream &Symbols, uint16_t Segment,
+                uint32_t Offset) const;
+
 private:
   std::unique_ptr<msf::MappedBlockStream> Stream;
   GSIHashTable PublicsTable;

diff  --git a/llvm/lib/DebugInfo/PDB/Native/PublicsStream.cpp b/llvm/lib/DebugInfo/PDB/Native/PublicsStream.cpp
index c350e0e0b3e19..0453eea26605b 100644
--- a/llvm/lib/DebugInfo/PDB/Native/PublicsStream.cpp
+++ b/llvm/lib/DebugInfo/PDB/Native/PublicsStream.cpp
@@ -22,9 +22,12 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/DebugInfo/PDB/Native/PublicsStream.h"
+#include "llvm/DebugInfo/CodeView/SymbolDeserializer.h"
+#include "llvm/DebugInfo/CodeView/SymbolRecord.h"
 #include "llvm/DebugInfo/MSF/MappedBlockStream.h"
 #include "llvm/DebugInfo/PDB/Native/RawError.h"
 #include "llvm/DebugInfo/PDB/Native/RawTypes.h"
+#include "llvm/DebugInfo/PDB/Native/SymbolStream.h"
 #include "llvm/Support/BinaryStreamReader.h"
 #include "llvm/Support/Error.h"
 #include <cstdint>
@@ -96,3 +99,50 @@ Error PublicsStream::reload() {
                                 "Corrupted publics stream.");
   return Error::success();
 }
+
+// This is a reimplementation of NearestSym:
+// https://github.com/microsoft/microsoft-pdb/blob/805655a28bd8198004be2ac27e6e0290121a5e89/PDB/dbi/gsi.cpp#L1492-L1581
+std::optional<std::pair<codeview::PublicSym32, size_t>>
+PublicsStream::findByAddress(const SymbolStream &Symbols, uint16_t Segment,
+                             uint32_t Offset) const {
+  // The address map is sorted by address, so we can use lower_bound to find the
+  // position. Each element is an offset into the symbols for a public symbol.
+  auto It = llvm::lower_bound(
+      AddressMap, std::tuple(Segment, Offset),
+      [&](support::ulittle32_t Cur, auto Addr) {
+        auto Sym = Symbols.readRecord(Cur.value());
+        if (Sym.kind() != codeview::S_PUB32)
+          return false; // stop here, this is most likely corrupted debug info
+
+        auto Psym =
+            codeview::SymbolDeserializer::deserializeAs<codeview::PublicSym32>(
+                Sym);
+        if (!Psym) {
+          consumeError(Psym.takeError());
+          return false;
+        }
+
+        return std::tie(Psym->Segment, Psym->Offset) < Addr;
+      });
+
+  if (It == AddressMap.end())
+    return std::nullopt;
+
+  auto Sym = Symbols.readRecord(It->value());
+  if (Sym.kind() != codeview::S_PUB32)
+    return std::nullopt; // this is most likely corrupted debug info
+
+  auto MaybePsym =
+      codeview::SymbolDeserializer::deserializeAs<codeview::PublicSym32>(Sym);
+  if (!MaybePsym) {
+    consumeError(MaybePsym.takeError());
+    return std::nullopt;
+  }
+  codeview::PublicSym32 Psym = std::move(*MaybePsym);
+
+  if (std::tuple(Segment, Offset) != std::tuple(Psym.Segment, Psym.Offset))
+    return std::nullopt;
+
+  std::ptr
diff _t IterOffset = It - AddressMap.begin();
+  return std::pair{Psym, static_cast<size_t>(IterOffset)};
+}

diff  --git a/llvm/unittests/DebugInfo/PDB/CMakeLists.txt b/llvm/unittests/DebugInfo/PDB/CMakeLists.txt
index ba2a732848f4d..b1b9d2d98c944 100644
--- a/llvm/unittests/DebugInfo/PDB/CMakeLists.txt
+++ b/llvm/unittests/DebugInfo/PDB/CMakeLists.txt
@@ -11,6 +11,7 @@ add_llvm_unittest_with_input_files(DebugInfoPDBTests
   StringTableBuilderTest.cpp
   PDBApiTest.cpp
   PDBVariantTest.cpp
+  PublicsStreamTest.cpp
   )
 
 target_link_libraries(DebugInfoPDBTests PRIVATE LLVMTestingSupport)

diff  --git a/llvm/unittests/DebugInfo/PDB/PublicsStreamTest.cpp b/llvm/unittests/DebugInfo/PDB/PublicsStreamTest.cpp
new file mode 100644
index 0000000000000..4b89280cbdb93
--- /dev/null
+++ b/llvm/unittests/DebugInfo/PDB/PublicsStreamTest.cpp
@@ -0,0 +1,226 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/DebugInfo/PDB/Native/PublicsStream.h"
+#include "llvm/DebugInfo/CodeView/SymbolDeserializer.h"
+#include "llvm/DebugInfo/CodeView/SymbolRecord.h"
+#include "llvm/DebugInfo/MSF/MSFBuilder.h"
+#include "llvm/DebugInfo/MSF/MappedBlockStream.h"
+#include "llvm/DebugInfo/PDB/Native/GSIStreamBuilder.h"
+#include "llvm/DebugInfo/PDB/Native/PDBFile.h"
+#include "llvm/DebugInfo/PDB/Native/PDBFileBuilder.h"
+#include "llvm/DebugInfo/PDB/Native/SymbolStream.h"
+#include "llvm/Support/BinaryByteStream.h"
+
+#include "gtest/gtest.h"
+
+using namespace llvm;
+using namespace llvm::pdb;
+
+namespace {
+struct PublicSym {
+  llvm::StringRef Name;
+  uint16_t Segment;
+  uint32_t Offset;
+};
+
+class MockPublics {
+public:
+  MockPublics(size_t StreamSize, BumpPtrAllocator &Alloc,
+              msf::MSFBuilder Builder);
+  static Expected<std::unique_ptr<MockPublics>>
+  create(BumpPtrAllocator &Allocator, size_t StreamSize);
+
+  void addPublics(ArrayRef<PublicSym> Syms);
+  Error finish();
+
+  PublicsStream *publicsStream();
+  SymbolStream *symbolStream();
+
+  MutableBinaryByteStream &stream() { return Stream; }
+
+private:
+  MutableBinaryByteStream Stream;
+
+  msf::MSFBuilder MsfBuilder;
+  std::optional<msf::MSFLayout> MsfLayout;
+
+  GSIStreamBuilder Gsi;
+
+  std::unique_ptr<PublicsStream> Publics;
+  std::unique_ptr<SymbolStream> Symbols;
+};
+
+MockPublics::MockPublics(size_t StreamSize, BumpPtrAllocator &Allocator,
+                         msf::MSFBuilder Builder)
+    : Stream({Allocator.Allocate<uint8_t>(StreamSize), StreamSize},
+             llvm::endianness::little),
+      MsfBuilder(std::move(Builder)), Gsi(this->MsfBuilder) {}
+
+Expected<std::unique_ptr<MockPublics>>
+MockPublics::create(BumpPtrAllocator &Allocator, size_t StreamSize) {
+  auto ExpectedMsf = msf::MSFBuilder::create(Allocator, 4096);
+  if (!ExpectedMsf)
+    return ExpectedMsf.takeError();
+  return std::make_unique<MockPublics>(StreamSize, Allocator,
+                                       std::move(*ExpectedMsf));
+}
+
+void MockPublics::addPublics(ArrayRef<PublicSym> Publics) {
+  std::vector<BulkPublic> Bulks;
+  for (const auto &Sym : Publics) {
+    BulkPublic BP;
+    BP.Name = Sym.Name.data();
+    BP.NameLen = Sym.Name.size();
+    BP.Offset = Sym.Offset;
+    BP.Segment = Sym.Segment;
+    Bulks.emplace_back(BP);
+  }
+  Gsi.addPublicSymbols(std::move(Bulks));
+}
+
+Error MockPublics::finish() {
+  auto Err = Gsi.finalizeMsfLayout();
+  if (Err)
+    return Err;
+
+  auto ExpectedLayout = MsfBuilder.generateLayout();
+  if (!ExpectedLayout)
+    return ExpectedLayout.takeError();
+  MsfLayout = std::move(*ExpectedLayout);
+
+  return Gsi.commit(*MsfLayout, Stream);
+}
+
+PublicsStream *MockPublics::publicsStream() {
+  if (!Publics) {
+    Publics = std::make_unique<PublicsStream>(
+        msf::MappedBlockStream::createIndexedStream(*MsfLayout, Stream,
+                                                    Gsi.getPublicsStreamIndex(),
+                                                    MsfBuilder.getAllocator()));
+  }
+  return Publics.get();
+}
+
+SymbolStream *MockPublics::symbolStream() {
+  if (!Symbols) {
+    Symbols = std::make_unique<SymbolStream>(
+        msf::MappedBlockStream::createIndexedStream(*MsfLayout, Stream,
+                                                    Gsi.getRecordStreamIndex(),
+                                                    MsfBuilder.getAllocator()));
+  }
+  return Symbols.get();
+}
+
+std::array GSymbols{
+    PublicSym{"??0Base@@QEAA at XZ", /*Segment=*/1, /*Offset=*/0},
+    PublicSym{"??0Derived@@QEAA at XZ", /*Segment=*/1, /*Offset=*/32},
+    PublicSym{"??0Derived2@@QEAA at XZ", /*Segment=*/1, /*Offset=*/32},
+    PublicSym{"??0Derived3@@QEAA at XZ", /*Segment=*/1, /*Offset=*/80},
+    PublicSym{"??1Base@@UEAA at XZ", /*Segment=*/1, /*Offset=*/160},
+    PublicSym{"??1Derived@@UEAA at XZ", /*Segment=*/1, /*Offset=*/176},
+    PublicSym{"??1Derived2@@UEAA at XZ", /*Segment=*/1, /*Offset=*/176},
+    PublicSym{"??1Derived3@@UEAA at XZ", /*Segment=*/1, /*Offset=*/208},
+    PublicSym{"??3 at YAXPEAX_K@Z", /*Segment=*/1, /*Offset=*/256},
+    PublicSym{"??_EDerived3@@W7EAAPEAXI at Z", /*Segment=*/1, /*Offset=*/268},
+    PublicSym{"??_GBase@@UEAAPEAXI at Z", /*Segment=*/1, /*Offset=*/288},
+    PublicSym{"??_EBase@@UEAAPEAXI at Z", /*Segment=*/1, /*Offset=*/288},
+    PublicSym{"??_EDerived2@@UEAAPEAXI at Z", /*Segment=*/1, /*Offset=*/352},
+    PublicSym{"??_EDerived@@UEAAPEAXI at Z", /*Segment=*/1, /*Offset=*/352},
+    PublicSym{"??_GDerived@@UEAAPEAXI at Z", /*Segment=*/1, /*Offset=*/352},
+    PublicSym{"??_GDerived2@@UEAAPEAXI at Z", /*Segment=*/1, /*Offset=*/352},
+    PublicSym{"??_EDerived3@@UEAAPEAXI at Z", /*Segment=*/1, /*Offset=*/416},
+    PublicSym{"??_GDerived3@@UEAAPEAXI at Z", /*Segment=*/1, /*Offset=*/416},
+    PublicSym{"?AMethod at AClass@@QEAAXHPEAD at Z", /*Segment=*/1, /*Offset=*/480},
+    PublicSym{"?Something at AClass@@SA_ND at Z", /*Segment=*/1, /*Offset=*/496},
+    PublicSym{"?dup1@@YAHH at Z", /*Segment=*/1, /*Offset=*/544},
+    PublicSym{"?dup3@@YAHH at Z", /*Segment=*/1, /*Offset=*/544},
+    PublicSym{"?dup2@@YAHH at Z", /*Segment=*/1, /*Offset=*/544},
+    PublicSym{"?foobar@@YAHH at Z", /*Segment=*/1, /*Offset=*/560},
+    PublicSym{"main", /*Segment=*/1, /*Offset=*/576},
+    PublicSym{"??_7Base@@6B@", /*Segment=*/2, /*Offset=*/0},
+    PublicSym{"??_7Derived@@6B@", /*Segment=*/2, /*Offset=*/8},
+    PublicSym{"??_7Derived2@@6B@", /*Segment=*/2, /*Offset=*/8},
+    PublicSym{"??_7Derived3@@6BDerived2@@@", /*Segment=*/2, /*Offset=*/16},
+    PublicSym{"??_7Derived3@@6BDerived@@@", /*Segment=*/2, /*Offset=*/24},
+    PublicSym{"?AGlobal@@3HA", /*Segment=*/3, /*Offset=*/0},
+};
+
+} // namespace
+
+static std::pair<uint32_t, uint32_t>
+nthSymbolAddress(PublicsStream *Publics, SymbolStream *Symbols, size_t N) {
+  auto Index = Publics->getAddressMap()[N].value();
+  codeview::CVSymbol Sym = Symbols->readRecord(Index);
+  auto ExpectedPub =
+      codeview::SymbolDeserializer::deserializeAs<codeview::PublicSym32>(Sym);
+  if (!ExpectedPub)
+    return std::pair(0, 0);
+  return std::pair(ExpectedPub->Segment, ExpectedPub->Offset);
+}
+
+TEST(PublicsStreamTest, FindByAddress) {
+  BumpPtrAllocator Allocator;
+  auto ExpectedMock = MockPublics::create(Allocator, 1 << 20);
+  ASSERT_TRUE(bool(ExpectedMock));
+  std::unique_ptr<MockPublics> Mock = std::move(*ExpectedMock);
+
+  Mock->addPublics(GSymbols);
+  Error Err = Mock->finish();
+  ASSERT_FALSE(Err) << Err;
+
+  auto *Publics = Mock->publicsStream();
+  ASSERT_NE(Publics, nullptr);
+  Err = Publics->reload();
+  ASSERT_FALSE(Err) << Err;
+
+  auto *Symbols = Mock->symbolStream();
+  ASSERT_NE(Symbols, nullptr);
+  Err = Symbols->reload();
+  ASSERT_FALSE(Err) << Err;
+
+  auto VTableDerived = Publics->findByAddress(*Symbols, 2, 8);
+  ASSERT_TRUE(VTableDerived.has_value());
+  // both derived and derived2 have their vftables there - but derived2 is first
+  // (due to ICF)
+  ASSERT_EQ(VTableDerived->first.Name, "??_7Derived2@@6B@");
+  ASSERT_EQ(VTableDerived->second, 26u);
+
+  // Again, make sure that we find the first symbol
+  auto VectorDtorDerived = Publics->findByAddress(*Symbols, 1, 352);
+  ASSERT_TRUE(VectorDtorDerived.has_value());
+  ASSERT_EQ(VectorDtorDerived->first.Name, "??_EDerived2@@UEAAPEAXI at Z");
+  ASSERT_EQ(VectorDtorDerived->second, 12u);
+  ASSERT_EQ(nthSymbolAddress(Publics, Symbols, 13), std::pair(1u, 352u));
+  ASSERT_EQ(nthSymbolAddress(Publics, Symbols, 14), std::pair(1u, 352u));
+  ASSERT_EQ(nthSymbolAddress(Publics, Symbols, 15), std::pair(1u, 352u));
+  ASSERT_EQ(nthSymbolAddress(Publics, Symbols, 16), std::pair(1u, 416u));
+
+  ASSERT_FALSE(Publics->findByAddress(*Symbols, 2, 7).has_value());
+  ASSERT_FALSE(Publics->findByAddress(*Symbols, 2, 9).has_value());
+
+  auto GlobalSym = Publics->findByAddress(*Symbols, 3, 0);
+  ASSERT_TRUE(GlobalSym.has_value());
+  ASSERT_EQ(GlobalSym->first.Name, "?AGlobal@@3HA");
+  ASSERT_EQ(GlobalSym->second, 30u);
+
+  // test corrupt debug info
+  codeview::CVSymbol GlobalCVSym =
+      Symbols->readRecord(Publics->getAddressMap()[30]);
+  ASSERT_EQ(GlobalCVSym.kind(), codeview::S_PUB32);
+  // CVSymbol::data returns a pointer to const data, so we modify the backing
+  // data
+  uint8_t *PDBData = Mock->stream().data().data();
+  auto Offset = GlobalCVSym.data().data() - PDBData;
+  reinterpret_cast<codeview::RecordPrefix *>(PDBData + Offset)->RecordKind =
+      codeview::S_GDATA32;
+  ASSERT_EQ(GlobalCVSym.kind(), codeview::S_GDATA32);
+
+  GlobalSym = Publics->findByAddress(*Symbols, 3, 0);
+  ASSERT_FALSE(GlobalSym.has_value());
+}


        


More information about the llvm-commits mailing list