[llvm] [symbolizer] Support symbol+offset lookup (PR #75067)

Serge Pavlov via llvm-commits llvm-commits at lists.llvm.org
Tue Dec 12 04:40:54 PST 2023


https://github.com/spavloff updated https://github.com/llvm/llvm-project/pull/75067

>From a6ebb53756c7621af8f289c68da410c0ff0f7f55 Mon Sep 17 00:00:00 2001
From: Serge Pavlov <sepavloff at gmail.com>
Date: Tue, 21 Nov 2023 01:31:33 +0700
Subject: [PATCH 1/2] [symbolizer] Support symbol+offset lookup

GNU addr2line supports lookup by symbol name in addition to the existing
address lookup. llvm-symbolizer starting from e144ae54dcb96838a6176fd9eef21028935ccd4f
supports lookup by symbol name. This change extends this lookup with
possibility to specify optional offset.

Now the address for which source information is searched for can be
specified with offset:

    llvm-symbolize --obj=abc.so "SYMBOL func_22+0x12"

It decreases the gap in features of llvm-symbolizer and GNU addr2line.
This lookup now is supported for code only.

Differential Revision: https://reviews.llvm.org/D139859
---
 .../DebugInfo/Symbolize/SymbolizableModule.h  |  2 +-
 .../Symbolize/SymbolizableObjectFile.h        |  2 +-
 .../llvm/DebugInfo/Symbolize/Symbolize.h      | 16 +++---
 .../Symbolize/SymbolizableObjectFile.cpp      |  8 +--
 llvm/lib/DebugInfo/Symbolize/Symbolize.cpp    | 20 +++++---
 .../tools/llvm-symbolizer/symbol-search.test  | 34 +++++++++++++
 .../tools/llvm-symbolizer/llvm-symbolizer.cpp | 50 ++++++++++++++-----
 llvm/unittests/ProfileData/MemProfTest.cpp    |  3 +-
 8 files changed, 101 insertions(+), 34 deletions(-)

diff --git a/llvm/include/llvm/DebugInfo/Symbolize/SymbolizableModule.h b/llvm/include/llvm/DebugInfo/Symbolize/SymbolizableModule.h
index 255932d35cda11..8be2c22a93a9b8 100644
--- a/llvm/include/llvm/DebugInfo/Symbolize/SymbolizableModule.h
+++ b/llvm/include/llvm/DebugInfo/Symbolize/SymbolizableModule.h
@@ -37,7 +37,7 @@ class SymbolizableModule {
   symbolizeFrame(object::SectionedAddress ModuleOffset) const = 0;
 
   virtual std::vector<object::SectionedAddress>
-  findSymbol(StringRef Symbol) const = 0;
+  findSymbol(StringRef Symbol, uint64_t Offset) const = 0;
 
   // Return true if this is a 32-bit x86 PE COFF module.
   virtual bool isWin32Module() const = 0;
diff --git a/llvm/include/llvm/DebugInfo/Symbolize/SymbolizableObjectFile.h b/llvm/include/llvm/DebugInfo/Symbolize/SymbolizableObjectFile.h
index 311fa201d900e4..5ef513f570b03f 100644
--- a/llvm/include/llvm/DebugInfo/Symbolize/SymbolizableObjectFile.h
+++ b/llvm/include/llvm/DebugInfo/Symbolize/SymbolizableObjectFile.h
@@ -44,7 +44,7 @@ class SymbolizableObjectFile : public SymbolizableModule {
   std::vector<DILocal>
   symbolizeFrame(object::SectionedAddress ModuleOffset) const override;
   std::vector<object::SectionedAddress>
-  findSymbol(StringRef Symbol) const override;
+  findSymbol(StringRef Symbol, uint64_t Offset) const override;
 
   // Return true if this is a 32-bit x86 PE COFF module.
   bool isWin32Module() const override;
diff --git a/llvm/include/llvm/DebugInfo/Symbolize/Symbolize.h b/llvm/include/llvm/DebugInfo/Symbolize/Symbolize.h
index bc4aa74073a655..11a169cfc20a69 100644
--- a/llvm/include/llvm/DebugInfo/Symbolize/Symbolize.h
+++ b/llvm/include/llvm/DebugInfo/Symbolize/Symbolize.h
@@ -105,12 +105,12 @@ class LLVMSymbolizer {
   symbolizeFrame(ArrayRef<uint8_t> BuildID,
                  object::SectionedAddress ModuleOffset);
 
-  Expected<std::vector<DILineInfo>> findSymbol(const ObjectFile &Obj,
-                                               StringRef Symbol);
-  Expected<std::vector<DILineInfo>> findSymbol(StringRef ModuleName,
-                                               StringRef Symbol);
-  Expected<std::vector<DILineInfo>> findSymbol(ArrayRef<uint8_t> BuildID,
-                                               StringRef Symbol);
+  Expected<std::vector<DILineInfo>>
+  findSymbol(const ObjectFile &Obj, StringRef Symbol, uint64_t Offset);
+  Expected<std::vector<DILineInfo>>
+  findSymbol(const std::string &ModuleName, StringRef Symbol, uint64_t Offset);
+  Expected<std::vector<DILineInfo>>
+  findSymbol(ArrayRef<uint8_t> BuildID, StringRef Symbol, uint64_t Offset);
 
   void flush();
 
@@ -155,8 +155,8 @@ class LLVMSymbolizer {
   symbolizeFrameCommon(const T &ModuleSpecifier,
                        object::SectionedAddress ModuleOffset);
   template <typename T>
-  Expected<std::vector<DILineInfo>> findSymbolCommon(const T &ModuleSpecifier,
-                                                     StringRef Symbol);
+  Expected<std::vector<DILineInfo>>
+  findSymbolCommon(const T &ModuleSpecifier, StringRef Symbol, uint64_t Offset);
 
   Expected<SymbolizableModule *> getOrCreateModuleInfo(const ObjectFile &Obj);
 
diff --git a/llvm/lib/DebugInfo/Symbolize/SymbolizableObjectFile.cpp b/llvm/lib/DebugInfo/Symbolize/SymbolizableObjectFile.cpp
index 697303038507a9..0c404327c693da 100644
--- a/llvm/lib/DebugInfo/Symbolize/SymbolizableObjectFile.cpp
+++ b/llvm/lib/DebugInfo/Symbolize/SymbolizableObjectFile.cpp
@@ -352,12 +352,14 @@ std::vector<DILocal> SymbolizableObjectFile::symbolizeFrame(
 }
 
 std::vector<object::SectionedAddress>
-SymbolizableObjectFile::findSymbol(StringRef Symbol) const {
+SymbolizableObjectFile::findSymbol(StringRef Symbol, uint64_t Offset) const {
   std::vector<object::SectionedAddress> Result;
   for (const SymbolDesc &Sym : Symbols) {
     if (Sym.Name.equals(Symbol)) {
-      object::SectionedAddress A{Sym.Addr,
-                                 getModuleSectionIndexForAddress(Sym.Addr)};
+      uint64_t Addr = Sym.Addr;
+      if (Offset < Sym.Size)
+        Addr += Offset;
+      object::SectionedAddress A{Addr, getModuleSectionIndexForAddress(Addr)};
       Result.push_back(A);
     }
   }
diff --git a/llvm/lib/DebugInfo/Symbolize/Symbolize.cpp b/llvm/lib/DebugInfo/Symbolize/Symbolize.cpp
index 15f2a6ece8b897..43e5c9e9329e00 100644
--- a/llvm/lib/DebugInfo/Symbolize/Symbolize.cpp
+++ b/llvm/lib/DebugInfo/Symbolize/Symbolize.cpp
@@ -233,7 +233,8 @@ LLVMSymbolizer::symbolizeFrame(ArrayRef<uint8_t> BuildID,
 
 template <typename T>
 Expected<std::vector<DILineInfo>>
-LLVMSymbolizer::findSymbolCommon(const T &ModuleSpecifier, StringRef Symbol) {
+LLVMSymbolizer::findSymbolCommon(const T &ModuleSpecifier, StringRef Symbol,
+                                 uint64_t Offset) {
   auto InfoOrErr = getOrCreateModuleInfo(ModuleSpecifier);
   if (!InfoOrErr)
     return InfoOrErr.takeError();
@@ -246,7 +247,7 @@ LLVMSymbolizer::findSymbolCommon(const T &ModuleSpecifier, StringRef Symbol) {
   if (!Info)
     return Result;
 
-  for (object::SectionedAddress A : Info->findSymbol(Symbol)) {
+  for (object::SectionedAddress A : Info->findSymbol(Symbol, Offset)) {
     DILineInfo LineInfo = Info->symbolizeCode(
         A, DILineInfoSpecifier(Opts.PathStyle, Opts.PrintFunctions),
         Opts.UseSymbolTable);
@@ -261,18 +262,21 @@ LLVMSymbolizer::findSymbolCommon(const T &ModuleSpecifier, StringRef Symbol) {
 }
 
 Expected<std::vector<DILineInfo>>
-LLVMSymbolizer::findSymbol(const ObjectFile &Obj, StringRef Symbol) {
-  return findSymbolCommon(Obj, Symbol);
+LLVMSymbolizer::findSymbol(const ObjectFile &Obj, StringRef Symbol,
+                           uint64_t Offset) {
+  return findSymbolCommon(Obj, Symbol, Offset);
 }
 
 Expected<std::vector<DILineInfo>>
-LLVMSymbolizer::findSymbol(StringRef ModuleName, StringRef Symbol) {
-  return findSymbolCommon(ModuleName.str(), Symbol);
+LLVMSymbolizer::findSymbol(const std::string &ModuleName, StringRef Symbol,
+                           uint64_t Offset) {
+  return findSymbolCommon(ModuleName, Symbol, Offset);
 }
 
 Expected<std::vector<DILineInfo>>
-LLVMSymbolizer::findSymbol(ArrayRef<uint8_t> BuildID, StringRef Symbol) {
-  return findSymbolCommon(BuildID, Symbol);
+LLVMSymbolizer::findSymbol(ArrayRef<uint8_t> BuildID, StringRef Symbol,
+                           uint64_t Offset) {
+  return findSymbolCommon(BuildID, Symbol, Offset);
 }
 
 void LLVMSymbolizer::flush() {
diff --git a/llvm/test/tools/llvm-symbolizer/symbol-search.test b/llvm/test/tools/llvm-symbolizer/symbol-search.test
index 634229c2e74c00..b978198ce12af0 100644
--- a/llvm/test/tools/llvm-symbolizer/symbol-search.test
+++ b/llvm/test/tools/llvm-symbolizer/symbol-search.test
@@ -34,6 +34,40 @@ RUN: llvm-symbolizer --obj=%p/Inputs/symbols.so func_01 func_02 | FileCheck --ch
 FUNCS:  /tmp/dbginfo{{[/\]+}}symbols.part1.cpp:12
 FUNCS:  /tmp/dbginfo{{[/\]+}}symbols.part2.cpp:10
 
+# Symbol may be combined with offset.
+RUN: llvm-symbolizer --obj=%p/Inputs/symbols.so func_01+0 | FileCheck --check-prefix=SYMLINE0 %s
+RUN: llvm-addr2line -e %p/Inputs/symbols.so func_01+0 | FileCheck --check-prefix=SYMLINE0 %s
+SYMLINE0: /tmp/dbginfo{{[/\]+}}symbols.part1.cpp:12
+
+RUN: llvm-symbolizer --obj=%p/Inputs/symbols.so func_01+12 | FileCheck --check-prefix=SYMLINE1 %s
+RUN: llvm-addr2line -e %p/Inputs/symbols.so func_01+12 | FileCheck --check-prefix=SYMLINE1 %s
+SYMLINE1: /tmp/dbginfo{{[/\]+}}symbols.part1.cpp:13
+
+RUN: llvm-symbolizer --obj=%p/Inputs/symbols.so func_01+19 | FileCheck --check-prefix=SYMLINE2 %s
+RUN: llvm-addr2line -e %p/Inputs/symbols.so func_01+19 | FileCheck --check-prefix=SYMLINE2 %s
+SYMLINE2: /tmp/dbginfo{{[/\]+}}symbols.part1.cpp:14
+
+# Offset can be specified with various bases.
+RUN: llvm-symbolizer --obj=%p/Inputs/symbols.so func_01+0x0C | FileCheck --check-prefix=SYMLINE1 %s
+RUN: llvm-addr2line -e %p/Inputs/symbols.so func_01+0x0C | FileCheck --check-prefix=SYMLINE1 %s
+
+RUN: llvm-symbolizer --obj=%p/Inputs/symbols.so func_01+014 | FileCheck --check-prefix=SYMLINE1 %s
+RUN: llvm-addr2line -e %p/Inputs/symbols.so func_01+014 | FileCheck --check-prefix=SYMLINE1 %s
+
+# If '+' is not followed by a number, it is a part of symbol, not an offset separator.
+RUN: llvm-symbolizer --obj=%p/Inputs/symbols.so func_01+ | FileCheck --check-prefix=NONEXISTENT %s
+RUN: llvm-addr2line --obj=%p/Inputs/symbols.so func_01+ | FileCheck --check-prefix=NONEXISTENT %s
+
+RUN: llvm-symbolizer --obj=%p/Inputs/symbols.so func_01+abc | FileCheck --check-prefix=NONEXISTENT %s
+RUN: llvm-addr2line --obj=%p/Inputs/symbols.so func_01+abc | FileCheck --check-prefix=NONEXISTENT %s
+
+RUN: llvm-symbolizer --obj=%p/Inputs/symbols.so func_01+0A | FileCheck --check-prefix=NONEXISTENT %s
+RUN: llvm-addr2line --obj=%p/Inputs/symbols.so func_01+0A | FileCheck --check-prefix=NONEXISTENT %s
+
+# If '+' is not preceded by a symbol, it is a part of symbol, not an offset separator.
+RUN: llvm-symbolizer --obj=%p/Inputs/symbols.so +0x1138 | FileCheck --check-prefix=NONEXISTENT %s
+RUN: llvm-addr2line --obj=%p/Inputs/symbols.so +0x1138 | FileCheck --check-prefix=NONEXISTENT %s
+
 # Show that C++ mangled names may be specified.
 RUN: llvm-addr2line --obj=%p/Inputs/symbols.so _ZL14static_func_01i | FileCheck --check-prefix=MULTI-CXX %s
 RUN: llvm-symbolizer --obj=%p/Inputs/symbols.so _ZL14static_func_01i | FileCheck --check-prefix=MULTI-CXX %s
diff --git a/llvm/tools/llvm-symbolizer/llvm-symbolizer.cpp b/llvm/tools/llvm-symbolizer/llvm-symbolizer.cpp
index 447c18abadc174..19fef183d808d7 100644
--- a/llvm/tools/llvm-symbolizer/llvm-symbolizer.cpp
+++ b/llvm/tools/llvm-symbolizer/llvm-symbolizer.cpp
@@ -159,7 +159,7 @@ static Error makeStringError(StringRef Msg) {
 static Error parseCommand(StringRef BinaryName, bool IsAddr2Line,
                           StringRef InputString, Command &Cmd,
                           std::string &ModuleName, object::BuildID &BuildID,
-                          StringRef &Symbol, uint64_t &ModuleOffset) {
+                          StringRef &Symbol, uint64_t &Offset) {
   ModuleName = BinaryName;
   if (InputString.consume_front("CODE ")) {
     Cmd = Command::Code;
@@ -224,25 +224,51 @@ static Error parseCommand(StringRef BinaryName, bool IsAddr2Line,
       return makeStringError("no input filename has been specified");
   }
 
-  // Parse module offset, which can be specified as a number or as a symbol.
-  InputString = InputString.ltrim();
+  // Parse address specification, which can be an offset in module or a
+  // symbol with optional offset.
+  InputString = InputString.trim();
   if (InputString.empty())
     return makeStringError("no module offset has been specified");
 
   // If input string contains a space, ignore everything after it. This behavior
   // is consistent with GNU addr2line.
-  int OffsetLength = InputString.find_first_of(" \n\r");
-  StringRef Offset = InputString.substr(0, OffsetLength);
+  int AddrSpecLength = InputString.find_first_of(" \n\r");
+  StringRef AddrSpec = InputString.substr(0, AddrSpecLength);
+  bool StartsWithDigit = std::isdigit(AddrSpec.front());
 
-  // GNU addr2line assumes the offset is hexadecimal and allows a redundant
+  // GNU addr2line assumes the address is hexadecimal and allows a redundant
   // "0x" or "0X" prefix; do the same for compatibility.
   if (IsAddr2Line)
-    Offset.consume_front("0x") || Offset.consume_front("0X");
+    AddrSpec.consume_front("0x") || AddrSpec.consume_front("0X");
 
-  // If the input is not a number, treat it is a symbol.
-  if (Offset.getAsInteger(IsAddr2Line ? 16 : 0, ModuleOffset)) {
-    Symbol = Offset;
-    ModuleOffset = 0;
+  // If address specification is a number, treat it as a module offset.
+  if (!AddrSpec.getAsInteger(IsAddr2Line ? 16 : 0, Offset)) {
+    // Module offset is an address.
+    Symbol = StringRef();
+    return Error::success();
+  }
+
+  // If address specification starts with a digit, but is not a number, consider
+  // it as invalid.
+  if (StartsWithDigit || AddrSpec.empty())
+    return makeStringError("expected a number as module offset");
+
+  // Otherwise it is a symbol name, potentially with an offset.
+  Symbol = AddrSpec;
+  Offset = 0;
+
+  // If the address specification contains '+', try treating it as
+  // "symbol + offset".
+  size_t Plus = AddrSpec.rfind('+');
+  if (Plus != StringRef::npos) {
+    StringRef SymbolStr = AddrSpec.take_front(Plus);
+    StringRef OffsetStr = AddrSpec.substr(Plus + 1);
+    if (!SymbolStr.empty() && !OffsetStr.empty() &&
+        !OffsetStr.getAsInteger(0, Offset)) {
+      Symbol = SymbolStr;
+      return Error::success();
+    }
+    // The found '+' is not an offset delimiter.
   }
 
   return Error::success();
@@ -268,7 +294,7 @@ void executeCommand(StringRef ModuleName, const T &ModuleSpec, Command Cmd,
     print(SymRequest, ResOrErr, Printer);
   } else if (!Symbol.empty()) {
     Expected<std::vector<DILineInfo>> ResOrErr =
-        Symbolizer.findSymbol(ModuleSpec, Symbol);
+        Symbolizer.findSymbol(ModuleSpec, Symbol, Offset);
     print(SymRequest, ResOrErr, Printer);
   } else if (ShouldInline) {
     Expected<DIInliningInfo> ResOrErr =
diff --git a/llvm/unittests/ProfileData/MemProfTest.cpp b/llvm/unittests/ProfileData/MemProfTest.cpp
index 682f79a540cdc5..f5e4a4aff2ed17 100644
--- a/llvm/unittests/ProfileData/MemProfTest.cpp
+++ b/llvm/unittests/ProfileData/MemProfTest.cpp
@@ -54,7 +54,8 @@ class MockSymbolizer : public SymbolizableModule {
   virtual std::vector<DILocal> symbolizeFrame(SectionedAddress) const {
     llvm_unreachable("unused");
   }
-  virtual std::vector<SectionedAddress> findSymbol(StringRef Symbol) const {
+  virtual std::vector<SectionedAddress> findSymbol(StringRef Symbol,
+                                                   uint64_t Offset) const {
     llvm_unreachable("unused");
   }
   virtual bool isWin32Module() const { llvm_unreachable("unused"); }

>From 9ce9b58582b8bfb745238236509b1d75f58c07c5 Mon Sep 17 00:00:00 2001
From: Serge Pavlov <sepavloff at gmail.com>
Date: Tue, 12 Dec 2023 19:38:09 +0700
Subject: [PATCH 2/2] Apply the suggested wordings

---
 llvm/test/tools/llvm-symbolizer/symbol-search.test | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/llvm/test/tools/llvm-symbolizer/symbol-search.test b/llvm/test/tools/llvm-symbolizer/symbol-search.test
index b978198ce12af0..e86efe4c6dd12d 100644
--- a/llvm/test/tools/llvm-symbolizer/symbol-search.test
+++ b/llvm/test/tools/llvm-symbolizer/symbol-search.test
@@ -54,7 +54,7 @@ RUN: llvm-addr2line -e %p/Inputs/symbols.so func_01+0x0C | FileCheck --check-pre
 RUN: llvm-symbolizer --obj=%p/Inputs/symbols.so func_01+014 | FileCheck --check-prefix=SYMLINE1 %s
 RUN: llvm-addr2line -e %p/Inputs/symbols.so func_01+014 | FileCheck --check-prefix=SYMLINE1 %s
 
-# If '+' is not followed by a number, it is a part of symbol, not an offset separator.
+# If '+' is not followed by a number, it is a part of the symbol name, not an offset separator.
 RUN: llvm-symbolizer --obj=%p/Inputs/symbols.so func_01+ | FileCheck --check-prefix=NONEXISTENT %s
 RUN: llvm-addr2line --obj=%p/Inputs/symbols.so func_01+ | FileCheck --check-prefix=NONEXISTENT %s
 
@@ -64,7 +64,7 @@ RUN: llvm-addr2line --obj=%p/Inputs/symbols.so func_01+abc | FileCheck --check-p
 RUN: llvm-symbolizer --obj=%p/Inputs/symbols.so func_01+0A | FileCheck --check-prefix=NONEXISTENT %s
 RUN: llvm-addr2line --obj=%p/Inputs/symbols.so func_01+0A | FileCheck --check-prefix=NONEXISTENT %s
 
-# If '+' is not preceded by a symbol, it is a part of symbol, not an offset separator.
+# If '+' is not preceded by a symbol, it is a part of a symbol name, not an offset separator.
 RUN: llvm-symbolizer --obj=%p/Inputs/symbols.so +0x1138 | FileCheck --check-prefix=NONEXISTENT %s
 RUN: llvm-addr2line --obj=%p/Inputs/symbols.so +0x1138 | FileCheck --check-prefix=NONEXISTENT %s
 



More information about the llvm-commits mailing list