[llvm] 04a2e12 - DebugInfo/Symbolize: Retrieve filename from the preceding STT_FILE for .symtab symbolization

Fangrui Song via llvm-commits llvm-commits at lists.llvm.org
Wed Feb 10 09:47:18 PST 2021


Author: Fangrui Song
Date: 2021-02-10T09:47:10-08:00
New Revision: 04a2e12612f49e65a7dc14fa45f11f0632d427f2

URL: https://github.com/llvm/llvm-project/commit/04a2e12612f49e65a7dc14fa45f11f0632d427f2
DIFF: https://github.com/llvm/llvm-project/commit/04a2e12612f49e65a7dc14fa45f11f0632d427f2.diff

LOG: DebugInfo/Symbolize: Retrieve filename from the preceding STT_FILE for .symtab symbolization

The ELF spec says:

> STT_FILE: Conventionally, the symbol's name gives the name of the source file associated with the object file. A file symbol has STB_LOCAL binding, its section index is SHN_ABS, and it precedes the other STB_LOCAL symbols for the file, if it is present.

For a local symbol, the preceding STT_FILE symbol is almost always in the same
file[1]. GNU addr2line uses this heuristic to retrieve the filename associated
with a local symbol (e.g. internal linkage functions in C/C++).

GNU addr2line can assign STT_FILE filename to a non-local symbol, too, but the trick
only works if no regular symbol precede STT_FILE. This patch does not implement this corner case
(not useful for most executables which have more than one files).

In case of filename mismatch between .debug_line & .symtab, arbitrarily make .debug_line win.

[1]: LLD does not synthesize STT_FILE symbols
(https://bugs.llvm.org/show_bug.cgi?id=48023 see also
https://sourceware.org/bugzilla/show_bug.cgi?id=26822).  An assembly file
without `.file` directives can cause mis-attribution. This is an edge case.

Differential Revision: https://reviews.llvm.org/D95927

Added: 
    llvm/test/DebugInfo/Symbolize/ELF/symtab-file-conflict.s
    llvm/test/DebugInfo/Symbolize/ELF/symtab-file2.yaml

Modified: 
    llvm/lib/DebugInfo/Symbolize/SymbolizableObjectFile.cpp
    llvm/lib/DebugInfo/Symbolize/SymbolizableObjectFile.h
    llvm/test/DebugInfo/Symbolize/ELF/symtab-file.s
    llvm/test/DebugInfo/Symbolize/ELF/symtab-ignored.s
    llvm/test/DebugInfo/Symbolize/ELF/symtab-notype.s

Removed: 
    


################################################################################
diff  --git a/llvm/lib/DebugInfo/Symbolize/SymbolizableObjectFile.cpp b/llvm/lib/DebugInfo/Symbolize/SymbolizableObjectFile.cpp
index b260cd9da485..2c6a96a973d1 100644
--- a/llvm/lib/DebugInfo/Symbolize/SymbolizableObjectFile.cpp
+++ b/llvm/lib/DebugInfo/Symbolize/SymbolizableObjectFile.cpp
@@ -16,6 +16,7 @@
 #include "llvm/BinaryFormat/COFF.h"
 #include "llvm/DebugInfo/DWARF/DWARFContext.h"
 #include "llvm/Object/COFF.h"
+#include "llvm/Object/ELFObjectFile.h"
 #include "llvm/Object/ObjectFile.h"
 #include "llvm/Object/SymbolSize.h"
 #include "llvm/Support/Casting.h"
@@ -69,9 +70,8 @@ SymbolizableObjectFile::create(const object::ObjectFile *Obj,
         return std::move(E);
   }
 
-  std::vector<std::pair<SymbolDesc, StringRef>> &Fs = res->Functions,
-                                                &Os = res->Objects;
-  auto Uniquify = [](std::vector<std::pair<SymbolDesc, StringRef>> &S) {
+  std::vector<SymbolDesc> &Fs = res->Functions, &Os = res->Objects;
+  auto Uniquify = [](std::vector<SymbolDesc> &S) {
     // Sort by (Addr,Size,Name). If several SymbolDescs share the same Addr,
     // pick the one with the largest Size. This helps us avoid symbols with no
     // size information (Size=0).
@@ -79,7 +79,7 @@ SymbolizableObjectFile::create(const object::ObjectFile *Obj,
     auto I = S.begin(), E = S.end(), J = S.begin();
     while (I != E) {
       auto OI = I;
-      while (++I != E && OI->first.Addr == I->first.Addr) {
+      while (++I != E && OI->Addr == I->Addr) {
       }
       *J++ = I[-1];
     }
@@ -138,8 +138,7 @@ Error SymbolizableObjectFile::addCoffExportSymbols(
     uint32_t NextOffset = I != E ? I->Offset : Export.Offset + 1;
     uint64_t SymbolStart = ImageBase + Export.Offset;
     uint64_t SymbolSize = NextOffset - Export.Offset;
-    SymbolDesc SD = {SymbolStart, SymbolSize};
-    Functions.emplace_back(SD, Export.Name);
+    Functions.push_back({SymbolStart, SymbolSize, Export.Name, 0});
   }
   return Error::success();
 }
@@ -150,9 +149,23 @@ Error SymbolizableObjectFile::addSymbol(const SymbolRef &Symbol,
                                         uint64_t OpdAddress) {
   // Avoid adding symbols from an unknown/undefined section.
   const ObjectFile &Obj = *Symbol.getObject();
+  Expected<StringRef> SymbolNameOrErr = Symbol.getName();
+  if (!SymbolNameOrErr)
+    return SymbolNameOrErr.takeError();
+  StringRef SymbolName = *SymbolNameOrErr;
+
+  uint32_t ELFSymIdx =
+      Obj.isELF() ? ELFSymbolRef(Symbol).getRawDataRefImpl().d.b : 0;
   Expected<section_iterator> Sec = Symbol.getSection();
-  if (!Sec || Obj.section_end() == *Sec)
+  if (!Sec || Obj.section_end() == *Sec) {
+    if (Obj.isELF()) {
+      // Store the (index, filename) pair for a file symbol.
+      ELFSymbolRef ESym(Symbol);
+      if (ESym.getELFType() == ELF::STT_FILE)
+        FileSymbols.emplace_back(ELFSymIdx, SymbolName);
+    }
     return Error::success();
+  }
 
   Expected<SymbolRef::Type> SymbolTypeOrErr = Symbol.getType();
   if (!SymbolTypeOrErr)
@@ -190,24 +203,21 @@ Error SymbolizableObjectFile::addSymbol(const SymbolRef &Symbol,
     if (OpdExtractor->isValidOffsetForAddress(OpdOffset))
       SymbolAddress = OpdExtractor->getAddress(&OpdOffset);
   }
-  Expected<StringRef> SymbolNameOrErr = Symbol.getName();
-  if (!SymbolNameOrErr)
-    return SymbolNameOrErr.takeError();
-  StringRef SymbolName = *SymbolNameOrErr;
   // Mach-O symbol table names have leading underscore, skip it.
   if (Module->isMachO() && !SymbolName.empty() && SymbolName[0] == '_')
     SymbolName = SymbolName.drop_front();
 
-  SymbolDesc SD = {SymbolAddress, SymbolSize};
-
+  if (Obj.isELF() && ELFSymbolRef(Symbol).getBinding() != ELF::STB_LOCAL)
+    ELFSymIdx = 0;
+  SymbolDesc SD = {SymbolAddress, SymbolSize, SymbolName, ELFSymIdx};
   // DATA command symbolizes just ST_Data (ELF STT_OBJECT) symbols as an
   // optimization. Treat everything else (e.g. ELF STT_NOTYPE, STT_FUNC and
   // STT_GNU_IFUNC) as function symbols which can be used to symbolize
   // addresses.
   if (SymbolType == SymbolRef::ST_Data)
-    Objects.emplace_back(SD, SymbolName);
+    Objects.push_back(SD);
   else
-    Functions.emplace_back(SD, SymbolName);
+    Functions.push_back(SD);
   return Error::success();
 }
 
@@ -223,23 +233,33 @@ uint64_t SymbolizableObjectFile::getModulePreferredBase() const {
   return 0;
 }
 
-bool SymbolizableObjectFile::getNameFromSymbolTable(SymbolRef::Type Type,
-                                                    uint64_t Address,
-                                                    std::string &Name,
-                                                    uint64_t &Addr,
-                                                    uint64_t &Size) const {
+bool SymbolizableObjectFile::getNameFromSymbolTable(
+    SymbolRef::Type Type, uint64_t Address, std::string &Name, uint64_t &Addr,
+    uint64_t &Size, std::string &FileName) const {
   const auto &Symbols = Type == SymbolRef::ST_Function ? Functions : Objects;
-  std::pair<SymbolDesc, StringRef> SD{{Address, UINT64_C(-1)}, StringRef()};
+  SymbolDesc SD{Address, UINT64_C(-1), StringRef(), 0};
   auto SymbolIterator = llvm::upper_bound(Symbols, SD);
   if (SymbolIterator == Symbols.begin())
     return false;
   --SymbolIterator;
-  if (SymbolIterator->first.Size != 0 &&
-      SymbolIterator->first.Addr + SymbolIterator->first.Size <= Address)
+  if (SymbolIterator->Size != 0 &&
+      SymbolIterator->Addr + SymbolIterator->Size <= Address)
     return false;
-  Name = SymbolIterator->second.str();
-  Addr = SymbolIterator->first.Addr;
-  Size = SymbolIterator->first.Size;
+  Name = SymbolIterator->Name.str();
+  Addr = SymbolIterator->Addr;
+  Size = SymbolIterator->Size;
+
+  if (SymbolIterator->ELFLocalSymIdx != 0) {
+    // If this is an ELF local symbol, find the STT_FILE symbol preceding
+    // SymbolIterator to get the filename. The ELF spec requires the STT_FILE
+    // symbol (if present) precedes the other STB_LOCAL symbols for the file.
+    assert(Module->isELF());
+    auto It = llvm::upper_bound(
+        FileSymbols,
+        std::make_pair(SymbolIterator->ELFLocalSymIdx, StringRef()));
+    if (It != FileSymbols.begin())
+      FileName = It[-1].second.str();
+  }
   return true;
 }
 
@@ -265,11 +285,13 @@ SymbolizableObjectFile::symbolizeCode(object::SectionedAddress ModuleOffset,
 
   // Override function name from symbol table if necessary.
   if (shouldOverrideWithSymbolTable(LineInfoSpecifier.FNKind, UseSymbolTable)) {
-    std::string FunctionName;
+    std::string FunctionName, FileName;
     uint64_t Start, Size;
     if (getNameFromSymbolTable(SymbolRef::ST_Function, ModuleOffset.Address,
-                               FunctionName, Start, Size)) {
+                               FunctionName, Start, Size, FileName)) {
       LineInfo.FunctionName = FunctionName;
+      if (LineInfo.FileName == DILineInfo::BadString && !FileName.empty())
+        LineInfo.FileName = FileName;
     }
   }
   return LineInfo;
@@ -290,12 +312,15 @@ DIInliningInfo SymbolizableObjectFile::symbolizeInlinedCode(
 
   // Override the function name in lower frame with name from symbol table.
   if (shouldOverrideWithSymbolTable(LineInfoSpecifier.FNKind, UseSymbolTable)) {
-    std::string FunctionName;
+    std::string FunctionName, FileName;
     uint64_t Start, Size;
     if (getNameFromSymbolTable(SymbolRef::ST_Function, ModuleOffset.Address,
-                               FunctionName, Start, Size)) {
-      InlinedContext.getMutableFrame(InlinedContext.getNumberOfFrames() - 1)
-          ->FunctionName = FunctionName;
+                               FunctionName, Start, Size, FileName)) {
+      DILineInfo *LI = InlinedContext.getMutableFrame(
+          InlinedContext.getNumberOfFrames() - 1);
+      LI->FunctionName = FunctionName;
+      if (LI->FileName == DILineInfo::BadString && !FileName.empty())
+        LI->FileName = FileName;
     }
   }
 
@@ -305,8 +330,9 @@ DIInliningInfo SymbolizableObjectFile::symbolizeInlinedCode(
 DIGlobal SymbolizableObjectFile::symbolizeData(
     object::SectionedAddress ModuleOffset) const {
   DIGlobal Res;
+  std::string FileName;
   getNameFromSymbolTable(SymbolRef::ST_Data, ModuleOffset.Address, Res.Name,
-                         Res.Start, Res.Size);
+                         Res.Start, Res.Size, FileName);
   return Res;
 }
 

diff  --git a/llvm/lib/DebugInfo/Symbolize/SymbolizableObjectFile.h b/llvm/lib/DebugInfo/Symbolize/SymbolizableObjectFile.h
index be3c66df056f..616096cb9db0 100644
--- a/llvm/lib/DebugInfo/Symbolize/SymbolizableObjectFile.h
+++ b/llvm/lib/DebugInfo/Symbolize/SymbolizableObjectFile.h
@@ -56,8 +56,8 @@ class SymbolizableObjectFile : public SymbolizableModule {
                                      bool UseSymbolTable) const;
 
   bool getNameFromSymbolTable(object::SymbolRef::Type Type, uint64_t Address,
-                              std::string &Name, uint64_t &Addr,
-                              uint64_t &Size) const;
+                              std::string &Name, uint64_t &Addr, uint64_t &Size,
+                              std::string &FileName) const;
   // For big-endian PowerPC64 ELF, OpdAddress is the address of the .opd
   // (function descriptor) section and OpdExtractor refers to its contents.
   Error addSymbol(const object::SymbolRef &Symbol, uint64_t SymbolSize,
@@ -78,12 +78,19 @@ class SymbolizableObjectFile : public SymbolizableModule {
     // the following symbol.
     uint64_t Size;
 
+    StringRef Name;
+    // Non-zero if this is an ELF local symbol. See the comment in
+    // getNameFromSymbolTable.
+    uint32_t ELFLocalSymIdx;
+
     bool operator<(const SymbolDesc &RHS) const {
       return Addr != RHS.Addr ? Addr < RHS.Addr : Size < RHS.Size;
     }
   };
-  std::vector<std::pair<SymbolDesc, StringRef>> Functions;
-  std::vector<std::pair<SymbolDesc, StringRef>> Objects;
+  std::vector<SymbolDesc> Functions;
+  std::vector<SymbolDesc> Objects;
+  // (index, filename) pairs of ELF STT_FILE symbols.
+  std::vector<std::pair<uint32_t, StringRef>> FileSymbols;
 
   SymbolizableObjectFile(const object::ObjectFile *Obj,
                          std::unique_ptr<DIContext> DICtx,

diff  --git a/llvm/test/DebugInfo/Symbolize/ELF/symtab-file-conflict.s b/llvm/test/DebugInfo/Symbolize/ELF/symtab-file-conflict.s
new file mode 100644
index 000000000000..280175dcf753
--- /dev/null
+++ b/llvm/test/DebugInfo/Symbolize/ELF/symtab-file-conflict.s
@@ -0,0 +1,14 @@
+# REQUIRES: x86-registered-target
+## If the filename from the preceding STT_FILE does not match .debug_line,
+## STT_FILE wins. Compilers should not emit such bogus information.
+# RUN: llvm-mc -filetype=obj -triple=x86_64 %s -o %t
+# RUN: llvm-symbolizer --obj=%t 0 | FileCheck %s
+
+# CHECK:       foo
+# CHECK-NEXT:  1.c:0:0
+
+.file "1.c"
+.file 0 "/tmp" "0.c"
+foo:
+  .loc 0 1 0
+  nop

diff  --git a/llvm/test/DebugInfo/Symbolize/ELF/symtab-file.s b/llvm/test/DebugInfo/Symbolize/ELF/symtab-file.s
index 04761635828b..64679a952197 100644
--- a/llvm/test/DebugInfo/Symbolize/ELF/symtab-file.s
+++ b/llvm/test/DebugInfo/Symbolize/ELF/symtab-file.s
@@ -4,15 +4,14 @@
 # RUN: llvm-mc -filetype=obj -triple=x86_64 %s -o %t
 # RUN: llvm-symbolizer --obj=%t 0 1 2 | FileCheck %s
 
-## TODO Find the preceding STT_FILE symbol as the filename of a local symbol.
 # CHECK:       local1
-# CHECK-NEXT:  ??:0:0
+# CHECK-NEXT:  1.c:0:0
 # CHECK-EMPTY:
 # CHECK-NEXT:  local2
-# CHECK-NEXT:  ??:0:0
+# CHECK-NEXT:  2.c:0:0
 # CHECK-EMPTY:
 # CHECK-NEXT:  local3
-# CHECK-NEXT:  ??:0:0
+# CHECK-NEXT:  3.c:0:0
 # CHECK-EMPTY:
 
 .file "1.c"

diff  --git a/llvm/test/DebugInfo/Symbolize/ELF/symtab-file2.yaml b/llvm/test/DebugInfo/Symbolize/ELF/symtab-file2.yaml
new file mode 100644
index 000000000000..2d68eaa1e5ba
--- /dev/null
+++ b/llvm/test/DebugInfo/Symbolize/ELF/symtab-file2.yaml
@@ -0,0 +1,75 @@
+# RUN: yaml2obj --docnum=1 %s -o %t1
+# RUN: llvm-symbolizer --obj=%t1 0 1 2 | FileCheck %s
+
+## The local symbol has no preceding STT_FILE. Its filename is unavailable.
+# CHECK:       local
+# CHECK-NEXT:  ??:0:0
+# CHECK-EMPTY:
+
+## All local symbols precede all non-local symbols. When there are multiple
+## STT_FILE symbols, we cannot tell which file defines the non-local symbol in
+## question. We could tell if there is only one STT_FILE but in reality there
+## are always more than one file, so implementing the special case is not useful.
+# CHECK-NEXT:  global
+# CHECK-NEXT:  ??:0:0
+# CHECK-EMPTY:
+# CHECK-NEXT:  weak
+# CHECK-NEXT:  ??:0:0
+# CHECK-EMPTY:
+
+--- !ELF
+FileHeader:
+  Class:   ELFCLASS64
+  Data:    ELFDATA2LSB
+  Type:    ET_DYN
+  Machine: EM_X86_64
+Sections:
+  - Name:  .text
+    Type:  SHT_PROGBITS
+    Flags: [ SHF_ALLOC, SHF_EXECINSTR ]
+    Size:  3
+Symbols:
+  - Name:    local
+    Section: .text
+    Value:   0
+  - Name:    1.c
+    Type:    STT_FILE
+    Index:   SHN_ABS
+  - Name:    global
+    Binding: STB_GLOBAL
+    Section: .text
+    Value:   1
+  - Name:    weak
+    Binding: STB_WEAK
+    Section: .text
+    Value:   2
+
+## If st_name of the STT_FILE symbols is invalid, the symbol name is lost as well.
+## TODO Keep the symbol name.
+# RUN: yaml2obj --docnum=2 %s -o %t2
+# RUN: llvm-symbolizer --obj=%t2 0 0 2>&1 | FileCheck %s --check-prefix=CHECK2
+
+# CHECK2:      error reading file: st_name (0xffff) is past the end of the string table of size
+# CHECK2-NEXT: ??
+# CHECK2-NEXT: ??:0:0
+# CHECK2-EMPTY:
+# CHECK2-NEXT: ??
+# CHECK2-NEXT: ??:0:0
+
+--- !ELF
+FileHeader:
+  Class:   ELFCLASS64
+  Data:    ELFDATA2LSB
+  Type:    ET_DYN
+  Machine: EM_X86_64
+Sections:
+  - Name:  .text
+    Type:  SHT_PROGBITS
+    Flags: [ SHF_ALLOC, SHF_EXECINSTR ]
+    Size:  1
+Symbols:
+  - StName:  0xffff
+    Type:    STT_FILE
+    Index:   SHN_ABS
+  - Name:    local
+    Section: .text

diff  --git a/llvm/test/DebugInfo/Symbolize/ELF/symtab-ignored.s b/llvm/test/DebugInfo/Symbolize/ELF/symtab-ignored.s
index 37ece2b390a1..dc9ac2857dcd 100644
--- a/llvm/test/DebugInfo/Symbolize/ELF/symtab-ignored.s
+++ b/llvm/test/DebugInfo/Symbolize/ELF/symtab-ignored.s
@@ -4,7 +4,7 @@
 # RUN: llvm-symbolizer --obj=%t 0 | FileCheck %s
 
 # CHECK:       b
-# CHECK-NEXT:  ??:0:0
+# CHECK-NEXT:  1.c:0:0
 # CHECK-EMPTY:
 
 .file "1.c"

diff  --git a/llvm/test/DebugInfo/Symbolize/ELF/symtab-notype.s b/llvm/test/DebugInfo/Symbolize/ELF/symtab-notype.s
index ff6727560eff..d3f1d3757083 100644
--- a/llvm/test/DebugInfo/Symbolize/ELF/symtab-notype.s
+++ b/llvm/test/DebugInfo/Symbolize/ELF/symtab-notype.s
@@ -20,21 +20,20 @@
 # CHECK-EMPTY:
 
 # CHECK-NEXT:  l_notype
-# CHECK-NEXT:  ??:0:0
+# CHECK-NEXT:  symtab-notype.s:0:0
 # CHECK-EMPTY:
 
 ## TODO addr2line does not symbolize the last two out-of-bounds addresses.
 # CHECK-NEXT:  l_notype_nosize
-# CHECK-NEXT:  ??:0:0
+# CHECK-NEXT:  symtab-notype.s:0:0
 # CHECK-EMPTY:
 # CHECK-NEXT:  l_notype_nosize
-# CHECK-NEXT:  ??:0:0
+# CHECK-NEXT:  symtab-notype.s:0:0
 # CHECK-EMPTY:
 # CHECK-NEXT:  l_notype_nosize
-# CHECK-NEXT:  ??:0:0
+# CHECK-NEXT:  symtab-notype.s:0:0
 # CHECK-EMPTY:
 
-## TODO Find the preceding STT_FILE symbol as the filename of a local symbol.
 .file "symtab-notype.s"
 
 .globl _start, g_notype


        


More information about the llvm-commits mailing list