[lld] fb14129 - [ELF] --gdb-index: skip SHF_GROUP .debug_info

Thu Aug 13 09:11:19 PDT 2020

Author: Fangrui Song
Date: 2020-08-13T09:11:01-07:00
New Revision: fb141292f4411448af41fc454c07f3903acb84dd

URL: https://github.com/llvm/llvm-project/commit/fb141292f4411448af41fc454c07f3903acb84dd
DIFF: https://github.com/llvm/llvm-project/commit/fb141292f4411448af41fc454c07f3903acb84dd.diff

LOG: [ELF] --gdb-index: skip SHF_GROUP .debug_info

-gdwarf-5 -fdebug-types-section may produce multiple .debug_info sections.  All
except one are type units (.debug_types before DWARF v5). When constructing
.gdb_index, we should ignore these type units. We use a simple heuristic: the
compile unit does not have the SHF_GROUP flag. (This needs to be revisited if
people place compile unit .debug_info in COMDAT groups.)

This issue manifests as a data race: because an object file may have multiple
.debug_info sections, we may concurrently construct `LLDDwarfObj` for the same
file in multiple threads. The threads may access `InputSectionBase::data()`
concurrently on the same input section. `InputSectionBase::data()` does a lazy
uncompress() and rewrites the member variable `rawData`. A thread running zlib
`inflate()` (transitively called by uncompress()) on a buffer with `rawData`
tampered by another thread may fail with `uncompress failed: zlib error: Z_DATA_ERROR`.

Even if no data race occurred in an optimistic run, if there are N .debug_info,
one CU entry and its address ranges will be replicated N times. The result
.gdb_index can be much larger than a correct one.

The new test gdb-index-dwarf5-type-unit.s actually has two compile units. This
cannot be produced with regular approaches (it can be produced with -r
--unique). This is used to demonstrate that the .gdb_index construction code
only considers the last non-SHF_GROUP .debug_info

Reviewed By: grimar

Differential Revision: https://reviews.llvm.org/D85579

Added: 
    lld/test/ELF/gdb-index-dwarf5-type-unit.s

Modified: 
    lld/ELF/DWARF.cpp
    lld/ELF/DWARF.h
    lld/ELF/SyntheticSections.cpp

Removed: 
    


################################################################################
diff  --git a/lld/ELF/DWARF.cpp b/lld/ELF/DWARF.cpp
index 24c44730bf64..5767f6020f93 100644

--- a/lld/ELF/DWARF.cpp
+++ b/lld/ELF/DWARF.cpp
@@ -26,7 +26,12 @@ using namespace lld;
 using namespace lld::elf;
 
 template <class ELFT> LLDDwarfObj<ELFT>::LLDDwarfObj(ObjFile<ELFT> *obj) {
-  for (InputSectionBase *sec : obj->getSections()) {
+  // Get the ELF sections to retrieve sh_flags. See the SHF_GROUP comment below.
+  ArrayRef<typename ELFT::Shdr> objSections =
+      CHECK(obj->getObj().sections(), obj);
+  assert(objSections.size() == obj->getSections().size());
+  for (auto it : llvm::enumerate(obj->getSections())) {
+    InputSectionBase *sec = it.value();
     if (!sec)
       continue;
 
@@ -35,7 +40,6 @@ template <class ELFT> LLDDwarfObj<ELFT>::LLDDwarfObj(ObjFile<ELFT> *obj) {
                 .Case(".debug_addr", &addrSection)
                 .Case(".debug_gnu_pubnames", &gnuPubnamesSection)
                 .Case(".debug_gnu_pubtypes", &gnuPubtypesSection)
-                .Case(".debug_info", &infoSection)
                 .Case(".debug_loclists", &loclistsSection)
                 .Case(".debug_ranges", &rangesSection)
                 .Case(".debug_rnglists", &rnglistsSection)
@@ -53,6 +57,20 @@ template <class ELFT> LLDDwarfObj<ELFT>::LLDDwarfObj(ObjFile<ELFT> *obj) {
       strSection = toStringRef(sec->data());
     else if (sec->name == ".debug_line_str")
       lineStrSection = toStringRef(sec->data());
+    else if (sec->name == ".debug_info" &&
+             !(objSections[it.index()].sh_flags & ELF::SHF_GROUP)) {
+      // In DWARF v5, -fdebug-types-section places type units in .debug_info
+      // sections in COMDAT groups. They are not compile units and thus should
+      // be ignored for .gdb_index/diagnostics purposes.
+      //
+      // We use a simple heuristic: the compile unit does not have the SHF_GROUP
+      // flag. If we place compile units in COMDAT groups in the future, we may
+      // need to perform a lightweight parsing. We drop the SHF_GROUP flag when
+      // the InputSection was created, so we need to retrieve sh_flags from the
+      // associated ELF section header.
+      infoSection.Data = toStringRef(sec->data());
+      infoSection.sec = sec;
+    }
   }
 }
 

diff  --git a/lld/ELF/DWARF.h b/lld/ELF/DWARF.h
index a12dae6e9960..900c63de26ff 100644
--- a/lld/ELF/DWARF.h
+++ b/lld/ELF/DWARF.h
@@ -32,6 +32,10 @@ template <class ELFT> class LLDDwarfObj final : public llvm::DWARFObject {
     f(infoSection);
   }
 
+  InputSection *getInfoSection() const {
+    return cast<InputSection>(infoSection.sec);
+  }
+
   const llvm::DWARFSection &getLoclistsSection() const override {
     return loclistsSection;
   }

diff  --git a/lld/ELF/SyntheticSections.cpp b/lld/ELF/SyntheticSections.cpp
index 09c50c22b10c..fd3d4797bb51 100644
--- a/lld/ELF/SyntheticSections.cpp
+++ b/lld/ELF/SyntheticSections.cpp
@@ -2695,15 +2695,6 @@ void GdbIndexSection::initOutputSize() {
   }
 }
 
-static std::vector<InputSection *> getDebugInfoSections() {
-  std::vector<InputSection *> ret;
-  for (InputSectionBase *s : inputSections)
-    if (InputSection *isec = dyn_cast<InputSection>(s))
-      if (isec->name == ".debug_info")
-        ret.push_back(isec);
-  return ret;
-}
-
 static std::vector<GdbIndexSection::CuEntry> readCuList(DWARFContext &dwarf) {
   std::vector<GdbIndexSection::CuEntry> ret;
   for (std::unique_ptr<DWARFUnit> &cu : dwarf.compile_units())
@@ -2857,30 +2848,40 @@ createSymbols(ArrayRef<std::vector<GdbIndexSection::NameAttrEntry>> nameAttrs,
 
 // Returns a newly-created .gdb_index section.
 template <class ELFT> GdbIndexSection *GdbIndexSection::create() {
-  std::vector<InputSection *> sections = getDebugInfoSections();
-
-  // .debug_gnu_pub{names,types} are useless in executables.
-  // They are present in input object files solely for creating
-  // a .gdb_index. So we can remove them from the output.
-  for (InputSectionBase *s : inputSections)
+  // Collect InputFiles with .debug_info. See the comment in
+  // LLDDwarfObj<ELFT>::LLDDwarfObj. If we do lightweight parsing in the future,
+  // note that isec->data() may uncompress the full content, which should be
+  // parallelized.
+  SetVector<InputFile *> files;
+  for (InputSectionBase *s : inputSections) {
+    InputSection *isec = dyn_cast<InputSection>(s);
+    if (!isec)
+      continue;
+    // .debug_gnu_pub{names,types} are useless in executables.
+    // They are present in input object files solely for creating
+    // a .gdb_index. So we can remove them from the output.
     if (s->name == ".debug_gnu_pubnames" || s->name == ".debug_gnu_pubtypes")
       s->markDead();
+    else if (isec->name == ".debug_info")
+      files.insert(isec->file);
+  }
 
-  std::vector<GdbChunk> chunks(sections.size());
-  std::vector<std::vector<NameAttrEntry>> nameAttrs(sections.size());
+  std::vector<GdbChunk> chunks(files.size());
+  std::vector<std::vector<NameAttrEntry>> nameAttrs(files.size());
 
-  parallelForEachN(0, sections.size(), [&](size_t i) {
+  parallelForEachN(0, files.size(), [&](size_t i) {
     // To keep memory usage low, we don't want to keep cached DWARFContext, so
     // avoid getDwarf() here.
-    ObjFile<ELFT> *file = sections[i]->getFile<ELFT>();
+    ObjFile<ELFT> *file = cast<ObjFile<ELFT>>(files[i]);
     DWARFContext dwarf(std::make_unique<LLDDwarfObj<ELFT>>(file));
+    auto &dobj = static_cast<const LLDDwarfObj<ELFT> &>(dwarf.getDWARFObj());
 
-    chunks[i].sec = sections[i];
+    // If the are multiple compile units .debug_info (very rare ld -r --unique),
+    // this only picks the last one. Other address ranges are lost.
+    chunks[i].sec = dobj.getInfoSection();
     chunks[i].compilationUnits = readCuList(dwarf);
-    chunks[i].addressAreas = readAddressAreas(dwarf, sections[i]);
-    nameAttrs[i] = readPubNamesAndTypes<ELFT>(
-        static_cast<const LLDDwarfObj<ELFT> &>(dwarf.getDWARFObj()),
-        chunks[i].compilationUnits);
+    chunks[i].addressAreas = readAddressAreas(dwarf, chunks[i].sec);
+    nameAttrs[i] = readPubNamesAndTypes<ELFT>(dobj, chunks[i].compilationUnits);
   });
 
   auto *ret = make<GdbIndexSection>();

diff  --git a/lld/test/ELF/gdb-index-dwarf5-type-unit.s b/lld/test/ELF/gdb-index-dwarf5-type-unit.s
new file mode 100644
index 000000000000..5cd6778fe7e4
--- /dev/null
+++ b/lld/test/ELF/gdb-index-dwarf5-type-unit.s
@@ -0,0 +1,93 @@
+# REQUIRES: x86, zlib
+## -gdwarf-5 -fdebug-types-section may produce multiple .debug_info sections.
+## All except one are type units. Test we can locate the compile unit, add it to
+## the index, and not erroneously duplicate it (which would happen if we
+## consider every .debug_info a compile unit).
+
+# RUN: llvm-mc -filetype=obj -triple=x86_64 %s -o %t.o
+# RUN: ld.lld --gdb-index -Ttext=0x1000 %t.o -o %t
+# RUN: llvm-dwarfdump --gdb-index %t | FileCheck %s
+
+## Test we don't uncompress a section while another thread is concurrently
+## accessing it. This would be detected by tsan as a data race.
+# RUN: llvm-objcopy --compress-debug-sections %t.o
+# RUN: ld.lld --gdb-index -Ttext=0x1000 %t.o -o %t1
+# RUN: llvm-dwarfdump --gdb-index %t1 | FileCheck %s
+
+## In this test, there are actually two compile unit .debug_info (very uncommon;
+## -r --unique). Currently we only handle the last compile unit.
+# CHECK:      CU list offset = 0x18, has 1 entries:
+# CHECK-NEXT:   0: Offset = 0x32, Length = 0x19
+
+# CHECK:      Address area offset = 0x28, has 1 entries:
+# CHECK-NEXT:   Low/High address = [0x1001, 0x1002) (Size: 0x1), CU id = 0
+
+.Lfunc_begin0:
+  ret
+.Lfunc_end0:
+.Lfunc_begin1:
+  ret
+.Lfunc_end1:
+
+.section  .debug_abbrev,"", at progbits
+  .byte  1                         # Abbreviation Code
+  .byte  65                        # DW_TAG_type_unit
+  .byte  0                         # DW_CHILDREN_no
+  .byte  0                         # EOM(1)
+  .byte  0                         # EOM(2)
+
+  .byte  2                         # Abbreviation Code
+  .byte  17                        # DW_TAG_compile_unit
+  .byte  0                         # DW_CHILDREN_no
+  .byte  17                        # DW_AT_low_pc
+  .byte  1                         # DW_FORM_addr
+  .byte  18                        # DW_AT_high_pc
+  .byte  6                         # DW_FORM_data4
+  .byte  0                         # EOM(1)
+  .byte  0                         # EOM(2)
+
+  .byte  0                         # EOM(3)
+
+.macro TYPE_UNIT id signature
+.section  .debug_info,"G", at progbits,\signature
+  .long  .Ldebug_info_end\id-.Ldebug_info_start\id # Length of Unit
+.Ldebug_info_start\id:
+  .short 5                         # DWARF version number
+  .byte  2                         # DWARF Unit Type
+  .byte  8                         # Address Size
+  .long  .debug_abbrev             # Offset Into Abbrev. Section
+  .quad  \signature                # Type Signature
+  .long  .Ldebug_info_end\id       # Type DIE Offset
+  .byte  1                         # Abbrev [1] DW_TAG_type_unit
+.Ldebug_info_end\id:
+.endm
+
+## We place compile units between two type units (rare). A naive approach will
+## take either the first or the last .debug_info
+TYPE_UNIT 0, 123
+
+.section  .debug_info,"", at progbits,unique,0
+.Lcu_begin0:
+  .long .Lcu_end0-.Lcu_begin0-4    # Length of Unit
+  .short 5                         # DWARF version number
+  .byte  1                         # DWARF Unit Type
+  .byte  8                         # Address Size
+  .long  .debug_abbrev             # Offset Into Abbrev. Section
+  .byte  2                         # Abbrev [2] DW_TAG_compile_unit
+  .quad  .Lfunc_begin0             # DW_AT_low_pc
+  .long  .Lfunc_end0-.Lfunc_begin0 # DW_AT_high_pc
+.Lcu_end0:
+
+.section  .debug_info,"", at progbits,unique,1
+.Lcu_begin1:
+  .long .Lcu_end1-.Lcu_begin1-4    # Length of Unit
+  .short 5                         # DWARF version number
+  .byte  1                         # DWARF Unit Type
+  .byte  8                         # Address Size
+  .long  .debug_abbrev             # Offset Into Abbrev. Section
+  .byte  2                         # Abbrev [2] DW_TAG_compile_unit
+  .quad  .Lfunc_begin1             # DW_AT_low_pc
+  .long  .Lfunc_end1-.Lfunc_begin1 # DW_AT_high_pc
+.Lcu_end1:
+
+TYPE_UNIT 1, 456