[lld] r336672 - Reduce memory usage when creating .gdb_index. NFC.

Rui Ueyama via llvm-commits llvm-commits at lists.llvm.org
Tue Jul 10 06:49:14 PDT 2018


Author: ruiu
Date: Tue Jul 10 06:49:13 2018
New Revision: 336672

URL: http://llvm.org/viewvc/llvm-project?rev=336672&view=rev
Log:
Reduce memory usage when creating .gdb_index. NFC.

.gdb_index sections can be very large. When you are compiling
multi-gibibyte executables, they can be larger than 1 GiB. The previous
implementation of .gdb_index seems to consume too much memory.

This patch reduces memory consumption by eliminating temporary objects.
In one experiment, memory consumption of GdbIndexSection class is
reduced from 962 MiB to 228 MiB when creating a .gdb_index of 1350 GiB.

Differential Revision: https://reviews.llvm.org/D49094

Modified:
    lld/trunk/ELF/SyntheticSections.cpp
    lld/trunk/ELF/SyntheticSections.h

Modified: lld/trunk/ELF/SyntheticSections.cpp
URL: http://llvm.org/viewvc/llvm-project/lld/trunk/ELF/SyntheticSections.cpp?rev=336672&r1=336671&r2=336672&view=diff
==============================================================================
--- lld/trunk/ELF/SyntheticSections.cpp (original)
+++ lld/trunk/ELF/SyntheticSections.cpp Tue Jul 10 06:49:13 2018
@@ -52,6 +52,7 @@ using namespace llvm::support;
 using namespace lld;
 using namespace lld::elf;
 
+using llvm::support::endian::read32le;
 using llvm::support::endian::write32le;
 using llvm::support::endian::write64le;
 
@@ -2311,19 +2312,17 @@ readAddressAreas(DWARFContext &Dwarf, In
 }
 
 static std::vector<GdbIndexChunk::NameTypeEntry>
-readPubNamesAndTypes(DWARFContext &Dwarf) {
+readPubNamesAndTypes(DWARFContext &Dwarf, uint32_t Idx) {
   StringRef Sec1 = Dwarf.getDWARFObj().getGnuPubNamesSection();
   StringRef Sec2 = Dwarf.getDWARFObj().getGnuPubTypesSection();
 
   std::vector<GdbIndexChunk::NameTypeEntry> Ret;
   for (StringRef Sec : {Sec1, Sec2}) {
     DWARFDebugPubTable Table(Sec, Config->IsLE, true);
-    for (const DWARFDebugPubTable::Set &Set : Table.getData()) {
-      for (const DWARFDebugPubTable::Entry &Ent : Set.Entries) {
-        CachedHashStringRef S(Ent.Name, computeGdbHash(Ent.Name));
-        Ret.push_back({S, Ent.Descriptor.toBits()});
-      }
-    }
+    for (const DWARFDebugPubTable::Set &Set : Table.getData())
+      for (const DWARFDebugPubTable::Entry &Ent : Set.Entries)
+        Ret.push_back({{Ent.Name, computeGdbHash(Ent.Name)},
+                       (Ent.Descriptor.toBits() << 24) | Idx});
   }
   return Ret;
 }
@@ -2337,43 +2336,6 @@ static std::vector<InputSection *> getDe
   return Ret;
 }
 
-void GdbIndexSection::fixCuIndex() {
-  uint32_t Idx = 0;
-  for (GdbIndexChunk &Chunk : Chunks) {
-    for (GdbIndexChunk::AddressEntry &Ent : Chunk.AddressAreas)
-      Ent.CuIndex += Idx;
-    Idx += Chunk.CompilationUnits.size();
-  }
-}
-
-std::vector<std::vector<uint32_t>> GdbIndexSection::createCuVectors() {
-  std::vector<std::vector<uint32_t>> Ret;
-  uint32_t Idx = 0;
-  uint32_t Off = 0;
-
-  for (GdbIndexChunk &Chunk : Chunks) {
-    for (GdbIndexChunk::NameTypeEntry &Ent : Chunk.NamesAndTypes) {
-      GdbSymbol *&Sym = Symbols[Ent.Name];
-      if (!Sym) {
-        Sym = make<GdbSymbol>(GdbSymbol{Ent.Name.hash(), Off, Ret.size()});
-        Off += Ent.Name.size() + 1;
-        Ret.push_back({});
-      }
-
-      // gcc 5.4.1 produces a buggy .debug_gnu_pubnames that contains
-      // duplicate entries, so we want to dedup them.
-      std::vector<uint32_t> &Vec = Ret[Sym->CuVectorIndex];
-      uint32_t Val = (Ent.Type << 24) | Idx;
-      if (Vec.empty() || Vec.back() != Val)
-        Vec.push_back(Val);
-    }
-    Idx += Chunk.CompilationUnits.size();
-  }
-
-  StringPoolSize = Off;
-  return Ret;
-}
-
 template <class ELFT> GdbIndexSection *elf::createGdbIndex() {
   // Gather debug info to create a .gdb_index section.
   std::vector<InputSection *> Sections = getDebugInfoSections();
@@ -2386,7 +2348,7 @@ template <class ELFT> GdbIndexSection *e
     Chunks[I].DebugInfoSec = Sections[I];
     Chunks[I].CompilationUnits = readCuList(Dwarf);
     Chunks[I].AddressAreas = readAddressAreas(Dwarf, Sections[I]);
-    Chunks[I].NamesAndTypes = readPubNamesAndTypes(Dwarf);
+    Chunks[I].NamesAndTypes = readPubNamesAndTypes(Dwarf, I);
   });
 
   // .debug_gnu_pub{names,types} are useless in executables.
@@ -2414,37 +2376,48 @@ static size_t getAddressAreaSize(ArrayRe
   return Ret;
 }
 
-std::vector<GdbSymbol *> GdbIndexSection::createGdbSymtab() {
-  uint32_t Size = NextPowerOf2(Symbols.size() * 4 / 3);
-  if (Size < 1024)
-    Size = 1024;
-
-  uint32_t Mask = Size - 1;
-  std::vector<GdbSymbol *> Ret(Size);
-
-  for (auto &KV : Symbols) {
-    GdbSymbol *Sym = KV.second;
-    uint32_t I = Sym->NameHash & Mask;
-    uint32_t Step = ((Sym->NameHash * 17) & Mask) | 1;
-
-    while (Ret[I])
-      I = (I + Step) & Mask;
-    Ret[I] = Sym;
-  }
-  return Ret;
+// Returns the desired size of an on-disk hash table for a .gdb_index section.
+// There's a tradeoff between size and collision rate. We aim 75% utilization.
+static size_t getSymtabSize(size_t NumSymbols) {
+  return std::max<size_t>(NextPowerOf2(NumSymbols * 4 / 3), 1024);
 }
 
 GdbIndexSection::GdbIndexSection(std::vector<GdbIndexChunk> &&C)
     : SyntheticSection(0, SHT_PROGBITS, 1, ".gdb_index"), Chunks(std::move(C)) {
-  fixCuIndex();
-  CuVectors = createCuVectors();
-  GdbSymtab = createGdbSymtab();
+  // A map to identify duplicate symbols.
+  DenseMap<CachedHashStringRef, size_t> Map;
+
+  // Initialize Symbols and CuVectors while deduplicating symbols by name.
+  for (GdbIndexChunk &Chunk : Chunks) {
+    for (GdbIndexChunk::NameTypeEntry &Ent : Chunk.NamesAndTypes) {
+      CachedHashStringRef S = Ent.Name;
+      size_t &Idx = Map[S];
+
+      if (!Idx) {
+        Idx = Symbols.size() + 1;
+        Symbols.push_back({S, static_cast<uint32_t>(StringPoolSize),
+                           static_cast<uint32_t>(Symbols.size())});
+        StringPoolSize += S.size() + 1;
+        CuVectors.push_back({});
+      }
+
+      // gcc 5.4.1 produces a buggy .debug_gnu_pubnames that contains
+      // duplicate entries, so we want to dedup them.
+      std::vector<uint32_t> &Vec = CuVectors[Symbols[Idx - 1].CuVectorIdx];
+      if (Vec.empty() || Vec.back() != Ent.Type)
+        Vec.push_back(Ent.Type);
+    }
+
+    // NamesAndTypes is useless beyond this point, so clear it to save memory.
+    Chunk.NamesAndTypes = {};
+  }
 
   // Compute offsets early to know the section size.
   // Each chunk size needs to be in sync with what we write in writeTo.
   CuTypesOffset = CuListOffset + getCuSize(Chunks) * 16;
   SymtabOffset = CuTypesOffset + getAddressAreaSize(Chunks) * 20;
-  ConstantPoolOffset = SymtabOffset + GdbSymtab.size() * 8;
+  SymtabSize = getSymtabSize(Symbols.size());
+  ConstantPoolOffset = SymtabOffset + SymtabSize * 8;
 
   for (ArrayRef<uint32_t> Vec : CuVectors) {
     CuVectorOffsets.push_back(CuVectorsPoolSize);
@@ -2480,25 +2453,35 @@ void GdbIndexSection::writeTo(uint8_t *B
   }
 
   // Write the address area.
-  for (GdbIndexChunk &D : Chunks) {
-    for (GdbIndexChunk::AddressEntry &E : D.AddressAreas) {
+  uint32_t Idx = 0;
+  for (GdbIndexChunk &Chunk : Chunks) {
+    for (GdbIndexChunk::AddressEntry &E : Chunk.AddressAreas) {
       uint64_t BaseAddr = E.Section->getVA(0);
       write64le(Buf, BaseAddr + E.LowAddress);
       write64le(Buf + 8, BaseAddr + E.HighAddress);
-      write32le(Buf + 16, E.CuIndex);
+      write32le(Buf + 16, E.CuIndex + Idx);
       Buf += 20;
     }
+    Idx += Chunk.CompilationUnits.size();
   }
 
-  // Write the symbol table.
-  for (GdbSymbol *Sym : GdbSymtab) {
-    if (Sym) {
-      write32le(Buf, CuVectorsPoolSize + Sym->NameOffset);
-      write32le(Buf + 4, CuVectorOffsets[Sym->CuVectorIndex]);
-    }
-    Buf += 8;
+  // Write the on-disk open-addressing hash table containing symbols.
+  for (GdbSymbol &Sym : Symbols) {
+    uint32_t Mask = SymtabSize - 1;
+    uint32_t H = Sym.Name.hash();
+    uint32_t I = H & Mask;
+    uint32_t Step = ((H * 17) & Mask) | 1;
+
+    while (read32le(Buf + I * 8))
+      I = (I + Step) & Mask;
+
+    uint8_t *P = Buf + I * 8;
+    write32le(P, CuVectorsPoolSize + Sym.OutputOff);
+    write32le(P + 4, CuVectorOffsets[Sym.CuVectorIdx]);
   }
 
+  Buf += SymtabSize * 8;
+
   // Write the CU vectors.
   for (ArrayRef<uint32_t> Vec : CuVectors) {
     write32le(Buf, Vec.size());
@@ -2510,13 +2493,8 @@ void GdbIndexSection::writeTo(uint8_t *B
   }
 
   // Write the string pool.
-  for (auto &KV : Symbols) {
-    CachedHashStringRef S = KV.first;
-    GdbSymbol *Sym = KV.second;
-    size_t Off = Sym->NameOffset;
-    memcpy(Buf + Off, S.val().data(), S.size());
-    Buf[Off + S.size()] = '\0';
-  }
+  for (GdbSymbol &Sym : Symbols)
+    memcpy(Buf + Sym.OutputOff, Sym.Name.val().data(), Sym.Name.size());
 }
 
 bool GdbIndexSection::empty() const { return !Out::DebugInfo; }

Modified: lld/trunk/ELF/SyntheticSections.h
URL: http://llvm.org/viewvc/llvm-project/lld/trunk/ELF/SyntheticSections.h?rev=336672&r1=336671&r2=336672&view=diff
==============================================================================
--- lld/trunk/ELF/SyntheticSections.h (original)
+++ lld/trunk/ELF/SyntheticSections.h Tue Jul 10 06:49:13 2018
@@ -668,7 +668,7 @@ struct GdbIndexChunk {
 
   struct NameTypeEntry {
     llvm::CachedHashStringRef Name;
-    uint8_t Type;
+    uint32_t Type;
   };
 
   InputSection *DebugInfoSec;
@@ -679,9 +679,9 @@ struct GdbIndexChunk {
 
 // The symbol type for the .gdb_index section.
 struct GdbSymbol {
-  uint32_t NameHash;
-  size_t NameOffset;
-  size_t CuVectorIndex;
+  llvm::CachedHashStringRef Name;
+  uint32_t OutputOff;
+  uint32_t CuVectorIdx;
 };
 
 class GdbIndexSection final : public SyntheticSection {
@@ -692,19 +692,12 @@ public:
   bool empty() const override;
 
 private:
-  void fixCuIndex();
-  std::vector<std::vector<uint32_t>> createCuVectors();
-  std::vector<GdbSymbol *> createGdbSymtab();
-
   // A symbol table for this .gdb_index section.
-  std::vector<GdbSymbol *> GdbSymtab;
+  std::vector<GdbSymbol> Symbols;
 
   // CU vector is a part of constant pool area of section.
   std::vector<std::vector<uint32_t>> CuVectors;
 
-  // Symbol table contents.
-  llvm::DenseMap<llvm::CachedHashStringRef, GdbSymbol *> Symbols;
-
   // Each chunk contains information gathered from a debug sections of single
   // object and used to build different areas of gdb index.
   std::vector<GdbIndexChunk> Chunks;
@@ -712,12 +705,13 @@ private:
   uint64_t CuListOffset = 24;
   uint64_t CuTypesOffset;
   uint64_t SymtabOffset;
+  uint64_t SymtabSize = 0;
   uint64_t ConstantPoolOffset;
   uint64_t CuVectorsPoolSize = 0;
   uint64_t StringPoolSize;
   uint64_t TotalSize;
 
-  std::vector<size_t> CuVectorOffsets;
+  std::vector<uint32_t> CuVectorOffsets;
 };
 
 template <class ELFT> GdbIndexSection *createGdbIndex();




More information about the llvm-commits mailing list