[lld] r296570 - Partially rewrite .gnu.hash synthetic section and add comments.

Tue Feb 28 18:51:42 PST 2017

Author: ruiu
Date: Tue Feb 28 20:51:42 2017
New Revision: 296570

URL: http://llvm.org/viewvc/llvm-project?rev=296570&view=rev
Log:
Partially rewrite .gnu.hash synthetic section and add comments.

This implementation is probably slightly inefficient than before,
but that should be negligible because this is not a performance-
critical pass.

Modified:
    lld/trunk/ELF/SyntheticSections.cpp
    lld/trunk/ELF/SyntheticSections.h

Modified: lld/trunk/ELF/SyntheticSections.cpp
URL: http://llvm.org/viewvc/llvm-project/lld/trunk/ELF/SyntheticSections.cpp?rev=296570&r1=296569&r2=296570&view=diff
==============================================================================

--- lld/trunk/ELF/SyntheticSections.cpp (original)
+++ lld/trunk/ELF/SyntheticSections.cpp Tue Feb 28 20:51:42 2017
@@ -1450,87 +1450,49 @@ GnuHashTableSection<ELFT>::GnuHashTableS
   this->Entsize = ELFT::Is64Bits ? 0 : 4;
 }
 
-template <class ELFT>
-unsigned GnuHashTableSection<ELFT>::calcNBuckets(unsigned NumHashed) {
-  if (!NumHashed)
-    return 0;
-
-  // These values are prime numbers which are not greater than 2^(N-1) + 1.
-  // In result, for any particular NumHashed we return a prime number
-  // which is not greater than NumHashed.
-  static const unsigned Primes[] = {
-      1,   1,    3,    3,    7,    13,    31,    61,    127,   251,
-      509, 1021, 2039, 4093, 8191, 16381, 32749, 65521, 131071};
-
-  return Primes[std::min<unsigned>(Log2_32_Ceil(NumHashed),
-                                   array_lengthof(Primes) - 1)];
-}
-
-// Bloom filter estimation: at least 8 bits for each hashed symbol.
-// GNU Hash table requirement: it should be a power of 2,
-//   the minimum value is 1, even for an empty table.
-// Expected results for a 32-bit target:
-//   calcMaskWords(0..4)   = 1
-//   calcMaskWords(5..8)   = 2
-//   calcMaskWords(9..16)  = 4
-// For a 64-bit target:
-//   calcMaskWords(0..8)   = 1
-//   calcMaskWords(9..16)  = 2
-//   calcMaskWords(17..32) = 4
-template <class ELFT>
-unsigned GnuHashTableSection<ELFT>::calcMaskWords(unsigned NumHashed) {
-  if (!NumHashed)
-    return 1;
-  return NextPowerOf2((NumHashed - 1) / sizeof(uintX_t));
-}
-
 template <class ELFT> void GnuHashTableSection<ELFT>::finalizeContents() {
-  unsigned NumHashed = Symbols.size();
-  NBuckets = calcNBuckets(NumHashed);
-  MaskWords = calcMaskWords(NumHashed);
-  // Second hash shift estimation: just predefined values.
-  Shift2 = ELFT::Is64Bits ? 6 : 5;
-
   this->OutSec->Entsize = this->Entsize;
   this->OutSec->Link = In<ELFT>::DynSymTab->OutSec->SectionIndex;
-  this->Size = sizeof(uint32_t) * 4            // Header
-               + sizeof(uintX_t) * MaskWords   // Bloom Filter
-               + sizeof(uint32_t) * NBuckets   // Hash Buckets
-               + sizeof(uint32_t) * NumHashed; // Hash Values
-}
 
-template <class ELFT> void GnuHashTableSection<ELFT>::writeTo(uint8_t *Buf) {
-  Buf = writeHeader(Buf);
+  // Computes bloom filter size in word size. We want to allocate 8
+  // bits for each symbol. It must be a power of two.
   if (Symbols.empty())
-    return;
-  Buf = writeBloomFilter(Buf);
-  writeHashTable(Buf);
+    MaskWords = 1;
+  else
+    MaskWords = NextPowerOf2((Symbols.size() - 1) / sizeof(uintX_t));
+
+  Size = 16;                           // Header
+  Size += sizeof(uintX_t) * MaskWords; // Bloom filter
+  Size += NBuckets * 4;                // Hash buckets
+  Size += Symbols.size() * 4;          // Hash values
 }
 
-template <class ELFT>
-uint8_t *GnuHashTableSection<ELFT>::writeHeader(uint8_t *Buf) {
+template <class ELFT> void GnuHashTableSection<ELFT>::writeTo(uint8_t *Buf) {
+  // Write a header.
   const endianness E = ELFT::TargetEndianness;
   write32<E>(Buf, NBuckets);
   write32<E>(Buf + 4, In<ELFT>::DynSymTab->getNumSymbols() - Symbols.size());
   write32<E>(Buf + 8, MaskWords);
-  write32<E>(Buf + 12, Shift2);
-  return Buf + 16;
+  write32<E>(Buf + 12, getShift2());
+  Buf += 16;
+
+  writeBloomFilter(Buf);
+  Buf += sizeof(uintX_t) * MaskWords;
+
+  writeHashTable(Buf);
 }
 
 template <class ELFT>
-uint8_t *GnuHashTableSection<ELFT>::writeBloomFilter(uint8_t *Buf) {
+void GnuHashTableSection<ELFT>::writeBloomFilter(uint8_t *Buf) {
   typedef typename ELFT::Off Elf_Off;
-
   const unsigned C = sizeof(uintX_t) * 8;
 
-  auto *Masks = reinterpret_cast<Elf_Off *>(Buf);
-  for (const SymbolData &Sym : Symbols) {
-    size_t Pos = (Sym.Hash / C) & (MaskWords - 1);
-    uintX_t V = (uintX_t(1) << (Sym.Hash % C)) |
-                (uintX_t(1) << ((Sym.Hash >> Shift2) % C));
-    Masks[Pos] |= V;
+  auto *Filter = reinterpret_cast<Elf_Off *>(Buf);
+  for (const Entry &Sym : Symbols) {
+    size_t I = (Sym.Hash / C) & (MaskWords - 1);
+    Filter[I] |= uintX_t(1) << (Sym.Hash % C);
+    Filter[I] |= uintX_t(1) << ((Sym.Hash >> getShift2()) % C);
   }
-  return Buf + sizeof(uintX_t) * MaskWords;
 }
 
 template <class ELFT>
@@ -1538,25 +1500,30 @@ void GnuHashTableSection<ELFT>::writeHas
   // A 32-bit integer type in the target endianness.
   typedef typename ELFT::Word Elf_Word;
 
+  // Group symbols by hash value.
+  std::vector<std::vector<Entry>> Syms(NBuckets);
+  for (const Entry &Ent : Symbols)
+    Syms[Ent.Hash % NBuckets].push_back(Ent);
+
+  // Write hash buckets. Hash buckets contain indices in the following
+  // hash value table.
   Elf_Word *Buckets = reinterpret_cast<Elf_Word *>(Buf);
+  for (size_t I = 0; I < NBuckets; ++I)
+    if (!Syms[I].empty())
+      Buckets[I] = Syms[I][0].Body->DynsymIndex;
+
+  // Write a hash value table. It represents a sequence of chains that
+  // share the same hash modulo value. The last element of each chain
+  // is terminated by LSB 1.
   Elf_Word *Values = Buckets + NBuckets;
-
-  int PrevBucket = -1;
-  int I = 0;
-  for (const SymbolData &Sym : Symbols) {
-    int Bucket = Sym.Hash % NBuckets;
-    assert(PrevBucket <= Bucket);
-    if (Bucket != PrevBucket) {
-      Buckets[Bucket] = Sym.Body->DynsymIndex;
-      PrevBucket = Bucket;
-      if (I > 0)
-        Values[I - 1] |= 1;
-    }
-    Values[I] = Sym.Hash & ~1;
-    ++I;
+  size_t I = 0;
+  for (std::vector<Entry> &Vec : Syms) {
+    if (Vec.empty())
+      continue;
+    for (const Entry &Ent : makeArrayRef(Vec).drop_back())
+      Values[I++] = Ent.Hash & ~1;
+    Values[I++] = Vec.back().Hash | 1;
   }
-  if (I > 0)
-    Values[I - 1] |= 1;
 }
 
 static uint32_t hashGnu(StringRef Name) {
@@ -1566,34 +1533,50 @@ static uint32_t hashGnu(StringRef Name)
   return H;
 }
 
+// Returns a number of hash buckets to accomodate given number of elements.
+// We want to choose a moderate number that is not too small (which
+// causes too many hash collisions) and not too large (which wastes
+// disk space.)
+//
+// We return a prime number because it (is believed to) achieve good
+// hash distribution.
+static size_t getBucketSize(size_t NumSymbols) {
+  // List of largest prime numbers that are not greater than 2^n + 1.
+  for (size_t N : {131071, 65521, 32749, 16381, 8191, 4093, 2039, 1021, 509,
+                   251, 127, 61, 31, 13, 7, 3, 1})
+    if (N <= NumSymbols)
+      return N;
+  return 0;
+}
+
 // Add symbols to this symbol hash table. Note that this function
 // destructively sort a given vector -- which is needed because
 // GNU-style hash table places some sorting requirements.
 template <class ELFT>
 void GnuHashTableSection<ELFT>::addSymbols(std::vector<SymbolTableEntry> &V) {
-  // Ideally this will just be 'auto' but GCC 6.1 is not able
-  // to deduce it correctly.
+  // We cannot use 'auto' for Mid because GCC 6.1 cannot deduce
+  // its type correctly.
   std::vector<SymbolTableEntry>::iterator Mid =
       std::stable_partition(V.begin(), V.end(), [](const SymbolTableEntry &S) {
         return S.Symbol->isUndefined();
       });
   if (Mid == V.end())
     return;
-  for (auto I = Mid, E = V.end(); I != E; ++I) {
-    SymbolBody *B = I->Symbol;
-    size_t StrOff = I->StrTabOffset;
-    Symbols.push_back({B, StrOff, hashGnu(B->getName())});
+
+  for (SymbolTableEntry &Ent : llvm::make_range(Mid, V.end())) {
+    SymbolBody *B = Ent.Symbol;
+    Symbols.push_back({B, Ent.StrTabOffset, hashGnu(B->getName())});
   }
 
-  unsigned NBuckets = calcNBuckets(Symbols.size());
+  NBuckets = getBucketSize(Symbols.size());
   std::stable_sort(Symbols.begin(), Symbols.end(),
-                   [&](const SymbolData &L, const SymbolData &R) {
+                   [&](const Entry &L, const Entry &R) {
                      return L.Hash % NBuckets < R.Hash % NBuckets;
                    });
 
   V.erase(Mid, V.end());
-  for (const SymbolData &Sym : Symbols)
-    V.push_back({Sym.Body, Sym.STName});
+  for (const Entry &Ent : Symbols)
+    V.push_back({Ent.Body, Ent.StrTabOffset});
 }
 
 template <class ELFT>

Modified: lld/trunk/ELF/SyntheticSections.h
URL: http://llvm.org/viewvc/llvm-project/lld/trunk/ELF/SyntheticSections.h?rev=296570&r1=296569&r2=296570&view=diff
==============================================================================
--- lld/trunk/ELF/SyntheticSections.h (original)
+++ lld/trunk/ELF/SyntheticSections.h Tue Feb 28 20:51:42 2017
@@ -445,25 +445,21 @@ public:
   void addSymbols(std::vector<SymbolTableEntry> &Symbols);
 
 private:
-  static unsigned calcNBuckets(unsigned NumHashed);
-  static unsigned calcMaskWords(unsigned NumHashed);
+  size_t getShift2() const { return ELFT::Is64Bits ? 6 : 5; }
 
-  uint8_t *writeHeader(uint8_t *Buf);
-  uint8_t *writeBloomFilter(uint8_t *Buf);
+  void writeBloomFilter(uint8_t *Buf);
   void writeHashTable(uint8_t *Buf);
 
-  struct SymbolData {
+  struct Entry {
     SymbolBody *Body;
-    size_t STName;
+    size_t StrTabOffset;
     uint32_t Hash;
   };
 
-  std::vector<SymbolData> Symbols;
-
-  unsigned MaskWords;
-  unsigned NBuckets;
-  unsigned Shift2;
-  uintX_t Size = 0;
+  std::vector<Entry> Symbols;
+  size_t MaskWords;
+  size_t NBuckets;
+  size_t Size = 0;
 };
 
 template <class ELFT> class HashTableSection final : public SyntheticSection {