[PATCH] D38266: Parallelize string merging.

Fri Sep 29 10:16:45 PDT 2017

I am starting another benchmark run and will report.

Note that this includes the idea of using the most significant bits of
the hash for selecting the shard.

If we then sort by hash in each shard we can get an output that doesn't
depend on the shard.

An extreme version of that would be to just use a parallel sort.

I will try to benchmark that too.

Cheers,
Rafael

Rui Ueyama via Phabricator <reviews at reviews.llvm.org> writes:

> ruiu updated this revision to Diff 117084.
> ruiu added a comment.
>
> - Fix a few minor errors.
>
>
> https://reviews.llvm.org/D38266
>
> Files:
>   lld/ELF/InputSection.h
>   lld/ELF/SyntheticSections.cpp
>   lld/ELF/SyntheticSections.h
>   lld/test/ELF/comment-gc.s
>   lld/test/ELF/compress-debug-sections.s
>   lld/test/ELF/compressed-debug-input.s
>   lld/test/ELF/debug-gc.s
>   lld/test/ELF/merge-string.s
>   lld/test/ELF/merge.s
>   lld/test/ELF/string-gc.s
>
> Index: lld/test/ELF/string-gc.s
> ===================================================================
> --- lld/test/ELF/string-gc.s
> +++ lld/test/ELF/string-gc.s
> @@ -14,16 +14,16 @@
>  // CHECK-NEXT:   }
>  // CHECK-NEXT:   Symbol {
>  // CHECK-NEXT:     Name: s3
> -// CHECK-NEXT:     Value: 0x200125
> +// CHECK-NEXT:     Value: 0x200120
>  // CHECK-NEXT:     Size: 0
>  // CHECK-NEXT:     Binding: Local (0x0)
>  // CHECK-NEXT:     Type: Object (0x1)
>  // CHECK-NEXT:     Other: 0
>  // CHECK-NEXT:     Section: .rodata (0x1)
>  // CHECK-NEXT:   }
>  // CHECK-NEXT:   Symbol {
>  // CHECK-NEXT:     Name: s1
> -// CHECK-NEXT:     Value: 0x200120
> +// CHECK-NEXT:     Value: 0x200125
>  // CHECK-NEXT:     Size: 0
>  // CHECK-NEXT:     Binding: Local (0x0)
>  // CHECK-NEXT:     Type: Object (0x1)
> Index: lld/test/ELF/merge.s
> ===================================================================
> --- lld/test/ELF/merge.s
> +++ lld/test/ELF/merge.s
> @@ -31,33 +31,33 @@
>  // CHECK-NEXT:    AddressAlignment: 4
>  // CHECK-NEXT:    EntrySize: 0
>  // CHECK-NEXT:    SectionData (
> -// CHECK-NEXT:      0000: 10000000 42000000
> +// CHECK-NEXT:      0000: 42000000 10000000
>  // CHECK-NEXT:    )
>  
>  
> -// Address of the constant 0x10 = 0x200120 = 2097440
> -// Address of the constant 0x42 = 0x200124 = 2097444
> +// Address of the constant 0x42 = 0x200120 = 2097440
> +// Address of the constant 0x10 = 0x200124 = 2097444
>  
>  // CHECK:      Symbols [
>  
>  // CHECK:        Name: bar
> -// CHECK-NEXT:   Value: 0x200124
> +// CHECK-NEXT:   Value: 0x200120
>  // CHECK-NEXT:   Size: 0
>  // CHECK-NEXT:   Binding: Loca
>  // CHECK-NEXT:   Type: None
>  // CHECK-NEXT:   Other: 0
>  // CHECK-NEXT:   Section: .mysec
>  
>  // CHECK:        Name: zed
> -// CHECK-NEXT:   Value: 0x200124
> +// CHECK-NEXT:   Value: 0x200120
>  // CHECK-NEXT:   Size: 0
>  // CHECK-NEXT:   Binding: Local
>  // CHECK-NEXT:   Type: None
>  // CHECK-NEXT:   Other: 0
>  // CHECK-NEXT:   Section: .mysec
>  
>  // CHECK:        Name: foo
> -// CHECK-NEXT:   Value: 0x200124
> +// CHECK-NEXT:   Value: 0x200120
>  // CHECK-NEXT:   Size: 0
>  // CHECK-NEXT:   Binding: Local
>  // CHECK-NEXT:   Type: None
> @@ -75,37 +75,37 @@
>  // DISASM-NEXT: _start:
>  
>          movl .mysec, %eax
> -// addr(0x10) = 2097440
> -// DISASM-NEXT:   movl    2097440, %eax
> +// addr(0x10) = 2097444
> +// DISASM-NEXT:   movl    2097444, %eax
>  
>          movl .mysec+7, %eax
> -// addr(0x42) + 3 = 2097444 + 3 = 2097447
> -// DISASM-NEXT:   movl    2097447, %eax
> +// addr(0x42) + 3 = 2097440 + 3
> +// DISASM-NEXT:   movl    2097443, %eax
>  
>          movl .mysec+8, %eax
> -// addr(0x42) = 2097444
> -// DISASM-NEXT:   movl    2097444, %eax
> +// addr(0x42) = 2097440
> +// DISASM-NEXT:   movl    2097440, %eax
>  
>          movl bar+7, %eax
> -// addr(0x42) + 7 = 2097444 + 7 = 2097451
> -// DISASM-NEXT:   movl    2097451, %eax
> +// addr(0x42) + 7 = 2097440 + 7
> +// DISASM-NEXT:   movl    2097447, %eax
>  
>          movl bar+8, %eax
> -// addr(0x42) + 8 = 2097444 + 8 = 2097452
> -// DISASM-NEXT:   movl    2097452, %eax
> +// addr(0x42) + 8 = 2097440 + 8
> +// DISASM-NEXT:   movl    2097448, %eax
>  
>          movl foo, %eax
> -// addr(0x42) = 2097444
> -// DISASM-NEXT:   movl    2097444, %eax
> +// addr(0x42) = 2097440
> +// DISASM-NEXT:   movl    2097440, %eax
>  
>          movl foo+7, %eax
> -// addr(0x42) + 7 =  = 2097444 + 7 = 2097451
> -// DISASM-NEXT:   movl    2097451, %eax
> +// addr(0x42) + 7 =  = 2097440 + 7
> +// DISASM-NEXT:   movl    2097447, %eax
>  
>          movl foo+8, %eax
> -// addr(0x42) + 8 =  = 2097444 + 8 = 2097452
> -// DISASM-NEXT:   movl    2097452, %eax
> +// addr(0x42) + 8 =  = 2097440 + 8
> +// DISASM-NEXT:   movl    2097448, %eax
>  
>  //  From the other file:  movl .mysec, %eax
> -// addr(0x42) = 2097444
> -// DISASM-NEXT:   movl    2097444, %eax
> +// addr(0x42) = 2097440
> +// DISASM-NEXT:   movl    2097440, %eax
> Index: lld/test/ELF/merge-string.s
> ===================================================================
> --- lld/test/ELF/merge-string.s
> +++ lld/test/ELF/merge-string.s
> @@ -54,7 +54,7 @@
>  // NOTAIL-NEXT: AddressAlignment: 1
>  // NOTAIL-NEXT: EntrySize: 0
>  // NOTAIL-NEXT: SectionData (
> -// NOTAIL-NEXT:   0000: 61626300 626300                     |abc.bc.|
> +// NOTAIL-NEXT:   0000: 62630061 626300                     |bc.abc.|
>  // NOTAIL-NEXT: )
>  
>  // NOMERGE:      Name:    .rodata1
> Index: lld/test/ELF/debug-gc.s
> ===================================================================
> --- lld/test/ELF/debug-gc.s
> +++ lld/test/ELF/debug-gc.s
> @@ -4,11 +4,11 @@
>  # RUN: llvm-objdump -s %t1 | FileCheck %s
>  
>  # CHECK:      Contents of section .debug_str:
> -# CHECK-NEXT:  0000 41414100 42424200 43434300           AAA.BBB.CCC.
> +# CHECK-NEXT:  0000 42424200 43434300 41414100           BBB.CCC.AAA.
>  # CHECK:      Contents of section .foo:
>  # CHECK-NEXT:  0000 2a000000
>  # CHECK:      Contents of section .debug_info:
> -# CHECK-NEXT:  0000 00000000 04000000
> +# CHECK-NEXT:  0000 08000000 00000000
>  
>  .globl _start
>  _start:
> Index: lld/test/ELF/compressed-debug-input.s
> ===================================================================
> --- lld/test/ELF/compressed-debug-input.s
> +++ lld/test/ELF/compressed-debug-input.s
> @@ -61,11 +61,11 @@
>  # DATA-NEXT:   AddressAlignment: 1
>  # DATA-NEXT:   EntrySize: 0
>  # DATA-NEXT:   SectionData (
> -# DATA-NEXT:     0000: 73686F72 7420756E 7369676E 65642069  |short unsigned i|
> -# DATA-NEXT:     0010: 6E740075 6E736967 6E656420 696E7400  |nt.unsigned int.|
> -# DATA-NEXT:     0020: 6C6F6E67 20756E73 69676E65 6420696E  |long unsigned in|
> -# DATA-NEXT:     0030: 74006368 61720075 6E736967 6E656420  |t.char.unsigned |
> -# DATA-NEXT:     0040: 63686172 00                          |char.|
> +# DATA-NEXT:     0000: 756E7369 676E6564 20636861 72006368  |unsigned char.ch|
> +# DATA-NEXT:     0010: 61720073 686F7274 20756E73 69676E65  |ar.short unsigne|
> +# DATA-NEXT:     0020: 6420696E 74006C6F 6E672075 6E736967  |d int.long unsig|
> +# DATA-NEXT:     0030: 6E656420 696E7400 756E7369 676E6564  |ned int.unsigned|
> +# DATA-NEXT:     0040: 20696E74 00                          | int.|
>  # DATA-NEXT:   )
>  # DATA-NEXT: }
>  
> Index: lld/test/ELF/compress-debug-sections.s
> ===================================================================
> --- lld/test/ELF/compress-debug-sections.s
> +++ lld/test/ELF/compress-debug-sections.s
> @@ -18,8 +18,8 @@
>  # RUN: llvm-dwarfdump %t1 -debug-str | \
>  # RUN:   FileCheck %s --check-prefix=DEBUGSTR
>  # DEBUGSTR:     .debug_str contents:
> -# DEBUGSTR-NEXT:  AAAAAAAAAAAAAAAAAAAAAAAAAAA
>  # DEBUGSTR-NEXT:  BBBBBBBBBBBBBBBBBBBBBBBBBBB
> +# DEBUGSTR-NEXT:  AAAAAAAAAAAAAAAAAAAAAAAAAAA
>  
>  ## Test alias.
>  # RUN: ld.lld %t.o -o %t2 --compress-debug-sections zlib
> Index: lld/test/ELF/comment-gc.s
> ===================================================================
> --- lld/test/ELF/comment-gc.s
> +++ lld/test/ELF/comment-gc.s
> @@ -5,8 +5,7 @@
>  # RUN: llvm-objdump -s %t1 | FileCheck %s
>  
>  # CHECK:      Contents of section .comment:
> -# CHECK-NEXT:  0000 00666f6f 00626172 004c4c44 20312e30 .foo.bar.LLD 1.0
> -# CHECK-NEXT:  0010 00 .
> +# CHECK-NEXT: .foo.LLD 1.0.bar
>  
>  .ident "foo"
>  
> Index: lld/ELF/SyntheticSections.h
> ===================================================================
> --- lld/ELF/SyntheticSections.h
> +++ lld/ELF/SyntheticSections.h
> @@ -668,33 +668,55 @@
>  class MergeSyntheticSection : public SyntheticSection {
>  public:
>    void addSection(MergeInputSection *MS);
> -  size_t getSize() const override;
> -  void writeTo(uint8_t *Buf) override;
>  
>  protected:
>    MergeSyntheticSection(StringRef Name, uint32_t Type, uint64_t Flags,
> -                        uint32_t Alignment);
> +                        uint32_t Alignment)
> +      : SyntheticSection(Flags, Type, Alignment, Name) {}
>  
>    std::vector<MergeInputSection *> Sections;
> -  llvm::StringTableBuilder Builder;
>  };
>  
>  class MergeTailSection final : public MergeSyntheticSection {
>  public:
>    MergeTailSection(StringRef Name, uint32_t Type, uint64_t Flags,
> -                   uint32_t Alignment)
> -      : MergeSyntheticSection(Name, Type, Flags, Alignment) {}
> +                   uint32_t Alignment);
>  
> +  size_t getSize() const override;
> +  void writeTo(uint8_t *Buf) override;
>    void finalizeContents() override;
> +
> +private:
> +  llvm::StringTableBuilder Builder;
>  };
>  
>  class MergeNoTailSection final : public MergeSyntheticSection {
>  public:
>    MergeNoTailSection(StringRef Name, uint32_t Type, uint64_t Flags,
>                       uint32_t Alignment)
>        : MergeSyntheticSection(Name, Type, Flags, Alignment) {}
>  
> +  size_t getSize() const override { return Size; }
> +  void writeTo(uint8_t *Buf) override;
>    void finalizeContents() override;
> +
> +private:
> +  // We use the most significant bits of a hash as a shard ID.
> +  // The reason why we don't want to use the least significant bits is
> +  // because DenseMap also uses lower bits to determine a bucket ID.
> +  // If we use lower bits, it significantly increases the probability of
> +  // hash collisons.
> +  size_t getShardId(size_t Hash) {
> +    return Hash >> (sizeof(size_t) * 8 - llvm::countTrailingZeros(NumShards));
> +  }
> +
> +  // Section size
> +  size_t Size;
> +
> +  // String table contents
> +  constexpr static size_t NumShards = 32;
> +  std::vector<llvm::StringTableBuilder> Shards;
> +  size_t ShardOffsets[NumShards];
>  };
>  
>  // .MIPS.abiflags section.
> Index: lld/ELF/SyntheticSections.cpp
> ===================================================================
> --- lld/ELF/SyntheticSections.cpp
> +++ lld/ELF/SyntheticSections.cpp
> @@ -37,6 +37,7 @@
>  #include "llvm/Support/SHA1.h"
>  #include "llvm/Support/xxhash.h"
>  #include <cstdlib>
> +#include <thread>
>  
>  using namespace llvm;
>  using namespace llvm::dwarf;
> @@ -48,6 +49,8 @@
>  using namespace lld;
>  using namespace lld::elf;
>  
> +const size_t MergeNoTailSection::NumShards;
> +
>  uint64_t SyntheticSection::getVA() const {
>    if (OutputSection *Sec = getParent())
>      return Sec->Addr + OutSecOff;
> @@ -2177,19 +2180,19 @@
>    return getNeedNum() == 0;
>  }
>  
> -MergeSyntheticSection::MergeSyntheticSection(StringRef Name, uint32_t Type,
> -                                             uint64_t Flags, uint32_t Alignment)
> -    : SyntheticSection(Flags, Type, Alignment, Name),
> -      Builder(StringTableBuilder::RAW, Alignment) {}
> -
>  void MergeSyntheticSection::addSection(MergeInputSection *MS) {
>    MS->Parent = this;
>    Sections.push_back(MS);
>  }
>  
> -size_t MergeSyntheticSection::getSize() const { return Builder.getSize(); }
> +MergeTailSection::MergeTailSection(StringRef Name, uint32_t Type,
> +                                   uint64_t Flags, uint32_t Alignment)
> +    : MergeSyntheticSection(Name, Type, Flags, Alignment),
> +      Builder(StringTableBuilder::RAW, Alignment) {}
> +
> +size_t MergeTailSection::getSize() const { return Builder.getSize(); }
>  
> -void MergeSyntheticSection::writeTo(uint8_t *Buf) { Builder.write(Buf); }
> +void MergeTailSection::writeTo(uint8_t *Buf) { Builder.write(Buf); }
>  
>  void MergeTailSection::finalizeContents() {
>    // Add all string pieces to the string table builder to create section
> @@ -2211,17 +2214,59 @@
>          Sec->Pieces[I].OutputOff = Builder.getOffset(Sec->getData(I));
>  }
>  
> +void MergeNoTailSection::writeTo(uint8_t *Buf) {
> +  for (size_t I = 0; I < NumShards; ++I)
> +    Shards[I].write(Buf + ShardOffsets[I]);
> +}
> +
> +// This function is very hot (i.e. it can take several seconds to finish)
> +// because sometimes the number of inputs is in an order of magnitude of
> +// millions. So, we use multi-threading.
> +//
> +// For any strings S and T, we know S is not mergeable with T if S's hash
> +// value is different from T's. If that's the case, we can safely put S and
> +// T into different string builders without worrying about merge misses.
> +// We do it in parallel.
>  void MergeNoTailSection::finalizeContents() {
> -  // Add all string pieces to the string table builder to create section
> -  // contents. Because we are not tail-optimizing, offsets of strings are
> -  // fixed when they are added to the builder (string table builder contains
> -  // a hash table from strings to offsets).
> -  for (MergeInputSection *Sec : Sections)
> +  // Initializes string table builders.
> +  for (size_t I = 0; I < NumShards; ++I)
> +    Shards.emplace_back(StringTableBuilder::RAW, Alignment);
> +
> +  // Concurrency level. Must be a power of 2.
> +  size_t Parallelism =
> +      std::min(PowerOf2Floor(std::thread::hardware_concurrency()), NumShards);
> +
> +  // Add section pieces to the builders.
> +  parallelForEachN(0, Parallelism, [&](size_t ThreadId) {
> +    for (MergeInputSection *Sec : Sections) {
> +      for (size_t I = 0, E = Sec->Pieces.size(); I != E; ++I) {
> +        if (!Sec->Pieces[I].Live)
> +          continue;
> +        size_t ShardId = getShardId(Sec->Hashes[I]);
> +        if ((ShardId & (Parallelism - 1)) == ThreadId)
> +          Sec->Pieces[I].OutputOff = Shards[ShardId].add(Sec->getData(I));
> +      }
> +    }
> +  });
> +
> +  // Compute an in-section offset for each shard.
> +  size_t Off = 0;
> +  for (size_t I = 0; I < NumShards; ++I) {
> +    Shards[I].finalizeInOrder();
> +    if (Shards[I].getSize() > 0)
> +      Off = alignTo(Off, Alignment);
> +    ShardOffsets[I] = Off;
> +    Off += Shards[I].getSize();
> +  }
> +  Size = Off;
> +
> +  // So far, section pieces have offsets from beginning of shards, but
> +  // we want offsets from beginning of the whole section. Fix them.
> +  parallelForEach(Sections, [&](MergeInputSection *Sec) {
>      for (size_t I = 0, E = Sec->Pieces.size(); I != E; ++I)
>        if (Sec->Pieces[I].Live)
> -        Sec->Pieces[I].OutputOff = Builder.add(Sec->getData(I));
> -
> -  Builder.finalizeInOrder();
> +        Sec->Pieces[I].OutputOff += ShardOffsets[getShardId(Sec->Hashes[I])];
> +  });
>  }
>  
>  static MergeSyntheticSection *createMergeSynthetic(StringRef Name,
> Index: lld/ELF/InputSection.h
> ===================================================================
> --- lld/ELF/InputSection.h
> +++ lld/ELF/InputSection.h
> @@ -240,20 +240,21 @@
>        End = Pieces[I + 1].InputOff;
>  
>      StringRef S = {(const char *)(this->Data.data() + Begin), End - Begin};
> -    return {S, Hashes[I]};
> +    return {S, (uint32_t)Hashes[I]};
>    }
>  
>    // Returns the SectionPiece at a given input section offset.
>    SectionPiece *getSectionPiece(uint64_t Offset);
>    const SectionPiece *getSectionPiece(uint64_t Offset) const;
>  
>    SyntheticSection *getParent() const;
>  
> +  std::vector<size_t> Hashes;
> +
>  private:
>    void splitStrings(ArrayRef<uint8_t> A, size_t Size);
>    void splitNonStrings(ArrayRef<uint8_t> A, size_t Size);
>  
> -  std::vector<uint32_t> Hashes;
>  
>    mutable llvm::DenseMap<uint64_t, uint64_t> OffsetMap;
>    mutable llvm::once_flag InitOffsetMap;