[lld] r327668 - COFF: Implement string tail merging.

Peter Collingbourne via llvm-commits llvm-commits at lists.llvm.org
Thu Mar 15 14:14:02 PDT 2018


Author: pcc
Date: Thu Mar 15 14:14:02 2018
New Revision: 327668

URL: http://llvm.org/viewvc/llvm-project?rev=327668&view=rev
Log:
COFF: Implement string tail merging.

In COFF, duplicate string literals are merged by placing them in a
comdat whose leader symbol name contains a specific prefix followed
by the hash and partial contents of the string literal. This gives
us an easy way to identify sections containing string literals in
the linker: check for leader symbol names with the given prefix.

Any sections that are identified in this way as containing string
literals may be tail merged. We do so using the StringTableBuilder
class, which is also used to tail merge string literals in the ELF
linker. Tail merging is enabled only if ICF is enabled, as this
provides a signal as to whether the user cares about binary size.

Differential Revision: https://reviews.llvm.org/D44504

Added:
    lld/trunk/test/COFF/string-tail-merge.s
Modified:
    lld/trunk/COFF/Chunks.cpp
    lld/trunk/COFF/Chunks.h
    lld/trunk/COFF/ICF.cpp
    lld/trunk/COFF/InputFiles.cpp
    lld/trunk/COFF/InputFiles.h
    lld/trunk/COFF/Writer.cpp

Modified: lld/trunk/COFF/Chunks.cpp
URL: http://llvm.org/viewvc/llvm-project/lld/trunk/COFF/Chunks.cpp?rev=327668&r1=327667&r2=327668&view=diff
==============================================================================
--- lld/trunk/COFF/Chunks.cpp (original)
+++ lld/trunk/COFF/Chunks.cpp Thu Mar 15 14:14:02 2018
@@ -571,5 +571,47 @@ uint8_t Baserel::getDefaultType() {
   }
 }
 
+std::map<uint32_t, MergeChunk *> MergeChunk::Instances;
+
+MergeChunk::MergeChunk(uint32_t Alignment)
+    : Builder(StringTableBuilder::RAW, Alignment) {
+  this->Alignment = Alignment;
+}
+
+void MergeChunk::addSection(SectionChunk *C) {
+  auto *&MC = Instances[C->Alignment];
+  if (!MC)
+    MC = make<MergeChunk>(C->Alignment);
+  MC->Sections.push_back(C);
+}
+
+void MergeChunk::finalizeContents() {
+  for (SectionChunk *C : Sections)
+    if (C->isLive())
+      Builder.add(toStringRef(C->getContents()));
+  Builder.finalize();
+
+  for (SectionChunk *C : Sections) {
+    if (!C->isLive())
+      continue;
+    size_t Off = Builder.getOffset(toStringRef(C->getContents()));
+    C->setOutputSection(Out);
+    C->setRVA(RVA + Off);
+    C->OutputSectionOff = OutputSectionOff + Off;
+  }
+}
+
+uint32_t MergeChunk::getPermissions() const {
+  return IMAGE_SCN_MEM_READ | IMAGE_SCN_CNT_INITIALIZED_DATA;
+}
+
+size_t MergeChunk::getSize() const {
+  return Builder.getSize();
+}
+
+void MergeChunk::writeTo(uint8_t *Buf) const {
+  Builder.write(Buf + OutputSectionOff);
+}
+
 } // namespace coff
 } // namespace lld

Modified: lld/trunk/COFF/Chunks.h
URL: http://llvm.org/viewvc/llvm-project/lld/trunk/COFF/Chunks.h?rev=327668&r1=327667&r2=327668&view=diff
==============================================================================
--- lld/trunk/COFF/Chunks.h (original)
+++ lld/trunk/COFF/Chunks.h Thu Mar 15 14:14:02 2018
@@ -16,6 +16,7 @@
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/iterator.h"
 #include "llvm/ADT/iterator_range.h"
+#include "llvm/MC/StringTableBuilder.h"
 #include "llvm/Object/COFF.h"
 #include <utility>
 #include <vector>
@@ -60,6 +61,10 @@ public:
   // before calling this function.
   virtual void writeTo(uint8_t *Buf) const {}
 
+  // Called by the writer after an RVA is assigned, but before calling
+  // getSize().
+  virtual void finalizeContents() {}
+
   // The writer sets and uses the addresses.
   uint64_t getRVA() const { return RVA; }
   void setRVA(uint64_t V) { RVA = V; }
@@ -222,6 +227,33 @@ private:
   uint32_t Class[2] = {0, 0};
 };
 
+// This class is used to implement an lld-specific feature (not implemented in
+// MSVC) that minimizes the output size by finding string literals sharing tail
+// parts and merging them.
+//
+// If string tail merging is enabled and a section is identified as containing a
+// string literal, it is added to a MergeChunk with an appropriate alignment.
+// The MergeChunk then tail merges the strings using the StringTableBuilder
+// class and assigns RVAs and section offsets to each of the member chunks based
+// on the offsets assigned by the StringTableBuilder.
+class MergeChunk : public Chunk {
+public:
+  MergeChunk(uint32_t Alignment);
+  static void addSection(SectionChunk *C);
+  void finalizeContents() override;
+
+  uint32_t getPermissions() const override;
+  StringRef getSectionName() const override { return ".rdata"; }
+  size_t getSize() const override;
+  void writeTo(uint8_t *Buf) const override;
+
+  static std::map<uint32_t, MergeChunk *> Instances;
+  std::vector<SectionChunk *> Sections;
+
+private:
+  llvm::StringTableBuilder Builder;
+};
+
 // A chunk for common symbols. Common chunks don't have actual data.
 class CommonChunk : public Chunk {
 public:

Modified: lld/trunk/COFF/ICF.cpp
URL: http://llvm.org/viewvc/llvm-project/lld/trunk/COFF/ICF.cpp?rev=327668&r1=327667&r2=327668&view=diff
==============================================================================
--- lld/trunk/COFF/ICF.cpp (original)
+++ lld/trunk/COFF/ICF.cpp Thu Mar 15 14:14:02 2018
@@ -224,6 +224,12 @@ void ICF::run(ArrayRef<Chunk *> Vec) {
     }
   }
 
+  // Make sure that ICF doesn't merge sections that are being handled by string
+  // tail merging.
+  for (auto &P : MergeChunk::Instances)
+    for (SectionChunk *SC : P.second->Sections)
+      SC->Class[0] = NextId++;
+
   // Initially, we use hash values to partition sections.
   for_each(parallel::par, Chunks.begin(), Chunks.end(), [&](SectionChunk *SC) {
     // Set MSB to 1 to avoid collisions with non-hash classs.

Modified: lld/trunk/COFF/InputFiles.cpp
URL: http://llvm.org/viewvc/llvm-project/lld/trunk/COFF/InputFiles.cpp?rev=327668&r1=327667&r2=327668&view=diff
==============================================================================
--- lld/trunk/COFF/InputFiles.cpp (original)
+++ lld/trunk/COFF/InputFiles.cpp Thu Mar 15 14:14:02 2018
@@ -138,12 +138,13 @@ void ObjFile::initializeChunks() {
     if (Sec->Characteristics & IMAGE_SCN_LNK_COMDAT)
       SparseChunks[I] = PendingComdat;
     else
-      SparseChunks[I] = readSection(I, nullptr);
+      SparseChunks[I] = readSection(I, nullptr, "");
   }
 }
 
 SectionChunk *ObjFile::readSection(uint32_t SectionNumber,
-                                   const coff_aux_section_definition *Def) {
+                                   const coff_aux_section_definition *Def,
+                                   StringRef LeaderName) {
   const coff_section *Sec;
   StringRef Name;
   if (auto EC = COFFObj->getSection(SectionNumber, Sec))
@@ -189,6 +190,12 @@ SectionChunk *ObjFile::readSection(uint3
     GuardLJmpChunks.push_back(C);
   else if (Name == ".sxdata")
     SXDataChunks.push_back(C);
+  else if (Config->DoICF && Sec->NumberOfRelocations == 0 && Name == ".rdata" &&
+           LeaderName.startswith("??_C@"))
+    // COFF sections that look like string literal sections (i.e. no
+    // relocations, in .rdata, leader symbol name matches the MSVC name mangling
+    // for string literals) are subject to string tail merging.
+    MergeChunk::addSection(C);
   else
     Chunks.push_back(C);
 
@@ -209,7 +216,7 @@ void ObjFile::readAssociativeDefinition(
   // the section; otherwise mark it as discarded.
   int32_t SectionNumber = Sym.getSectionNumber();
   if (Parent) {
-    SparseChunks[SectionNumber] = readSection(SectionNumber, Def);
+    SparseChunks[SectionNumber] = readSection(SectionNumber, Def, "");
     if (SparseChunks[SectionNumber])
       Parent->addAssociative(SparseChunks[SectionNumber]);
   } else {
@@ -343,7 +350,7 @@ Optional<Symbol *> ObjFile::createDefine
       Prevailing = true;
     }
     if (Prevailing) {
-      SectionChunk *C = readSection(SectionNumber, Def);
+      SectionChunk *C = readSection(SectionNumber, Def, Name);
       SparseChunks[SectionNumber] = C;
       C->Sym = cast<DefinedRegular>(Leader);
       cast<DefinedRegular>(Leader)->Data = &C->Repl;

Modified: lld/trunk/COFF/InputFiles.h
URL: http://llvm.org/viewvc/llvm-project/lld/trunk/COFF/InputFiles.h?rev=327668&r1=327667&r2=327668&view=diff
==============================================================================
--- lld/trunk/COFF/InputFiles.h (original)
+++ lld/trunk/COFF/InputFiles.h Thu Mar 15 14:14:02 2018
@@ -150,7 +150,8 @@ private:
 
   SectionChunk *
   readSection(uint32_t SectionNumber,
-              const llvm::object::coff_aux_section_definition *Def);
+              const llvm::object::coff_aux_section_definition *Def,
+              StringRef LeaderName);
 
   void readAssociativeDefinition(
       COFFSymbolRef COFFSym,

Modified: lld/trunk/COFF/Writer.cpp
URL: http://llvm.org/viewvc/llvm-project/lld/trunk/COFF/Writer.cpp?rev=327668&r1=327667&r2=327668&view=diff
==============================================================================
--- lld/trunk/COFF/Writer.cpp (original)
+++ lld/trunk/COFF/Writer.cpp Thu Mar 15 14:14:02 2018
@@ -426,6 +426,9 @@ void Writer::createSections() {
 void Writer::createMiscChunks() {
   OutputSection *RData = createSection(".rdata");
 
+  for (auto &P : MergeChunk::Instances)
+    RData->addChunk(P.second);
+
   // Create thunks for locally-dllimported symbols.
   if (!Symtab->LocalImportChunks.empty()) {
     for (Chunk *C : Symtab->LocalImportChunks)
@@ -665,6 +668,7 @@ void Writer::assignAddresses() {
       VirtualSize = alignTo(VirtualSize, C->Alignment);
       C->setRVA(RVA + VirtualSize);
       C->OutputSectionOff = VirtualSize;
+      C->finalizeContents();
       VirtualSize += C->getSize();
       if (C->hasData())
         RawSize = alignTo(VirtualSize, SectorSize);

Added: lld/trunk/test/COFF/string-tail-merge.s
URL: http://llvm.org/viewvc/llvm-project/lld/trunk/test/COFF/string-tail-merge.s?rev=327668&view=auto
==============================================================================
--- lld/trunk/test/COFF/string-tail-merge.s (added)
+++ lld/trunk/test/COFF/string-tail-merge.s Thu Mar 15 14:14:02 2018
@@ -0,0 +1,87 @@
+# REQUIRES: x86
+# RUN: llvm-mc -triple=x86_64-windows-msvc -filetype=obj -o %t.obj %s
+# RUN: lld-link %t.obj /out:%t.exe /entry:main /subsystem:console
+# RUN: llvm-objdump -s %t.exe | FileCheck %s
+
+# CHECK: Contents of section .rdata:
+# CHECK-NEXT:  140002000 68656c6c 6f20776f 726c6400 6fa26ca4  hello world.o.l.
+# CHECK-NEXT:  140002010 0068656c 6c6f2077 6f726c64 00006865  .hello world..he
+# CHECK-NEXT:  140002020 6c6c6f20 776f726c 64006800 65006c00  llo world.h.e.l.
+# CHECK-NEXT:  140002030 6c006f00 20007700 6f007200 6c006400  l.o. .w.o.r.l.d.
+# CHECK-NEXT:  140002040 0000                                 ..
+
+# CHECK: Contents of section .text:
+.globl main
+main:
+# CHECK-NEXT: 140003000 11200040 01000000 17200040 01000000
+.8byte "??_C at _0M@LACCCNMM at hello?5world?$AA@"
+.8byte "??_C at _05MCBCHHEJ@world?$AA@"
+# CHECK-NEXT: 140003010 2a200040 01000000 36200040 01000000
+.8byte "??_C at _1BI@HHJHKLLN@?$AAh?$AAe?$AAl?$AAl?$AAo?$AA?5?$AAw?$AAo?$AAr?$AAl?$AAd?$AA?$AA@"
+.8byte "??_C at _1M@NBBDDHIO@?$AAw?$AAo?$AAr?$AAl?$AAd?$AA?$AA@"
+# CHECK-NEXT: 140003020 00200040 01000000 0c200040 01000000
+.8byte "??_D at not_a_string_literal"
+.8byte "??_C at string_literal_with_relocs"
+# CHECK-NEXT: 140003030 00100040 01000000 1e200040 01000000
+.8byte "??_C at string_literal_in_wrong_section"
+.8byte "??_C at overaligned_string_literal"
+
+.section .rdata,"dr",discard,"??_C at _0M@LACCCNMM at hello?5world?$AA@"
+.globl "??_C at _0M@LACCCNMM at hello?5world?$AA@"
+"??_C at _0M@LACCCNMM at hello?5world?$AA@":
+.asciz "hello world"
+
+.section .rdata,"dr",discard,"??_C at _05MCBCHHEJ@world?$AA@"
+.globl "??_C at _05MCBCHHEJ@world?$AA@"
+"??_C at _05MCBCHHEJ@world?$AA@":
+.asciz "world"
+
+.section .rdata,"dr",discard,"??_C at _1BI@HHJHKLLN@?$AAh?$AAe?$AAl?$AAl?$AAo?$AA?5?$AAw?$AAo?$AAr?$AAl?$AAd?$AA?$AA@"
+.globl "??_C at _1BI@HHJHKLLN@?$AAh?$AAe?$AAl?$AAl?$AAo?$AA?5?$AAw?$AAo?$AAr?$AAl?$AAd?$AA?$AA@"
+.p2align 1
+"??_C at _1BI@HHJHKLLN@?$AAh?$AAe?$AAl?$AAl?$AAo?$AA?5?$AAw?$AAo?$AAr?$AAl?$AAd?$AA?$AA@":
+.short 104
+.short 101
+.short 108
+.short 108
+.short 111
+.short 32
+.short 119
+.short 111
+.short 114
+.short 108
+.short 100
+.short 0
+
+.section .rdata,"dr",discard,"??_C at _1M@NBBDDHIO@?$AAw?$AAo?$AAr?$AAl?$AAd?$AA?$AA@"
+.globl "??_C at _1M@NBBDDHIO@?$AAw?$AAo?$AAr?$AAl?$AAd?$AA?$AA@"
+.p2align 1
+"??_C at _1M@NBBDDHIO@?$AAw?$AAo?$AAr?$AAl?$AAd?$AA?$AA@":
+.short 119
+.short 111
+.short 114
+.short 108
+.short 100
+.short 0
+
+.section .data,"drw",discard,"??_C at string_literal_in_wrong_section"
+.globl "??_C at string_literal_in_wrong_section"
+"??_C at string_literal_in_wrong_section":
+.asciz "hello world"
+
+.section .rdata,"dr",discard,"??_D at not_a_string_literal"
+.globl "??_D at not_a_string_literal"
+"??_D at not_a_string_literal":
+.asciz "hello world"
+
+.section .rdata,"dr",discard,"??_C at string_literal_with_relocs"
+.globl "??_C at string_literal_with_relocs"
+"??_C at string_literal_with_relocs":
+.4byte main + 111 + (114 << 8) + (108 << 16) + (100 << 24) # main + "orld"
+.byte 0
+
+.section .rdata,"dr",discard,"??_C at overaligned_string_literal"
+.globl "??_C at overaligned_string_literal"
+.p2align 1
+"??_C at overaligned_string_literal":
+.asciz "hello world"




More information about the llvm-commits mailing list