[llvm] 3b3e28a - [PDB] Optimize public symbol processing

Fri May 8 10:23:34 PDT 2020

Author: Reid Kleckner
Date: 2020-05-08T10:23:27-07:00
New Revision: 3b3e28a07cf5482db27581ecf23f6364b912f733

URL: https://github.com/llvm/llvm-project/commit/3b3e28a07cf5482db27581ecf23f6364b912f733
DIFF: https://github.com/llvm/llvm-project/commit/3b3e28a07cf5482db27581ecf23f6364b912f733.diff

LOG: [PDB] Optimize public symbol processing

Reduces time to link PGO instrumented net_unittets.exe by 11% (9.766s ->
8.672s, best of three). Reduces peak memory by 65.7MB (2142.71MB ->
2076.95MB).

Use a more compact struct, BulkPublic, for faster sorting. Sort in
parallel. Construct the hash buckets in parallel. Try to use one vector
to hold all the publics instead of copying them from one to another.
Allocate all the memory needed to serialize publics up front, and then
serialize them in place in parallel.

Reviewed By: aganea, hans

Differential Revision: https://reviews.llvm.org/D79467

Added: 
    

Modified: 
    lld/COFF/PDB.cpp
    llvm/include/llvm/DebugInfo/PDB/Native/GSIStreamBuilder.h
    llvm/lib/DebugInfo/PDB/Native/GSIStreamBuilder.cpp

Removed: 
    


################################################################################
diff  --git a/lld/COFF/PDB.cpp b/lld/COFF/PDB.cpp
index 44efff132ecd..f005d468c770 100644

--- a/lld/COFF/PDB.cpp
+++ b/lld/COFF/PDB.cpp
@@ -56,7 +56,6 @@
 #include "llvm/Support/Errc.h"
 #include "llvm/Support/FormatAdapters.h"
 #include "llvm/Support/FormatVariadic.h"
-#include "llvm/Support/Parallel.h"
 #include "llvm/Support/Path.h"
 #include "llvm/Support/ScopedPrinter.h"
 #include <memory>
@@ -75,7 +74,7 @@ static Timer totalPdbLinkTimer("PDB Emission (Cumulative)", Timer::root());
 static Timer addObjectsTimer("Add Objects", totalPdbLinkTimer);
 static Timer typeMergingTimer("Type Merging", addObjectsTimer);
 static Timer symbolMergingTimer("Symbol Merging", addObjectsTimer);
-static Timer globalsLayoutTimer("Globals Stream Layout", totalPdbLinkTimer);
+static Timer publicsLayoutTimer("Publics Stream Layout", totalPdbLinkTimer);
 static Timer tpiStreamLayoutTimer("TPI Stream Layout", totalPdbLinkTimer);
 static Timer diskCommitTimer("Commit to Disk", totalPdbLinkTimer);
 
@@ -106,6 +105,9 @@ class PDBLinker {
   /// Link CodeView from each object file in the symbol table into the PDB.
   void addObjectsToPDB();
 
+  /// Add every live, defined public symbol to the PDB.
+  void addPublicsToPDB();
+
   /// Link info for each import file in the symbol table into the PDB.
   void addImportFilesToPDB(ArrayRef<OutputSection *> outputSections);
 
@@ -1295,20 +1297,24 @@ static void createModuleDBI(pdb::PDBFileBuilder &builder) {
   }
 }
 
-static PublicSym32 createPublic(Defined *def) {
-  PublicSym32 pub(SymbolKind::S_PUB32);
-  pub.Name = def->getName();
+static pdb::BulkPublic createPublic(Defined *def) {
+  pdb::BulkPublic pub;
+  pub.Name = def->getName().data();
+  pub.NameLen = def->getName().size();
+
+  PublicSymFlags flags = PublicSymFlags::None;
   if (auto *d = dyn_cast<DefinedCOFF>(def)) {
     if (d->getCOFFSymbol().isFunctionDefinition())
-      pub.Flags = PublicSymFlags::Function;
+      flags = PublicSymFlags::Function;
   } else if (isa<DefinedImportThunk>(def)) {
-    pub.Flags = PublicSymFlags::Function;
+    flags = PublicSymFlags::Function;
   }
+  pub.Flags = static_cast<uint16_t>(flags);
 
   OutputSection *os = def->getChunk()->getOutputSection();
   assert(os && "all publics should be in final image");
   pub.Offset = def->getRVA() - os->getRVA();
-  pub.Segment = os->sectionIndex;
+  pub.U.Segment = os->sectionIndex;
   return pub;
 }
 
@@ -1330,13 +1336,16 @@ void PDBLinker::addObjectsToPDB() {
   addTypeInfo(builder.getTpiBuilder(), tMerger.getTypeTable());
   addTypeInfo(builder.getIpiBuilder(), tMerger.getIDTable());
   t2.stop();
+}
 
-  ScopedTimer t3(globalsLayoutTimer);
-  // Compute the public and global symbols.
+void PDBLinker::addPublicsToPDB() {
+  ScopedTimer t3(publicsLayoutTimer);
+  // Compute the public symbols.
   auto &gsiBuilder = builder.getGsiBuilder();
-  std::vector<PublicSym32> publics;
+  std::vector<pdb::BulkPublic> publics;
   symtab->forEachSymbol([&publics](Symbol *s) {
-    // Only emit defined, live symbols that have a chunk.
+    // Only emit external, defined, live symbols that have a chunk. Static,
+    // non-external symbols do not appear in the symbol table.
     auto *def = dyn_cast<Defined>(s);
     if (def && def->isLive() && def->getChunk())
       publics.push_back(createPublic(def));
@@ -1344,12 +1353,7 @@ void PDBLinker::addObjectsToPDB() {
 
   if (!publics.empty()) {
     publicSymbols = publics.size();
-    // Sort the public symbols and add them to the stream.
-    parallelSort(publics, [](const PublicSym32 &l, const PublicSym32 &r) {
-      return l.Name < r.Name;
-    });
-    for (const PublicSym32 &pub : publics)
-      gsiBuilder.addPublicSymbol(pub);
+    gsiBuilder.addPublicSymbols(std::move(publics));
   }
 }
 
@@ -1708,6 +1712,7 @@ void lld::coff::createPDB(SymbolTable *symtab,
   pdb.addSections(outputSections, sectionTable);
   pdb.addNatvisFiles();
   pdb.addNamedStreams();
+  pdb.addPublicsToPDB();
 
   ScopedTimer t2(diskCommitTimer);
   codeview::GUID guid;

diff  --git a/llvm/include/llvm/DebugInfo/PDB/Native/GSIStreamBuilder.h b/llvm/include/llvm/DebugInfo/PDB/Native/GSIStreamBuilder.h
index c4d35438541a..f52c2f6ed3dc 100644
--- a/llvm/include/llvm/DebugInfo/PDB/Native/GSIStreamBuilder.h
+++ b/llvm/include/llvm/DebugInfo/PDB/Native/GSIStreamBuilder.h
@@ -37,6 +37,7 @@ struct MSFLayout;
 } // namespace msf
 namespace pdb {
 struct GSIHashStreamBuilder;
+struct BulkPublic;
 
 class GSIStreamBuilder {
 
@@ -55,7 +56,8 @@ class GSIStreamBuilder {
   uint32_t getGlobalsStreamIndex() const { return GlobalsStreamIndex; }
   uint32_t getRecordStreamIndex() const { return RecordStreamIndex; }
 
-  void addPublicSymbol(const codeview::PublicSym32 &Pub);
+  // Add public symbols in bulk.
+  void addPublicSymbols(std::vector<BulkPublic> &&Publics);
 
   void addGlobalSymbol(const codeview::ProcRefSym &Sym);
   void addGlobalSymbol(const codeview::DataSym &Sym);
@@ -75,7 +77,40 @@ class GSIStreamBuilder {
   msf::MSFBuilder &Msf;
   std::unique_ptr<GSIHashStreamBuilder> PSH;
   std::unique_ptr<GSIHashStreamBuilder> GSH;
+  std::vector<support::ulittle32_t> PubAddrMap;
 };
+
+/// This struct is equivalent to codeview::PublicSym32, but it has been
+/// optimized for size to speed up bulk serialization and sorting operations
+/// during PDB writing.
+struct BulkPublic {
+  const char *Name = nullptr;
+  uint32_t NameLen = 0;
+
+  // Offset of the symbol record in the publics stream.
+  uint32_t SymOffset = 0;
+
+  // Section offset of the symbol in the image.
+  uint32_t Offset = 0;
+
+  union {
+    // Section index of the section containing the symbol.
+    uint16_t Segment;
+
+    // GSI hash table bucket index.
+    uint16_t BucketIdx;
+  } U{0};
+
+  // PublicSymFlags or hash bucket index
+  uint16_t Flags = 0;
+
+  StringRef getName() const { return StringRef(Name, NameLen); }
+};
+
+static_assert(sizeof(BulkPublic) <= 24, "unexpected size increase");
+static_assert(std::is_trivially_copyable<BulkPublic>::value,
+              "should be trivial");
+
 } // namespace pdb
 } // namespace llvm
 

diff  --git a/llvm/lib/DebugInfo/PDB/Native/GSIStreamBuilder.cpp b/llvm/lib/DebugInfo/PDB/Native/GSIStreamBuilder.cpp
index d35c656e7172..dc06a8022f0e 100644
--- a/llvm/lib/DebugInfo/PDB/Native/GSIStreamBuilder.cpp
+++ b/llvm/lib/DebugInfo/PDB/Native/GSIStreamBuilder.cpp
@@ -5,6 +5,12 @@
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
 //===----------------------------------------------------------------------===//
+//
+// The data structures defined in this file are based on the reference
+// implementation which is available at
+// https://github.com/Microsoft/microsoft-pdb/blob/master/PDB/dbi/gsi.cpp
+//
+//===----------------------------------------------------------------------===//
 
 #include "llvm/DebugInfo/PDB/Native/GSIStreamBuilder.h"
 
@@ -20,6 +26,7 @@
 #include "llvm/DebugInfo/PDB/Native/Hash.h"
 #include "llvm/Support/BinaryItemStream.h"
 #include "llvm/Support/BinaryStreamWriter.h"
+#include "llvm/Support/Parallel.h"
 #include "llvm/Support/xxhash.h"
 #include <algorithm>
 #include <vector>
@@ -57,23 +64,65 @@ struct llvm::pdb::GSIHashStreamBuilder {
   uint32_t calculateSerializedLength() const;
   uint32_t calculateRecordByteSize() const;
   Error commit(BinaryStreamWriter &Writer);
+
   void finalizeBuckets(uint32_t RecordZeroOffset);
 
+  // Finalize public symbol buckets.
+  void finalizeBuckets(uint32_t RecordZeroOffset,
+                       std::vector<BulkPublic> &&Publics);
+
   template <typename T> void addSymbol(const T &Symbol, MSFBuilder &Msf) {
     T Copy(Symbol);
     addSymbol(SymbolSerializer::writeOneSymbol(Copy, Msf.getAllocator(),
                                                CodeViewContainer::Pdb));
   }
-  void addSymbol(const CVSymbol &Symbol) {
-    if (Symbol.kind() == S_UDT || Symbol.kind() == S_CONSTANT) {
-      auto Iter = SymbolHashes.insert(Symbol);
-      if (!Iter.second)
-        return;
-    }
+  void addSymbol(const CVSymbol &Symbol);
+};
 
-    Records.push_back(Symbol);
+void GSIHashStreamBuilder::addSymbol(const codeview::CVSymbol &Symbol) {
+  // Ignore duplicate typedefs and constants.
+  if (Symbol.kind() == S_UDT || Symbol.kind() == S_CONSTANT) {
+    auto Iter = SymbolHashes.insert(Symbol);
+    if (!Iter.second)
+      return;
   }
-};
+  Records.push_back(Symbol);
+}
+
+namespace {
+LLVM_PACKED(struct PublicSym32Layout {
+  RecordPrefix Prefix;
+  PublicSym32Header Pub;
+  // char Name[];
+});
+}
+
+// Calculate how much memory this public needs when serialized.
+static uint32_t sizeOfPublic(const BulkPublic &Pub) {
+  uint32_t NameLen = Pub.NameLen;
+  NameLen = std::min(NameLen,
+                     uint32_t(MaxRecordLength - sizeof(PublicSym32Layout) - 1));
+  return alignTo(sizeof(PublicSym32Layout) + NameLen + 1, 4);
+}
+
+static CVSymbol serializePublic(uint8_t *Mem, const BulkPublic &Pub) {
+  // Assume the caller has allocated sizeOfPublic bytes.
+  uint32_t NameLen = std::min(
+      Pub.NameLen, uint32_t(MaxRecordLength - sizeof(PublicSym32Layout) - 1));
+  size_t Size = alignTo(sizeof(PublicSym32Layout) + NameLen + 1, 4);
+  assert(Size == sizeOfPublic(Pub));
+  auto *FixedMem = reinterpret_cast<PublicSym32Layout *>(Mem);
+  FixedMem->Prefix.RecordKind = static_cast<uint16_t>(codeview::S_PUB32);
+  FixedMem->Prefix.RecordLen = static_cast<uint16_t>(Size - 2);
+  FixedMem->Pub.Flags = Pub.Flags;
+  FixedMem->Pub.Offset = Pub.Offset;
+  FixedMem->Pub.Segment = Pub.U.Segment;
+  char *NameMem = reinterpret_cast<char *>(FixedMem + 1);
+  memcpy(NameMem, Pub.Name, NameLen);
+  // Zero the null terminator and remaining bytes.
+  memset(&NameMem[NameLen], 0, Size - sizeof(PublicSym32Layout) - NameLen);
+  return CVSymbol(makeArrayRef(reinterpret_cast<uint8_t *>(Mem), Size));
+}
 
 uint32_t GSIHashStreamBuilder::calculateSerializedLength() const {
   uint32_t Size = sizeof(GSIHashHeader);
@@ -114,70 +163,97 @@ static bool isAsciiString(StringRef S) {
 }
 
 // See `caseInsensitiveComparePchPchCchCch` in gsi.cpp
-static bool gsiRecordLess(StringRef S1, StringRef S2) {
+static int gsiRecordCmp(StringRef S1, StringRef S2) {
   size_t LS = S1.size();
   size_t RS = S2.size();
   // Shorter strings always compare less than longer strings.
   if (LS != RS)
-    return LS < RS;
+    return LS - RS;
 
   // If either string contains non ascii characters, memcmp them.
   if (LLVM_UNLIKELY(!isAsciiString(S1) || !isAsciiString(S2)))
-    return memcmp(S1.data(), S2.data(), LS) < 0;
+    return memcmp(S1.data(), S2.data(), LS);
 
   // Both strings are ascii, perform a case-insenstive comparison.
-  return S1.compare_lower(S2.data()) < 0;
+  return S1.compare_lower(S2.data());
 }
 
 void GSIHashStreamBuilder::finalizeBuckets(uint32_t RecordZeroOffset) {
-  std::array<std::vector<std::pair<StringRef, PSHashRecord>>, IPHR_HASH + 1>
-      TmpBuckets;
+  // Build up a list of globals to be bucketed. This repurposes the BulkPublic
+  // struct with 
diff erent meanings for the fields to avoid reallocating a new
+  // vector during public symbol table hash construction.
+  std::vector<BulkPublic> Globals;
+  Globals.resize(Records.size());
   uint32_t SymOffset = RecordZeroOffset;
-  for (const CVSymbol &Sym : Records) {
-    PSHashRecord HR;
-    // Add one when writing symbol offsets to disk. See GSI1::fixSymRecs.
-    HR.Off = SymOffset + 1;
-    HR.CRef = 1; // Always use a refcount of 1.
-
-    // Hash the name to figure out which bucket this goes into.
-    StringRef Name = getSymbolName(Sym);
-    size_t BucketIdx = hashStringV1(Name) % IPHR_HASH;
-    TmpBuckets[BucketIdx].push_back(std::make_pair(Name, HR));
-    SymOffset += Sym.length();
+  for (size_t I = 0, E = Records.size(); I < E; ++I) {
+    StringRef Name = getSymbolName(Records[I]);
+    Globals[I].Name = Name.data();
+    Globals[I].NameLen = Name.size();
+    Globals[I].SymOffset = SymOffset;
+    SymOffset += Records[I].length();
   }
 
-  // Compute the three tables: the hash records in bucket and chain order, the
-  // bucket presence bitmap, and the bucket chain start offsets.
-  HashRecords.reserve(Records.size());
+  finalizeBuckets(RecordZeroOffset, std::move(Globals));
+}
+
+void GSIHashStreamBuilder::finalizeBuckets(uint32_t RecordZeroOffset,
+                                           std::vector<BulkPublic> &&Globals) {
+  // Hash every name in parallel. The Segment field is no longer needed, so
+  // store the BucketIdx in a union.
+  parallelForEachN(0, Globals.size(), [&](size_t I) {
+    Globals[I].U.BucketIdx = hashStringV1(Globals[I].Name) % IPHR_HASH;
+  });
+
+  // Parallel sort by bucket index, then name within the buckets. Within the
+  // buckets, sort each bucket by memcmp of the symbol's name.  It's important
+  // that we use the same sorting algorithm as is used by the reference
+  // implementation to ensure that the search for a record within a bucket can
+  // properly early-out when it detects the record won't be found.  The
+  // algorithm used here corredsponds to the function
+  // caseInsensitiveComparePchPchCchCch in the reference implementation.
+  auto BucketCmp = [](const BulkPublic &L, const BulkPublic &R) {
+    if (L.U.BucketIdx != R.U.BucketIdx)
+      return L.U.BucketIdx < R.U.BucketIdx;
+    int Cmp = gsiRecordCmp(L.getName(), R.getName());
+    if (Cmp != 0)
+      return Cmp < 0;
+    // This comparison is necessary to make the sorting stable in the presence
+    // of two static globals with the same name. The easiest way to observe
+    // this is with S_LDATA32 records.
+    return L.SymOffset < R.SymOffset;
+  };
+  parallelSort(Globals, BucketCmp);
+
+  // Zero out the bucket index bitmap.
   for (ulittle32_t &Word : HashBitmap)
     Word = 0;
-  for (size_t BucketIdx = 0; BucketIdx < IPHR_HASH + 1; ++BucketIdx) {
-    auto &Bucket = TmpBuckets[BucketIdx];
-    if (Bucket.empty())
-      continue;
-    HashBitmap[BucketIdx / 32] |= 1U << (BucketIdx % 32);
-
-    // Calculate what the offset of the first hash record in the chain would
-    // be if it were inflated to contain 32-bit pointers. On a 32-bit system,
-    // each record would be 12 bytes. See HROffsetCalc in gsi.h.
-    const int SizeOfHROffsetCalc = 12;
-    ulittle32_t ChainStartOff =
-        ulittle32_t(HashRecords.size() * SizeOfHROffsetCalc);
-    HashBuckets.push_back(ChainStartOff);
-
-    // Sort each bucket by memcmp of the symbol's name.  It's important that
-    // we use the same sorting algorithm as is used by the reference
-    // implementation to ensure that the search for a record within a bucket
-    // can properly early-out when it detects the record won't be found.  The
-    // algorithm used here corredsponds to the function
-    // caseInsensitiveComparePchPchCchCch in the reference implementation.
-    llvm::sort(Bucket, [](const std::pair<StringRef, PSHashRecord> &Left,
-                          const std::pair<StringRef, PSHashRecord> &Right) {
-      return gsiRecordLess(Left.first, Right.first);
-    });
-
-    for (const auto &Entry : Bucket)
-      HashRecords.push_back(Entry.second);
+
+  // Compute the three tables: the hash records in bucket and chain order, the
+  // bucket presence bitmap, and the bucket chain start offsets.
+  HashRecords.reserve(Globals.size());
+  uint32_t LastBucketIdx = ~0U;
+  for (const BulkPublic &Global : Globals) {
+    // If this is a new bucket, add it to the bitmap and the start offset map.
+    uint32_t BucketIdx = Global.U.BucketIdx;
+    if (LastBucketIdx != BucketIdx) {
+      HashBitmap[BucketIdx / 32] |= 1U << (BucketIdx % 32);
+
+      // Calculate what the offset of the first hash record in the chain would
+      // be if it were inflated to contain 32-bit pointers. On a 32-bit system,
+      // each record would be 12 bytes. See HROffsetCalc in gsi.h.
+      const int SizeOfHROffsetCalc = 12;
+      ulittle32_t ChainStartOff =
+          ulittle32_t(HashRecords.size() * SizeOfHROffsetCalc);
+      HashBuckets.push_back(ChainStartOff);
+      LastBucketIdx = BucketIdx;
+    }
+
+    // Create the hash record. Add one when writing symbol offsets to disk.
+    // See GSI1::fixSymRecs. Always use a refcount of 1 for now.
+    PSHashRecord HRec;
+    HRec.Off = Global.SymOffset + 1;
+    HRec.CRef = 1;
+    HashRecords.push_back(HRec);
   }
 }
 
@@ -191,7 +267,7 @@ uint32_t GSIStreamBuilder::calculatePublicsHashStreamSize() const {
   uint32_t Size = 0;
   Size += sizeof(PublicsStreamHeader);
   Size += PSH->calculateSerializedLength();
-  Size += PSH->Records.size() * sizeof(uint32_t); // AddrMap
+  Size += PubAddrMap.size() * sizeof(uint32_t); // AddrMap
   // FIXME: Add thunk map and section offsets for incremental linking.
 
   return Size;
@@ -203,11 +279,9 @@ uint32_t GSIStreamBuilder::calculateGlobalsHashStreamSize() const {
 
 Error GSIStreamBuilder::finalizeMsfLayout() {
   // First we write public symbol records, then we write global symbol records.
-  uint32_t PSHZero = 0;
-  uint32_t GSHZero = PSH->calculateRecordByteSize();
-
-  PSH->finalizeBuckets(PSHZero);
-  GSH->finalizeBuckets(GSHZero);
+  uint32_t PublicsSize = PSH->calculateRecordByteSize();
+  uint32_t GlobalsSize = GSH->calculateRecordByteSize();
+  GSH->finalizeBuckets(PublicsSize);
 
   Expected<uint32_t> Idx = Msf.addStream(calculateGlobalsHashStreamSize());
   if (!Idx)
@@ -219,8 +293,7 @@ Error GSIStreamBuilder::finalizeMsfLayout() {
     return Idx.takeError();
   PublicsStreamIndex = *Idx;
 
-  uint32_t RecordBytes =
-      GSH->calculateRecordByteSize() + PSH->calculateRecordByteSize();
+  uint32_t RecordBytes = PublicsSize + GlobalsSize;
 
   Idx = Msf.addStream(RecordBytes);
   if (!Idx)
@@ -229,65 +302,56 @@ Error GSIStreamBuilder::finalizeMsfLayout() {
   return Error::success();
 }
 
-static StringRef extractPubSym(const CVSymbol *Sym, uint16_t &Seg,
-                               uint32_t &Offset) {
-  ArrayRef<uint8_t> Buf = Sym->content();
-  assert(Buf.size() > sizeof(PublicSym32Header));
-  const auto *Hdr = reinterpret_cast<const PublicSym32Header *>(Buf.data());
-  Buf = Buf.drop_front(sizeof(PublicSym32Header));
-  Seg = Hdr->Segment;
-  Offset = Hdr->Offset;
-  // Don't worry about finding the null terminator, since the strings will be
-  // compared later.
-  return StringRef(reinterpret_cast<const char *>(Buf.data()), Buf.size());
-}
-
-static bool comparePubSymByAddrAndName(const CVSymbol *LS, const CVSymbol *RS) {
-  uint16_t LSeg, RSeg;
-  uint32_t LOff, ROff;
-  StringRef LName, RName;
-  LName = extractPubSym(LS, LSeg, LOff);
-  RName = extractPubSym(RS, RSeg, ROff);
-  if (LSeg != RSeg)
-    return LSeg < RSeg;
-  if (LOff != ROff)
-    return LOff < ROff;
-  return LName < RName;
-}
-
-/// Compute the address map. The address map is an array of symbol offsets
-/// sorted so that it can be binary searched by address.
-static std::vector<ulittle32_t> computeAddrMap(ArrayRef<CVSymbol> Records) {
-  // Make a vector of pointers to the symbols so we can sort it by address.
-  // Also gather the symbol offsets while we're at it.
-
-  std::vector<const CVSymbol *> PublicsByAddr;
-  std::vector<uint32_t> SymOffsets;
-  PublicsByAddr.reserve(Records.size());
-  SymOffsets.reserve(Records.size());
+void GSIStreamBuilder::addPublicSymbols(std::vector<BulkPublic> &&Publics) {
+  // Sort the symbols by name. PDBs contain lots of symbols, so use parallelism.
+  parallelSort(Publics, [](const BulkPublic &L, const BulkPublic &R) {
+    return L.getName() < R.getName();
+  });
 
+  // Assign offsets and allocate one contiguous block of memory for all public
+  // symbols.
   uint32_t SymOffset = 0;
-  for (const CVSymbol &Sym : Records) {
-    assert(Sym.kind() == SymbolKind::S_PUB32);
-    PublicsByAddr.push_back(&Sym);
-    SymOffsets.push_back(SymOffset);
-    SymOffset += Sym.length();
+  for (BulkPublic &Pub : Publics) {
+    Pub.SymOffset = SymOffset;
+    SymOffset += sizeOfPublic(Pub);
   }
-  llvm::stable_sort(PublicsByAddr, comparePubSymByAddrAndName);
+  uint8_t *Mem =
+      reinterpret_cast<uint8_t *>(Msf.getAllocator().Allocate(SymOffset, 4));
+
+  // Instead of storing individual CVSymbol records, store them as one giant
+  // buffer.
+  // FIXME: This is kind of a hack. This makes Records.size() wrong, and we have
+  // to account for that elsewhere.
+  PSH->Records.push_back(CVSymbol(makeArrayRef(Mem, SymOffset)));
+
+  // Serialize them in parallel.
+  parallelForEachN(0, Publics.size(), [&](size_t I) {
+    const BulkPublic &Pub = Publics[I];
+    serializePublic(Mem + Pub.SymOffset, Pub);
+  });
+
+  // Re-sort the publics by address so we can build the address map. We no
+  // longer need the original ordering.
+  auto AddrCmp = [](const BulkPublic &L, const BulkPublic &R) {
+    if (L.U.Segment != R.U.Segment)
+      return L.U.Segment < R.U.Segment;
+    if (L.Offset != R.Offset)
+      return L.Offset < R.Offset;
+    // parallelSort is unstable, so we have to do name comparison to ensure
+    // that two names for the same location come out in a determinstic order.
+    return L.getName() < R.getName();
+  };
+  parallelSort(Publics, AddrCmp);
 
   // Fill in the symbol offsets in the appropriate order.
-  std::vector<ulittle32_t> AddrMap;
-  AddrMap.reserve(Records.size());
-  for (const CVSymbol *Sym : PublicsByAddr) {
-    ptr
diff _t Idx = std::distance(Records.data(), Sym);
-    assert(Idx >= 0 && size_t(Idx) < Records.size());
-    AddrMap.push_back(ulittle32_t(SymOffsets[Idx]));
-  }
-  return AddrMap;
-}
-
-void GSIStreamBuilder::addPublicSymbol(const PublicSym32 &Pub) {
-  PSH->addSymbol(Pub, Msf);
+  PubAddrMap.reserve(Publics.size());
+  for (const BulkPublic &Pub : Publics)
+    PubAddrMap.push_back(ulittle32_t(Pub.SymOffset));
+
+  // Finalize public symbol buckets immediately after they have been added.
+  // They should all be warm in the cache at this point, so go ahead and do it
+  // now.
+  PSH->finalizeBuckets(0, std::move(Publics));
 }
 
 void GSIStreamBuilder::addGlobalSymbol(const ProcRefSym &Sym) {
@@ -336,7 +400,7 @@ Error GSIStreamBuilder::commitPublicsHashStream(
 
   // FIXME: Fill these in. They are for incremental linking.
   Header.SymHash = PSH->calculateSerializedLength();
-  Header.AddrMap = PSH->Records.size() * 4;
+  Header.AddrMap = PubAddrMap.size() * 4;
   Header.NumThunks = 0;
   Header.SizeOfThunk = 0;
   Header.ISectThunkTable = 0;
@@ -349,8 +413,7 @@ Error GSIStreamBuilder::commitPublicsHashStream(
   if (auto EC = PSH->commit(Writer))
     return EC;
 
-  std::vector<ulittle32_t> AddrMap = computeAddrMap(PSH->Records);
-  if (auto EC = Writer.writeArray(makeArrayRef(AddrMap)))
+  if (auto EC = Writer.writeArray(makeArrayRef(PubAddrMap)))
     return EC;
 
   return Error::success();