[clang] 7d15657 - [clang][modules] Move `UNHASHED_CONTROL_BLOCK` up in the AST file

Jan Svoboda via cfe-commits cfe-commits at lists.llvm.org
Thu Aug 24 09:16:13 PDT 2023


Author: Jan Svoboda
Date: 2023-08-24T09:16:04-07:00
New Revision: 7d1565727dad3acb54fe76a908630843835d7bc8

URL: https://github.com/llvm/llvm-project/commit/7d1565727dad3acb54fe76a908630843835d7bc8
DIFF: https://github.com/llvm/llvm-project/commit/7d1565727dad3acb54fe76a908630843835d7bc8.diff

LOG: [clang][modules] Move `UNHASHED_CONTROL_BLOCK` up in the AST file

When loading (transitively) imported AST file, `ModuleManager::addModule()` first checks it has the expected signature via `readASTFileSignature()`. The signature is part of `UNHASHED_CONTROL_BLOCK`, which is placed at the end of the AST file. This means that just to verify signature of an AST file, we need to skip over all top-level blocks, paging in the whole AST file from disk. This is pretty slow.

This patch moves `UNHASHED_CONTROL_BLOCK` to the start of the AST file, so that it can be read more efficiently. To achieve this, we use dummy signature when first emitting the unhashed control block, and then backpatch the real signature at the end of the serialization process.

This speeds up dependency scanning by over 9% and significantly reduces run-to-run variability of my benchmarks.

Depends on D158572.

Reviewed By: benlangmuir

Differential Revision: https://reviews.llvm.org/D158573

Added: 
    

Modified: 
    clang/include/clang/Basic/Module.h
    clang/include/clang/Serialization/ASTBitCodes.h
    clang/include/clang/Serialization/ASTWriter.h
    clang/lib/Serialization/ASTReader.cpp
    clang/lib/Serialization/ASTWriter.cpp
    clang/lib/Serialization/GlobalModuleIndex.cpp
    clang/test/Modules/ASTSignature.c

Removed: 
    


################################################################################
diff  --git a/clang/include/clang/Basic/Module.h b/clang/include/clang/Basic/Module.h
index a4ad8ad2f768fe..7f1eb6faecf4e7 100644
--- a/clang/include/clang/Basic/Module.h
+++ b/clang/include/clang/Basic/Module.h
@@ -81,6 +81,12 @@ struct ASTFileSignature : std::array<uint8_t, 20> {
     return Sentinel;
   }
 
+  static ASTFileSignature createDummy() {
+    ASTFileSignature Dummy;
+    Dummy.fill(0x00);
+    return Dummy;
+  }
+
   template <typename InputIt>
   static ASTFileSignature create(InputIt First, InputIt Last) {
     assert(std::distance(First, Last) == size &&

diff  --git a/clang/include/clang/Serialization/ASTBitCodes.h b/clang/include/clang/Serialization/ASTBitCodes.h
index 3ad22bbcf5d12d..e759b21474c841 100644
--- a/clang/include/clang/Serialization/ASTBitCodes.h
+++ b/clang/include/clang/Serialization/ASTBitCodes.h
@@ -41,7 +41,7 @@ namespace serialization {
 /// Version 4 of AST files also requires that the version control branch and
 /// revision match exactly, since there is no backward compatibility of
 /// AST files at this time.
-const unsigned VERSION_MAJOR = 28;
+const unsigned VERSION_MAJOR = 29;
 
 /// AST file minor version number supported by this version of
 /// Clang.

diff  --git a/clang/include/clang/Serialization/ASTWriter.h b/clang/include/clang/Serialization/ASTWriter.h
index b9d197c15c1344..f2c7c03ff09360 100644
--- a/clang/include/clang/Serialization/ASTWriter.h
+++ b/clang/include/clang/Serialization/ASTWriter.h
@@ -128,10 +128,17 @@ class ASTWriter : public ASTDeserializationListener,
   /// The module we're currently writing, if any.
   Module *WritingModule = nullptr;
 
-  /// The offset of the first bit inside the AST_BLOCK.
+  /// The byte range representing all the UNHASHED_CONTROL_BLOCK.
+  std::pair<uint64_t, uint64_t> UnhashedControlBlockRange;
+  /// The bit offset of the AST block hash blob.
+  uint64_t ASTBlockHashOffset = 0;
+  /// The bit offset of the signature blob.
+  uint64_t SignatureOffset = 0;
+
+  /// The bit offset of the first bit inside the AST_BLOCK.
   uint64_t ASTBlockStartOffset = 0;
 
-  /// The range representing all the AST_BLOCK.
+  /// The byte range representing all the AST_BLOCK.
   std::pair<uint64_t, uint64_t> ASTBlockRange;
 
   /// The base directory for any relative paths we emit.
@@ -495,12 +502,11 @@ class ASTWriter : public ASTDeserializationListener,
                          StringRef isysroot);
 
   /// Write out the signature and diagnostic options, and return the signature.
-  ASTFileSignature writeUnhashedControlBlock(Preprocessor &PP,
-                                             ASTContext &Context);
+  void writeUnhashedControlBlock(Preprocessor &PP, ASTContext &Context);
+  ASTFileSignature backpatchSignature();
 
   /// Calculate hash of the pcm content.
-  static std::pair<ASTFileSignature, ASTFileSignature>
-  createSignature(StringRef AllBytes, StringRef ASTBlockBytes);
+  std::pair<ASTFileSignature, ASTFileSignature> createSignature() const;
 
   void WriteInputFiles(SourceManager &SourceMgr, HeaderSearchOptions &HSOpts);
   void WriteSourceManagerBlock(SourceManager &SourceMgr,

diff  --git a/clang/lib/Serialization/ASTReader.cpp b/clang/lib/Serialization/ASTReader.cpp
index 82dac5fbfc888d..e707f346a2c652 100644
--- a/clang/lib/Serialization/ASTReader.cpp
+++ b/clang/lib/Serialization/ASTReader.cpp
@@ -4734,12 +4734,6 @@ ASTReader::ReadASTCore(StringRef FileName,
       ShouldFinalizePCM = true;
       return Success;
 
-    case UNHASHED_CONTROL_BLOCK_ID:
-      // This block is handled using look-ahead during ReadControlBlock.  We
-      // shouldn't get here!
-      Error("malformed block record in AST file");
-      return Failure;
-
     default:
       if (llvm::Error Err = Stream.SkipBlock()) {
         Error(std::move(Err));
@@ -4859,13 +4853,18 @@ ASTReader::ASTReadResult ASTReader::readUnhashedControlBlockImpl(
     }
     switch ((UnhashedControlBlockRecordTypes)MaybeRecordType.get()) {
     case SIGNATURE:
-      if (F)
-        F->Signature = ASTFileSignature::create(Record.begin(), Record.end());
+      if (F) {
+        F->Signature = ASTFileSignature::create(Blob.begin(), Blob.end());
+        assert(F->Signature != ASTFileSignature::createDummy() &&
+               "Dummy AST file signature not backpatched in ASTWriter.");
+      }
       break;
     case AST_BLOCK_HASH:
-      if (F)
-        F->ASTBlockHash =
-            ASTFileSignature::create(Record.begin(), Record.end());
+      if (F) {
+        F->ASTBlockHash = ASTFileSignature::create(Blob.begin(), Blob.end());
+        assert(F->ASTBlockHash != ASTFileSignature::createDummy() &&
+               "Dummy AST block hash not backpatched in ASTWriter.");
+      }
       break;
     case DIAGNOSTIC_OPTIONS: {
       bool Complain = (ClientLoadCapabilities & ARR_OutOfDate) == 0;
@@ -5167,9 +5166,12 @@ static ASTFileSignature readASTFileSignature(StringRef PCH) {
       consumeError(MaybeRecord.takeError());
       return ASTFileSignature();
     }
-    if (SIGNATURE == MaybeRecord.get())
-      return ASTFileSignature::create(Record.begin(),
-                                      Record.begin() + ASTFileSignature::size);
+    if (SIGNATURE == MaybeRecord.get()) {
+      auto Signature = ASTFileSignature::create(Blob.begin(), Blob.end());
+      assert(Signature != ASTFileSignature::createDummy() &&
+             "Dummy AST file signature not backpatched in ASTWriter.");
+      return Signature;
+    }
   }
 }
 

diff  --git a/clang/lib/Serialization/ASTWriter.cpp b/clang/lib/Serialization/ASTWriter.cpp
index 2a12779b0d7d12..bad6723840f37d 100644
--- a/clang/lib/Serialization/ASTWriter.cpp
+++ b/clang/lib/Serialization/ASTWriter.cpp
@@ -1120,50 +1120,97 @@ adjustFilenameForRelocatableAST(const char *Filename, StringRef BaseDir) {
 }
 
 std::pair<ASTFileSignature, ASTFileSignature>
-ASTWriter::createSignature(StringRef AllBytes, StringRef ASTBlockBytes) {
+ASTWriter::createSignature() const {
+  StringRef AllBytes(Buffer.data(), Buffer.size());
+
   llvm::SHA1 Hasher;
-  Hasher.update(ASTBlockBytes);
+  Hasher.update(AllBytes.slice(ASTBlockRange.first, ASTBlockRange.second));
   ASTFileSignature ASTBlockHash = ASTFileSignature::create(Hasher.result());
 
-  // Add the remaining bytes (i.e. bytes before the unhashed control block that
-  // are not part of the AST block).
-  Hasher.update(
-      AllBytes.take_front(ASTBlockBytes.bytes_end() - AllBytes.bytes_begin()));
+  // Add the remaining bytes:
+  //  1. Before the unhashed control block.
+  Hasher.update(AllBytes.slice(0, UnhashedControlBlockRange.first));
+  //  2. Between the unhashed control block and the AST block.
   Hasher.update(
-      AllBytes.take_back(AllBytes.bytes_end() - ASTBlockBytes.bytes_end()));
+      AllBytes.slice(UnhashedControlBlockRange.second, ASTBlockRange.first));
+  //  3. After the AST block.
+  Hasher.update(AllBytes.slice(ASTBlockRange.second, StringRef::npos));
   ASTFileSignature Signature = ASTFileSignature::create(Hasher.result());
 
   return std::make_pair(ASTBlockHash, Signature);
 }
 
-ASTFileSignature ASTWriter::writeUnhashedControlBlock(Preprocessor &PP,
-                                                      ASTContext &Context) {
+ASTFileSignature ASTWriter::backpatchSignature() {
+  if (!WritingModule ||
+      !PP->getHeaderSearchInfo().getHeaderSearchOpts().ModulesHashContent)
+    return {};
+
+  // For implicit modules, write the hash of the PCM as its signature.
+
+  auto BackpatchSignatureAt = [&](const ASTFileSignature &S, uint64_t BitNo) {
+    using WordT = unsigned;
+    std::array<WordT, sizeof(ASTFileSignature) / sizeof(WordT)> Words;
+    static_assert(sizeof(Words) == sizeof(S));
+    std::memcpy(Words.data(), S.data(), sizeof(ASTFileSignature));
+    for (WordT Word : Words) {
+      Stream.BackpatchWord(BitNo, Word);
+      BitNo += sizeof(WordT) * 8;
+    }
+  };
+
+  ASTFileSignature ASTBlockHash;
+  ASTFileSignature Signature;
+  std::tie(ASTBlockHash, Signature) = createSignature();
+
+  BackpatchSignatureAt(ASTBlockHash, ASTBlockHashOffset);
+  BackpatchSignatureAt(Signature, SignatureOffset);
+
+  return Signature;
+}
+
+void ASTWriter::writeUnhashedControlBlock(Preprocessor &PP,
+                                          ASTContext &Context) {
   using namespace llvm;
 
   // Flush first to prepare the PCM hash (signature).
   Stream.FlushToWord();
-  auto StartOfUnhashedControl = Stream.GetCurrentBitNo() >> 3;
+  UnhashedControlBlockRange.first = Stream.GetCurrentBitNo() >> 3;
 
   // Enter the block and prepare to write records.
   RecordData Record;
   Stream.EnterSubblock(UNHASHED_CONTROL_BLOCK_ID, 5);
 
   // For implicit modules, write the hash of the PCM as its signature.
-  ASTFileSignature Signature;
   if (WritingModule &&
       PP.getHeaderSearchInfo().getHeaderSearchOpts().ModulesHashContent) {
-    ASTFileSignature ASTBlockHash;
-    auto ASTBlockStartByte = ASTBlockRange.first >> 3;
-    auto ASTBlockByteLength = (ASTBlockRange.second >> 3) - ASTBlockStartByte;
-    std::tie(ASTBlockHash, Signature) = createSignature(
-        StringRef(Buffer.begin(), StartOfUnhashedControl),
-        StringRef(Buffer.begin() + ASTBlockStartByte, ASTBlockByteLength));
-
-    Record.append(ASTBlockHash.begin(), ASTBlockHash.end());
-    Stream.EmitRecord(AST_BLOCK_HASH, Record);
+    // At this point, we don't know the actual signature of the file or the AST
+    // block - we're only able to compute those at the end of the serialization
+    // process. Let's store dummy signatures for now, and replace them with the
+    // real ones later on.
+    // The bitstream VBR-encodes record elements, which makes backpatching them
+    // really 
diff icult. Let's store the signatures as blobs instead - they are
+    // guaranteed to be word-aligned, and we control their format/encoding.
+    auto Dummy = ASTFileSignature::createDummy();
+    SmallString<128> Blob{Dummy.begin(), Dummy.end()};
+
+    auto Abbrev = std::make_shared<BitCodeAbbrev>();
+    Abbrev->Add(BitCodeAbbrevOp(AST_BLOCK_HASH));
+    Abbrev->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Blob));
+    unsigned ASTBlockHashAbbrev = Stream.EmitAbbrev(std::move(Abbrev));
+
+    Abbrev = std::make_shared<BitCodeAbbrev>();
+    Abbrev->Add(BitCodeAbbrevOp(SIGNATURE));
+    Abbrev->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Blob));
+    unsigned SignatureAbbrev = Stream.EmitAbbrev(std::move(Abbrev));
+
+    Record.push_back(AST_BLOCK_HASH);
+    Stream.EmitRecordWithBlob(ASTBlockHashAbbrev, Record, Blob);
+    ASTBlockHashOffset = Stream.GetCurrentBitNo() - Blob.size() * 8;
     Record.clear();
-    Record.append(Signature.begin(), Signature.end());
-    Stream.EmitRecord(SIGNATURE, Record);
+
+    Record.push_back(SIGNATURE);
+    Stream.EmitRecordWithBlob(SignatureAbbrev, Record, Blob);
+    SignatureOffset = Stream.GetCurrentBitNo() - Blob.size() * 8;
     Record.clear();
   }
 
@@ -1232,7 +1279,7 @@ ASTFileSignature ASTWriter::writeUnhashedControlBlock(Preprocessor &PP,
 
   // Leave the options block.
   Stream.ExitBlock();
-  return Signature;
+  UnhashedControlBlockRange.second = Stream.GetCurrentBitNo() >> 3;
 }
 
 /// Write the control block.
@@ -4690,6 +4737,7 @@ ASTFileSignature ASTWriter::WriteASTCore(Sema &SemaRef, StringRef isysroot,
 
   ASTContext &Context = SemaRef.Context;
   Preprocessor &PP = SemaRef.PP;
+  writeUnhashedControlBlock(PP, Context);
 
   collectNonAffectingInputFiles();
 
@@ -4838,7 +4886,7 @@ ASTFileSignature ASTWriter::WriteASTCore(Sema &SemaRef, StringRef isysroot,
 
   // Write the remaining AST contents.
   Stream.FlushToWord();
-  ASTBlockRange.first = Stream.GetCurrentBitNo();
+  ASTBlockRange.first = Stream.GetCurrentBitNo() >> 3;
   Stream.EnterSubblock(AST_BLOCK_ID, 5);
   ASTBlockStartOffset = Stream.GetCurrentBitNo();
 
@@ -5191,13 +5239,13 @@ ASTFileSignature ASTWriter::WriteASTCore(Sema &SemaRef, StringRef isysroot,
   Stream.EmitRecord(STATISTICS, Record);
   Stream.ExitBlock();
   Stream.FlushToWord();
-  ASTBlockRange.second = Stream.GetCurrentBitNo();
+  ASTBlockRange.second = Stream.GetCurrentBitNo() >> 3;
 
   // Write the module file extension blocks.
   for (const auto &ExtWriter : ModuleFileExtensionWriters)
     WriteModuleFileExtension(SemaRef, *ExtWriter);
 
-  return writeUnhashedControlBlock(PP, Context);
+  return backpatchSignature();
 }
 
 void ASTWriter::WriteDeclUpdatesBlocks(RecordDataImpl &OffsetsRecord) {

diff  --git a/clang/lib/Serialization/GlobalModuleIndex.cpp b/clang/lib/Serialization/GlobalModuleIndex.cpp
index d57f4cec2eab6f..92417c7bf1d524 100644
--- a/clang/lib/Serialization/GlobalModuleIndex.cpp
+++ b/clang/lib/Serialization/GlobalModuleIndex.cpp
@@ -697,9 +697,12 @@ llvm::Error GlobalModuleIndexBuilder::loadModuleFile(const FileEntry *File) {
     }
 
     // Get Signature.
-    if (State == DiagnosticOptionsBlock && Code == SIGNATURE)
-      getModuleFileInfo(File).Signature = ASTFileSignature::create(
-          Record.begin(), Record.begin() + ASTFileSignature::size);
+    if (State == DiagnosticOptionsBlock && Code == SIGNATURE) {
+      auto Signature = ASTFileSignature::create(Blob.begin(), Blob.end());
+      assert(Signature != ASTFileSignature::createDummy() &&
+             "Dummy AST file signature not backpatched in ASTWriter.");
+      getModuleFileInfo(File).Signature = Signature;
+    }
 
     // We don't care about this record.
   }

diff  --git a/clang/test/Modules/ASTSignature.c b/clang/test/Modules/ASTSignature.c
index f0df1e4e78236f..11d7c5c9d43d53 100644
--- a/clang/test/Modules/ASTSignature.c
+++ b/clang/test/Modules/ASTSignature.c
@@ -1,6 +1,6 @@
 // RUN: rm -rf %t
-// RUN: %clang_cc1 -iquote %S/Inputs/ASTHash/ -fsyntax-only -fmodules \
-// RUN:   -fimplicit-module-maps -fmodules-strict-context-hash \
+// RUN: %clang_cc1             -iquote %S/Inputs/ASTHash/ -fsyntax-only \
+// RUN:   -fmodules -fimplicit-module-maps -fmodules-strict-context-hash \
 // RUN:   -fmodules-cache-path=%t -fdisable-module-hash %s
 // RUN: cp %t/MyHeader2.pcm %t1.pcm
 // RUN: rm -rf %t
@@ -8,17 +8,18 @@
 // RUN:   -fmodules -fimplicit-module-maps -fmodules-strict-context-hash \
 // RUN:   -fmodules-cache-path=%t -fdisable-module-hash %s
 // RUN: cp %t/MyHeader2.pcm %t2.pcm
-// RUN: llvm-bcanalyzer --dump --disable-histogram %t1.pcm > %t1.dump
-// RUN: llvm-bcanalyzer --dump --disable-histogram %t2.pcm > %t2.dump
+// RUN: llvm-bcanalyzer --dump --disable-histogram --show-binary-blobs %t1.pcm > %t1.dump
+// RUN: llvm-bcanalyzer --dump --disable-histogram --show-binary-blobs %t2.pcm > %t2.dump
 // RUN: cat %t1.dump %t2.dump | FileCheck %s
 
 #include "my_header_2.h"
 
 my_int var = 42;
 
-// CHECK: [[AST_BLOCK_HASH:<AST_BLOCK_HASH .*>]]
-// CHECK: [[SIGNATURE:<SIGNATURE .*>]]
-// CHECK: [[AST_BLOCK_HASH]]
-// CHECK-NOT: [[SIGNATURE]]
-// The modules built by this test are designed to yield the same AST. If this
-// test fails, it means that the AST block is has become non-relocatable.
+// CHECK:     <AST_BLOCK_HASH abbrevid={{[0-9]+}}/> blob data = '[[AST_BLOCK_HASH:.*]]'
+// CHECK:     <SIGNATURE abbrevid={{[0-9]+}}/> blob data = '[[SIGNATURE:.*]]'
+// CHECK:     <AST_BLOCK_HASH abbrevid={{[0-9]+}}/> blob data = '[[AST_BLOCK_HASH]]'
+// CHECK-NOT: <SIGNATURE abbrevid={{[0-9]+}}/> blob data = '[[SIGNATURE]]'
+// The modules built by this test are designed to yield the same AST but distinct AST files.
+// If this test fails, it means that either the AST block has become non-relocatable,
+// or the file signature stopped hashing some parts of the AST file.


        


More information about the cfe-commits mailing list