[clang] [C++20][Modules] Use `MultiOnDiskHashTable` to improve the performance of header search algorithm. (PR #155350)
Michael Park via cfe-commits
cfe-commits at lists.llvm.org
Mon Aug 25 20:37:50 PDT 2025
https://github.com/mpark created https://github.com/llvm/llvm-project/pull/155350
This is a second attempt to solve https://github.com/llvm/llvm-project/issues/140860, and supersedes the attempt in https://github.com/llvm/llvm-project/pull/140867.
In this approach, we introduce an instance of `MultiOnDiskHashTable` that lazily loads the on-disk hash table for each PCMs, and only condenses it into memory once it surpasses some threshold (64 is used here).
The results are very similar to https://github.com/llvm/llvm-project/pull/140867, where the following test case takes about **14s** before the change, and **0.2s** with the change.
```bash
#/usr/bin/env bash
CLANG=${CLANG:-clang++}
NUM_MODULES=${NUM_MODULES:-1000}
NUM_HEADERS=${NUM_HEADERS:-10000}
mkdir build && cd build
for i in $(seq 1 $NUM_MODULES); do > m${i}.h; done
seq 1 $NUM_MODULES | xargs -I {} -P $(nproc) bash -c "$CLANG -std=c++20 -fmodule-header m{}.h"
for i in $(seq 1 $NUM_HEADERS); do > h${i}.h; done
> a.cpp
for i in $(seq 1 $NUM_MODULES); do echo "import \"m${i}.h\";" >> a.cpp; done
for i in $(seq 1 $NUM_HEADERS); do echo "#include \"h${i}.h\"" >> a.cpp; done
echo "int main() {}" >> a.cpp
time $CLANG -std=c++20 -Wno-experimental-header-units -E a.cpp -o /dev/null \
$(for i in $(seq 1 $NUM_MODULES); do echo -n "-fmodule-file=m${i}.pcm "; done)
```
I'll be happy to add more comments and such, but I wanted to get some feedback as to whether this approach seems reasonable to folks.
>From 28fe4c9af6678b0c0972fd75b600330faa0de008 Mon Sep 17 00:00:00 2001
From: Michael Park <mcypark at gmail.com>
Date: Thu, 21 Aug 2025 21:14:21 -0700
Subject: [PATCH 1/2] Move `mergeHeaderFileInfo` into `HeaderFileInfo::merge`.
---
clang/include/clang/Lex/HeaderSearch.h | 3 +++
clang/lib/Lex/HeaderSearch.cpp | 29 ++++++++++++--------------
clang/lib/Serialization/ASTReader.cpp | 5 ++---
3 files changed, 18 insertions(+), 19 deletions(-)
diff --git a/clang/include/clang/Lex/HeaderSearch.h b/clang/include/clang/Lex/HeaderSearch.h
index 2e0c8bec8bd8c..ca32e5f0e3fb7 100644
--- a/clang/include/clang/Lex/HeaderSearch.h
+++ b/clang/include/clang/Lex/HeaderSearch.h
@@ -138,6 +138,9 @@ struct HeaderFileInfo {
/// isModuleHeader will potentially be set, but not cleared.
/// isTextualModuleHeader will be set or cleared based on the role update.
void mergeModuleMembership(ModuleMap::ModuleHeaderRole Role);
+
+ /// Merge the header file info \p Other into this header file info.
+ void merge(const HeaderFileInfo& Other);
};
static_assert(sizeof(HeaderFileInfo) <= 16);
diff --git a/clang/lib/Lex/HeaderSearch.cpp b/clang/lib/Lex/HeaderSearch.cpp
index ea2391f6812cf..da90706bb1654 100644
--- a/clang/lib/Lex/HeaderSearch.cpp
+++ b/clang/lib/Lex/HeaderSearch.cpp
@@ -1289,23 +1289,20 @@ void HeaderFileInfo::mergeModuleMembership(ModuleMap::ModuleHeaderRole Role) {
(Role & ModuleMap::TextualHeader));
}
-/// Merge the header file info provided by \p OtherHFI into the current
-/// header file info (\p HFI)
-static void mergeHeaderFileInfo(HeaderFileInfo &HFI,
- const HeaderFileInfo &OtherHFI) {
- assert(OtherHFI.External && "expected to merge external HFI");
+void HeaderFileInfo::merge(const HeaderFileInfo& Other) {
+ assert(Other.External && "expected to merge external HFI");
- HFI.isImport |= OtherHFI.isImport;
- HFI.isPragmaOnce |= OtherHFI.isPragmaOnce;
- mergeHeaderFileInfoModuleBits(HFI, OtherHFI.isModuleHeader,
- OtherHFI.isTextualModuleHeader);
+ isImport |= Other.isImport;
+ isPragmaOnce |= Other.isPragmaOnce;
+ mergeHeaderFileInfoModuleBits(*this, Other.isModuleHeader,
+ Other.isTextualModuleHeader);
- if (!HFI.LazyControllingMacro.isValid())
- HFI.LazyControllingMacro = OtherHFI.LazyControllingMacro;
+ if (!LazyControllingMacro.isValid())
+ LazyControllingMacro = Other.LazyControllingMacro;
- HFI.DirInfo = OtherHFI.DirInfo;
- HFI.External = (!HFI.IsValid || HFI.External);
- HFI.IsValid = true;
+ DirInfo = Other.DirInfo;
+ External = (!IsValid || External);
+ IsValid = true;
}
HeaderFileInfo &HeaderSearch::getFileInfo(FileEntryRef FE) {
@@ -1319,7 +1316,7 @@ HeaderFileInfo &HeaderSearch::getFileInfo(FileEntryRef FE) {
if (ExternalHFI.IsValid) {
HFI->Resolved = true;
if (ExternalHFI.External)
- mergeHeaderFileInfo(*HFI, ExternalHFI);
+ HFI->merge(ExternalHFI);
}
}
@@ -1343,7 +1340,7 @@ const HeaderFileInfo *HeaderSearch::getExistingFileInfo(FileEntryRef FE) const {
if (ExternalHFI.IsValid) {
HFI->Resolved = true;
if (ExternalHFI.External)
- mergeHeaderFileInfo(*HFI, ExternalHFI);
+ HFI->merge(ExternalHFI);
}
}
} else if (FE.getUID() < FileInfo.size()) {
diff --git a/clang/lib/Serialization/ASTReader.cpp b/clang/lib/Serialization/ASTReader.cpp
index 32f7779278286..0c5900fbac0a4 100644
--- a/clang/lib/Serialization/ASTReader.cpp
+++ b/clang/lib/Serialization/ASTReader.cpp
@@ -2347,9 +2347,8 @@ HeaderFileInfoTrait::ReadData(internal_key_ref key, const unsigned char *d,
// deserialize this header file info again.
Reader.getPreprocessor().getIncludedFiles().insert(*FE);
- // FIXME: Refactor with mergeHeaderFileInfo in HeaderSearch.cpp.
- HFI.isImport |= (Flags >> 5) & 0x01;
- HFI.isPragmaOnce |= (Flags >> 4) & 0x01;
+ HFI.isImport = (Flags >> 5) & 0x01;
+ HFI.isPragmaOnce = (Flags >> 4) & 0x01;
HFI.DirInfo = (Flags >> 1) & 0x07;
HFI.LazyControllingMacro = Reader.getGlobalIdentifierID(
M, endian::readNext<IdentifierID, llvm::endianness::little>(d));
>From d5c944026e8d10fcc8264b2d76151907b3554f12 Mon Sep 17 00:00:00 2001
From: Michael Park <mcypark at gmail.com>
Date: Wed, 20 Aug 2025 00:13:33 -0700
Subject: [PATCH 2/2] Use `MultiOnDiskHashTable` to improve the performance of
header search.
---
clang/include/clang/Serialization/ASTReader.h | 5 ++
.../include/clang/Serialization/ModuleFile.h | 6 +-
clang/lib/Serialization/ASTReader.cpp | 78 ++++++++-----------
clang/lib/Serialization/ASTReaderInternals.h | 26 ++++++-
clang/lib/Serialization/ModuleFile.cpp | 1 -
.../lib/Serialization/MultiOnDiskHashTable.h | 69 ++++++++++++----
6 files changed, 115 insertions(+), 70 deletions(-)
diff --git a/clang/include/clang/Serialization/ASTReader.h b/clang/include/clang/Serialization/ASTReader.h
index 3ede0925676e8..56a925c600f13 100644
--- a/clang/include/clang/Serialization/ASTReader.h
+++ b/clang/include/clang/Serialization/ASTReader.h
@@ -385,6 +385,9 @@ struct ModuleLocalLookupTable;
/// The on-disk hash table(s) used for specialization decls.
struct LazySpecializationInfoLookupTable;
+/// The on-disk hash table(s) used for header file infos.
+struct HeaderFileInfoLookupTable;
+
} // namespace reader
} // namespace serialization
@@ -684,6 +687,8 @@ class ASTReader
/// Map from the TU to its lexical contents from each module file.
std::vector<std::pair<ModuleFile*, LexicalContents>> TULexicalDecls;
+ std::unique_ptr<serialization::reader::HeaderFileInfoLookupTable> HeaderFileInfoLookup;
+
/// Map from a DeclContext to its lookup tables.
llvm::DenseMap<const DeclContext *,
serialization::reader::DeclContextLookupTable> Lookups;
diff --git a/clang/include/clang/Serialization/ModuleFile.h b/clang/include/clang/Serialization/ModuleFile.h
index f20cb2f9f35ae..ce57009da26e3 100644
--- a/clang/include/clang/Serialization/ModuleFile.h
+++ b/clang/include/clang/Serialization/ModuleFile.h
@@ -394,11 +394,7 @@ class ModuleFile {
///
/// This pointer points into a memory buffer, where the on-disk hash
/// table for header file information actually lives.
- const char *HeaderFileInfoTableData = nullptr;
-
- /// The on-disk hash table that contains information about each of
- /// the header files.
- void *HeaderFileInfoTable = nullptr;
+ const unsigned char *HeaderFileInfoTableData = nullptr;
// === Submodule information ===
diff --git a/clang/lib/Serialization/ASTReader.cpp b/clang/lib/Serialization/ASTReader.cpp
index 0c5900fbac0a4..7aebef19fa32e 100644
--- a/clang/lib/Serialization/ASTReader.cpp
+++ b/clang/lib/Serialization/ASTReader.cpp
@@ -2273,6 +2273,18 @@ ASTReader::getGlobalPreprocessedEntityID(ModuleFile &M,
return LocalID + I->second;
}
+void HeaderFileInfoTrait::data_type_builder::merge(data_type D) const {
+ Data.merge(D);
+}
+
+ModuleFile *HeaderFileInfoTrait::ReadFileRef(const unsigned char *&d) {
+ using namespace llvm::support;
+
+ uint32_t ModuleFileID =
+ endian::readNext<uint32_t, llvm::endianness::little, unaligned>(d);
+ return Reader.getLocalModuleFile(M, ModuleFileID);
+}
+
OptionalFileEntryRef
HeaderFileInfoTrait::getFile(const internal_key_type &Key) {
FileManager &FileMgr = Reader.getFileManager();
@@ -2299,6 +2311,10 @@ HeaderFileInfoTrait::GetInternalKey(external_key_type ekey) {
return ikey;
}
+OptionalFileEntryRef HeaderFileInfoTrait::TryGetExternalKey(internal_key_ref ikey) {
+ return getFile(ikey);
+}
+
bool HeaderFileInfoTrait::EqualKey(internal_key_ref a, internal_key_ref b) {
if (a.Size != b.Size || (a.ModTime && b.ModTime && a.ModTime != b.ModTime))
return false;
@@ -2330,9 +2346,9 @@ HeaderFileInfoTrait::ReadKey(const unsigned char *d, unsigned) {
return ikey;
}
-HeaderFileInfoTrait::data_type
-HeaderFileInfoTrait::ReadData(internal_key_ref key, const unsigned char *d,
- unsigned DataLen) {
+void HeaderFileInfoTrait::ReadDataInto(
+ internal_key_ref key, const unsigned char *d,
+ unsigned DataLen, data_type_builder &Val) {
using namespace llvm::support;
const unsigned char *End = d + DataLen;
@@ -2379,7 +2395,11 @@ HeaderFileInfoTrait::ReadData(internal_key_ref key, const unsigned char *d,
// This HeaderFileInfo was externally loaded.
HFI.External = true;
HFI.IsValid = true;
- return HFI;
+ Val.merge(HFI);
+}
+
+void HeaderFileInfoTrait::MergeDataInto(const data_type &From, data_type_builder &To) {
+ To.merge(From);
}
void ASTReader::addPendingMacro(IdentifierInfo *II, ModuleFile *M,
@@ -4232,12 +4252,17 @@ llvm::Error ASTReader::ReadASTBlock(ModuleFile &F,
break;
case HEADER_SEARCH_TABLE:
- F.HeaderFileInfoTableData = Blob.data();
+ F.HeaderFileInfoTableData = (const unsigned char *)Blob.data();
F.LocalNumHeaderFileInfos = Record[1];
if (Record[0]) {
- F.HeaderFileInfoTable = HeaderFileInfoLookupTable::Create(
- (const unsigned char *)F.HeaderFileInfoTableData + Record[0],
- (const unsigned char *)F.HeaderFileInfoTableData,
+ if (!HeaderFileInfoLookup) {
+ HeaderFileInfoLookup = std::make_unique<HeaderFileInfoLookupTable>();
+ }
+ HeaderFileInfoLookup->Table.add(
+ &F,
+ F.HeaderFileInfoTableData + Record[0],
+ F.HeaderFileInfoTableData + sizeof(uint32_t),
+ F.HeaderFileInfoTableData,
HeaderFileInfoTrait(*this, F));
PP.getHeaderSearchInfo().SetExternalSource(this);
@@ -6946,43 +6971,8 @@ std::optional<bool> ASTReader::isPreprocessedEntityInFileID(unsigned Index,
return false;
}
-namespace {
-
- /// Visitor used to search for information about a header file.
- class HeaderFileInfoVisitor {
- FileEntryRef FE;
- std::optional<HeaderFileInfo> HFI;
-
- public:
- explicit HeaderFileInfoVisitor(FileEntryRef FE) : FE(FE) {}
-
- bool operator()(ModuleFile &M) {
- HeaderFileInfoLookupTable *Table
- = static_cast<HeaderFileInfoLookupTable *>(M.HeaderFileInfoTable);
- if (!Table)
- return false;
-
- // Look in the on-disk hash table for an entry for this file name.
- HeaderFileInfoLookupTable::iterator Pos = Table->find(FE);
- if (Pos == Table->end())
- return false;
-
- HFI = *Pos;
- return true;
- }
-
- std::optional<HeaderFileInfo> getHeaderFileInfo() const { return HFI; }
- };
-
-} // namespace
-
HeaderFileInfo ASTReader::GetHeaderFileInfo(FileEntryRef FE) {
- HeaderFileInfoVisitor Visitor(FE);
- ModuleMgr.visit(Visitor);
- if (std::optional<HeaderFileInfo> HFI = Visitor.getHeaderFileInfo())
- return *HFI;
-
- return HeaderFileInfo();
+ return HeaderFileInfoLookup->Table.find(FE);
}
void ASTReader::ReadPragmaDiagnosticMappings(DiagnosticsEngine &Diag) {
diff --git a/clang/lib/Serialization/ASTReaderInternals.h b/clang/lib/Serialization/ASTReaderInternals.h
index 4a7794889b039..714bd7d5bb22d 100644
--- a/clang/lib/Serialization/ASTReaderInternals.h
+++ b/clang/lib/Serialization/ASTReaderInternals.h
@@ -373,6 +373,9 @@ class HeaderFileInfoTrait {
ModuleFile &M;
public:
+ // Maximum number of lookup tables we allow before condensing the tables.
+ static const int MaxTables = 64;
+
using external_key_type = FileEntryRef;
struct internal_key_type {
@@ -385,14 +388,24 @@ class HeaderFileInfoTrait {
using internal_key_ref = const internal_key_type &;
using data_type = HeaderFileInfo;
+
+ struct data_type_builder {
+ data_type &Data;
+ data_type_builder(data_type &D) : Data(D) {}
+
+ void merge(data_type D) const;
+ };
+
using hash_value_type = unsigned;
using offset_type = unsigned;
+ using file_type = ModuleFile *;
HeaderFileInfoTrait(ASTReader &Reader, ModuleFile &M)
: Reader(Reader), M(M) {}
static hash_value_type ComputeHash(internal_key_ref ikey);
internal_key_type GetInternalKey(external_key_type ekey);
+ OptionalFileEntryRef TryGetExternalKey(internal_key_ref ikey);
bool EqualKey(internal_key_ref a, internal_key_ref b);
static std::pair<unsigned, unsigned>
@@ -400,15 +413,22 @@ class HeaderFileInfoTrait {
static internal_key_type ReadKey(const unsigned char *d, unsigned);
- data_type ReadData(internal_key_ref,const unsigned char *d, unsigned DataLen);
+ void ReadDataInto(
+ internal_key_ref, const unsigned char *d,
+ unsigned DataLen, data_type_builder &Val);
+
+ static void MergeDataInto(const data_type &From, data_type_builder &To);
+
+ file_type ReadFileRef(const unsigned char *&d);
private:
OptionalFileEntryRef getFile(const internal_key_type &Key);
};
/// The on-disk hash table used for known header files.
-using HeaderFileInfoLookupTable =
- llvm::OnDiskChainedHashTable<HeaderFileInfoTrait>;
+struct HeaderFileInfoLookupTable {
+ MultiOnDiskHashTable<HeaderFileInfoTrait, /*UseExternalKey=*/true> Table;
+};
} // namespace reader
diff --git a/clang/lib/Serialization/ModuleFile.cpp b/clang/lib/Serialization/ModuleFile.cpp
index 4858cdbda5545..5ce808f473af4 100644
--- a/clang/lib/Serialization/ModuleFile.cpp
+++ b/clang/lib/Serialization/ModuleFile.cpp
@@ -24,7 +24,6 @@ using namespace reader;
ModuleFile::~ModuleFile() {
delete static_cast<ASTIdentifierLookupTable *>(IdentifierLookupTable);
- delete static_cast<HeaderFileInfoLookupTable *>(HeaderFileInfoTable);
delete static_cast<ASTSelectorLookupTable *>(SelectorLookupTable);
}
diff --git a/clang/lib/Serialization/MultiOnDiskHashTable.h b/clang/lib/Serialization/MultiOnDiskHashTable.h
index a179016d47419..7e5a6feb76629 100644
--- a/clang/lib/Serialization/MultiOnDiskHashTable.h
+++ b/clang/lib/Serialization/MultiOnDiskHashTable.h
@@ -28,6 +28,7 @@
#include "llvm/Support/Endian.h"
#include "llvm/Support/EndianStream.h"
#include "llvm/Support/OnDiskHashTable.h"
+#include "llvm/Support/SaveAndRestore.h"
#include "llvm/Support/raw_ostream.h"
#include <algorithm>
#include <cstdint>
@@ -37,7 +38,8 @@ namespace clang {
namespace serialization {
/// A collection of on-disk hash tables, merged when relevant for performance.
-template<typename Info> class MultiOnDiskHashTable {
+template<typename Info, bool UseExternalKey = false>
+class MultiOnDiskHashTable {
public:
/// A handle to a file, used when overriding tables.
using file_type = typename Info::file_type;
@@ -47,6 +49,10 @@ template<typename Info> class MultiOnDiskHashTable {
using external_key_type = typename Info::external_key_type;
using internal_key_type = typename Info::internal_key_type;
+ using merged_key_type = std::conditional_t<
+ UseExternalKey,
+ typename Info::external_key_type,
+ typename Info::internal_key_type>;
using data_type = typename Info::data_type;
using data_type_builder = typename Info::data_type_builder;
using hash_value_type = unsigned;
@@ -72,7 +78,7 @@ template<typename Info> class MultiOnDiskHashTable {
struct MergedTable {
std::vector<file_type> Files;
- llvm::DenseMap<internal_key_type, data_type> Data;
+ llvm::DenseMap<merged_key_type, data_type> Data;
};
using Table = llvm::PointerUnion<OnDiskTable *, MergedTable *>;
@@ -85,6 +91,8 @@ template<typename Info> class MultiOnDiskHashTable {
/// it is the first table.
TableVector Tables;
+ bool CanCondense = true;
+
/// Files corresponding to overridden tables that we've not yet
/// discarded.
llvm::TinyPtrVector<file_type> PendingOverrides;
@@ -141,9 +149,8 @@ template<typename Info> class MultiOnDiskHashTable {
}
void condense() {
- MergedTable *Merged = getMergedTable();
- if (!Merged)
- Merged = new MergedTable;
+ llvm::SaveAndRestore GuardCondense(CanCondense, false);
+ MergedTable *NewMerged = new MergedTable;
// Read in all the tables and merge them together.
// FIXME: Be smarter about which tables we merge.
@@ -157,15 +164,35 @@ template<typename Info> class MultiOnDiskHashTable {
// FIXME: Don't rely on the OnDiskHashTable format here.
auto L = InfoObj.ReadKeyDataLength(LocalPtr);
const internal_key_type &Key = InfoObj.ReadKey(LocalPtr, L.first);
- data_type_builder ValueBuilder(Merged->Data[Key]);
- InfoObj.ReadDataInto(Key, LocalPtr + L.first, L.second,
- ValueBuilder);
+ if constexpr (UseExternalKey) {
+ if (auto EKey = InfoObj.TryGetExternalKey(Key)) {
+ data_type_builder ValueBuilder(NewMerged->Data[*EKey]);
+ InfoObj.ReadDataInto(Key, LocalPtr + L.first, L.second,
+ ValueBuilder);
+ }
+ } else {
+ data_type_builder ValueBuilder(NewMerged->Data[Key]);
+ InfoObj.ReadDataInto(Key, LocalPtr + L.first, L.second,
+ ValueBuilder);
+ }
}
- Merged->Files.push_back(ODT->File);
- delete ODT;
+ NewMerged->Files.push_back(ODT->File);
}
-
+ MergedTable *Merged = getMergedTable();
+ if (!Merged) {
+ Merged = NewMerged;
+ } else {
+ for (auto &[Key, Value] : NewMerged->Data) {
+ data_type_builder ValueBuilder(Merged->Data[Key]);
+ Info::MergeDataInto(Value, ValueBuilder);
+ }
+ Merged->Files.insert(Merged->Files.end(), NewMerged->Files.begin(),
+ NewMerged->Files.end());
+ delete NewMerged;
+ }
+ for (auto *T : tables())
+ delete T;
Tables.clear();
Tables.push_back(Table(Merged).getOpaqueValue());
}
@@ -213,13 +240,18 @@ template<typename Info> class MultiOnDiskHashTable {
// Read the OnDiskChainedHashTable header.
storage_type Buckets = Data + BucketOffset;
+ add(File, Buckets, Ptr, Data, std::move(InfoObj));
+ }
+
+ void add(file_type File, storage_type Buckets, storage_type Payload,
+ storage_type Base, Info InfoObj) {
auto NumBucketsAndEntries =
OnDiskTable::HashTable::readNumBucketsAndEntries(Buckets);
// Register the table.
Table NewTable = new OnDiskTable(File, NumBucketsAndEntries.first,
NumBucketsAndEntries.second,
- Buckets, Ptr, Data, std::move(InfoObj));
+ Buckets, Payload, Base, std::move(InfoObj));
Tables.push_back(NewTable.getOpaqueValue());
}
@@ -230,14 +262,15 @@ template<typename Info> class MultiOnDiskHashTable {
if (!PendingOverrides.empty())
removeOverriddenTables();
- if (Tables.size() > static_cast<unsigned>(Info::MaxTables))
+ if (Tables.size() > static_cast<unsigned>(Info::MaxTables) && CanCondense)
condense();
- internal_key_type Key = Info::GetInternalKey(EKey);
- auto KeyHash = Info::ComputeHash(Key);
-
if (MergedTable *M = getMergedTable()) {
- auto It = M->Data.find(Key);
+ merged_key_type MKey = [&] {
+ if constexpr (UseExternalKey) return EKey;
+ else return Info::GetInternalKey(EKey);
+ }();
+ auto It = M->Data.find(MKey);
if (It != M->Data.end())
Result = It->second;
}
@@ -246,6 +279,8 @@ template<typename Info> class MultiOnDiskHashTable {
for (auto *ODT : tables()) {
auto &HT = ODT->Table;
+ const internal_key_type &Key = HT.getInfoObj().GetInternalKey(EKey);
+ auto KeyHash = Info::ComputeHash(Key);
auto It = HT.find_hashed(Key, KeyHash);
if (It != HT.end())
HT.getInfoObj().ReadDataInto(Key, It.getDataPtr(), It.getDataLen(),
More information about the cfe-commits
mailing list