[clang] f02b1cc - [ASTWriter] Detect more non-affecting FileIDs to reduce source location duplication (#112015)

via cfe-commits cfe-commits at lists.llvm.org
Fri Nov 8 00:10:41 PST 2024


Author: Ilya Biryukov
Date: 2024-11-08T09:10:37+01:00
New Revision: f02b1cc99e12ac0147d5c334f130a305d85e477a

URL: https://github.com/llvm/llvm-project/commit/f02b1cc99e12ac0147d5c334f130a305d85e477a
DIFF: https://github.com/llvm/llvm-project/commit/f02b1cc99e12ac0147d5c334f130a305d85e477a.diff

LOG: [ASTWriter] Detect more non-affecting FileIDs to reduce source location duplication (#112015)

Currently, any FileID that references a module map file that was
required for a compilation is considered as affecting. This misses an
important opportunity to reduce the source location space taken by the
resulting PCM.

In particular, consider the situation where the same module map file is
passed multiple times in the dependency chain:

```shell
$ clang -fmodule-map-file=foo.modulemap ... -o mod1.pcm
$ clang -fmodule-map-file=foo.modulemap -fmodule-file=mod1.pcm ... -o mod2.pcm
...
$ clang -fmodule-map-file=foo.modulemap -fmodule-file=mod$((N-1)).pcm ... -o mod$N.pcm
```

Because `foo.modulemap` is read before reading any of the `.pcm` files,
we have to create a unique `FileID` for it when creating each module.
However, when reading the `.pcm` files, we will reuse the `FileID`
loaded from it for the same module map file and the `FileID` we created
can never be used again, but we will still mark it as affecting and it
will take the source location space in the output PCM.

For a chain of N dependencies, this results in the file taking `N *
(size of file)` source location space, which could be significant. For
examples, we observer internally that some targets that run out of 2GB
of source location space end up wasting up to 20% of that space in
module maps as described above.

I take extra care to still write the InputFile entries for those files that occupied
source location space before. It is required for correctness of clang-scan-deps.

Added: 
    clang/test/Modules/prune-non-affecting-module-map-repeated.cpp

Modified: 
    clang/include/clang/Serialization/ASTWriter.h
    clang/lib/Serialization/ASTReader.cpp
    clang/lib/Serialization/ASTWriter.cpp

Removed: 
    


################################################################################
diff  --git a/clang/include/clang/Serialization/ASTWriter.h b/clang/include/clang/Serialization/ASTWriter.h
index dc9fcd3c33726e..88d15b84d5198e 100644
--- a/clang/include/clang/Serialization/ASTWriter.h
+++ b/clang/include/clang/Serialization/ASTWriter.h
@@ -496,6 +496,9 @@ class ASTWriter : public ASTDeserializationListener,
 
   /// Mapping from a source location entry to whether it is affecting or not.
   llvm::BitVector IsSLocAffecting;
+  /// Mapping from a source location entry to whether it must be included as
+  /// input file.
+  llvm::BitVector IsSLocFileEntryAffecting;
 
   /// Mapping from \c FileID to an index into the FileID adjustment table.
   std::vector<FileID> NonAffectingFileIDs;

diff  --git a/clang/lib/Serialization/ASTReader.cpp b/clang/lib/Serialization/ASTReader.cpp
index 79615dc3c018ea..fcb62fba85b7de 100644
--- a/clang/lib/Serialization/ASTReader.cpp
+++ b/clang/lib/Serialization/ASTReader.cpp
@@ -5866,6 +5866,12 @@ llvm::Error ASTReader::ReadSubmoduleBlock(ModuleFile &F,
       }
 
       CurrentModule->Kind = Kind;
+      // Note that we may be rewriting an existing location and it is important
+      // to keep doing that. In particular, we would like to prefer a
+      // `DefinitionLoc` loaded from the module file instead of the location
+      // created in the current source manager, because it allows the new
+      // location to be marked as "unaffecting" when writing and avoid creating
+      // duplicate locations for the same module map file.
       CurrentModule->DefinitionLoc = DefinitionLoc;
       CurrentModule->Signature = F.Signature;
       CurrentModule->IsFromModuleFile = true;

diff  --git a/clang/lib/Serialization/ASTWriter.cpp b/clang/lib/Serialization/ASTWriter.cpp
index 016d1d4acad137..aa8be35e351dec 100644
--- a/clang/lib/Serialization/ASTWriter.cpp
+++ b/clang/lib/Serialization/ASTWriter.cpp
@@ -38,6 +38,7 @@
 #include "clang/AST/TypeLocVisitor.h"
 #include "clang/Basic/Diagnostic.h"
 #include "clang/Basic/DiagnosticOptions.h"
+#include "clang/Basic/FileEntry.h"
 #include "clang/Basic/FileManager.h"
 #include "clang/Basic/FileSystemOptions.h"
 #include "clang/Basic/IdentifierTable.h"
@@ -81,6 +82,7 @@
 #include "llvm/ADT/APSInt.h"
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/DenseSet.h"
 #include "llvm/ADT/Hashing.h"
 #include "llvm/ADT/PointerIntPair.h"
 #include "llvm/ADT/STLExtras.h"
@@ -166,7 +168,12 @@ static TypeCode getTypeCodeForTypeClass(Type::TypeClass id) {
 
 namespace {
 
-std::optional<std::set<const FileEntry *>>
+struct AffectingModuleMaps {
+  llvm::DenseSet<FileID> DefinitionFileIDs;
+  llvm::DenseSet<const FileEntry *> DefinitionFiles;
+};
+
+std::optional<AffectingModuleMaps>
 GetAffectingModuleMaps(const Preprocessor &PP, Module *RootModule) {
   if (!PP.getHeaderSearchInfo()
            .getHeaderSearchOpts()
@@ -174,10 +181,12 @@ GetAffectingModuleMaps(const Preprocessor &PP, Module *RootModule) {
     return std::nullopt;
 
   const HeaderSearch &HS = PP.getHeaderSearchInfo();
+  const SourceManager &SM = PP.getSourceManager();
   const ModuleMap &MM = HS.getModuleMap();
 
-  std::set<const FileEntry *> ModuleMaps;
-  std::set<const Module *> ProcessedModules;
+  llvm::DenseSet<FileID> ModuleMaps;
+
+  llvm::DenseSet<const Module *> ProcessedModules;
   auto CollectModuleMapsForHierarchy = [&](const Module *M) {
     M = M->getTopLevelModule();
 
@@ -192,13 +201,13 @@ GetAffectingModuleMaps(const Preprocessor &PP, Module *RootModule) {
 
       // The containing module map is affecting, because it's being pointed
       // into by Module::DefinitionLoc.
-      if (auto FE = MM.getContainingModuleMapFile(Mod))
-        ModuleMaps.insert(*FE);
+      if (auto F = MM.getContainingModuleMapFileID(Mod); F.isValid())
+        ModuleMaps.insert(F);
       // For inferred modules, the module map that allowed inferring is not
       // related to the virtual containing module map file. It did affect the
       // compilation, though.
-      if (auto FE = MM.getModuleMapFileForUniquing(Mod))
-        ModuleMaps.insert(*FE);
+      if (auto UniqF = MM.getModuleMapFileIDForUniquing(Mod); UniqF.isValid())
+        ModuleMaps.insert(UniqF);
 
       for (auto *SubM : Mod->submodules())
         Q.push(SubM);
@@ -268,7 +277,16 @@ GetAffectingModuleMaps(const Preprocessor &PP, Module *RootModule) {
   // just ban module map hierarchies where module map defining a (sub)module X
   // includes a module map defining a module that's not a submodule of X.
 
-  return ModuleMaps;
+  llvm::DenseSet<const FileEntry *> ModuleFileEntries;
+  for (FileID MM : ModuleMaps) {
+    if (auto *FE = SM.getFileEntryForID(MM))
+      ModuleFileEntries.insert(FE);
+  }
+
+  AffectingModuleMaps R;
+  R.DefinitionFileIDs = std::move(ModuleMaps);
+  R.DefinitionFiles = std::move(ModuleFileEntries);
+  return std::move(R);
 }
 
 class ASTTypeWriter {
@@ -1770,14 +1788,17 @@ void ASTWriter::WriteInputFiles(SourceManager &SourceMgr,
       continue;
 
     // Do not emit input files that do not affect current module.
-    if (!IsSLocAffecting[I])
+    if (!IsSLocFileEntryAffecting[I])
       continue;
 
     InputFileEntry Entry(*Cache->OrigEntry);
     Entry.IsSystemFile = isSystem(File.getFileCharacteristic());
     Entry.IsTransient = Cache->IsTransient;
     Entry.BufferOverridden = Cache->BufferOverridden;
-    Entry.IsTopLevel = getAffectingIncludeLoc(SourceMgr, File).isInvalid();
+
+    FileID IncludeFileID = SourceMgr.getFileID(File.getIncludeLoc());
+    Entry.IsTopLevel = IncludeFileID.isInvalid() || IncludeFileID.ID < 0 ||
+                       !IsSLocFileEntryAffecting[IncludeFileID.ID];
     Entry.IsModuleMap = isModuleMap(File.getFileCharacteristic());
 
     uint64_t ContentHash = 0;
@@ -4920,6 +4941,7 @@ void ASTWriter::computeNonAffectingInputFiles() {
   unsigned N = SrcMgr.local_sloc_entry_size();
 
   IsSLocAffecting.resize(N, true);
+  IsSLocFileEntryAffecting.resize(N, true);
 
   if (!WritingModule)
     return;
@@ -4956,10 +4978,12 @@ void ASTWriter::computeNonAffectingInputFiles() {
       continue;
 
     // Don't prune module maps that are affecting.
-    if (llvm::is_contained(*AffectingModuleMaps, *Cache->OrigEntry))
+    if (AffectingModuleMaps->DefinitionFileIDs.contains(FID))
       continue;
 
     IsSLocAffecting[I] = false;
+    IsSLocFileEntryAffecting[I] =
+        AffectingModuleMaps->DefinitionFiles.contains(*Cache->OrigEntry);
 
     FileIDAdjustment += 1;
     // Even empty files take up one element in the offset table.

diff  --git a/clang/test/Modules/prune-non-affecting-module-map-repeated.cpp b/clang/test/Modules/prune-non-affecting-module-map-repeated.cpp
new file mode 100644
index 00000000000000..2c09bc4b01b026
--- /dev/null
+++ b/clang/test/Modules/prune-non-affecting-module-map-repeated.cpp
@@ -0,0 +1,85 @@
+// Check that the same module map file passed to -fmodule-map-file *and*
+// available from one of the `-fmodule-file` does not allocate extra source
+// location space. This optimization is important for using module maps in
+// large codebases to avoid running out of source location space.
+
+// RUN: rm -rf %t && mkdir %t
+// RUN: split-file %s %t
+
+// RUN: %clang_cc1 -xc++ -fmodules -fno-implicit-modules -fmodule-map-file=%t/base.map -fmodule-name=base -emit-module %t/base.map -o %t/base.pcm
+// RUN: %clang_cc1 -xc++ -fmodules -fno-implicit-modules \
+// RUN:   -fmodule-map-file=%t/base.map -fmodule-map-file=%t/mod1.map \
+// RUN:   -fmodule-file=%t/base.pcm \
+// RUN:   -fmodule-name=mod1 -emit-module %t/mod1.map -o %t/mod1.pcm
+// RUN: %clang_cc1 -xc++ -fmodules -fno-implicit-modules \
+// RUN:   -fmodule-map-file=%t/base.map -fmodule-map-file=%t/mod1.map -fmodule-map-file=%t/mod2.map \
+// RUN:   -fmodule-file=%t/mod1.pcm \
+// RUN:   -fmodule-name=mod2 -emit-module %t/mod2.map -o %t/mod2.pcm
+// RUN: %clang_cc1 -xc++ -fmodules -fno-implicit-modules \
+// RUN:   -fmodule-map-file=%t/base.map -fmodule-map-file=%t/mod2.map -fmodule-map-file=%t/mod3.map \
+// RUN:   -fmodule-file=%t/mod2.pcm \
+// RUN:   -fmodule-name=mod3 -emit-module %t/mod3.map -o %t/mod3.pcm
+// RUN: %clang_cc1 -xc++ -fmodules -fno-implicit-modules \
+// RUN:   -fmodule-map-file=%t/base.map -fmodule-map-file=%t/mod3.map -fmodule-map-file=%t/mod4.map \
+// RUN:   -fmodule-file=%t/mod3.pcm \
+// RUN:   -fmodule-name=mod4 -emit-module %t/mod4.map -o %t/mod4.pcm
+// RUN: %clang_cc1 -xc++ -fmodules -fno-implicit-modules -fmodule-map-file=%t/base.map -fmodule-map-file=%t/mod4.map -fmodule-file=%t/mod4.pcm -fsyntax-only -verify %t/check_slocs.cc
+
+//--- base.map
+module base { header "vector.h" }
+//--- mod1.map
+module mod1 { header "mod1.h" }
+//--- mod2.map
+module mod2 { header "mod2.h" }
+//--- mod3.map
+module mod3 { header "mod3.h" }
+//--- mod4.map
+module mod4 { header "mod4.h" }
+//--- check_slocs.cc
+#include "mod4.h"
+#pragma clang __debug sloc_usage // expected-remark {{source manager location address space usage}}
+// expected-note@* {{% of available space}}
+
+// Module map files files that were specified on the command line are entered twice (once when parsing command-line, once loaded from the .pcm)
+// Those that  not specified on the command line must be entered once.
+
+// expected-note at base.map:1 {{file entered 2 times}}
+// expected-note at mod4.map:1 {{file entered 2 times}}
+// expected-note at mod1.map:1 {{file entered 1 time}}
+// expected-note at mod2.map:1 {{file entered 1 time}}
+// expected-note at mod3.map:1 {{file entered 1 time}}
+// expected-note@* + {{file entered}}
+
+
+//--- vector.h
+#ifndef VECTOR_H
+#define VECTOR_H
+#endif
+
+//--- mod1.h
+#ifndef MOD1
+#define MOD1
+#include "vector.h"
+int mod1();
+#endif
+//--- mod2.h
+#ifndef MOD2
+#define MOD2
+#include "vector.h"
+#include "mod1.h"
+int mod2();
+#endif
+//--- mod3.h
+#ifndef MOD3
+#define MOD3
+#include "vector.h"
+#include "mod2.h"
+int mod3();
+#endif
+//--- mod4.h
+#ifndef MOD4
+#define MOD4
+#include "vector.h"
+#include "mod3.h"
+int mod4();
+#endif


        


More information about the cfe-commits mailing list