[clang] [clang][modules] Move `SLocEntry` search into `ASTReader` (PR #66966)

Jan Svoboda via cfe-commits cfe-commits at lists.llvm.org
Fri Sep 22 11:17:56 PDT 2023


https://github.com/jansvoboda11 updated https://github.com/llvm/llvm-project/pull/66966

>From 4edf9d8559339a12108d9c4d1e2f3bb062a5a768 Mon Sep 17 00:00:00 2001
From: Jan Svoboda <jan_svoboda at apple.com>
Date: Wed, 20 Sep 2023 17:30:45 -0700
Subject: [PATCH 1/4] [clang][modules] Move `SLocEntry` search into `ASTReader`

In `getFileID()` the `SourceManager` ends up doing a binary search over its buffer of `SLocEntries`. For modules, this binary search fully deserializes the entire `SLocEntry` block for visited each entry. This shows up in profiles of the dependency scanner, since that operation includes decompressing buffers associated with some entries.

This patch moves the binary search over loaded entries into `ASTReader`, which now only performs partial deserialization during the binary search, speeding up the scanner by ~3.3%.
---
 clang/include/clang/Basic/SourceManager.h     |  3 +
 clang/include/clang/Serialization/ASTReader.h |  6 ++
 clang/lib/Basic/SourceManager.cpp             | 70 +------------------
 clang/lib/Serialization/ASTReader.cpp         | 63 +++++++++++++++++
 4 files changed, 75 insertions(+), 67 deletions(-)

diff --git a/clang/include/clang/Basic/SourceManager.h b/clang/include/clang/Basic/SourceManager.h
index 2f846502d6f3327..a4c7facddd53d64 100644
--- a/clang/include/clang/Basic/SourceManager.h
+++ b/clang/include/clang/Basic/SourceManager.h
@@ -533,6 +533,9 @@ class ExternalSLocEntrySource {
   /// entry from being loaded.
   virtual bool ReadSLocEntry(int ID) = 0;
 
+  /// Get the index ID for the loaded SourceLocation offset.
+  virtual int getSLocEntryID(SourceLocation::UIntTy SLocOffset) = 0;
+
   /// Retrieve the module import location and name for the given ID, if
   /// in fact it was loaded from a module (rather than, say, a precompiled
   /// header).
diff --git a/clang/include/clang/Serialization/ASTReader.h b/clang/include/clang/Serialization/ASTReader.h
index dc1eb21c27801fe..e643fcf4c930f09 100644
--- a/clang/include/clang/Serialization/ASTReader.h
+++ b/clang/include/clang/Serialization/ASTReader.h
@@ -2153,6 +2153,12 @@ class ASTReader
 
   /// Read the source location entry with index ID.
   bool ReadSLocEntry(int ID) override;
+  /// Get the index ID for the loaded SourceLocation offset.
+  int getSLocEntryID(SourceLocation::UIntTy SLocOffset) override;
+  /// Read the offset of the SLocEntry at the given index in the given module
+  /// file.
+  std::optional<SourceLocation::UIntTy> readSLocOffset(ModuleFile *F,
+                                                       unsigned Index);
 
   /// Retrieve the module import location and module name for the
   /// given source manager entry ID.
diff --git a/clang/lib/Basic/SourceManager.cpp b/clang/lib/Basic/SourceManager.cpp
index 0521ac7b30339ab..f881afc2e46c5c6 100644
--- a/clang/lib/Basic/SourceManager.cpp
+++ b/clang/lib/Basic/SourceManager.cpp
@@ -864,74 +864,10 @@ FileID SourceManager::getFileIDLocal(SourceLocation::UIntTy SLocOffset) const {
 /// This function knows that the SourceLocation is in a loaded buffer, not a
 /// local one.
 FileID SourceManager::getFileIDLoaded(SourceLocation::UIntTy SLocOffset) const {
-  if (SLocOffset < CurrentLoadedOffset) {
-    assert(0 && "Invalid SLocOffset or bad function choice");
-    return FileID();
-  }
-
-  // Essentially the same as the local case, but the loaded array is sorted
-  // in the other direction (decreasing order).
-  // GreaterIndex is the one where the offset is greater, which is actually a
-  // lower index!
-  unsigned GreaterIndex = 0;
-  unsigned LessIndex = LoadedSLocEntryTable.size();
-  if (LastFileIDLookup.ID < 0) {
-    // Prune the search space.
-    int LastID = LastFileIDLookup.ID;
-    if (getLoadedSLocEntryByID(LastID).getOffset() > SLocOffset)
-      GreaterIndex =
-          (-LastID - 2) + 1; // Exclude LastID, else we would have hit the cache
-    else
-      LessIndex = -LastID - 2;
-  }
-
-  // First do a linear scan from the last lookup position, if possible.
-  unsigned NumProbes;
+  int ID = ExternalSLocEntries->getSLocEntryID(SLocOffset);
   bool Invalid = false;
-  for (NumProbes = 0; NumProbes < 8; ++NumProbes, ++GreaterIndex) {
-    // Make sure the entry is loaded!
-    const SrcMgr::SLocEntry &E = getLoadedSLocEntry(GreaterIndex, &Invalid);
-    if (Invalid)
-      return FileID(); // invalid entry.
-    if (E.getOffset() <= SLocOffset) {
-      FileID Res = FileID::get(-int(GreaterIndex) - 2);
-      LastFileIDLookup = Res;
-      NumLinearScans += NumProbes + 1;
-      return Res;
-    }
-  }
-
-  // Linear scan failed. Do the binary search.
-  NumProbes = 0;
-  while (true) {
-    ++NumProbes;
-    unsigned MiddleIndex = (LessIndex - GreaterIndex) / 2 + GreaterIndex;
-    const SrcMgr::SLocEntry &E = getLoadedSLocEntry(MiddleIndex, &Invalid);
-    if (Invalid)
-      return FileID(); // invalid entry.
-
-    if (E.getOffset() > SLocOffset) {
-      if (GreaterIndex == MiddleIndex) {
-        assert(0 && "binary search missed the entry");
-        return FileID();
-      }
-      GreaterIndex = MiddleIndex;
-      continue;
-    }
-
-    if (isOffsetInFileID(FileID::get(-int(MiddleIndex) - 2), SLocOffset)) {
-      FileID Res = FileID::get(-int(MiddleIndex) - 2);
-      LastFileIDLookup = Res;
-      NumBinaryProbes += NumProbes;
-      return Res;
-    }
-
-    if (LessIndex == MiddleIndex) {
-      assert(0 && "binary search missed the entry");
-      return FileID();
-    }
-    LessIndex = MiddleIndex;
-  }
+  (void)getLoadedSLocEntryByID(ID, &Invalid);
+  return Invalid ? FileID() : FileID::get(ID);
 }
 
 SourceLocation SourceManager::
diff --git a/clang/lib/Serialization/ASTReader.cpp b/clang/lib/Serialization/ASTReader.cpp
index 0952244d037a77c..fdf89dce41aab4d 100644
--- a/clang/lib/Serialization/ASTReader.cpp
+++ b/clang/lib/Serialization/ASTReader.cpp
@@ -1444,6 +1444,69 @@ llvm::Error ASTReader::ReadSourceManagerBlock(ModuleFile &F) {
   }
 }
 
+std::optional<SourceLocation::UIntTy>
+ASTReader::readSLocOffset(ModuleFile *F, unsigned Index) {
+  BitstreamCursor &Cursor = F->SLocEntryCursor;
+  SavedStreamPosition SavedPosition(Cursor);
+  if (llvm::Error Err = Cursor.JumpToBit(F->SLocEntryOffsetsBase +
+                                         F->SLocEntryOffsets[Index])) {
+    Error(std::move(Err));
+    return std::nullopt;
+  }
+
+  Expected<llvm::BitstreamEntry> MaybeEntry = Cursor.advance();
+  if (!MaybeEntry) {
+    Error(MaybeEntry.takeError());
+    return std::nullopt;
+  }
+  llvm::BitstreamEntry Entry = MaybeEntry.get();
+
+  if (Entry.Kind != llvm::BitstreamEntry::Record) {
+    Error("incorrectly-formatted source location entry in AST file");
+    return std::nullopt;
+  }
+
+  RecordData Record;
+  StringRef Blob;
+  Expected<unsigned> MaybeSLOC = Cursor.readRecord(Entry.ID, Record, &Blob);
+  if (!MaybeSLOC) {
+    Error(MaybeSLOC.takeError());
+    return std::nullopt;
+  }
+  switch (MaybeSLOC.get()) {
+  default:
+    Error("incorrectly-formatted source location entry in AST file");
+    return std::nullopt;
+  case SM_SLOC_FILE_ENTRY:
+  case SM_SLOC_BUFFER_ENTRY:
+  case SM_SLOC_EXPANSION_ENTRY:
+    return F->SLocEntryBaseOffset + Record[0];
+  }
+}
+
+int ASTReader::getSLocEntryID(SourceLocation::UIntTy SLocOffset) {
+  auto SLocMapI =
+      GlobalSLocOffsetMap.find(SourceManager::MaxLoadedOffset - SLocOffset - 1);
+  assert(SLocMapI != GlobalSLocOffsetMap.end() &&
+         "Corrupted global sloc offset map");
+  ModuleFile *F = SLocMapI->second;
+
+  std::vector<unsigned> Indices(F->LocalNumSLocEntries);
+  for (unsigned I = 0; I != F->LocalNumSLocEntries; ++I)
+    Indices[I] = I;
+
+  auto It = llvm::upper_bound(Indices, SLocOffset,
+                    [&](SourceLocation::UIntTy Offset, unsigned Index) {
+                      auto EntryOffset = readSLocOffset(F, Index);
+                      assert(EntryOffset && "Corrupted AST file");
+                      return Offset < *EntryOffset;
+                    });
+  // The iterator points to the first entry with start offset greater than the
+  // offset of interest. The previous entry must contain the offset of interest.
+  It = std::prev(It);
+  return F->SLocEntryBaseID + *It;
+}
+
 bool ASTReader::ReadSLocEntry(int ID) {
   if (ID == 0)
     return false;

>From d793bbdfa0e738545b584d31c53186d53452ce65 Mon Sep 17 00:00:00 2001
From: Jan Svoboda <jan_svoboda at apple.com>
Date: Fri, 22 Sep 2023 10:27:30 -0700
Subject: [PATCH 2/4] [clang][modules] Cache deserialized `SLocEntry` offsets

---
 clang/include/clang/Serialization/ModuleFile.h |  6 ++++--
 clang/lib/Serialization/ASTReader.cpp          | 10 +++++++---
 2 files changed, 11 insertions(+), 5 deletions(-)

diff --git a/clang/include/clang/Serialization/ModuleFile.h b/clang/include/clang/Serialization/ModuleFile.h
index 0af5cae6aebc375..8284d8410732dfb 100644
--- a/clang/include/clang/Serialization/ModuleFile.h
+++ b/clang/include/clang/Serialization/ModuleFile.h
@@ -288,10 +288,12 @@ class ModuleFile {
   /// for the entry is SLocEntryOffsetsBase + SLocEntryOffsets[i].
   uint64_t SLocEntryOffsetsBase = 0;
 
-  /// Offsets for all of the source location entries in the
-  /// AST file.
+  /// Stream bit offsets for all of the source location entries in the AST file.
   const uint32_t *SLocEntryOffsets = nullptr;
 
+  /// SLocEntry offsets that have been loaded from the AST file.
+  std::vector<SourceLocation::UIntTy> SLocEntryOffsetLoaded;
+
   /// SLocEntries that we're going to preload.
   SmallVector<uint64_t, 4> PreloadSLocEntries;
 
diff --git a/clang/lib/Serialization/ASTReader.cpp b/clang/lib/Serialization/ASTReader.cpp
index fdf89dce41aab4d..d9276582f5792df 100644
--- a/clang/lib/Serialization/ASTReader.cpp
+++ b/clang/lib/Serialization/ASTReader.cpp
@@ -1497,9 +1497,12 @@ int ASTReader::getSLocEntryID(SourceLocation::UIntTy SLocOffset) {
 
   auto It = llvm::upper_bound(Indices, SLocOffset,
                     [&](SourceLocation::UIntTy Offset, unsigned Index) {
-                      auto EntryOffset = readSLocOffset(F, Index);
-                      assert(EntryOffset && "Corrupted AST file");
-                      return Offset < *EntryOffset;
+                      if (F->SLocEntryOffsetLoaded[Index] == -1U) {
+                        auto MaybeEntryOffset = readSLocOffset(F, Index);
+                        assert(MaybeEntryOffset && "Corrupted AST file");
+                        F->SLocEntryOffsetLoaded[Index] = *MaybeEntryOffset;
+                      }
+                      return Offset < F->SLocEntryOffsetLoaded[Index];
                     });
   // The iterator points to the first entry with start offset greater than the
   // offset of interest. The previous entry must contain the offset of interest.
@@ -3606,6 +3609,7 @@ llvm::Error ASTReader::ReadASTBlock(ModuleFile &F,
         return llvm::createStringError(std::errc::invalid_argument,
                                        "ran out of source locations");
       }
+      F.SLocEntryOffsetLoaded.resize(F.LocalNumSLocEntries, -1U);
       // Make our entry in the range map. BaseID is negative and growing, so
       // we invert it. Because we invert it, though, we need the other end of
       // the range.

>From 9981f50a130019c3b50d0a6b5d09ba81f7a6e936 Mon Sep 17 00:00:00 2001
From: Jan Svoboda <jan_svoboda at apple.com>
Date: Fri, 22 Sep 2023 10:28:06 -0700
Subject: [PATCH 3/4] [clang][modules] Don't allocate when searching for FileID

---
 clang/lib/Serialization/ASTReader.cpp | 23 ++++----
 llvm/include/llvm/ADT/STLExtras.h     | 77 ++++++++++++++++++---------
 2 files changed, 61 insertions(+), 39 deletions(-)

diff --git a/clang/lib/Serialization/ASTReader.cpp b/clang/lib/Serialization/ASTReader.cpp
index d9276582f5792df..b8c7611f1913c81 100644
--- a/clang/lib/Serialization/ASTReader.cpp
+++ b/clang/lib/Serialization/ASTReader.cpp
@@ -1491,19 +1491,16 @@ int ASTReader::getSLocEntryID(SourceLocation::UIntTy SLocOffset) {
          "Corrupted global sloc offset map");
   ModuleFile *F = SLocMapI->second;
 
-  std::vector<unsigned> Indices(F->LocalNumSLocEntries);
-  for (unsigned I = 0; I != F->LocalNumSLocEntries; ++I)
-    Indices[I] = I;
-
-  auto It = llvm::upper_bound(Indices, SLocOffset,
-                    [&](SourceLocation::UIntTy Offset, unsigned Index) {
-                      if (F->SLocEntryOffsetLoaded[Index] == -1U) {
-                        auto MaybeEntryOffset = readSLocOffset(F, Index);
-                        assert(MaybeEntryOffset && "Corrupted AST file");
-                        F->SLocEntryOffsetLoaded[Index] = *MaybeEntryOffset;
-                      }
-                      return Offset < F->SLocEntryOffsetLoaded[Index];
-                    });
+  auto It = llvm::upper_bound(
+      llvm::index_range(0, F->LocalNumSLocEntries), SLocOffset,
+      [&](SourceLocation::UIntTy Offset, std::size_t Index) {
+        if (F->SLocEntryOffsetLoaded[Index] == -1U) {
+          auto MaybeEntryOffset = readSLocOffset(F, Index);
+          assert(MaybeEntryOffset && "Corrupted AST file");
+          F->SLocEntryOffsetLoaded[Index] = *MaybeEntryOffset;
+        }
+        return Offset < F->SLocEntryOffsetLoaded[Index];
+      });
   // The iterator points to the first entry with start offset greater than the
   // offset of interest. The previous entry must contain the offset of interest.
   It = std::prev(It);
diff --git a/llvm/include/llvm/ADT/STLExtras.h b/llvm/include/llvm/ADT/STLExtras.h
index 5b926864f0cc4a2..8384dedf6365a57 100644
--- a/llvm/include/llvm/ADT/STLExtras.h
+++ b/llvm/include/llvm/ADT/STLExtras.h
@@ -2261,43 +2261,68 @@ template <typename... Refs> struct enumerator_result<std::size_t, Refs...> {
   mutable range_reference_tuple Storage;
 };
 
-/// Infinite stream of increasing 0-based `size_t` indices.
-struct index_stream {
-  struct iterator : iterator_facade_base<iterator, std::forward_iterator_tag,
-                                         const iterator> {
-    iterator &operator++() {
-      assert(Index != std::numeric_limits<std::size_t>::max() &&
-             "Attempting to increment end iterator");
-      ++Index;
-      return *this;
-    }
+struct index_iterator
+    : llvm::iterator_facade_base<index_iterator,
+                                 std::random_access_iterator_tag, std::size_t,
+                                 std::ptrdiff_t> {
+  index_iterator(std::size_t Index) : Index(Index) {}
+
+  index_iterator &operator+=(std::ptrdiff_t N) {
+    Index += N;
+    return *this;
+  }
 
-    // Note: This dereference operator returns a value instead of a reference
-    // and does not strictly conform to the C++17's definition of forward
-    // iterator. However, it satisfies all the forward_iterator requirements
-    // that the `zip_common` depends on and fully conforms to the C++20
-    // definition of forward iterator.
-    std::size_t operator*() const { return Index; }
+  index_iterator &operator-=(std::ptrdiff_t N) {
+    Index -= N;
+    return *this;
+  }
 
-    friend bool operator==(const iterator &Lhs, const iterator &Rhs) {
-      return Lhs.Index == Rhs.Index;
-    }
+  std::ptrdiff_t operator-(const index_iterator &R) const {
+    return Index - R.Index;
+  }
 
-    std::size_t Index = 0;
-  };
+  // Note: This dereference operator returns a value instead of a reference
+  // and does not strictly conform to the C++17's definition of forward
+  // iterator. However, it satisfies all the forward_iterator requirements
+  // that the `zip_common` depends on and fully conforms to the C++20
+  // definition of forward iterator.
+  std::size_t operator*() const { return Index; }
 
-  iterator begin() const { return {}; }
-  iterator end() const {
+  friend bool operator==(const index_iterator &Lhs, const index_iterator &Rhs) {
+    return Lhs.Index == Rhs.Index;
+  }
+
+  friend bool operator<(const index_iterator &Lhs, const index_iterator &Rhs) {
+    return Lhs.Index < Rhs.Index;
+  }
+
+private:
+  std::size_t Index;
+};
+
+/// Infinite stream of increasing 0-based `size_t` indices.
+struct index_stream {
+  index_iterator begin() const { return {0}; }
+  index_iterator end() const {
     // We approximate 'infinity' with the max size_t value, which should be good
     // enough to index over any container.
-    iterator It;
-    It.Index = std::numeric_limits<std::size_t>::max();
-    return It;
+    return index_iterator{std::numeric_limits<std::size_t>::max()};
   }
 };
 
 } // end namespace detail
 
+/// Increasing range of `size_t` indices.
+class index_range {
+  std::size_t Begin;
+  std::size_t End;
+
+public:
+  index_range(std::size_t Begin, std::size_t End) : Begin(Begin), End(End) {}
+  detail::index_iterator begin() const { return {Begin}; }
+  detail::index_iterator end() const { return {End}; }
+};
+
 /// Given two or more input ranges, returns a new range whose values are are
 /// tuples (A, B, C, ...), such that A is the 0-based index of the item in the
 /// sequence, and B, C, ..., are the values from the original input ranges. All

>From efedf1270722a5a96f83be0bddbceb0fc693f9be Mon Sep 17 00:00:00 2001
From: Jan Svoboda <jan_svoboda at apple.com>
Date: Fri, 22 Sep 2023 11:16:18 -0700
Subject: [PATCH 4/4] [clang][modules] Report failures from lightweight
 deserialization, save potentially unnecessary heavyweight deserialization

---
 clang/include/clang/Basic/SourceManager.h     |  3 +++
 clang/include/clang/Serialization/ASTReader.h |  4 ++--
 clang/lib/Basic/SourceManager.cpp             |  5 +----
 clang/lib/Serialization/ASTReader.cpp         | 14 +++++++++++---
 4 files changed, 17 insertions(+), 9 deletions(-)

diff --git a/clang/include/clang/Basic/SourceManager.h b/clang/include/clang/Basic/SourceManager.h
index a4c7facddd53d64..71e3256ffa55f36 100644
--- a/clang/include/clang/Basic/SourceManager.h
+++ b/clang/include/clang/Basic/SourceManager.h
@@ -534,6 +534,9 @@ class ExternalSLocEntrySource {
   virtual bool ReadSLocEntry(int ID) = 0;
 
   /// Get the index ID for the loaded SourceLocation offset.
+  ///
+  /// \returns Invalid index ID (0) if an error occurred that prevented the
+  /// SLocEntry  from being loaded.
   virtual int getSLocEntryID(SourceLocation::UIntTy SLocOffset) = 0;
 
   /// Retrieve the module import location and name for the given ID, if
diff --git a/clang/include/clang/Serialization/ASTReader.h b/clang/include/clang/Serialization/ASTReader.h
index e643fcf4c930f09..315e1d6afa3a552 100644
--- a/clang/include/clang/Serialization/ASTReader.h
+++ b/clang/include/clang/Serialization/ASTReader.h
@@ -2155,8 +2155,8 @@ class ASTReader
   bool ReadSLocEntry(int ID) override;
   /// Get the index ID for the loaded SourceLocation offset.
   int getSLocEntryID(SourceLocation::UIntTy SLocOffset) override;
-  /// Read the offset of the SLocEntry at the given index in the given module
-  /// file.
+  /// Try to read the offset of the SLocEntry at the given index in the given
+  /// module file.
   std::optional<SourceLocation::UIntTy> readSLocOffset(ModuleFile *F,
                                                        unsigned Index);
 
diff --git a/clang/lib/Basic/SourceManager.cpp b/clang/lib/Basic/SourceManager.cpp
index f881afc2e46c5c6..298d4d605c18b6d 100644
--- a/clang/lib/Basic/SourceManager.cpp
+++ b/clang/lib/Basic/SourceManager.cpp
@@ -864,10 +864,7 @@ FileID SourceManager::getFileIDLocal(SourceLocation::UIntTy SLocOffset) const {
 /// This function knows that the SourceLocation is in a loaded buffer, not a
 /// local one.
 FileID SourceManager::getFileIDLoaded(SourceLocation::UIntTy SLocOffset) const {
-  int ID = ExternalSLocEntries->getSLocEntryID(SLocOffset);
-  bool Invalid = false;
-  (void)getLoadedSLocEntryByID(ID, &Invalid);
-  return Invalid ? FileID() : FileID::get(ID);
+  return FileID::get(ExternalSLocEntries->getSLocEntryID(SLocOffset));
 }
 
 SourceLocation SourceManager::
diff --git a/clang/lib/Serialization/ASTReader.cpp b/clang/lib/Serialization/ASTReader.cpp
index b8c7611f1913c81..f25a0665f18d2ec 100644
--- a/clang/lib/Serialization/ASTReader.cpp
+++ b/clang/lib/Serialization/ASTReader.cpp
@@ -1491,20 +1491,28 @@ int ASTReader::getSLocEntryID(SourceLocation::UIntTy SLocOffset) {
          "Corrupted global sloc offset map");
   ModuleFile *F = SLocMapI->second;
 
+  bool Invalid = false;
+
   auto It = llvm::upper_bound(
       llvm::index_range(0, F->LocalNumSLocEntries), SLocOffset,
       [&](SourceLocation::UIntTy Offset, std::size_t Index) {
         if (F->SLocEntryOffsetLoaded[Index] == -1U) {
           auto MaybeEntryOffset = readSLocOffset(F, Index);
-          assert(MaybeEntryOffset && "Corrupted AST file");
+          if (!MaybeEntryOffset) {
+            Invalid = true;
+            return true;
+          }
           F->SLocEntryOffsetLoaded[Index] = *MaybeEntryOffset;
         }
         return Offset < F->SLocEntryOffsetLoaded[Index];
       });
+
+  if (Invalid)
+    return 0;
+
   // The iterator points to the first entry with start offset greater than the
   // offset of interest. The previous entry must contain the offset of interest.
-  It = std::prev(It);
-  return F->SLocEntryBaseID + *It;
+  return F->SLocEntryBaseID + *std::prev(It);
 }
 
 bool ASTReader::ReadSLocEntry(int ID) {



More information about the cfe-commits mailing list