[clang-tools-extra] 735ab46 - [clangd] Don't create as much garbage while building Dex index.

Sam McCall via cfe-commits cfe-commits at lists.llvm.org
Thu May 14 04:51:37 PDT 2020


Author: Sam McCall
Date: 2020-05-14T13:35:44+02:00
New Revision: 735ab46cb4148c92d636c912832a32509252b579

URL: https://github.com/llvm/llvm-project/commit/735ab46cb4148c92d636c912832a32509252b579
DIFF: https://github.com/llvm/llvm-project/commit/735ab46cb4148c92d636c912832a32509252b579.diff

LOG: [clangd] Don't create as much garbage while building Dex index.

Summary:
The Token objects are relatively expensive and we were spending a lot of
CPU creating them for each trigram emitted. Instead, use a tiny trigram
structure until we're ready to finalize the index.

This improves the new BuildDex benchmark by 20%. This code is hot and on
the critical path in clangd: it runs after a new preamble is built.

Reviewers: kbobyrev

Subscribers: ilya-biryukov, MaskRay, jkorous, mgrang, arphaman, kadircet, usaxena95, cfe-commits

Tags: #clang

Differential Revision: https://reviews.llvm.org/D79918

Added: 
    

Modified: 
    clang-tools-extra/clangd/benchmarks/IndexBenchmark.cpp
    clang-tools-extra/clangd/index/dex/Dex.cpp
    clang-tools-extra/clangd/index/dex/Trigram.cpp
    clang-tools-extra/clangd/index/dex/Trigram.h
    clang-tools-extra/clangd/test/Inputs/requests.json
    clang-tools-extra/clangd/unittests/DexTests.cpp

Removed: 
    


################################################################################
diff  --git a/clang-tools-extra/clangd/benchmarks/IndexBenchmark.cpp b/clang-tools-extra/clangd/benchmarks/IndexBenchmark.cpp
index 439ac9c65b9c..26a70de8ee51 100644
--- a/clang-tools-extra/clangd/benchmarks/IndexBenchmark.cpp
+++ b/clang-tools-extra/clangd/benchmarks/IndexBenchmark.cpp
@@ -83,6 +83,12 @@ static void DexQueries(benchmark::State &State) {
 }
 BENCHMARK(DexQueries);
 
+static void DexBuild(benchmark::State &State) {
+  for (auto _ : State)
+    buildDex();
+}
+BENCHMARK(DexBuild);
+
 } // namespace
 } // namespace clangd
 } // namespace clang

diff  --git a/clang-tools-extra/clangd/index/dex/Dex.cpp b/clang-tools-extra/clangd/index/dex/Dex.cpp
index a663e5387ece..8082ffd75ad9 100644
--- a/clang-tools-extra/clangd/index/dex/Dex.cpp
+++ b/clang-tools-extra/clangd/index/dex/Dex.cpp
@@ -39,28 +39,60 @@ namespace {
 const Token RestrictedForCodeCompletion =
     Token(Token::Kind::Sentinel, "Restricted For Code Completion");
 
-// Returns the tokens which are given symbol's characteristics. Currently, the
-// generated tokens only contain fuzzy matching trigrams and symbol's scope,
-// but in the future this will also return path proximity tokens and other
-// types of tokens such as symbol type (if applicable).
-// Returns the tokens which are given symbols's characteristics. For example,
-// trigrams and scopes.
-// FIXME(kbobyrev): Support more token types:
-// * Namespace proximity
-std::vector<Token> generateSearchTokens(const Symbol &Sym) {
-  std::vector<Token> Result = generateIdentifierTrigrams(Sym.Name);
-  Result.emplace_back(Token::Kind::Scope, Sym.Scope);
-  // Skip token generation for symbols with unknown declaration location.
-  if (!llvm::StringRef(Sym.CanonicalDeclaration.FileURI).empty())
-    for (const auto &ProximityURI :
-         generateProximityURIs(Sym.CanonicalDeclaration.FileURI))
-      Result.emplace_back(Token::Kind::ProximityURI, ProximityURI);
-  if (Sym.Flags & Symbol::IndexedForCodeCompletion)
-    Result.emplace_back(RestrictedForCodeCompletion);
-  if (!Sym.Type.empty())
-    Result.emplace_back(Token::Kind::Type, Sym.Type);
-  return Result;
-}
+// Helper to efficiently assemble the inverse index (token -> matching docs).
+// The output is a nice uniform structure keyed on Token, but constructing
+// the Token object every time we want to insert into the map is wasteful.
+// Instead we have various maps keyed on things that are cheap to compute,
+// and produce the Token keys once at the end.
+class IndexBuilder {
+  llvm::DenseMap<Trigram, std::vector<DocID>> TrigramDocs;
+  std::vector<DocID> RestrictedCCDocs;
+  llvm::StringMap<std::vector<DocID>> TypeDocs;
+  llvm::StringMap<std::vector<DocID>> ScopeDocs;
+  llvm::StringMap<std::vector<DocID>> ProximityDocs;
+  std::vector<Trigram> TrigramScratch;
+
+public:
+  // Add the tokens which are given symbol's characteristics.
+  // This includes fuzzy matching trigrams, symbol's scope, etc.
+  // FIXME(kbobyrev): Support more token types:
+  // * Namespace proximity
+  void add(const Symbol &Sym, DocID D) {
+    generateIdentifierTrigrams(Sym.Name, TrigramScratch);
+    for (Trigram T : TrigramScratch)
+      TrigramDocs[T].push_back(D);
+    ScopeDocs[Sym.Scope].push_back(D);
+    if (!llvm::StringRef(Sym.CanonicalDeclaration.FileURI).empty())
+      for (const auto &ProximityURI :
+           generateProximityURIs(Sym.CanonicalDeclaration.FileURI))
+        ProximityDocs[ProximityURI].push_back(D);
+    if (Sym.Flags & Symbol::IndexedForCodeCompletion)
+      RestrictedCCDocs.push_back(D);
+    if (!Sym.Type.empty())
+      TypeDocs[Sym.Type].push_back(D);
+  }
+
+  // Assemble the final compressed posting lists for the added symbols.
+  llvm::DenseMap<Token, PostingList> build() {
+    llvm::DenseMap<Token, PostingList> Result(/*InitialReserve=*/
+                                              TrigramDocs.size() +
+                                              RestrictedCCDocs.size() +
+                                              TypeDocs.size() +
+                                              ScopeDocs.size() +
+                                              ProximityDocs.size());
+    for (const auto &E : TrigramDocs)
+      Result.try_emplace(Token(Token::Kind::Trigram, E.first.str()), E.second);
+    for (const auto &E : TypeDocs)
+      Result.try_emplace(Token(Token::Kind::Type, E.first()), E.second);
+    for (const auto &E : ScopeDocs)
+      Result.try_emplace(Token(Token::Kind::Scope, E.first()), E.second);
+    for (const auto &E : ProximityDocs)
+      Result.try_emplace(Token(Token::Kind::ProximityURI, E.first()), E.second);
+    if (!RestrictedCCDocs.empty())
+      Result.try_emplace(RestrictedForCodeCompletion, RestrictedCCDocs);
+    return Result;
+  }
+};
 
 } // namespace
 
@@ -86,18 +118,11 @@ void Dex::buildIndex() {
     Symbols[I] = ScoredSymbols[I].second;
   }
 
-  // Populate TempInvertedIndex with lists for index symbols.
-  llvm::DenseMap<Token, std::vector<DocID>> TempInvertedIndex;
-  for (DocID SymbolRank = 0; SymbolRank < Symbols.size(); ++SymbolRank) {
-    const auto *Sym = Symbols[SymbolRank];
-    for (const auto &Token : generateSearchTokens(*Sym))
-      TempInvertedIndex[Token].push_back(SymbolRank);
-  }
-
-  // Convert lists of items to posting lists.
-  for (const auto &TokenToPostingList : TempInvertedIndex)
-    InvertedIndex.insert(
-        {TokenToPostingList.first, PostingList(TokenToPostingList.second)});
+  // Build posting lists for symbols.
+  IndexBuilder Builder;
+  for (DocID SymbolRank = 0; SymbolRank < Symbols.size(); ++SymbolRank)
+    Builder.add(*Symbols[SymbolRank], SymbolRank);
+  InvertedIndex = Builder.build();
 }
 
 std::unique_ptr<Iterator> Dex::iterator(const Token &Tok) const {

diff  --git a/clang-tools-extra/clangd/index/dex/Trigram.cpp b/clang-tools-extra/clangd/index/dex/Trigram.cpp
index 725c9c409df1..fddca8a95320 100644
--- a/clang-tools-extra/clangd/index/dex/Trigram.cpp
+++ b/clang-tools-extra/clangd/index/dex/Trigram.cpp
@@ -11,6 +11,7 @@
 #include "Token.h"
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/DenseSet.h"
+#include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/StringExtras.h"
 #include <cctype>
 #include <queue>
@@ -20,7 +21,9 @@ namespace clang {
 namespace clangd {
 namespace dex {
 
-std::vector<Token> generateIdentifierTrigrams(llvm::StringRef Identifier) {
+// Produce trigrams (including duplicates) and pass them to Out().
+template <typename Func>
+static void identifierTrigrams(llvm::StringRef Identifier, Func Out) {
   // Apply fuzzy matching text segmentation.
   std::vector<CharRole> Roles(Identifier.size());
   calculateRoles(Identifier,
@@ -46,12 +49,6 @@ std::vector<Token> generateIdentifierTrigrams(llvm::StringRef Identifier) {
     }
   }
 
-  llvm::DenseSet<Token> UniqueTrigrams;
-
-  auto Add = [&](std::string Chars) {
-    UniqueTrigrams.insert(Token(Token::Kind::Trigram, Chars));
-  };
-
   // Iterate through valid sequences of three characters Fuzzy Matcher can
   // process.
   for (size_t I = 0; I < LowercaseIdentifier.size(); ++I) {
@@ -64,23 +61,41 @@ std::vector<Token> generateIdentifierTrigrams(llvm::StringRef Identifier) {
       for (const unsigned K : Next[J]) {
         if (K == 0)
           continue;
-        Add({{LowercaseIdentifier[I], LowercaseIdentifier[J],
-              LowercaseIdentifier[K]}});
+        Out(Trigram(LowercaseIdentifier[I], LowercaseIdentifier[J],
+                    LowercaseIdentifier[K]));
       }
     }
   }
   // Emit short-query trigrams: FooBar -> f, fo, fb.
   if (!LowercaseIdentifier.empty())
-    Add({LowercaseIdentifier[0]});
+    Out(Trigram(LowercaseIdentifier[0]));
   if (LowercaseIdentifier.size() >= 2)
-    Add({LowercaseIdentifier[0], LowercaseIdentifier[1]});
+    Out(Trigram(LowercaseIdentifier[0], LowercaseIdentifier[1]));
   for (size_t I = 1; I < LowercaseIdentifier.size(); ++I)
     if (Roles[I] == Head) {
-      Add({LowercaseIdentifier[0], LowercaseIdentifier[I]});
+      Out(Trigram(LowercaseIdentifier[0], LowercaseIdentifier[I]));
       break;
     }
+}
 
-  return {UniqueTrigrams.begin(), UniqueTrigrams.end()};
+void generateIdentifierTrigrams(llvm::StringRef Identifier,
+                                std::vector<Trigram> &Result) {
+  // Empirically, scanning for duplicates is faster with fewer trigrams, and
+  // a hashtable is faster with more. This is a hot path, so dispatch based on
+  // expected number of trigrams. Longer identifiers produce more trigrams.
+  // The magic number was tuned by running IndexBenchmark.DexBuild.
+  constexpr unsigned ManyTrigramsIdentifierThreshold = 14;
+  Result.clear();
+  if (Identifier.size() < ManyTrigramsIdentifierThreshold) {
+    identifierTrigrams(Identifier, [&](Trigram T) {
+      if (!llvm::is_contained(Result, T))
+        Result.push_back(T);
+    });
+  } else {
+    identifierTrigrams(Identifier, [&](Trigram T) { Result.push_back(T); });
+    llvm::sort(Result);
+    Result.erase(std::unique(Result.begin(), Result.end()), Result.end());
+  }
 }
 
 std::vector<Token> generateQueryTrigrams(llvm::StringRef Query) {

diff  --git a/clang-tools-extra/clangd/index/dex/Trigram.h b/clang-tools-extra/clangd/index/dex/Trigram.h
index bf1e5e809412..28cd4718c350 100644
--- a/clang-tools-extra/clangd/index/dex/Trigram.h
+++ b/clang-tools-extra/clangd/index/dex/Trigram.h
@@ -24,6 +24,7 @@
 #define LLVM_CLANG_TOOLS_EXTRA_CLANGD_DEX_TRIGRAM_H
 
 #include "Token.h"
+#include "llvm/ADT/bit.h"
 
 #include <string>
 
@@ -31,7 +32,27 @@ namespace clang {
 namespace clangd {
 namespace dex {
 
-/// Returns list of unique fuzzy-search trigrams from unqualified symbol.
+// Compact representation of a trigram (string with up to 3 characters).
+// Trigram generation is the hot path of indexing, so Token is too wasteful.
+class Trigram {
+  std::array<char, 4> Data; // Last element is length.
+  // Steal some invalid bit patterns for DenseMap sentinels.
+  enum class Sentinel { Tombstone = 4, Empty = 5 };
+  Trigram(Sentinel S) : Data{0, 0, 0, static_cast<char>(S)} {}
+  uint32_t id() const { return llvm::bit_cast<uint32_t>(Data); }
+
+public:
+  Trigram() : Data{0, 0, 0, 0} {}
+  Trigram(char A) : Data{A, 0, 0, 1} {}
+  Trigram(char A, char B) : Data{A, B, 0, 2} {}
+  Trigram(char A, char B, char C) : Data{A, B, C, 3} {}
+  std::string str() const { return std::string(Data.data(), Data[3]); }
+  friend struct ::llvm::DenseMapInfo<Trigram>;
+  friend bool operator==(Trigram L, Trigram R) { return L.id() == R.id(); }
+  friend bool operator<(Trigram L, Trigram R) { return L.id() < R.id(); }
+};
+
+/// Produces list of unique fuzzy-search trigrams from unqualified symbol.
 /// The trigrams give the 3-character query substrings this symbol can match.
 ///
 /// The symbol's name is broken into segments, e.g. "FooBar" has two segments.
@@ -46,8 +67,9 @@ namespace dex {
 ///  {f, fo, fb, foo, fob, fba, oob, oba, bar}.
 ///
 /// Trigrams are lowercase, as trigram matching is case-insensitive.
-/// Trigrams in the returned list are deduplicated.
-std::vector<Token> generateIdentifierTrigrams(llvm::StringRef Identifier);
+/// Trigrams in the list are deduplicated.
+void generateIdentifierTrigrams(llvm::StringRef Identifier,
+                                std::vector<Trigram> &Out);
 
 /// Returns list of unique fuzzy-search trigrams given a query.
 ///
@@ -64,4 +86,29 @@ std::vector<Token> generateQueryTrigrams(llvm::StringRef Query);
 } // namespace clangd
 } // namespace clang
 
+namespace llvm {
+template <> struct llvm::DenseMapInfo<clang::clangd::dex::Trigram> {
+  using Trigram = clang::clangd::dex::Trigram;
+  static inline Trigram getEmptyKey() {
+    return Trigram(Trigram::Sentinel::Empty);
+  }
+  static inline Trigram getTombstoneKey() {
+    return Trigram(Trigram::Sentinel::Tombstone);
+  }
+  static unsigned getHashValue(Trigram V) {
+    // Finalize step from MurmurHash3.
+    uint32_t X = V.id();
+    X ^= X >> 16;
+    X *= uint32_t{0x85ebca6b};
+    X ^= X >> 13;
+    X *= uint32_t{0xc2b2ae35};
+    X ^= X >> 16;
+    return X;
+  }
+  static bool isEqual(const Trigram &LHS, const Trigram &RHS) {
+    return LHS == RHS;
+  }
+};
+} // namespace llvm
+
 #endif // LLVM_CLANG_TOOLS_EXTRA_CLANGD_DEX_TRIGRAM_H

diff  --git a/clang-tools-extra/clangd/test/Inputs/requests.json b/clang-tools-extra/clangd/test/Inputs/requests.json
index ee91cf243749..e5a2f1459760 100644
--- a/clang-tools-extra/clangd/test/Inputs/requests.json
+++ b/clang-tools-extra/clangd/test/Inputs/requests.json
@@ -1,7 +1,7 @@
-[{"Limit":100,"ProximityPaths":["/usr/home/user/clang-tools-extra/clangd/benchmarks/IndexBenchmark.cpp"],"Query":"OMP","RestrictForCodeCompletion":true,"Scopes":["clang::"], "AnyScope":false},
-{"Limit":100,"ProximityPaths":[],"Query":"s","RestrictForCodeCompletion":true,"Scopes":["llvm::", ""], "AnyScope":false},
-{"Limit":100,"ProximityPaths":[],"Query":"sy","RestrictForCodeCompletion":true,"Scopes":["llvm::", ""], "AnyScope":false},
-{"Limit":100,"ProximityPaths":[],"Query":"sys","RestrictForCodeCompletion":true,"Scopes":["llvm::", ""], "AnyScope":false},
-{"Limit":100,"ProximityPaths":[],"Query":"sys","RestrictForCodeCompletion":true,"Scopes":["llvm::", ""], "AnyScope":false},
-{"Limit":100,"ProximityPaths":[],"Query":"Dex","RestrictForCodeCompletion":true,"Scopes":["clang::clangd::", "clang::", "clang::clangd::dex::"],"AnyScope":false},
-{"Limit":100,"ProximityPaths":[],"Query":"Variable","RestrictForCodeCompletion":true,"Scopes":[""], "AnyScope":false}]
+[{"Limit":100,"ProximityPaths":["/usr/home/user/clang-tools-extra/clangd/benchmarks/IndexBenchmark.cpp"],"Query":"OMP","RestrictForCodeCompletion":true,"Scopes":["clang::"], "AnyScope":false, "PreferredTypes":[]},
+{"Limit":100,"ProximityPaths":[],"Query":"s","RestrictForCodeCompletion":true,"Scopes":["llvm::", ""], "AnyScope":false, "PreferredTypes":[]},
+{"Limit":100,"ProximityPaths":[],"Query":"sy","RestrictForCodeCompletion":true,"Scopes":["llvm::", ""], "AnyScope":false, "PreferredTypes":[]},
+{"Limit":100,"ProximityPaths":[],"Query":"sys","RestrictForCodeCompletion":true,"Scopes":["llvm::", ""], "AnyScope":false, "PreferredTypes":[]},
+{"Limit":100,"ProximityPaths":[],"Query":"sys","RestrictForCodeCompletion":true,"Scopes":["llvm::", ""], "AnyScope":false, "PreferredTypes":[]},
+{"Limit":100,"ProximityPaths":[],"Query":"Dex","RestrictForCodeCompletion":true,"Scopes":["clang::clangd::", "clang::", "clang::clangd::dex::"],"AnyScope":false, "PreferredTypes":[]},
+{"Limit":100,"ProximityPaths":[],"Query":"Variable","RestrictForCodeCompletion":true,"Scopes":[""], "AnyScope":false, "PreferredTypes":[]}]

diff  --git a/clang-tools-extra/clangd/unittests/DexTests.cpp b/clang-tools-extra/clangd/unittests/DexTests.cpp
index 53abb269191b..e83cd3052b65 100644
--- a/clang-tools-extra/clangd/unittests/DexTests.cpp
+++ b/clang-tools-extra/clangd/unittests/DexTests.cpp
@@ -366,38 +366,46 @@ trigramsAre(std::initializer_list<std::string> Trigrams) {
   return tokensAre(Trigrams, Token::Kind::Trigram);
 }
 
+std::vector<Token> identifierTrigramTokens(llvm::StringRef S) {
+  std::vector<Trigram> Trigrams;
+  generateIdentifierTrigrams(S, Trigrams);
+  std::vector<Token> Tokens;
+  for (Trigram T : Trigrams)
+    Tokens.emplace_back(Token::Kind::Trigram, T.str());
+  return Tokens;
+}
+
 TEST(DexTrigrams, IdentifierTrigrams) {
-  EXPECT_THAT(generateIdentifierTrigrams("X86"),
-              trigramsAre({"x86", "x", "x8"}));
+  EXPECT_THAT(identifierTrigramTokens("X86"), trigramsAre({"x86", "x", "x8"}));
 
-  EXPECT_THAT(generateIdentifierTrigrams("nl"), trigramsAre({"nl", "n"}));
+  EXPECT_THAT(identifierTrigramTokens("nl"), trigramsAre({"nl", "n"}));
 
-  EXPECT_THAT(generateIdentifierTrigrams("n"), trigramsAre({"n"}));
+  EXPECT_THAT(identifierTrigramTokens("n"), trigramsAre({"n"}));
 
-  EXPECT_THAT(generateIdentifierTrigrams("clangd"),
+  EXPECT_THAT(identifierTrigramTokens("clangd"),
               trigramsAre({"c", "cl", "cla", "lan", "ang", "ngd"}));
 
-  EXPECT_THAT(generateIdentifierTrigrams("abc_def"),
+  EXPECT_THAT(identifierTrigramTokens("abc_def"),
               trigramsAre({"a", "ab", "ad", "abc", "abd", "ade", "bcd", "bde",
                            "cde", "def"}));
 
-  EXPECT_THAT(generateIdentifierTrigrams("a_b_c_d_e_"),
+  EXPECT_THAT(identifierTrigramTokens("a_b_c_d_e_"),
               trigramsAre({"a", "a_", "ab", "abc", "bcd", "cde"}));
 
-  EXPECT_THAT(generateIdentifierTrigrams("unique_ptr"),
+  EXPECT_THAT(identifierTrigramTokens("unique_ptr"),
               trigramsAre({"u", "un", "up", "uni", "unp", "upt", "niq", "nip",
                            "npt", "iqu", "iqp", "ipt", "que", "qup", "qpt",
                            "uep", "ept", "ptr"}));
 
   EXPECT_THAT(
-      generateIdentifierTrigrams("TUDecl"),
+      identifierTrigramTokens("TUDecl"),
       trigramsAre({"t", "tu", "td", "tud", "tde", "ude", "dec", "ecl"}));
 
-  EXPECT_THAT(generateIdentifierTrigrams("IsOK"),
+  EXPECT_THAT(identifierTrigramTokens("IsOK"),
               trigramsAre({"i", "is", "io", "iso", "iok", "sok"}));
 
   EXPECT_THAT(
-      generateIdentifierTrigrams("abc_defGhij__klm"),
+      identifierTrigramTokens("abc_defGhij__klm"),
       trigramsAre({"a",   "ab",  "ad",  "abc", "abd", "ade", "adg", "bcd",
                    "bde", "bdg", "cde", "cdg", "def", "deg", "dgh", "dgk",
                    "efg", "egh", "egk", "fgh", "fgk", "ghi", "ghk", "gkl",


        


More information about the cfe-commits mailing list