[clang-tools-extra] 735ab46 - [clangd] Don't create as much garbage while building Dex index.
Sam McCall via cfe-commits
cfe-commits at lists.llvm.org
Thu May 14 04:51:37 PDT 2020
Author: Sam McCall
Date: 2020-05-14T13:35:44+02:00
New Revision: 735ab46cb4148c92d636c912832a32509252b579
URL: https://github.com/llvm/llvm-project/commit/735ab46cb4148c92d636c912832a32509252b579
DIFF: https://github.com/llvm/llvm-project/commit/735ab46cb4148c92d636c912832a32509252b579.diff
LOG: [clangd] Don't create as much garbage while building Dex index.
Summary:
The Token objects are relatively expensive and we were spending a lot of
CPU creating them for each trigram emitted. Instead, use a tiny trigram
structure until we're ready to finalize the index.
This improves the new BuildDex benchmark by 20%. This code is hot and on
the critical path in clangd: it runs after a new preamble is built.
Reviewers: kbobyrev
Subscribers: ilya-biryukov, MaskRay, jkorous, mgrang, arphaman, kadircet, usaxena95, cfe-commits
Tags: #clang
Differential Revision: https://reviews.llvm.org/D79918
Added:
Modified:
clang-tools-extra/clangd/benchmarks/IndexBenchmark.cpp
clang-tools-extra/clangd/index/dex/Dex.cpp
clang-tools-extra/clangd/index/dex/Trigram.cpp
clang-tools-extra/clangd/index/dex/Trigram.h
clang-tools-extra/clangd/test/Inputs/requests.json
clang-tools-extra/clangd/unittests/DexTests.cpp
Removed:
################################################################################
diff --git a/clang-tools-extra/clangd/benchmarks/IndexBenchmark.cpp b/clang-tools-extra/clangd/benchmarks/IndexBenchmark.cpp
index 439ac9c65b9c..26a70de8ee51 100644
--- a/clang-tools-extra/clangd/benchmarks/IndexBenchmark.cpp
+++ b/clang-tools-extra/clangd/benchmarks/IndexBenchmark.cpp
@@ -83,6 +83,12 @@ static void DexQueries(benchmark::State &State) {
}
BENCHMARK(DexQueries);
+static void DexBuild(benchmark::State &State) {
+ for (auto _ : State)
+ buildDex();
+}
+BENCHMARK(DexBuild);
+
} // namespace
} // namespace clangd
} // namespace clang
diff --git a/clang-tools-extra/clangd/index/dex/Dex.cpp b/clang-tools-extra/clangd/index/dex/Dex.cpp
index a663e5387ece..8082ffd75ad9 100644
--- a/clang-tools-extra/clangd/index/dex/Dex.cpp
+++ b/clang-tools-extra/clangd/index/dex/Dex.cpp
@@ -39,28 +39,60 @@ namespace {
const Token RestrictedForCodeCompletion =
Token(Token::Kind::Sentinel, "Restricted For Code Completion");
-// Returns the tokens which are given symbol's characteristics. Currently, the
-// generated tokens only contain fuzzy matching trigrams and symbol's scope,
-// but in the future this will also return path proximity tokens and other
-// types of tokens such as symbol type (if applicable).
-// Returns the tokens which are given symbols's characteristics. For example,
-// trigrams and scopes.
-// FIXME(kbobyrev): Support more token types:
-// * Namespace proximity
-std::vector<Token> generateSearchTokens(const Symbol &Sym) {
- std::vector<Token> Result = generateIdentifierTrigrams(Sym.Name);
- Result.emplace_back(Token::Kind::Scope, Sym.Scope);
- // Skip token generation for symbols with unknown declaration location.
- if (!llvm::StringRef(Sym.CanonicalDeclaration.FileURI).empty())
- for (const auto &ProximityURI :
- generateProximityURIs(Sym.CanonicalDeclaration.FileURI))
- Result.emplace_back(Token::Kind::ProximityURI, ProximityURI);
- if (Sym.Flags & Symbol::IndexedForCodeCompletion)
- Result.emplace_back(RestrictedForCodeCompletion);
- if (!Sym.Type.empty())
- Result.emplace_back(Token::Kind::Type, Sym.Type);
- return Result;
-}
+// Helper to efficiently assemble the inverse index (token -> matching docs).
+// The output is a nice uniform structure keyed on Token, but constructing
+// the Token object every time we want to insert into the map is wasteful.
+// Instead we have various maps keyed on things that are cheap to compute,
+// and produce the Token keys once at the end.
+class IndexBuilder {
+ llvm::DenseMap<Trigram, std::vector<DocID>> TrigramDocs;
+ std::vector<DocID> RestrictedCCDocs;
+ llvm::StringMap<std::vector<DocID>> TypeDocs;
+ llvm::StringMap<std::vector<DocID>> ScopeDocs;
+ llvm::StringMap<std::vector<DocID>> ProximityDocs;
+ std::vector<Trigram> TrigramScratch;
+
+public:
+ // Add the tokens which are given symbol's characteristics.
+ // This includes fuzzy matching trigrams, symbol's scope, etc.
+ // FIXME(kbobyrev): Support more token types:
+ // * Namespace proximity
+ void add(const Symbol &Sym, DocID D) {
+ generateIdentifierTrigrams(Sym.Name, TrigramScratch);
+ for (Trigram T : TrigramScratch)
+ TrigramDocs[T].push_back(D);
+ ScopeDocs[Sym.Scope].push_back(D);
+ if (!llvm::StringRef(Sym.CanonicalDeclaration.FileURI).empty())
+ for (const auto &ProximityURI :
+ generateProximityURIs(Sym.CanonicalDeclaration.FileURI))
+ ProximityDocs[ProximityURI].push_back(D);
+ if (Sym.Flags & Symbol::IndexedForCodeCompletion)
+ RestrictedCCDocs.push_back(D);
+ if (!Sym.Type.empty())
+ TypeDocs[Sym.Type].push_back(D);
+ }
+
+ // Assemble the final compressed posting lists for the added symbols.
+ llvm::DenseMap<Token, PostingList> build() {
+ llvm::DenseMap<Token, PostingList> Result(/*InitialReserve=*/
+ TrigramDocs.size() +
+ RestrictedCCDocs.size() +
+ TypeDocs.size() +
+ ScopeDocs.size() +
+ ProximityDocs.size());
+ for (const auto &E : TrigramDocs)
+ Result.try_emplace(Token(Token::Kind::Trigram, E.first.str()), E.second);
+ for (const auto &E : TypeDocs)
+ Result.try_emplace(Token(Token::Kind::Type, E.first()), E.second);
+ for (const auto &E : ScopeDocs)
+ Result.try_emplace(Token(Token::Kind::Scope, E.first()), E.second);
+ for (const auto &E : ProximityDocs)
+ Result.try_emplace(Token(Token::Kind::ProximityURI, E.first()), E.second);
+ if (!RestrictedCCDocs.empty())
+ Result.try_emplace(RestrictedForCodeCompletion, RestrictedCCDocs);
+ return Result;
+ }
+};
} // namespace
@@ -86,18 +118,11 @@ void Dex::buildIndex() {
Symbols[I] = ScoredSymbols[I].second;
}
- // Populate TempInvertedIndex with lists for index symbols.
- llvm::DenseMap<Token, std::vector<DocID>> TempInvertedIndex;
- for (DocID SymbolRank = 0; SymbolRank < Symbols.size(); ++SymbolRank) {
- const auto *Sym = Symbols[SymbolRank];
- for (const auto &Token : generateSearchTokens(*Sym))
- TempInvertedIndex[Token].push_back(SymbolRank);
- }
-
- // Convert lists of items to posting lists.
- for (const auto &TokenToPostingList : TempInvertedIndex)
- InvertedIndex.insert(
- {TokenToPostingList.first, PostingList(TokenToPostingList.second)});
+ // Build posting lists for symbols.
+ IndexBuilder Builder;
+ for (DocID SymbolRank = 0; SymbolRank < Symbols.size(); ++SymbolRank)
+ Builder.add(*Symbols[SymbolRank], SymbolRank);
+ InvertedIndex = Builder.build();
}
std::unique_ptr<Iterator> Dex::iterator(const Token &Tok) const {
diff --git a/clang-tools-extra/clangd/index/dex/Trigram.cpp b/clang-tools-extra/clangd/index/dex/Trigram.cpp
index 725c9c409df1..fddca8a95320 100644
--- a/clang-tools-extra/clangd/index/dex/Trigram.cpp
+++ b/clang-tools-extra/clangd/index/dex/Trigram.cpp
@@ -11,6 +11,7 @@
#include "Token.h"
#include "llvm/ADT/ArrayRef.h"
#include "llvm/ADT/DenseSet.h"
+#include "llvm/ADT/STLExtras.h"
#include "llvm/ADT/StringExtras.h"
#include <cctype>
#include <queue>
@@ -20,7 +21,9 @@ namespace clang {
namespace clangd {
namespace dex {
-std::vector<Token> generateIdentifierTrigrams(llvm::StringRef Identifier) {
+// Produce trigrams (including duplicates) and pass them to Out().
+template <typename Func>
+static void identifierTrigrams(llvm::StringRef Identifier, Func Out) {
// Apply fuzzy matching text segmentation.
std::vector<CharRole> Roles(Identifier.size());
calculateRoles(Identifier,
@@ -46,12 +49,6 @@ std::vector<Token> generateIdentifierTrigrams(llvm::StringRef Identifier) {
}
}
- llvm::DenseSet<Token> UniqueTrigrams;
-
- auto Add = [&](std::string Chars) {
- UniqueTrigrams.insert(Token(Token::Kind::Trigram, Chars));
- };
-
// Iterate through valid sequences of three characters Fuzzy Matcher can
// process.
for (size_t I = 0; I < LowercaseIdentifier.size(); ++I) {
@@ -64,23 +61,41 @@ std::vector<Token> generateIdentifierTrigrams(llvm::StringRef Identifier) {
for (const unsigned K : Next[J]) {
if (K == 0)
continue;
- Add({{LowercaseIdentifier[I], LowercaseIdentifier[J],
- LowercaseIdentifier[K]}});
+ Out(Trigram(LowercaseIdentifier[I], LowercaseIdentifier[J],
+ LowercaseIdentifier[K]));
}
}
}
// Emit short-query trigrams: FooBar -> f, fo, fb.
if (!LowercaseIdentifier.empty())
- Add({LowercaseIdentifier[0]});
+ Out(Trigram(LowercaseIdentifier[0]));
if (LowercaseIdentifier.size() >= 2)
- Add({LowercaseIdentifier[0], LowercaseIdentifier[1]});
+ Out(Trigram(LowercaseIdentifier[0], LowercaseIdentifier[1]));
for (size_t I = 1; I < LowercaseIdentifier.size(); ++I)
if (Roles[I] == Head) {
- Add({LowercaseIdentifier[0], LowercaseIdentifier[I]});
+ Out(Trigram(LowercaseIdentifier[0], LowercaseIdentifier[I]));
break;
}
+}
- return {UniqueTrigrams.begin(), UniqueTrigrams.end()};
+void generateIdentifierTrigrams(llvm::StringRef Identifier,
+ std::vector<Trigram> &Result) {
+ // Empirically, scanning for duplicates is faster with fewer trigrams, and
+ // a hashtable is faster with more. This is a hot path, so dispatch based on
+ // expected number of trigrams. Longer identifiers produce more trigrams.
+ // The magic number was tuned by running IndexBenchmark.DexBuild.
+ constexpr unsigned ManyTrigramsIdentifierThreshold = 14;
+ Result.clear();
+ if (Identifier.size() < ManyTrigramsIdentifierThreshold) {
+ identifierTrigrams(Identifier, [&](Trigram T) {
+ if (!llvm::is_contained(Result, T))
+ Result.push_back(T);
+ });
+ } else {
+ identifierTrigrams(Identifier, [&](Trigram T) { Result.push_back(T); });
+ llvm::sort(Result);
+ Result.erase(std::unique(Result.begin(), Result.end()), Result.end());
+ }
}
std::vector<Token> generateQueryTrigrams(llvm::StringRef Query) {
diff --git a/clang-tools-extra/clangd/index/dex/Trigram.h b/clang-tools-extra/clangd/index/dex/Trigram.h
index bf1e5e809412..28cd4718c350 100644
--- a/clang-tools-extra/clangd/index/dex/Trigram.h
+++ b/clang-tools-extra/clangd/index/dex/Trigram.h
@@ -24,6 +24,7 @@
#define LLVM_CLANG_TOOLS_EXTRA_CLANGD_DEX_TRIGRAM_H
#include "Token.h"
+#include "llvm/ADT/bit.h"
#include <string>
@@ -31,7 +32,27 @@ namespace clang {
namespace clangd {
namespace dex {
-/// Returns list of unique fuzzy-search trigrams from unqualified symbol.
+// Compact representation of a trigram (string with up to 3 characters).
+// Trigram generation is the hot path of indexing, so Token is too wasteful.
+class Trigram {
+ std::array<char, 4> Data; // Last element is length.
+ // Steal some invalid bit patterns for DenseMap sentinels.
+ enum class Sentinel { Tombstone = 4, Empty = 5 };
+ Trigram(Sentinel S) : Data{0, 0, 0, static_cast<char>(S)} {}
+ uint32_t id() const { return llvm::bit_cast<uint32_t>(Data); }
+
+public:
+ Trigram() : Data{0, 0, 0, 0} {}
+ Trigram(char A) : Data{A, 0, 0, 1} {}
+ Trigram(char A, char B) : Data{A, B, 0, 2} {}
+ Trigram(char A, char B, char C) : Data{A, B, C, 3} {}
+ std::string str() const { return std::string(Data.data(), Data[3]); }
+ friend struct ::llvm::DenseMapInfo<Trigram>;
+ friend bool operator==(Trigram L, Trigram R) { return L.id() == R.id(); }
+ friend bool operator<(Trigram L, Trigram R) { return L.id() < R.id(); }
+};
+
+/// Produces list of unique fuzzy-search trigrams from unqualified symbol.
/// The trigrams give the 3-character query substrings this symbol can match.
///
/// The symbol's name is broken into segments, e.g. "FooBar" has two segments.
@@ -46,8 +67,9 @@ namespace dex {
/// {f, fo, fb, foo, fob, fba, oob, oba, bar}.
///
/// Trigrams are lowercase, as trigram matching is case-insensitive.
-/// Trigrams in the returned list are deduplicated.
-std::vector<Token> generateIdentifierTrigrams(llvm::StringRef Identifier);
+/// Trigrams in the list are deduplicated.
+void generateIdentifierTrigrams(llvm::StringRef Identifier,
+ std::vector<Trigram> &Out);
/// Returns list of unique fuzzy-search trigrams given a query.
///
@@ -64,4 +86,29 @@ std::vector<Token> generateQueryTrigrams(llvm::StringRef Query);
} // namespace clangd
} // namespace clang
+namespace llvm {
+template <> struct llvm::DenseMapInfo<clang::clangd::dex::Trigram> {
+ using Trigram = clang::clangd::dex::Trigram;
+ static inline Trigram getEmptyKey() {
+ return Trigram(Trigram::Sentinel::Empty);
+ }
+ static inline Trigram getTombstoneKey() {
+ return Trigram(Trigram::Sentinel::Tombstone);
+ }
+ static unsigned getHashValue(Trigram V) {
+ // Finalize step from MurmurHash3.
+ uint32_t X = V.id();
+ X ^= X >> 16;
+ X *= uint32_t{0x85ebca6b};
+ X ^= X >> 13;
+ X *= uint32_t{0xc2b2ae35};
+ X ^= X >> 16;
+ return X;
+ }
+ static bool isEqual(const Trigram &LHS, const Trigram &RHS) {
+ return LHS == RHS;
+ }
+};
+} // namespace llvm
+
#endif // LLVM_CLANG_TOOLS_EXTRA_CLANGD_DEX_TRIGRAM_H
diff --git a/clang-tools-extra/clangd/test/Inputs/requests.json b/clang-tools-extra/clangd/test/Inputs/requests.json
index ee91cf243749..e5a2f1459760 100644
--- a/clang-tools-extra/clangd/test/Inputs/requests.json
+++ b/clang-tools-extra/clangd/test/Inputs/requests.json
@@ -1,7 +1,7 @@
-[{"Limit":100,"ProximityPaths":["/usr/home/user/clang-tools-extra/clangd/benchmarks/IndexBenchmark.cpp"],"Query":"OMP","RestrictForCodeCompletion":true,"Scopes":["clang::"], "AnyScope":false},
-{"Limit":100,"ProximityPaths":[],"Query":"s","RestrictForCodeCompletion":true,"Scopes":["llvm::", ""], "AnyScope":false},
-{"Limit":100,"ProximityPaths":[],"Query":"sy","RestrictForCodeCompletion":true,"Scopes":["llvm::", ""], "AnyScope":false},
-{"Limit":100,"ProximityPaths":[],"Query":"sys","RestrictForCodeCompletion":true,"Scopes":["llvm::", ""], "AnyScope":false},
-{"Limit":100,"ProximityPaths":[],"Query":"sys","RestrictForCodeCompletion":true,"Scopes":["llvm::", ""], "AnyScope":false},
-{"Limit":100,"ProximityPaths":[],"Query":"Dex","RestrictForCodeCompletion":true,"Scopes":["clang::clangd::", "clang::", "clang::clangd::dex::"],"AnyScope":false},
-{"Limit":100,"ProximityPaths":[],"Query":"Variable","RestrictForCodeCompletion":true,"Scopes":[""], "AnyScope":false}]
+[{"Limit":100,"ProximityPaths":["/usr/home/user/clang-tools-extra/clangd/benchmarks/IndexBenchmark.cpp"],"Query":"OMP","RestrictForCodeCompletion":true,"Scopes":["clang::"], "AnyScope":false, "PreferredTypes":[]},
+{"Limit":100,"ProximityPaths":[],"Query":"s","RestrictForCodeCompletion":true,"Scopes":["llvm::", ""], "AnyScope":false, "PreferredTypes":[]},
+{"Limit":100,"ProximityPaths":[],"Query":"sy","RestrictForCodeCompletion":true,"Scopes":["llvm::", ""], "AnyScope":false, "PreferredTypes":[]},
+{"Limit":100,"ProximityPaths":[],"Query":"sys","RestrictForCodeCompletion":true,"Scopes":["llvm::", ""], "AnyScope":false, "PreferredTypes":[]},
+{"Limit":100,"ProximityPaths":[],"Query":"sys","RestrictForCodeCompletion":true,"Scopes":["llvm::", ""], "AnyScope":false, "PreferredTypes":[]},
+{"Limit":100,"ProximityPaths":[],"Query":"Dex","RestrictForCodeCompletion":true,"Scopes":["clang::clangd::", "clang::", "clang::clangd::dex::"],"AnyScope":false, "PreferredTypes":[]},
+{"Limit":100,"ProximityPaths":[],"Query":"Variable","RestrictForCodeCompletion":true,"Scopes":[""], "AnyScope":false, "PreferredTypes":[]}]
diff --git a/clang-tools-extra/clangd/unittests/DexTests.cpp b/clang-tools-extra/clangd/unittests/DexTests.cpp
index 53abb269191b..e83cd3052b65 100644
--- a/clang-tools-extra/clangd/unittests/DexTests.cpp
+++ b/clang-tools-extra/clangd/unittests/DexTests.cpp
@@ -366,38 +366,46 @@ trigramsAre(std::initializer_list<std::string> Trigrams) {
return tokensAre(Trigrams, Token::Kind::Trigram);
}
+std::vector<Token> identifierTrigramTokens(llvm::StringRef S) {
+ std::vector<Trigram> Trigrams;
+ generateIdentifierTrigrams(S, Trigrams);
+ std::vector<Token> Tokens;
+ for (Trigram T : Trigrams)
+ Tokens.emplace_back(Token::Kind::Trigram, T.str());
+ return Tokens;
+}
+
TEST(DexTrigrams, IdentifierTrigrams) {
- EXPECT_THAT(generateIdentifierTrigrams("X86"),
- trigramsAre({"x86", "x", "x8"}));
+ EXPECT_THAT(identifierTrigramTokens("X86"), trigramsAre({"x86", "x", "x8"}));
- EXPECT_THAT(generateIdentifierTrigrams("nl"), trigramsAre({"nl", "n"}));
+ EXPECT_THAT(identifierTrigramTokens("nl"), trigramsAre({"nl", "n"}));
- EXPECT_THAT(generateIdentifierTrigrams("n"), trigramsAre({"n"}));
+ EXPECT_THAT(identifierTrigramTokens("n"), trigramsAre({"n"}));
- EXPECT_THAT(generateIdentifierTrigrams("clangd"),
+ EXPECT_THAT(identifierTrigramTokens("clangd"),
trigramsAre({"c", "cl", "cla", "lan", "ang", "ngd"}));
- EXPECT_THAT(generateIdentifierTrigrams("abc_def"),
+ EXPECT_THAT(identifierTrigramTokens("abc_def"),
trigramsAre({"a", "ab", "ad", "abc", "abd", "ade", "bcd", "bde",
"cde", "def"}));
- EXPECT_THAT(generateIdentifierTrigrams("a_b_c_d_e_"),
+ EXPECT_THAT(identifierTrigramTokens("a_b_c_d_e_"),
trigramsAre({"a", "a_", "ab", "abc", "bcd", "cde"}));
- EXPECT_THAT(generateIdentifierTrigrams("unique_ptr"),
+ EXPECT_THAT(identifierTrigramTokens("unique_ptr"),
trigramsAre({"u", "un", "up", "uni", "unp", "upt", "niq", "nip",
"npt", "iqu", "iqp", "ipt", "que", "qup", "qpt",
"uep", "ept", "ptr"}));
EXPECT_THAT(
- generateIdentifierTrigrams("TUDecl"),
+ identifierTrigramTokens("TUDecl"),
trigramsAre({"t", "tu", "td", "tud", "tde", "ude", "dec", "ecl"}));
- EXPECT_THAT(generateIdentifierTrigrams("IsOK"),
+ EXPECT_THAT(identifierTrigramTokens("IsOK"),
trigramsAre({"i", "is", "io", "iso", "iok", "sok"}));
EXPECT_THAT(
- generateIdentifierTrigrams("abc_defGhij__klm"),
+ identifierTrigramTokens("abc_defGhij__klm"),
trigramsAre({"a", "ab", "ad", "abc", "abd", "ade", "adg", "bcd",
"bde", "bdg", "cde", "cdg", "def", "deg", "dgh", "dgk",
"efg", "egh", "egk", "fgh", "fgk", "ghi", "ghk", "gkl",
More information about the cfe-commits
mailing list