[PATCH] D50700: [clangd] Generate better incomplete bigrams for the Dex index
Kirill Bobyrev via Phabricator via cfe-commits
cfe-commits at lists.llvm.org
Tue Aug 14 04:51:19 PDT 2018
kbobyrev updated this revision to Diff 160555.
kbobyrev added a comment.
Treat leading underscores as additional signals and don't extract two heads in that case.
https://reviews.llvm.org/D50700
Files:
clang-tools-extra/clangd/index/dex/Trigram.cpp
clang-tools-extra/clangd/index/dex/Trigram.h
clang-tools-extra/unittests/clangd/DexIndexTests.cpp
Index: clang-tools-extra/unittests/clangd/DexIndexTests.cpp
===================================================================
--- clang-tools-extra/unittests/clangd/DexIndexTests.cpp
+++ clang-tools-extra/unittests/clangd/DexIndexTests.cpp
@@ -321,6 +321,9 @@
EXPECT_THAT(generateQueryTrigrams("__"), trigramsAre({"__$"}));
EXPECT_THAT(generateQueryTrigrams("___"), trigramsAre({"___"}));
+ EXPECT_THAT(generateQueryTrigrams("u_p"), trigramsAre({"up$"}));
+ EXPECT_THAT(generateQueryTrigrams("_u_p"), trigramsAre({"_u_"}));
+
EXPECT_THAT(generateQueryTrigrams("X86"), trigramsAre({"x86"}));
EXPECT_THAT(generateQueryTrigrams("clangd"),
Index: clang-tools-extra/clangd/index/dex/Trigram.h
===================================================================
--- clang-tools-extra/clangd/index/dex/Trigram.h
+++ clang-tools-extra/clangd/index/dex/Trigram.h
@@ -62,7 +62,11 @@
///
/// For short queries (less than 3 characters with Head or Tail roles in Fuzzy
/// Matching segmentation) this returns a single trigram with the first
-/// characters (up to 3) to perfrom prefix match.
+/// characters (up to 3) to perfrom prefix match. However, if the query is short
+/// but it contains two HEAD symbols then the returned trigram would be an
+/// incomplete bigram with those two HEADs (unless query starts with '_' which
+/// is treated as an additional information). This would help to match
+/// "unique_ptr" and similar symbols with "u_p" query
std::vector<Token> generateQueryTrigrams(llvm::StringRef Query);
} // namespace dex
Index: clang-tools-extra/clangd/index/dex/Trigram.cpp
===================================================================
--- clang-tools-extra/clangd/index/dex/Trigram.cpp
+++ clang-tools-extra/clangd/index/dex/Trigram.cpp
@@ -116,21 +116,39 @@
// Additional pass is necessary to count valid identifier characters.
// Depending on that, this function might return incomplete trigram.
+ unsigned Heads = 0;
unsigned ValidSymbolsCount = 0;
- for (size_t I = 0; I < Roles.size(); ++I)
- if (Roles[I] == Head || Roles[I] == Tail)
+ for (size_t I = 0; I < Roles.size(); ++I) {
+ if (Roles[I] == Head) {
+ ++ValidSymbolsCount;
+ ++Heads;
+ } else if (Roles[I] == Tail) {
++ValidSymbolsCount;
+ }
+ }
std::string LowercaseQuery = Query.lower();
DenseSet<Token> UniqueTrigrams;
// If the number of symbols which can form fuzzy matching trigram is not
// sufficient, generate a single incomplete trigram for query.
if (ValidSymbolsCount < 3) {
- std::string Chars =
- LowercaseQuery.substr(0, std::min<size_t>(3UL, Query.size()));
- Chars.append(3 - Chars.size(), END_MARKER);
+ std::string Chars;
+ // If the query is not long enough to form a trigram but contains two heads
+ // the returned trigram should be "xy$" where "x" and "y" are the heads.
+ // This might be particulary important for cases like "u_p" to match
+ // "unique_ptr" and similar symbols from the C++ Standard Library.
+ if (Heads == 2 && !Query.startswith("_")) {
+ for (size_t I = 0; I < LowercaseQuery.size(); ++I)
+ if (Roles[I] == Head)
+ Chars += LowercaseQuery[I];
+
+ Chars += END_MARKER;
+ } else {
+ Chars = LowercaseQuery.substr(0, std::min<size_t>(3UL, Query.size()));
+ Chars.append(3 - Chars.size(), END_MARKER);
+ }
UniqueTrigrams.insert(Token(Token::Kind::Trigram, Chars));
} else {
std::deque<char> Chars;
-------------- next part --------------
A non-text attachment was scrubbed...
Name: D50700.160555.patch
Type: text/x-patch
Size: 3493 bytes
Desc: not available
URL: <http://lists.llvm.org/pipermail/cfe-commits/attachments/20180814/bd338870/attachment-0001.bin>
More information about the cfe-commits
mailing list