[PATCH] D113995: [clangd] Dex Trigrams: Improve query trigram generation

Kirill Bobyrev via Phabricator via cfe-commits cfe-commits at lists.llvm.org
Tue Nov 16 06:52:03 PST 2021


kbobyrev updated this revision to Diff 387615.
kbobyrev added a comment.

Add a small comment.


Repository:
  rG LLVM Github Monorepo

CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D113995/new/

https://reviews.llvm.org/D113995

Files:
  clang-tools-extra/clangd/index/dex/Trigram.cpp
  clang-tools-extra/clangd/unittests/DexTests.cpp


Index: clang-tools-extra/clangd/unittests/DexTests.cpp
===================================================================
--- clang-tools-extra/clangd/unittests/DexTests.cpp
+++ clang-tools-extra/clangd/unittests/DexTests.cpp
@@ -404,6 +404,9 @@
   EXPECT_THAT(identifierTrigramTokens("IsOK"),
               trigramsAre({"i", "is", "io", "iso", "iok", "sok"}));
 
+  EXPECT_THAT(identifierTrigramTokens("_pb"), trigramsAre({"_", "_p"}));
+  EXPECT_THAT(identifierTrigramTokens("__pb"), trigramsAre({"_", "__", "_p"}));
+
   EXPECT_THAT(
       identifierTrigramTokens("abc_defGhij__klm"),
       trigramsAre({"a",   "ab",  "ad",  "abc", "abd", "ade", "adg", "bcd",
@@ -422,6 +425,14 @@
   EXPECT_THAT(generateQueryTrigrams("__"), trigramsAre({"__"}));
   EXPECT_THAT(generateQueryTrigrams("___"), trigramsAre({}));
 
+  EXPECT_THAT(generateQueryTrigrams("m_"), trigramsAre({"m_"}));
+
+  EXPECT_THAT(generateQueryTrigrams("p_b"), trigramsAre({"pb"}));
+  EXPECT_THAT(generateQueryTrigrams("pb_"), trigramsAre({"pb"}));
+  EXPECT_THAT(generateQueryTrigrams("_p"), trigramsAre({"_p"}));
+  EXPECT_THAT(generateQueryTrigrams("_pb_"), trigramsAre({"_p"}));
+  EXPECT_THAT(generateQueryTrigrams("__pb"), trigramsAre({"_p"}));
+
   EXPECT_THAT(generateQueryTrigrams("X86"), trigramsAre({"x86"}));
 
   EXPECT_THAT(generateQueryTrigrams("clangd"),
@@ -545,6 +556,18 @@
   Req.Query = "ttf";
   EXPECT_THAT(match(*I, Req, &Incomplete), ElementsAre("OneTwoThreeFour"));
   EXPECT_FALSE(Incomplete) << "3-char string is not a short query";
+
+  I = Dex::build(generateSymbols({"tok::kw_builtin_va_arg", "bar::whatever"}),
+                 RefSlab(), RelationSlab());
+
+  Req.Query = "kw_";
+  EXPECT_THAT(match(*I, Req, &Incomplete),
+              ElementsAre("tok::kw_builtin_va_arg"));
+  EXPECT_FALSE(Incomplete) << "kw_ is enough to match the whole symbol";
+  Req.Scopes = {"tok::"};
+  EXPECT_THAT(match(*I, Req, &Incomplete),
+              ElementsAre("tok::kw_builtin_va_arg"));
+  EXPECT_FALSE(Incomplete) << "kw_ is enough to match the whole symbol";
 }
 
 TEST(DexTest, MatchQualifiedNamesWithoutSpecificScope) {
Index: clang-tools-extra/clangd/index/dex/Trigram.cpp
===================================================================
--- clang-tools-extra/clangd/index/dex/Trigram.cpp
+++ clang-tools-extra/clangd/index/dex/Trigram.cpp
@@ -101,17 +101,43 @@
 std::vector<Token> generateQueryTrigrams(llvm::StringRef Query) {
   if (Query.empty())
     return {};
-  std::string LowercaseQuery = Query.lower();
-  if (Query.size() < 3) // short-query trigrams only
-    return {Token(Token::Kind::Trigram, LowercaseQuery)};
 
   // Apply fuzzy matching text segmentation.
   std::vector<CharRole> Roles(Query.size());
   calculateRoles(Query, llvm::makeMutableArrayRef(Roles.data(), Query.size()));
 
+  std::string LowercaseQuery = Query.lower();
+
+  if (LowercaseQuery.size() < 3) // short-query trigrams only.
+    return {Token(Token::Kind::Trigram, LowercaseQuery)};
+
+  unsigned ValidSymbols =
+      llvm::count_if(Roles, [](CharRole R) { return R == Head || R == Tail; });
+  // If the query does not have any alphanumeric symbols, don't restrict the
+  // result to the names.
+  if (ValidSymbols == 0)
+    return {};
+  // For queries with very few letters, emulate what generateIdentifierTrigrams
+  // outputs for the beginning of the Identifier.
+  if (ValidSymbols < 3) {
+    std::string Letters =
+        Roles.front() == Separator ? std::string(1, Query.front()) : "";
+    for (unsigned I = 0; I < LowercaseQuery.size(); ++I) {
+      if (Roles[I] == Head || Roles[I] == Tail) {
+        Letters += LowercaseQuery[I];
+        // Similar to the identifier trigram generation, stop here for the
+        // queries starting with the separator, i.e. "_va" will only output
+        // "_v" here, identifier trigram generator will output "_" and "_v"
+        if (Roles.front() == Separator)
+          break;
+      }
+    }
+    return {Token(Token::Kind::Trigram, Letters)};
+  }
+
   llvm::DenseSet<Token> UniqueTrigrams;
   std::string Chars;
-  for (unsigned I = 0; I < Query.size(); ++I) {
+  for (unsigned I = 0; I < LowercaseQuery.size(); ++I) {
     if (Roles[I] != Head && Roles[I] != Tail)
       continue; // Skip delimiters.
     Chars.push_back(LowercaseQuery[I]);


-------------- next part --------------
A non-text attachment was scrubbed...
Name: D113995.387615.patch
Type: text/x-patch
Size: 4306 bytes
Desc: not available
URL: <http://lists.llvm.org/pipermail/cfe-commits/attachments/20211116/c37802ca/attachment-0001.bin>


More information about the cfe-commits mailing list