[PATCH] D52885: [clangd] Remove one-segment-skipping from Dex trigrams.

Sam McCall via Phabricator via cfe-commits cfe-commits at lists.llvm.org
Thu Oct 4 06:42:20 PDT 2018


sammccall created this revision.
sammccall added reviewers: ilya-biryukov, ioeric.
Herald added subscribers: cfe-commits, kadircet, arphaman, jkorous, MaskRay.

Currently queries like "ab" can match identifiers like a_yellow_bee.
The value of allowing this for exactly one segment but no more seems dubious.
It costs ~3% of overall ram (~9% of posting list ram) and some quality.


Repository:
  rCTE Clang Tools Extra

https://reviews.llvm.org/D52885

Files:
  clangd/index/dex/Trigram.cpp
  clangd/index/dex/Trigram.h
  unittests/clangd/DexTests.cpp


Index: unittests/clangd/DexTests.cpp
===================================================================
--- unittests/clangd/DexTests.cpp
+++ unittests/clangd/DexTests.cpp
@@ -381,8 +381,7 @@
                            "def", "ab$", "ad$"}));
 
   EXPECT_THAT(generateIdentifierTrigrams("a_b_c_d_e_"),
-              trigramsAre({"a$$", "a_$", "a_b", "abc", "abd", "acd", "ace",
-                           "bcd", "bce", "bde", "cde", "ab$"}));
+              trigramsAre({"a$$", "a_$", "a_b", "abc", "bcd", "cde", "ab$"}));
 
   EXPECT_THAT(generateIdentifierTrigrams("unique_ptr"),
               trigramsAre({"u$$", "uni", "unp", "upt", "niq", "nip", "npt",
@@ -398,11 +397,9 @@
 
   EXPECT_THAT(
       generateIdentifierTrigrams("abc_defGhij__klm"),
-      trigramsAre({"a$$", "abc", "abd", "abg", "ade", "adg", "adk", "agh",
-                   "agk", "bcd", "bcg", "bde", "bdg", "bdk", "bgh", "bgk",
-                   "cde", "cdg", "cdk", "cgh", "cgk", "def", "deg", "dek",
-                   "dgh", "dgk", "dkl", "efg", "efk", "egh", "egk", "ekl",
-                   "fgh", "fgk", "fkl", "ghi", "ghk", "gkl", "hij", "hik",
+      trigramsAre({"a$$", "abc", "abd", "ade", "adg", "bcd", "bde", "bdg",
+                   "cde", "cdg", "def", "deg", "dgh", "dgk", "efg", "egh",
+                   "egk", "fgh", "fgk", "ghi", "ghk", "gkl", "hij", "hik",
                    "hkl", "ijk", "ikl", "jkl", "klm", "ab$", "ad$"}));
 }
 
Index: clangd/index/dex/Trigram.h
===================================================================
--- clangd/index/dex/Trigram.h
+++ clangd/index/dex/Trigram.h
@@ -42,8 +42,7 @@
 /// characters is inserted into the result.
 ///
 /// Trigrams can start at any character in the input. Then we can choose to move
-/// to the next character, move to the start of the next segment, or skip over a
-/// segment.
+/// to the next character, move to the start of the next segment.
 ///
 /// This also generates incomplete trigrams for short query scenarios:
 ///  * Empty trigram: "$$$".
Index: clangd/index/dex/Trigram.cpp
===================================================================
--- clangd/index/dex/Trigram.cpp
+++ clangd/index/dex/Trigram.cpp
@@ -46,14 +46,13 @@
   // Next stores tuples of three indices in the presented order, if a variant is
   // not available then 0 is stored.
   std::vector<std::array<unsigned, 3>> Next(LowercaseIdentifier.size());
-  unsigned NextTail = 0, NextHead = 0, NextNextHead = 0;
+  unsigned NextTail = 0, NextHead = 0;
   // Store two first HEAD characters in the identifier (if present).
   std::deque<char> TwoHeads;
   for (int I = LowercaseIdentifier.size() - 1; I >= 0; --I) {
-    Next[I] = {{NextTail, NextHead, NextNextHead}};
+    Next[I] = {{NextTail, NextHead}};
     NextTail = Roles[I] == Tail ? I : 0;
     if (Roles[I] == Head) {
-      NextNextHead = NextHead;
       NextHead = I;
       TwoHeads.push_front(LowercaseIdentifier[I]);
       if (TwoHeads.size() > 2)


-------------- next part --------------
A non-text attachment was scrubbed...
Name: D52885.168282.patch
Type: text/x-patch
Size: 2971 bytes
Desc: not available
URL: <http://lists.llvm.org/pipermail/cfe-commits/attachments/20181004/a7a935fe/attachment-0001.bin>


More information about the cfe-commits mailing list