[clang-tools-extra] r319557 - [clangd] Fuzzy match scorer

Fri Dec 1 09:08:02 PST 2017

Author: sammccall
Date: Fri Dec  1 09:08:02 2017
New Revision: 319557

URL: http://llvm.org/viewvc/llvm-project?rev=319557&view=rev
Log:
[clangd] Fuzzy match scorer

Summary:
This will be used for rescoring code completion results based on partial
identifiers.
Short-term use:
  - we want to limit the number of code completion results returned to
  improve performance of global completion. The scorer will be used to
  rerank the results to return when the user has applied a filter.
Long-term use case:
  - ranking of completion results from in-memory index
  - merging of completion results from multiple sources (merging usually
  works best when done at the component-score level, rescoring the
  fuzzy-match quality avoids different backends needing to have
  comparable scores)

Reviewers: ilya-biryukov

Subscribers: cfe-commits, mgorny

Differential Revision: https://reviews.llvm.org/D40060

Added:
    clang-tools-extra/trunk/clangd/FuzzyMatch.cpp
    clang-tools-extra/trunk/clangd/FuzzyMatch.h
    clang-tools-extra/trunk/unittests/clangd/FuzzyMatchTests.cpp
Modified:
    clang-tools-extra/trunk/clangd/CMakeLists.txt
    clang-tools-extra/trunk/unittests/clangd/CMakeLists.txt

Modified: clang-tools-extra/trunk/clangd/CMakeLists.txt
URL: http://llvm.org/viewvc/llvm-project/clang-tools-extra/trunk/clangd/CMakeLists.txt?rev=319557&r1=319556&r2=319557&view=diff
==============================================================================

--- clang-tools-extra/trunk/clangd/CMakeLists.txt (original)
+++ clang-tools-extra/trunk/clangd/CMakeLists.txt Fri Dec  1 09:08:02 2017
@@ -8,6 +8,7 @@ add_clang_library(clangDaemon
   ClangdUnit.cpp
   ClangdUnitStore.cpp
   DraftStore.cpp
+  FuzzyMatch.cpp
   GlobalCompilationDatabase.cpp
   JSONExpr.cpp
   JSONRPCDispatcher.cpp

Added: clang-tools-extra/trunk/clangd/FuzzyMatch.cpp
URL: http://llvm.org/viewvc/llvm-project/clang-tools-extra/trunk/clangd/FuzzyMatch.cpp?rev=319557&view=auto
==============================================================================
--- clang-tools-extra/trunk/clangd/FuzzyMatch.cpp (added)
+++ clang-tools-extra/trunk/clangd/FuzzyMatch.cpp Fri Dec  1 09:08:02 2017
@@ -0,0 +1,373 @@
+//===--- FuzzyMatch.h - Approximate identifier matching  ---------*- C++-*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// To check for a match between a Pattern ('u_p') and a Word ('unique_ptr'),
+// we consider the possible partial match states:
+//
+//     u n i q u e _ p t r
+//   +---------------------
+//   |A . . . . . . . . . .
+//  u|
+//   |. . . . . . . . . . .
+//  _|
+//   |. . . . . . . O . . .
+//  p|
+//   |. . . . . . . . . . B
+//
+// Each dot represents some prefix of the pattern being matched against some
+// prefix of the word.
+//   - A is the initial state: '' matched against ''
+//   - O is an intermediate state: 'u_' matched against 'unique_'
+//   - B is the target state: 'u_p' matched against 'unique_ptr'
+//
+// We aim to find the best path from A->B.
+//  - Moving right (consuming a word character)
+//    Always legal: not all word characters must match.
+//  - Moving diagonally (consuming both a word and pattern character)
+//    Legal if the characters match.
+//  - Moving down (consuming a pattern character) is never legal.
+//    Never legal: all pattern characters must match something.
+//
+// The scoring is based on heuristics:
+//  - when matching a character, apply a bonus or penalty depending on the
+//    match quality (does case match, do word segments align, etc)
+//  - when skipping a character, apply a penalty if it hurts the match
+//    (it starts a word segment, or splits the matched region, etc)
+//
+// These heuristics require the ability to "look backward" one character, to
+// see whether it was matched or not. Therefore the dynamic-programming matrix
+// has an extra dimension (last character matched).
+// Each entry also has an additional flag indicating whether the last-but-one
+// character matched, which is needed to trace back through the scoring table
+// and reconstruct the match.
+//
+// We treat strings as byte-sequences, so only ASCII has first-class support.
+//
+// This algorithm was inspired by VS code's client-side filtering, and aims
+// to be mostly-compatible.
+//
+//===----------------------------------------------------------------------===//
+
+#include "FuzzyMatch.h"
+#include "llvm/ADT/Optional.h"
+#include "llvm/Support/Format.h"
+
+using namespace llvm;
+using namespace clang::clangd;
+
+const int FuzzyMatcher::MaxPat;
+const int FuzzyMatcher::MaxWord;
+
+static char lower(char C) { return C >= 'A' && C <= 'Z' ? C + ('a' - 'A') : C; }
+// A "negative infinity" score that won't overflow.
+// We use this to mark unreachable states and forbidden solutions.
+// Score field is 15 bits wide, min value is -2^14, we use half of that.
+static constexpr int AwfulScore = -(1 << 13);
+static bool isAwful(int S) { return S < AwfulScore / 2; }
+static constexpr int PerfectBonus = 3; // Perfect per-pattern-char score.
+
+FuzzyMatcher::FuzzyMatcher(StringRef Pattern)
+    : PatN(std::min<int>(MaxPat, Pattern.size())), CaseSensitive(false),
+      ScoreScale(float{1} / (PerfectBonus * PatN)), WordN(0) {
+  memcpy(Pat, Pattern.data(), PatN);
+  for (int I = 0; I < PatN; ++I) {
+    LowPat[I] = lower(Pat[I]);
+    CaseSensitive |= LowPat[I] != Pat[I];
+  }
+  Scores[0][0][Miss] = {0, Miss};
+  Scores[0][0][Match] = {AwfulScore, Miss};
+  for (int P = 0; P <= PatN; ++P)
+    for (int W = 0; W < P; ++W)
+      for (Action A : {Miss, Match})
+        Scores[P][W][A] = {AwfulScore, Miss};
+  calculateRoles(Pat, PatRole, PatN);
+}
+
+Optional<float> FuzzyMatcher::match(StringRef Word) {
+  if (!PatN)
+    return 1;
+  if (!(WordContainsPattern = init(Word)))
+    return None;
+  buildGraph();
+  auto Best = std::max(Scores[PatN][WordN][Miss].Score,
+                       Scores[PatN][WordN][Match].Score);
+  if (isAwful(Best))
+    return None;
+  return ScoreScale * std::min(PerfectBonus * PatN, std::max<int>(0, Best));
+}
+
+// Segmentation of words and patterns.
+// A name like "fooBar_baz" consists of several parts foo, bar, baz.
+// Aligning segmentation of word and pattern improves the fuzzy-match.
+// For example: [lol] matches "LaughingOutLoud" better than "LionPopulation"
+//
+// First we classify each character into types (uppercase, lowercase, etc).
+// Then we look at the sequence: e.g. [upper, lower] is the start of a segment.
+
+// We only distinguish the types of characters that affect segmentation.
+// It's not obvious how to segment digits, we treat them as lowercase letters.
+// As we don't decode UTF-8, we treat bytes over 127 as lowercase too.
+// This means we require exact (case-sensitive) match.
+enum FuzzyMatcher::CharType : char {
+  Empty = 0,       // Before-the-start and after-the-end (and control chars).
+  Lower = 1,       // Lowercase letters, digits, and non-ASCII bytes.
+  Upper = 2,       // Uppercase letters.
+  Punctuation = 3, // ASCII punctuation (including Space)
+};
+
+// We get CharTypes from a lookup table. Each is 2 bits, 4 fit in each byte.
+// The top 6 bits of the char select the byte, the bottom 2 select the offset.
+// e.g. 'q' = 010100 01 = byte 28 (55), bits 3-2 (01) -> Lower.
+constexpr static uint8_t CharTypes[] = {
+    0x00, 0x00, 0x00, 0x00, // Control characters
+    0x00, 0x00, 0x00, 0x00, // Control characters
+    0xff, 0xff, 0xff, 0xff, // Punctuation
+    0x55, 0x55, 0xf5, 0xff, // Numbers->Lower, more Punctuation.
+    0xab, 0xaa, 0xaa, 0xaa, // @ and A-O
+    0xaa, 0xaa, 0xea, 0xff, // P-Z, more Punctuation.
+    0x57, 0x55, 0x55, 0x55, // ` and a-o
+    0x55, 0x55, 0xd5, 0x3f, // p-z, Punctuation, DEL.
+    0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, // Bytes over 127 -> Lower.
+    0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, // (probably UTF-8).
+    0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55,
+    0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55,
+};
+
+// Each character's Role is the Head or Tail of a segment, or a Separator.
+// e.g. XMLHttpRequest_Async
+//      +--+---+------ +----
+//      ^Head   ^Tail ^Separator
+enum FuzzyMatcher::CharRole : char {
+  Unknown = 0,   // Stray control characters or impossible states.
+  Tail = 1,      // Part of a word segment, but not the first character.
+  Head = 2,      // The first character of a word segment.
+  Separator = 3, // Punctuation characters that separate word segments.
+};
+
+// The Role can be determined from the Type of a character and its neighbors:
+//
+//   Example  | Chars | Type | Role
+//   ---------+--------------+-----
+//   F(o)oBar | Foo   | Ull  | Tail
+//   Foo(B)ar | oBa   | lUl  | Head
+//   (f)oo    | ^fo   | Ell  | Head
+//   H(T)TP   | HTT   | UUU  | Tail
+//
+// Our lookup table maps a 6 bit key (Prev, Curr, Next) to a 2-bit Role.
+// A byte packs 4 Roles. (Prev, Curr) selects a byte, Next selects the offset.
+// e.g. Lower, Upper, Lower -> 01 10 01 -> byte 6 (aa), bits 3-2 (10) -> Head.
+constexpr static uint8_t CharRoles[] = {
+    // clang-format off
+    //         Curr= Empty Lower Upper Separ
+    /* Prev=Empty */ 0x00, 0xaa, 0xaa, 0xff, // At start, Lower|Upper->Head
+    /* Prev=Lower */ 0x00, 0x55, 0xaa, 0xff, // In word, Upper->Head;Lower->Tail
+    /* Prev=Upper */ 0x00, 0x55, 0x59, 0xff, // Ditto, but U(U)U->Tail
+    /* Prev=Separ */ 0x00, 0xaa, 0xaa, 0xff, // After separator, like at start
+    // clang-format on
+};
+
+template <typename T> static T packedLookup(const uint8_t *Data, int I) {
+  return static_cast<T>((Data[I >> 2] >> ((I & 3) * 2)) & 3);
+}
+void FuzzyMatcher::calculateRoles(const char *Text, CharRole *Out, int N) {
+  // Types holds a sliding window of (Prev, Curr, Next) types.
+  // Initial value is (Empty, Empty, type of Text[0]).
+  int Types = packedLookup<CharType>(CharTypes, Text[0]);
+  // Rotate slides in the type of the next character.
+  auto Rotate = [&](CharType T) { Types = ((Types << 2) | T) & 0x3f; };
+  for (int I = 0; I < N - 1; ++I) {
+    // For each character, rotate in the next, and look up the role.
+    Rotate(packedLookup<CharType>(CharTypes, Text[I + 1]));
+    *Out++ = packedLookup<CharRole>(CharRoles, Types);
+  }
+  // For the last character, the "next character" is Empty.
+  Rotate(Empty);
+  *Out++ = packedLookup<CharRole>(CharRoles, Types);
+}
+
+// Sets up the data structures matching Word.
+// Returns false if we can cheaply determine that no match is possible.
+bool FuzzyMatcher::init(StringRef NewWord) {
+  WordN = std::min<int>(MaxWord, NewWord.size());
+  if (PatN > WordN)
+    return false;
+  memcpy(Word, NewWord.data(), WordN);
+  for (int I = 0; I < WordN; ++I)
+    LowWord[I] = lower(Word[I]);
+
+  // Cheap subsequence check.
+  for (int W = 0, P = 0; P != PatN; ++W) {
+    if (W == WordN)
+      return false;
+    if (LowWord[W] == LowPat[P])
+      ++P;
+  }
+
+  calculateRoles(Word, WordRole, WordN);
+  return true;
+}
+
+// The forwards pass finds the mappings of Pattern onto Word.
+// Score = best score achieved matching Word[..W] against Pat[..P].
+// Unlike other tables, indices range from 0 to N *inclusive*
+// Matched = whether we chose to match Word[W] with Pat[P] or not.
+//
+// Points are mostly assigned to matched characters, with 1 being a good score
+// and 3 being a great one. So we treat the score range as [0, 3 * PatN].
+// This range is not strict: we can apply larger bonuses/penalties, or penalize
+// non-matched characters.
+void FuzzyMatcher::buildGraph() {
+  for (int W = 0; W < WordN; ++W) {
+    Scores[0][W + 1][Miss] = {Scores[0][W][Miss].Score - skipPenalty(W, Miss),
+                              Miss};
+    Scores[0][W + 1][Match] = {AwfulScore, Miss};
+  }
+  for (int P = 0; P < PatN; ++P) {
+    for (int W = P; W < WordN; ++W) {
+      auto &Score = Scores[P + 1][W + 1], &PreMiss = Scores[P + 1][W];
+
+      auto MatchMissScore = PreMiss[Match].Score;
+      auto MissMissScore = PreMiss[Miss].Score;
+      if (P < PatN - 1) { // Skipping trailing characters is always free.
+        MatchMissScore -= skipPenalty(W, Match);
+        MissMissScore -= skipPenalty(W, Miss);
+      }
+      Score[Miss] = (MatchMissScore > MissMissScore)
+                        ? ScoreInfo{MatchMissScore, Match}
+                        : ScoreInfo{MissMissScore, Miss};
+
+      if (LowPat[P] != LowWord[W]) { // No match possible.
+        Score[Match] = {AwfulScore, Miss};
+      } else {
+        auto &PreMatch = Scores[P][W];
+        auto MatchMatchScore = PreMatch[Match].Score + matchBonus(P, W, Match);
+        auto MissMatchScore = PreMatch[Miss].Score + matchBonus(P, W, Miss);
+        Score[Match] = (MatchMatchScore > MissMatchScore)
+                           ? ScoreInfo{MatchMatchScore, Match}
+                           : ScoreInfo{MissMatchScore, Miss};
+      }
+    }
+  }
+}
+
+int FuzzyMatcher::skipPenalty(int W, Action Last) {
+  int S = 0;
+  if (WordRole[W] == Head) // Skipping a segment.
+    S += 1;
+  if (Last == Match) // Non-consecutive match.
+    S += 2;          // We'd rather skip a segment than split our match.
+  return S;
+}
+
+int FuzzyMatcher::matchBonus(int P, int W, Action Last) {
+  assert(LowPat[P] == LowWord[W]);
+  int S = 1;
+  // Bonus: pattern so far is a (case-insensitive) prefix of the word.
+  if (P == W) // We can't skip pattern characters, so we must have matched all.
+    ++S;
+  // Bonus: case matches, or a Head in the pattern aligns with one in the word.
+  if ((Pat[P] == Word[W] && (CaseSensitive || P == W)) ||
+      (PatRole[P] == Head && WordRole[W] == Head))
+    ++S;
+  // Penalty: matching inside a segment (and previous char wasn't matched).
+  if (WordRole[W] == Tail && P && Last == Miss)
+    S -= 3;
+  // Penalty: a Head in the pattern matches in the middle of a word segment.
+  if (PatRole[P] == Head && WordRole[W] == Tail)
+    --S;
+  // Penalty: matching the first pattern character in the middle of a segment.
+  if (P == 0 && WordRole[W] == Tail)
+    S -= 4;
+  assert(S <= PerfectBonus);
+  return S;
+}
+
+llvm::SmallString<256> FuzzyMatcher::dumpLast(llvm::raw_ostream &OS) const {
+  llvm::SmallString<256> Result;
+  OS << "=== Match \"" << StringRef(Word, WordN) << "\" against ["
+     << StringRef(Pat, PatN) << "] ===\n";
+  if (!WordContainsPattern) {
+    OS << "Substring check failed.\n";
+    return Result;
+  } else if (isAwful(std::max(Scores[PatN][WordN][Match].Score,
+                              Scores[PatN][WordN][Miss].Score))) {
+    OS << "Substring check passed, but all matches are forbidden\n";
+  }
+  if (!CaseSensitive)
+    OS << "Lowercase query, so scoring ignores case\n";
+
+  // Traverse Matched table backwards to reconstruct the Pattern/Word mapping.
+  // The Score table has cumulative scores, subtracting along this path gives
+  // us the per-letter scores.
+  Action Last =
+      (Scores[PatN][WordN][Match].Score > Scores[PatN][WordN][Miss].Score)
+          ? Match
+          : Miss;
+  int S[MaxWord];
+  Action A[MaxWord];
+  for (int W = WordN - 1, P = PatN - 1; W >= 0; --W) {
+    A[W] = Last;
+    const auto &Cell = Scores[P + 1][W + 1][Last];
+    if (Last == Match)
+      --P;
+    const auto &Prev = Scores[P + 1][W][Cell.Prev];
+    S[W] = Cell.Score - Prev.Score;
+    Last = Cell.Prev;
+  }
+  for (int I = 0; I < WordN; ++I) {
+    if (A[I] == Match && (I == 0 || A[I - 1] == Miss))
+      Result.push_back('[');
+    if (A[I] == Miss && I > 0 && A[I - 1] == Match)
+      Result.push_back(']');
+    Result.push_back(Word[I]);
+  }
+  if (A[WordN - 1] == Match)
+    Result.push_back(']');
+
+  for (char C : StringRef(Word, WordN))
+    OS << " " << C << " ";
+  OS << "\n";
+  for (int I = 0, J = 0; I < WordN; I++)
+    OS << " " << (A[I] == Match ? Pat[J++] : ' ') << " ";
+  OS << "\n";
+  for (int I = 0; I < WordN; I++)
+    OS << format("%2d ", S[I]);
+  OS << "\n";
+
+  OS << "\nSegmentation:";
+  OS << "\n'" << StringRef(Word, WordN) << "'\n ";
+  for (int I = 0; I < WordN; ++I)
+    OS << "?-+ "[static_cast<int>(WordRole[I])];
+  OS << "\n[" << StringRef(Pat, PatN) << "]\n ";
+  for (int I = 0; I < PatN; ++I)
+    OS << "?-+ "[static_cast<int>(PatRole[I])];
+  OS << "\n";
+
+  OS << "\nScoring table (last-Miss, last-Match):\n";
+  OS << " |    ";
+  for (char C : StringRef(Word, WordN))
+    OS << "  " << C << " ";
+  OS << "\n";
+  OS << "-+----" << std::string(WordN * 4, '-') << "\n";
+  for (int I = 0; I <= PatN; ++I) {
+    for (Action A : {Miss, Match}) {
+      OS << ((I && A == Miss) ? Pat[I - 1] : ' ') << "|";
+      for (int J = 0; J <= WordN; ++J) {
+        if (!isAwful(Scores[I][J][A].Score))
+          OS << format("%3d%c", Scores[I][J][A].Score,
+                       Scores[I][J][A].Prev == Match ? '*' : ' ');
+        else
+          OS << "    ";
+      }
+      OS << "\n";
+    }
+  }
+
+  return Result;
+}

Added: clang-tools-extra/trunk/clangd/FuzzyMatch.h
URL: http://llvm.org/viewvc/llvm-project/clang-tools-extra/trunk/clangd/FuzzyMatch.h?rev=319557&view=auto
==============================================================================
--- clang-tools-extra/trunk/clangd/FuzzyMatch.h (added)
+++ clang-tools-extra/trunk/clangd/FuzzyMatch.h Fri Dec  1 09:08:02 2017
@@ -0,0 +1,84 @@
+//===--- FuzzyMatch.h - Approximate identifier matching  ---------*- C++-*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements fuzzy-matching of strings against identifiers.
+// It indicates both the existence and quality of a match:
+// 'eb' matches both 'emplace_back' and 'embed', the former has a better score.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANGD_FUZZYMATCH_H
+#define LLVM_CLANG_TOOLS_EXTRA_CLANGD_FUZZYMATCH_H
+
+#include "llvm/ADT/Optional.h"
+#include "llvm/ADT/SmallString.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/Support/raw_ostream.h"
+
+namespace clang {
+namespace clangd {
+
+// A matcher capable of matching and scoring strings against a single pattern.
+// It's optimized for matching against many strings - match() does not allocate.
+class FuzzyMatcher {
+public:
+  // Characters beyond MaxPat are ignored.
+  FuzzyMatcher(llvm::StringRef Pattern);
+
+  // If Word matches the pattern, return a score in [0,1] (higher is better).
+  // Characters beyond MaxWord are ignored.
+  llvm::Optional<float> match(llvm::StringRef Word);
+
+  // Dump internal state from the last match() to the stream, for debugging.
+  // Returns the pattern with [] around matched characters, e.g.
+  //   [u_p] + "unique_ptr" --> "[u]nique[_p]tr"
+  llvm::SmallString<256> dumpLast(llvm::raw_ostream &) const;
+
+private:
+  // We truncate the pattern and the word to bound the cost of matching.
+  constexpr static int MaxPat = 63, MaxWord = 127;
+  enum CharRole : char; // For segmentation.
+  enum CharType : char; // For segmentation.
+  enum Action { Miss = 0, Match = 1 };
+
+  bool init(llvm::StringRef Word);
+  void buildGraph();
+  void calculateRoles(const char *Text, CharRole *Out, int N);
+  int skipPenalty(int W, Action Last);
+  int matchBonus(int P, int W, Action Last);
+
+  // Pattern data is initialized by the constructor, then constant.
+  char Pat[MaxPat];         // Pattern data
+  int PatN;                 // Length
+  char LowPat[MaxPat];      // Pattern in lowercase
+  CharRole PatRole[MaxPat]; // Pattern segmentation info
+  bool CaseSensitive;       // Case-sensitive match if pattern has uppercase
+  float ScoreScale;         // Normalizes scores for the pattern length.
+
+  // Word data is initialized on each call to match(), mostly by init().
+  char Word[MaxWord];         // Word data
+  int WordN;                  // Length
+  char LowWord[MaxWord];      // Word in lowercase
+  CharRole WordRole[MaxWord]; // Word segmentation info
+  bool WordContainsPattern;   // Simple substring check
+
+  // Cumulative best-match score table.
+  // Boundary conditions are filled in by the constructor.
+  // The rest is repopulated for each match(), by buildGraph().
+  struct ScoreInfo {
+    signed int Score : 15;
+    Action Prev : 1;
+  };
+  ScoreInfo Scores[MaxPat + 1][MaxWord + 1][/* Last Action */ 2];
+};
+
+} // namespace clangd
+} // namespace clang
+
+#endif

Modified: clang-tools-extra/trunk/unittests/clangd/CMakeLists.txt
URL: http://llvm.org/viewvc/llvm-project/clang-tools-extra/trunk/unittests/clangd/CMakeLists.txt?rev=319557&r1=319556&r2=319557&view=diff
==============================================================================
--- clang-tools-extra/trunk/unittests/clangd/CMakeLists.txt (original)
+++ clang-tools-extra/trunk/unittests/clangd/CMakeLists.txt Fri Dec  1 09:08:02 2017
@@ -10,6 +10,7 @@ include_directories(
 
 add_extra_unittest(ClangdTests
   ClangdTests.cpp
+  FuzzyMatchTests.cpp
   JSONExprTests.cpp
   TraceTests.cpp
   )

Added: clang-tools-extra/trunk/unittests/clangd/FuzzyMatchTests.cpp
URL: http://llvm.org/viewvc/llvm-project/clang-tools-extra/trunk/unittests/clangd/FuzzyMatchTests.cpp?rev=319557&view=auto
==============================================================================
--- clang-tools-extra/trunk/unittests/clangd/FuzzyMatchTests.cpp (added)
+++ clang-tools-extra/trunk/unittests/clangd/FuzzyMatchTests.cpp Fri Dec  1 09:08:02 2017
@@ -0,0 +1,252 @@
+//===-- FuzzyMatchTests.cpp - String fuzzy matcher tests --------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "FuzzyMatch.h"
+
+#include "llvm/ADT/StringExtras.h"
+#include "gmock/gmock.h"
+#include "gtest/gtest.h"
+
+namespace clang {
+namespace clangd {
+namespace {
+using namespace llvm;
+using testing::Not;
+
+struct ExpectedMatch {
+  ExpectedMatch(StringRef Annotated) : Word(Annotated), Annotated(Annotated) {
+    for (char C : "[]")
+      Word.erase(std::remove(Word.begin(), Word.end(), C), Word.end());
+  }
+  std::string Word;
+  StringRef Annotated;
+};
+raw_ostream &operator<<(raw_ostream &OS, const ExpectedMatch &M) {
+  return OS << "'" << M.Word << "' as " << M.Annotated;
+}
+
+struct MatchesMatcher : public testing::MatcherInterface<StringRef> {
+  ExpectedMatch Candidate;
+  MatchesMatcher(ExpectedMatch Candidate) : Candidate(std::move(Candidate)) {}
+
+  void DescribeTo(::std::ostream *OS) const override {
+    raw_os_ostream(*OS) << "Matches " << Candidate;
+  }
+
+  bool MatchAndExplain(StringRef Pattern,
+                       testing::MatchResultListener *L) const override {
+    std::unique_ptr<raw_ostream> OS(
+        L->stream() ? (raw_ostream *)(new raw_os_ostream(*L->stream()))
+                    : new raw_null_ostream());
+    FuzzyMatcher Matcher(Pattern);
+    auto Result = Matcher.match(Candidate.Word);
+    auto AnnotatedMatch = Matcher.dumpLast(*OS << "\n");
+    return Result && AnnotatedMatch == Candidate.Annotated;
+  }
+};
+
+// Accepts patterns that match a given word.
+// Dumps the debug tables on match failure.
+testing::Matcher<StringRef> matches(StringRef M) {
+  return testing::MakeMatcher<StringRef>(new MatchesMatcher(M));
+}
+
+TEST(FuzzyMatch, Matches) {
+  EXPECT_THAT("u_p", matches("[u]nique[_p]tr"));
+  EXPECT_THAT("up", matches("[u]nique_[p]tr"));
+  EXPECT_THAT("uq", matches("[u]ni[q]ue_ptr"));
+  EXPECT_THAT("qp", Not(matches("unique_ptr")));
+  EXPECT_THAT("log", Not(matches("SVGFEMorphologyElement")));
+
+  EXPECT_THAT("tit", matches("win.[tit]"));
+  EXPECT_THAT("title", matches("win.[title]"));
+  EXPECT_THAT("WordCla", matches("[Word]Character[Cla]ssifier"));
+  EXPECT_THAT("WordCCla", matches("[WordC]haracter[Cla]ssifier"));
+
+  EXPECT_THAT("dete", Not(matches("editor.quickSuggestionsDelay")));
+
+  EXPECT_THAT("highlight", matches("editorHover[Highlight]"));
+  EXPECT_THAT("hhighlight", matches("editor[H]over[Highlight]"));
+  EXPECT_THAT("dhhighlight", Not(matches("editorHoverHighlight")));
+
+  EXPECT_THAT("-moz", matches("[-moz]-foo"));
+  EXPECT_THAT("moz", matches("-[moz]-foo"));
+  EXPECT_THAT("moza", matches("-[moz]-[a]nimation"));
+
+  EXPECT_THAT("ab", matches("[ab]A"));
+  EXPECT_THAT("ccm", matches("[c]a[cm]elCase"));
+  EXPECT_THAT("bti", Not(matches("the_black_knight")));
+  EXPECT_THAT("ccm", Not(matches("camelCase")));
+  EXPECT_THAT("cmcm", Not(matches("camelCase")));
+  EXPECT_THAT("BK", matches("the_[b]lack_[k]night"));
+  EXPECT_THAT("KeyboardLayout=", Not(matches("KeyboardLayout")));
+  EXPECT_THAT("LLL", matches("SVisual[L]ogger[L]ogs[L]ist"));
+  EXPECT_THAT("LLLL", Not(matches("SVilLoLosLi")));
+  EXPECT_THAT("LLLL", Not(matches("SVisualLoggerLogsList")));
+  EXPECT_THAT("TEdit", matches("[T]ext[Edit]"));
+  EXPECT_THAT("TEdit", matches("[T]ext[Edit]or"));
+  EXPECT_THAT("TEdit", matches("[Te]xte[dit]"));
+  EXPECT_THAT("TEdit", matches("[t]ext_[edit]"));
+  EXPECT_THAT("TEditDit", matches("[T]ext[Edit]or[D]ecorat[i]on[T]ype"));
+  EXPECT_THAT("TEdit", matches("[T]ext[Edit]orDecorationType"));
+  EXPECT_THAT("Tedit", matches("[T]ext[Edit]"));
+  EXPECT_THAT("ba", Not(matches("?AB?")));
+  EXPECT_THAT("bkn", matches("the_[b]lack_[kn]ight"));
+  EXPECT_THAT("bt", matches("the_[b]lack_knigh[t]"));
+  EXPECT_THAT("ccm", matches("[c]amelCase[cm]"));
+  EXPECT_THAT("fdm", matches("[f]in[dM]odel"));
+  EXPECT_THAT("fob", matches("[fo]o[b]ar"));
+  EXPECT_THAT("fobz", Not(matches("foobar")));
+  EXPECT_THAT("foobar", matches("[foobar]"));
+  EXPECT_THAT("form", matches("editor.[form]atOnSave"));
+  EXPECT_THAT("g p", matches("[G]it:[ P]ull"));
+  EXPECT_THAT("g p", matches("[G]it:[ P]ull"));
+  EXPECT_THAT("gip", matches("[Gi]t: [P]ull"));
+  EXPECT_THAT("gip", matches("[Gi]t: [P]ull"));
+  EXPECT_THAT("gp", matches("[G]it: [P]ull"));
+  EXPECT_THAT("gp", matches("[G]it_Git_[P]ull"));
+  EXPECT_THAT("is", matches("[I]mport[S]tatement"));
+  EXPECT_THAT("is", matches("[is]Valid"));
+  EXPECT_THAT("lowrd", matches("[low]Wo[rd]"));
+  EXPECT_THAT("myvable", matches("[myva]ria[ble]"));
+  EXPECT_THAT("no", Not(matches("")));
+  EXPECT_THAT("no", Not(matches("match")));
+  EXPECT_THAT("ob", Not(matches("foobar")));
+  EXPECT_THAT("sl", matches("[S]Visual[L]oggerLogsList"));
+  EXPECT_THAT("sllll", matches("[S]Visua[lL]ogger[L]ogs[L]ist"));
+  EXPECT_THAT("Three", matches("H[T]ML[HRE]l[e]ment"));
+  EXPECT_THAT("Three", matches("[Three]"));
+  EXPECT_THAT("fo", Not(matches("barfoo")));
+  EXPECT_THAT("fo", matches("bar_[fo]o"));
+  EXPECT_THAT("fo", matches("bar_[Fo]o"));
+  EXPECT_THAT("fo", matches("bar [fo]o"));
+  EXPECT_THAT("fo", matches("bar.[fo]o"));
+  EXPECT_THAT("fo", matches("bar/[fo]o"));
+  EXPECT_THAT("fo", matches("bar\\[fo]o"));
+
+  EXPECT_THAT(
+      "aaaaaa",
+      matches("[aaaaaa]aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa"
+              "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa"));
+  EXPECT_THAT("baba", Not(matches("ababababab")));
+  EXPECT_THAT("fsfsfs", Not(matches("dsafdsafdsafdsafdsafdsafdsafasdfdsa")));
+  EXPECT_THAT("fsfsfsfsfsfsfsf",
+              Not(matches("dsafdsafdsafdsafdsafdsafdsafasdfdsafdsafdsafdsafdsfd"
+                          "safdsfdfdfasdnfdsajfndsjnafjndsajlknfdsa")));
+
+  EXPECT_THAT("  g", matches("[  g]roup"));
+  EXPECT_THAT("g", matches("  [g]roup"));
+  EXPECT_THAT("g g", Not(matches("  groupGroup")));
+  EXPECT_THAT("g g", matches("  [g]roup[ G]roup"));
+  EXPECT_THAT(" g g", matches("[ ] [g]roup[ G]roup"));
+  EXPECT_THAT("zz", matches("[zz]Group"));
+  EXPECT_THAT("zzg", matches("[zzG]roup"));
+  EXPECT_THAT("g", matches("zz[G]roup"));
+
+  EXPECT_THAT("aaaa", matches("_a_[aaaa]")); // Prefer consecutive.
+  EXPECT_THAT("printf", matches("s[printf]"));
+  EXPECT_THAT("str", matches("o[str]eam"));
+}
+
+struct RankMatcher : public testing::MatcherInterface<StringRef> {
+  std::vector<ExpectedMatch> RankedStrings;
+  RankMatcher(std::initializer_list<ExpectedMatch> RankedStrings)
+      : RankedStrings(RankedStrings) {}
+
+  void DescribeTo(::std::ostream *OS) const override {
+    raw_os_ostream O(*OS);
+    O << "Ranks strings in order: [";
+    for (const auto &Str : RankedStrings)
+      O << "\n\t" << Str;
+    O << "\n]";
+  }
+
+  bool MatchAndExplain(StringRef Pattern,
+                       testing::MatchResultListener *L) const override {
+    std::unique_ptr<raw_ostream> OS(
+        L->stream() ? (raw_ostream *)(new raw_os_ostream(*L->stream()))
+                    : new raw_null_ostream());
+    FuzzyMatcher Matcher(Pattern);
+    const ExpectedMatch *LastMatch;
+    Optional<float> LastScore;
+    bool Ok = true;
+    for (const auto &Str : RankedStrings) {
+      auto Score = Matcher.match(Str.Word);
+      if (!Score) {
+        *OS << "\nDoesn't match '" << Str.Word << "'";
+        Matcher.dumpLast(*OS << "\n");
+        Ok = false;
+      } else {
+        std::string Buf;
+        llvm::raw_string_ostream Info(Buf);
+        auto AnnotatedMatch = Matcher.dumpLast(Info);
+
+        if (AnnotatedMatch != Str.Annotated) {
+          *OS << "\nMatched " << Str.Word << " as " << AnnotatedMatch
+              << " instead of " << Str.Annotated << "\n"
+              << Info.str();
+          Ok = false;
+        } else if (LastScore && *LastScore < *Score) {
+          *OS << "\nRanks '" << Str.Word << "'=" << *Score << " above '"
+              << LastMatch->Word << "'=" << *LastScore << "\n"
+              << Info.str();
+          Matcher.match(LastMatch->Word);
+          Matcher.dumpLast(*OS << "\n");
+          Ok = false;
+        }
+      }
+      LastMatch = &Str;
+      LastScore = Score;
+    }
+    return Ok;
+  }
+};
+
+// Accepts patterns that match all the strings and rank them in the given order.
+// Dumps the debug tables on match failure.
+template <typename... T> testing::Matcher<StringRef> ranks(T... RankedStrings) {
+  return testing::MakeMatcher<StringRef>(
+      new RankMatcher{ExpectedMatch(RankedStrings)...});
+}
+
+TEST(FuzzyMatch, Ranking) {
+  EXPECT_THAT("eb", ranks("[e]mplace_[b]ack", "[e]m[b]ed"));
+  EXPECT_THAT("cons",
+              ranks("[cons]ole", "[Cons]ole", "ArrayBuffer[Cons]tructor"));
+  EXPECT_THAT("foo", ranks("[foo]", "[Foo]"));
+  EXPECT_THAT("onMess",
+              ranks("[onMess]age", "[onmess]age", "[on]This[M]ega[Es]cape[s]"));
+  EXPECT_THAT("CC", ranks("[C]amel[C]ase", "[c]amel[C]ase"));
+  EXPECT_THAT("cC", ranks("[c]amel[C]ase", "[C]amel[C]ase"));
+  EXPECT_THAT("p", ranks("[p]arse", "[p]osix", "[p]afdsa", "[p]ath", "[p]"));
+  EXPECT_THAT("pa", ranks("[pa]rse", "[pa]th", "[pa]fdsa"));
+  EXPECT_THAT("log", ranks("[log]", "Scroll[Log]icalPosition"));
+  EXPECT_THAT("e", ranks("[e]lse", "Abstract[E]lement"));
+  EXPECT_THAT("workbench.sideb",
+              ranks("[workbench.sideB]ar.location",
+                    "[workbench.]editor.default[SideB]ySideLayout"));
+  EXPECT_THAT("editor.r", ranks("[editor.r]enderControlCharacter",
+                                "[editor.]overview[R]ulerlanes",
+                                "diff[Editor.r]enderSideBySide"));
+  EXPECT_THAT("-mo", ranks("[-mo]z-columns", "[-]ms-ime-[mo]de"));
+  EXPECT_THAT("convertModelPosition",
+              ranks("[convertModelPosition]ToViewPosition",
+                    "[convert]ViewTo[ModelPosition]"));
+  EXPECT_THAT("is", ranks("[is]ValidViewletId", "[i]mport [s]tatement"));
+  EXPECT_THAT("title", ranks("window.[title]",
+                             "files.[t]r[i]m[T]rai[l]ingWhit[e]space"));
+  EXPECT_THAT("strcpy", ranks("[strcpy]", "[strcpy]_s", "[str]n[cpy]"));
+  EXPECT_THAT("close", ranks("workbench.quickOpen.[close]OnFocusOut",
+                             "[c]ss.[l]int.imp[o]rt[S]tat[e]ment",
+                             "[c]ss.co[lo]rDecorator[s].[e]nable"));
+}
+
+} // namespace
+} // namespace clangd
+} // namespace clang