[clang-tools-extra] r337901 - [clangd] Introduce Dex symbol index search tokens
Kirill Bobyrev via cfe-commits
cfe-commits at lists.llvm.org
Wed Jul 25 03:34:57 PDT 2018
Author: omtcyfz
Date: Wed Jul 25 03:34:57 2018
New Revision: 337901
URL: http://llvm.org/viewvc/llvm-project?rev=337901&view=rev
Log:
[clangd] Introduce Dex symbol index search tokens
This patch introduces the core building block of the next-generation
Clangd symbol index - Dex. Search tokens are the keys in the inverted
index and represent a characteristic of a specific symbol: examples of
search token types (Token Namespaces) are
* Trigrams - these are essential for unqualified symbol name fuzzy
search * Scopes for filtering the symbols by the namespace * Paths, e.g.
these can be used to uprank symbols defined close to the edited file
This patch outlines the generic for such token namespaces, but only
implements trigram generation.
The intuition behind trigram generation algorithm is that each extracted
trigram is a valid sequence for Fuzzy Matcher jumps, proposed
implementation utilize existing FuzzyMatcher API for segmentation and
trigram extraction.
However, trigrams generation algorithm for the query string is different
from the previous one: it simply yields sequences of 3 consecutive
lowercased valid characters (letters, digits).
Dex RFC in the mailing list:
http://lists.llvm.org/pipermail/clangd-dev/2018-July/000022.html
The trigram generation techniques are described in detail in the
proposal:
https://docs.google.com/document/d/1C-A6PGT6TynyaX4PXyExNMiGmJ2jL1UwV91Kyx11gOI/edit#heading=h.903u1zon9nkj
Reviewers: sammccall, ioeric, ilya-biryukovA
Subscribers: cfe-commits, klimek, mgorny, MaskRay, jkorous, arphaman
Differential Revision: https://reviews.llvm.org/D49591
Added:
clang-tools-extra/trunk/clangd/index/dex/
clang-tools-extra/trunk/clangd/index/dex/Token.h
clang-tools-extra/trunk/clangd/index/dex/Trigram.cpp
clang-tools-extra/trunk/clangd/index/dex/Trigram.h
clang-tools-extra/trunk/unittests/clangd/DexIndexTests.cpp
Modified:
clang-tools-extra/trunk/clangd/CMakeLists.txt
clang-tools-extra/trunk/unittests/clangd/CMakeLists.txt
Modified: clang-tools-extra/trunk/clangd/CMakeLists.txt
URL: http://llvm.org/viewvc/llvm-project/clang-tools-extra/trunk/clangd/CMakeLists.txt?rev=337901&r1=337900&r2=337901&view=diff
==============================================================================
--- clang-tools-extra/trunk/clangd/CMakeLists.txt (original)
+++ clang-tools-extra/trunk/clangd/CMakeLists.txt Wed Jul 25 03:34:57 2018
@@ -34,6 +34,7 @@ add_clang_library(clangDaemon
TUScheduler.cpp
URI.cpp
XRefs.cpp
+
index/CanonicalIncludes.cpp
index/FileIndex.cpp
index/Index.cpp
@@ -42,6 +43,8 @@ add_clang_library(clangDaemon
index/SymbolCollector.cpp
index/SymbolYAML.cpp
+ index/dex/Trigram.cpp
+
LINK_LIBS
clangAST
clangASTMatchers
Added: clang-tools-extra/trunk/clangd/index/dex/Token.h
URL: http://llvm.org/viewvc/llvm-project/clang-tools-extra/trunk/clangd/index/dex/Token.h?rev=337901&view=auto
==============================================================================
--- clang-tools-extra/trunk/clangd/index/dex/Token.h (added)
+++ clang-tools-extra/trunk/clangd/index/dex/Token.h Wed Jul 25 03:34:57 2018
@@ -0,0 +1,112 @@
+//===--- Token.h - Symbol Search primitive ----------------------*- C++ -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// Token objects represent a characteristic of a symbol, which can be used to
+// perform efficient search. Tokens are keys for inverted index which are mapped
+// to the corresponding posting lists.
+//
+// The symbol std::cout might have the tokens:
+// * Scope "std::"
+// * Trigram "cou"
+// * Trigram "out"
+// * Type "std::ostream"
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANGD_DEX_TOKEN_H
+#define LLVM_CLANG_TOOLS_EXTRA_CLANGD_DEX_TOKEN_H
+
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/Support/raw_ostream.h"
+
+#include <string>
+#include <vector>
+
+namespace clang {
+namespace clangd {
+namespace dex {
+
+/// A Token represents an attribute of a symbol, such as a particular trigram
+/// present in the name (used for fuzzy search).
+///
+/// Tokens can be used to perform more sophisticated search queries by
+/// constructing complex iterator trees.
+struct Token {
+ /// Kind specifies Token type which defines semantics for the internal
+ /// representation. Each Kind has different representation stored in Data
+ /// field.
+ enum class Kind {
+ /// Represents trigram used for fuzzy search of unqualified symbol names.
+ ///
+ /// Data contains 3 bytes with trigram contents.
+ Trigram,
+ /// Scope primitives, e.g. "symbol belongs to namespace foo::bar".
+ ///
+ /// Data stroes full scope name , e.g. "foo::bar::baz::" or "" (for global
+ /// scope).
+ Scope,
+ /// Internal Token type for invalid/special tokens, e.g. empty tokens for
+ /// llvm::DenseMap.
+ Sentinel,
+ /// FIXME(kbobyrev): Add other Token Kinds
+ /// * Path with full or relative path to the directory in which symbol is
+ /// defined
+ /// * Type with qualified type name or its USR
+ };
+
+ Token(Kind TokenKind, llvm::StringRef Data)
+ : Data(Data), TokenKind(TokenKind) {}
+
+ bool operator==(const Token &Other) const {
+ return TokenKind == Other.TokenKind && Data == Other.Data;
+ }
+
+ /// Representation which is unique among Token with the same Kind.
+ std::string Data;
+ Kind TokenKind;
+
+ friend llvm::raw_ostream &operator<<(llvm::raw_ostream &OS, const Token &T) {
+ return OS << T.Data;
+ }
+
+private:
+ friend llvm::hash_code hash_value(const Token &Token) {
+ return llvm::hash_combine(static_cast<int>(Token.TokenKind), Token.Data);
+ }
+};
+
+} // namespace dex
+} // namespace clangd
+} // namespace clang
+
+namespace llvm {
+
+// Support Tokens as DenseMap keys.
+template <> struct DenseMapInfo<clang::clangd::dex::Token> {
+ static inline clang::clangd::dex::Token getEmptyKey() {
+ return {clang::clangd::dex::Token::Kind::Sentinel, "EmptyKey"};
+ }
+
+ static inline clang::clangd::dex::Token getTombstoneKey() {
+ return {clang::clangd::dex::Token::Kind::Sentinel, "TombstoneKey"};
+ }
+
+ static unsigned getHashValue(const clang::clangd::dex::Token &Tag) {
+ return hash_value(Tag);
+ }
+
+ static bool isEqual(const clang::clangd::dex::Token &LHS,
+ const clang::clangd::dex::Token &RHS) {
+ return LHS == RHS;
+ }
+};
+
+} // namespace llvm
+
+#endif
Added: clang-tools-extra/trunk/clangd/index/dex/Trigram.cpp
URL: http://llvm.org/viewvc/llvm-project/clang-tools-extra/trunk/clangd/index/dex/Trigram.cpp?rev=337901&view=auto
==============================================================================
--- clang-tools-extra/trunk/clangd/index/dex/Trigram.cpp (added)
+++ clang-tools-extra/trunk/clangd/index/dex/Trigram.cpp Wed Jul 25 03:34:57 2018
@@ -0,0 +1,132 @@
+//===--- Trigram.cpp - Trigram generation for Fuzzy Matching ----*- C++ -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "Trigram.h"
+#include "../../FuzzyMatch.h"
+#include "Token.h"
+
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/DenseSet.h"
+#include "llvm/ADT/StringExtras.h"
+
+#include <cctype>
+#include <queue>
+#include <string>
+
+using namespace llvm;
+
+namespace clang {
+namespace clangd {
+namespace dex {
+
+// FIXME(kbobyrev): Deal with short symbol symbol names. A viable approach would
+// be generating unigrams and bigrams here, too. This would prevent symbol index
+// from applying fuzzy matching on a tremendous number of symbols and allow
+// supplementary retrieval for short queries.
+//
+// Short names (total segment length <3 characters) are currently ignored.
+std::vector<Token> generateIdentifierTrigrams(llvm::StringRef Identifier) {
+ // Apply fuzzy matching text segmentation.
+ std::vector<CharRole> Roles(Identifier.size());
+ calculateRoles(Identifier,
+ llvm::makeMutableArrayRef(Roles.data(), Identifier.size()));
+
+ std::string LowercaseIdentifier = Identifier.lower();
+
+ // For each character, store indices of the characters to which fuzzy matching
+ // algorithm can jump. There are 3 possible variants:
+ //
+ // * Next Tail - next character from the same segment
+ // * Next Head - front character of the next segment
+ // * Skip-1-Next Head - front character of the skip-1-next segment
+ //
+ // Next stores tuples of three indices in the presented order, if a variant is
+ // not available then 0 is stored.
+ std::vector<std::array<unsigned, 3>> Next(LowercaseIdentifier.size());
+ unsigned NextTail = 0, NextHead = 0, NextNextHead = 0;
+ for (int I = LowercaseIdentifier.size() - 1; I >= 0; --I) {
+ Next[I] = {{NextTail, NextHead, NextNextHead}};
+ NextTail = Roles[I] == Tail ? I : 0;
+ if (Roles[I] == Head) {
+ NextNextHead = NextHead;
+ NextHead = I;
+ }
+ }
+
+ DenseSet<Token> UniqueTrigrams;
+ std::array<char, 4> Chars;
+ for (size_t I = 0; I < LowercaseIdentifier.size(); ++I) {
+ // Skip delimiters.
+ if (Roles[I] != Head && Roles[I] != Tail)
+ continue;
+ for (const unsigned J : Next[I]) {
+ if (!J)
+ continue;
+ for (const unsigned K : Next[J]) {
+ if (!K)
+ continue;
+ Chars = {{LowercaseIdentifier[I], LowercaseIdentifier[J],
+ LowercaseIdentifier[K], 0}};
+ auto Trigram = Token(Token::Kind::Trigram, Chars.data());
+ // Push unique trigrams to the result.
+ if (!UniqueTrigrams.count(Trigram)) {
+ UniqueTrigrams.insert(Trigram);
+ }
+ }
+ }
+ }
+
+ std::vector<Token> Result;
+ for (const auto &Trigram : UniqueTrigrams)
+ Result.push_back(Trigram);
+
+ return Result;
+}
+
+// FIXME(kbobyrev): Similarly, to generateIdentifierTrigrams, this ignores short
+// inputs (total segment length <3 characters).
+std::vector<Token> generateQueryTrigrams(llvm::StringRef Query) {
+ // Apply fuzzy matching text segmentation.
+ std::vector<CharRole> Roles(Query.size());
+ calculateRoles(Query, llvm::makeMutableArrayRef(Roles.data(), Query.size()));
+
+ std::string LowercaseQuery = Query.lower();
+
+ DenseSet<Token> UniqueTrigrams;
+ std::deque<char> Chars;
+
+ for (size_t I = 0; I < LowercaseQuery.size(); ++I) {
+ // If current symbol is delimiter, just skip it.
+ if (Roles[I] != Head && Roles[I] != Tail)
+ continue;
+
+ Chars.push_back(LowercaseQuery[I]);
+
+ if (Chars.size() > 3)
+ Chars.pop_front();
+ if (Chars.size() == 3) {
+ auto Trigram =
+ Token(Token::Kind::Trigram, std::string(begin(Chars), end(Chars)));
+ // Push unique trigrams to the result.
+ if (!UniqueTrigrams.count(Trigram)) {
+ UniqueTrigrams.insert(Trigram);
+ }
+ }
+ }
+
+ std::vector<Token> Result;
+ for (const auto &Trigram : UniqueTrigrams)
+ Result.push_back(Trigram);
+
+ return Result;
+}
+
+} // namespace dex
+} // namespace clangd
+} // namespace clang
Added: clang-tools-extra/trunk/clangd/index/dex/Trigram.h
URL: http://llvm.org/viewvc/llvm-project/clang-tools-extra/trunk/clangd/index/dex/Trigram.h?rev=337901&view=auto
==============================================================================
--- clang-tools-extra/trunk/clangd/index/dex/Trigram.h (added)
+++ clang-tools-extra/trunk/clangd/index/dex/Trigram.h Wed Jul 25 03:34:57 2018
@@ -0,0 +1,62 @@
+//===--- Trigram.h - Trigram generation for Fuzzy Matching ------*- C++ -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// Trigrams are attributes of the symbol unqualified name used to effectively
+// extract symbols which can be fuzzy-matched given user query from the inverted
+// index. To match query with the extracted set of trigrams Q, the set of
+// generated trigrams T for identifier (unqualified symbol name) should contain
+// all items of Q, i.e. Q â T.
+//
+// Trigram sets extracted from unqualified name and from query are different:
+// the set of query trigrams only contains consecutive sequences of three
+// characters (which is only a subset of all trigrams generated for an
+// identifier).
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANGD_DEX_TRIGRAM_H
+#define LLVM_CLANG_TOOLS_EXTRA_CLANGD_DEX_TRIGRAM_H
+
+#include "Token.h"
+
+#include <string>
+
+namespace clang {
+namespace clangd {
+namespace dex {
+
+/// Returns list of unique fuzzy-search trigrams from unqualified symbol.
+///
+/// First, given Identifier (unqualified symbol name) is segmented using
+/// FuzzyMatch API and lowercased. After segmentation, the following technique
+/// is applied for generating trigrams: for each letter or digit in the input
+/// string the algorithms looks for the possible next and skip-1-next symbols
+/// which can be jumped to during fuzzy matching. Each combination of such three
+/// symbols is inserted into the result.
+///
+/// Trigrams can start at any character in the input. Then we can choose to move
+/// to the next character, move to the start of the next segment, or skip over a
+/// segment.
+///
+/// Note: the returned list of trigrams does not have duplicates, if any trigram
+/// belongs to more than one class it is only inserted once.
+std::vector<Token> generateIdentifierTrigrams(llvm::StringRef Identifier);
+
+/// Returns list of unique fuzzy-search trigrams given a query.
+///
+/// Query is segmented using FuzzyMatch API and downcasted to lowercase. Then,
+/// the simplest trigrams - sequences of three consecutive letters and digits
+/// are extracted and returned after deduplication.
+std::vector<Token> generateQueryTrigrams(llvm::StringRef Query);
+
+} // namespace dex
+} // namespace clangd
+} // namespace clang
+
+#endif
Modified: clang-tools-extra/trunk/unittests/clangd/CMakeLists.txt
URL: http://llvm.org/viewvc/llvm-project/clang-tools-extra/trunk/unittests/clangd/CMakeLists.txt?rev=337901&r1=337900&r2=337901&view=diff
==============================================================================
--- clang-tools-extra/trunk/unittests/clangd/CMakeLists.txt (original)
+++ clang-tools-extra/trunk/unittests/clangd/CMakeLists.txt Wed Jul 25 03:34:57 2018
@@ -15,9 +15,10 @@ add_extra_unittest(ClangdTests
CodeCompleteTests.cpp
CodeCompletionStringsTests.cpp
ContextTests.cpp
+ DexIndexTests.cpp
DraftStoreTests.cpp
- FileIndexTests.cpp
FileDistanceTests.cpp
+ FileIndexTests.cpp
FindSymbolsTests.cpp
FuzzyMatchTests.cpp
GlobalCompilationDatabaseTests.cpp
@@ -27,11 +28,11 @@ add_extra_unittest(ClangdTests
SourceCodeTests.cpp
SymbolCollectorTests.cpp
SyncAPI.cpp
+ TUSchedulerTests.cpp
TestFS.cpp
TestTU.cpp
ThreadingTests.cpp
TraceTests.cpp
- TUSchedulerTests.cpp
URITests.cpp
XRefsTests.cpp
)
Added: clang-tools-extra/trunk/unittests/clangd/DexIndexTests.cpp
URL: http://llvm.org/viewvc/llvm-project/clang-tools-extra/trunk/unittests/clangd/DexIndexTests.cpp?rev=337901&view=auto
==============================================================================
--- clang-tools-extra/trunk/unittests/clangd/DexIndexTests.cpp (added)
+++ clang-tools-extra/trunk/unittests/clangd/DexIndexTests.cpp Wed Jul 25 03:34:57 2018
@@ -0,0 +1,96 @@
+//===-- DexIndexTests.cpp ----------------------------*- C++ -*-----------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "index/dex/Token.h"
+#include "index/dex/Trigram.h"
+#include "gmock/gmock.h"
+#include "gtest/gtest.h"
+
+#include <string>
+#include <vector>
+
+namespace clang {
+namespace clangd {
+namespace dex {
+
+testing::Matcher<std::vector<Token>>
+trigramsAre(std::initializer_list<std::string> Trigrams) {
+ std::vector<Token> Tokens;
+ for (const auto &Symbols : Trigrams) {
+ Tokens.push_back(Token(Token::Kind::Trigram, Symbols));
+ }
+ return testing::UnorderedElementsAreArray(Tokens);
+}
+
+TEST(DexIndexTrigrams, IdentifierTrigrams) {
+ EXPECT_THAT(generateIdentifierTrigrams("X86"), trigramsAre({"x86"}));
+
+ EXPECT_THAT(generateIdentifierTrigrams("nl"), trigramsAre({}));
+
+ EXPECT_THAT(generateIdentifierTrigrams("clangd"),
+ trigramsAre({"cla", "lan", "ang", "ngd"}));
+
+ EXPECT_THAT(generateIdentifierTrigrams("abc_def"),
+ trigramsAre({"abc", "abd", "ade", "bcd", "bde", "cde", "def"}));
+
+ EXPECT_THAT(
+ generateIdentifierTrigrams("a_b_c_d_e_"),
+ trigramsAre({"abc", "abd", "acd", "ace", "bcd", "bce", "bde", "cde"}));
+
+ EXPECT_THAT(
+ generateIdentifierTrigrams("unique_ptr"),
+ trigramsAre({"uni", "unp", "upt", "niq", "nip", "npt", "iqu", "iqp",
+ "ipt", "que", "qup", "qpt", "uep", "ept", "ptr"}));
+
+ EXPECT_THAT(generateIdentifierTrigrams("TUDecl"),
+ trigramsAre({"tud", "tde", "ude", "dec", "ecl"}));
+
+ EXPECT_THAT(generateIdentifierTrigrams("IsOK"),
+ trigramsAre({"iso", "iok", "sok"}));
+
+ EXPECT_THAT(generateIdentifierTrigrams("abc_defGhij__klm"),
+ trigramsAre({
+ "abc", "abd", "abg", "ade", "adg", "adk", "agh", "agk", "bcd",
+ "bcg", "bde", "bdg", "bdk", "bgh", "bgk", "cde", "cdg", "cdk",
+ "cgh", "cgk", "def", "deg", "dek", "dgh", "dgk", "dkl", "efg",
+ "efk", "egh", "egk", "ekl", "fgh", "fgk", "fkl", "ghi", "ghk",
+ "gkl", "hij", "hik", "hkl", "ijk", "ikl", "jkl", "klm",
+ }));
+}
+
+TEST(DexIndexTrigrams, QueryTrigrams) {
+ EXPECT_THAT(generateQueryTrigrams("X86"), trigramsAre({"x86"}));
+
+ EXPECT_THAT(generateQueryTrigrams("nl"), trigramsAre({}));
+
+ EXPECT_THAT(generateQueryTrigrams("clangd"),
+ trigramsAre({"cla", "lan", "ang", "ngd"}));
+
+ EXPECT_THAT(generateQueryTrigrams("abc_def"),
+ trigramsAre({"abc", "bcd", "cde", "def"}));
+
+ EXPECT_THAT(generateQueryTrigrams("a_b_c_d_e_"),
+ trigramsAre({"abc", "bcd", "cde"}));
+
+ EXPECT_THAT(generateQueryTrigrams("unique_ptr"),
+ trigramsAre({"uni", "niq", "iqu", "que", "uep", "ept", "ptr"}));
+
+ EXPECT_THAT(generateQueryTrigrams("TUDecl"),
+ trigramsAre({"tud", "ude", "dec", "ecl"}));
+
+ EXPECT_THAT(generateQueryTrigrams("IsOK"), trigramsAre({"iso", "sok"}));
+
+ EXPECT_THAT(generateQueryTrigrams("abc_defGhij__klm"),
+ trigramsAre({"abc", "bcd", "cde", "def", "efg", "fgh", "ghi",
+ "hij", "ijk", "jkl", "klm"}));
+}
+
+} // namespace dex
+} // namespace clangd
+} // namespace clang
More information about the cfe-commits
mailing list