[clang-tools-extra] r297630 - [include-fixer] Add fuzzy SymbolIndex, where identifier needn't match exactly.
Sam McCall via cfe-commits
cfe-commits at lists.llvm.org
Mon Mar 13 08:56:00 PDT 2017
Author: sammccall
Date: Mon Mar 13 10:55:59 2017
New Revision: 297630
URL: http://llvm.org/viewvc/llvm-project?rev=297630&view=rev
Log:
[include-fixer] Add fuzzy SymbolIndex, where identifier needn't match exactly.
Summary:
Add fuzzy SymbolIndex, where identifier needn't match exactly.
The purpose for this is global autocomplete in clangd. The query will be a
partial identifier up to the cursor, and the results will be suggestions.
It's in include-fixer because:
- it handles SymbolInfos, actually SymbolIndex is exactly the right interface
- it's a good harness for lit testing the fuzzy YAML index
- (Laziness: we can't unit test clangd until reorganizing with a tool/ dir)
Other questionable choices:
- FuzzySymbolIndex, which just refines the contract of SymbolIndex. This is
an interface to allow extension to large monorepos (*cough*)
- an always-true safety check that Identifier == Name is removed from
SymbolIndexManager, as it's not true for fuzzy matching
- exposing -db=fuzzyYaml from include-fixer is not a very useful feature, and
a non-orthogonal ui (fuzziness vs data source). -db=fixed is similar though.
Reviewers: bkramer
Subscribers: cfe-commits, mgorny
Differential Revision: https://reviews.llvm.org/D30720
Added:
clang-tools-extra/trunk/include-fixer/FuzzySymbolIndex.cpp
clang-tools-extra/trunk/include-fixer/FuzzySymbolIndex.h
clang-tools-extra/trunk/test/include-fixer/yaml_fuzzy.cpp
clang-tools-extra/trunk/unittests/include-fixer/FuzzySymbolIndexTests.cpp
Modified:
clang-tools-extra/trunk/include-fixer/CMakeLists.txt
clang-tools-extra/trunk/include-fixer/SymbolIndexManager.cpp
clang-tools-extra/trunk/include-fixer/tool/ClangIncludeFixer.cpp
clang-tools-extra/trunk/test/include-fixer/Inputs/fake_yaml_db.yaml
clang-tools-extra/trunk/unittests/include-fixer/CMakeLists.txt
Modified: clang-tools-extra/trunk/include-fixer/CMakeLists.txt
URL: http://llvm.org/viewvc/llvm-project/clang-tools-extra/trunk/include-fixer/CMakeLists.txt?rev=297630&r1=297629&r2=297630&view=diff
==============================================================================
--- clang-tools-extra/trunk/include-fixer/CMakeLists.txt (original)
+++ clang-tools-extra/trunk/include-fixer/CMakeLists.txt Mon Mar 13 10:55:59 2017
@@ -6,6 +6,7 @@ add_clang_library(clangIncludeFixer
IncludeFixer.cpp
IncludeFixerContext.cpp
InMemorySymbolIndex.cpp
+ FuzzySymbolIndex.cpp
SymbolIndexManager.cpp
YamlSymbolIndex.cpp
Added: clang-tools-extra/trunk/include-fixer/FuzzySymbolIndex.cpp
URL: http://llvm.org/viewvc/llvm-project/clang-tools-extra/trunk/include-fixer/FuzzySymbolIndex.cpp?rev=297630&view=auto
==============================================================================
--- clang-tools-extra/trunk/include-fixer/FuzzySymbolIndex.cpp (added)
+++ clang-tools-extra/trunk/include-fixer/FuzzySymbolIndex.cpp Mon Mar 13 10:55:59 2017
@@ -0,0 +1,143 @@
+//===--- FuzzySymbolIndex.cpp - Lookup symbols for autocomplete -*- C++ -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+#include "FuzzySymbolIndex.h"
+#include "llvm/Support/Regex.h"
+
+using clang::find_all_symbols::SymbolAndSignals;
+using llvm::StringRef;
+
+namespace clang {
+namespace include_fixer {
+namespace {
+
+class MemSymbolIndex : public FuzzySymbolIndex {
+public:
+ MemSymbolIndex(std::vector<SymbolAndSignals> Symbols) {
+ for (auto &Symbol : Symbols) {
+ auto Tokens = tokenize(Symbol.Symbol.getName());
+ this->Symbols.emplace_back(
+ StringRef(llvm::join(Tokens.begin(), Tokens.end(), " ")),
+ std::move(Symbol));
+ }
+ }
+
+ std::vector<SymbolAndSignals> search(StringRef Query) override {
+ auto Tokens = tokenize(Query);
+ llvm::Regex Pattern("^" + queryRegexp(Tokens));
+ std::vector<SymbolAndSignals> Results;
+ for (const Entry &E : Symbols)
+ if (Pattern.match(E.first))
+ Results.push_back(E.second);
+ return Results;
+ }
+
+private:
+ using Entry = std::pair<llvm::SmallString<32>, SymbolAndSignals>;
+ std::vector<Entry> Symbols;
+};
+
+// Helpers for tokenize state machine.
+enum TokenizeState {
+ EMPTY, // No pending characters.
+ ONE_BIG, // Read one uppercase letter, could be WORD or Word.
+ BIG_WORD, // Reading an uppercase WORD.
+ SMALL_WORD, // Reading a lowercase word.
+ NUMBER // Reading a number.
+};
+
+enum CharType { UPPER, LOWER, DIGIT, MISC };
+CharType classify(char c) {
+ if (isupper(c))
+ return UPPER;
+ if (islower(c))
+ return LOWER;
+ if (isdigit(c))
+ return DIGIT;
+ return MISC;
+}
+
+} // namespace
+
+std::vector<std::string> FuzzySymbolIndex::tokenize(StringRef Text) {
+ std::vector<std::string> Result;
+ // State describes the treatment of text from Start to I.
+ // Once text is Flush()ed into Result, we're done with it and advance Start.
+ TokenizeState State = EMPTY;
+ size_t Start = 0;
+ auto Flush = [&](size_t End) {
+ if (State != EMPTY) {
+ Result.push_back(Text.substr(Start, End - Start).lower());
+ State = EMPTY;
+ }
+ Start = End;
+ };
+ for (size_t I = 0; I < Text.size(); ++I) {
+ CharType Type = classify(Text[I]);
+ if (Type == MISC)
+ Flush(I);
+ else if (Type == LOWER)
+ switch (State) {
+ case BIG_WORD:
+ Flush(I - 1); // FOOBar: first token is FOO, not FOOB.
+ LLVM_FALLTHROUGH;
+ case ONE_BIG:
+ State = SMALL_WORD;
+ LLVM_FALLTHROUGH;
+ case SMALL_WORD:
+ break;
+ default:
+ Flush(I);
+ State = SMALL_WORD;
+ }
+ else if (Type == UPPER)
+ switch (State) {
+ case ONE_BIG:
+ State = BIG_WORD;
+ LLVM_FALLTHROUGH;
+ case BIG_WORD:
+ break;
+ default:
+ Flush(I);
+ State = ONE_BIG;
+ }
+ else if (Type == DIGIT && State != NUMBER) {
+ Flush(I);
+ State = NUMBER;
+ }
+ }
+ Flush(Text.size());
+ return Result;
+}
+
+std::string
+FuzzySymbolIndex::queryRegexp(const std::vector<std::string> &Tokens) {
+ std::string Result;
+ for (size_t I = 0; I < Tokens.size(); ++I) {
+ if (I)
+ Result.append("[[:alnum:]]* ");
+ for (size_t J = 0; J < Tokens[I].size(); ++J) {
+ if (J)
+ Result.append("([[:alnum:]]* )?");
+ Result.push_back(Tokens[I][J]);
+ }
+ }
+ return Result;
+}
+
+llvm::Expected<std::unique_ptr<FuzzySymbolIndex>>
+FuzzySymbolIndex::createFromYAML(StringRef FilePath) {
+ auto Buffer = llvm::MemoryBuffer::getFile(FilePath);
+ if (!Buffer)
+ return llvm::errorCodeToError(Buffer.getError());
+ return llvm::make_unique<MemSymbolIndex>(
+ find_all_symbols::ReadSymbolInfosFromYAML(Buffer.get()->getBuffer()));
+}
+
+} // namespace include_fixer
+} // namespace clang
Added: clang-tools-extra/trunk/include-fixer/FuzzySymbolIndex.h
URL: http://llvm.org/viewvc/llvm-project/clang-tools-extra/trunk/include-fixer/FuzzySymbolIndex.h?rev=297630&view=auto
==============================================================================
--- clang-tools-extra/trunk/include-fixer/FuzzySymbolIndex.h (added)
+++ clang-tools-extra/trunk/include-fixer/FuzzySymbolIndex.h Mon Mar 13 10:55:59 2017
@@ -0,0 +1,55 @@
+//===--- FuzzySymbolIndex.h - Lookup symbols for autocomplete ---*- C++ -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_CLANG_TOOLS_EXTRA_INCLUDE_FIXER_FUZZY_SYMBOL_INDEX_H
+#define LLVM_CLANG_TOOLS_EXTRA_INCLUDE_FIXER_FUZZY_SYMBOL_INDEX_H
+
+#include "SymbolIndex.h"
+#include "find-all-symbols/SymbolInfo.h"
+#include "llvm/ADT/SmallString.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/Support/Error.h"
+#include <string>
+#include <vector>
+
+namespace clang {
+namespace include_fixer {
+
+// A FuzzySymbolIndex retrieves top-level symbols matching a query string.
+//
+// It refines the contract of SymbolIndex::search to do fuzzy matching:
+// - symbol names are tokenized: "unique ptr", "string ref".
+// - query must match prefixes of symbol tokens: [upt]
+// - if the query has multiple tokens, splits must match: [StR], not [STr].
+// Helpers for tokenization and regex matching are provided.
+//
+// Implementations may choose to truncate results, refuse short queries, etc.
+class FuzzySymbolIndex : public SymbolIndex {
+public:
+ // Loads the specified include-fixer database and returns an index serving it.
+ static llvm::Expected<std::unique_ptr<FuzzySymbolIndex>>
+ createFromYAML(llvm::StringRef File);
+
+ // Helpers for implementing indexes:
+
+ // Transforms a symbol name or query into a sequence of tokens.
+ // - URLHandlerCallback --> [url, handler, callback]
+ // - snake_case11 --> [snake, case, 11]
+ // - _WTF$ --> [wtf]
+ static std::vector<std::string> tokenize(llvm::StringRef Text);
+
+ // Transforms query tokens into an unanchored regexp to match symbol tokens.
+ // - [fe f] --> /f(\w* )?e\w* f/, matches [fee fie foe].
+ static std::string queryRegexp(const std::vector<std::string> &Tokens);
+};
+
+} // namespace include_fixer
+} // namespace clang
+
+#endif // LLVM_CLANG_TOOLS_EXTRA_INCLUDE_FIXER_FUZZY_SYMBOL_INDEX_H
Modified: clang-tools-extra/trunk/include-fixer/SymbolIndexManager.cpp
URL: http://llvm.org/viewvc/llvm-project/clang-tools-extra/trunk/include-fixer/SymbolIndexManager.cpp?rev=297630&r1=297629&r2=297630&view=diff
==============================================================================
--- clang-tools-extra/trunk/include-fixer/SymbolIndexManager.cpp (original)
+++ clang-tools-extra/trunk/include-fixer/SymbolIndexManager.cpp Mon Mar 13 10:55:59 2017
@@ -103,46 +103,44 @@ SymbolIndexManager::search(llvm::StringR
for (auto &SymAndSig : Symbols) {
const SymbolInfo &Symbol = SymAndSig.Symbol;
// Match the identifier name without qualifier.
- if (Symbol.getName() == Names.back()) {
- bool IsMatched = true;
- auto SymbolContext = Symbol.getContexts().begin();
- auto IdentiferContext = Names.rbegin() + 1; // Skip identifier name.
- // Match the remaining context names.
- while (IdentiferContext != Names.rend() &&
- SymbolContext != Symbol.getContexts().end()) {
- if (SymbolContext->second == *IdentiferContext) {
- ++IdentiferContext;
- ++SymbolContext;
- } else if (SymbolContext->first ==
- find_all_symbols::SymbolInfo::ContextType::EnumDecl) {
- // Skip non-scoped enum context.
- ++SymbolContext;
- } else {
- IsMatched = false;
- break;
- }
+ bool IsMatched = true;
+ auto SymbolContext = Symbol.getContexts().begin();
+ auto IdentiferContext = Names.rbegin() + 1; // Skip identifier name.
+ // Match the remaining context names.
+ while (IdentiferContext != Names.rend() &&
+ SymbolContext != Symbol.getContexts().end()) {
+ if (SymbolContext->second == *IdentiferContext) {
+ ++IdentiferContext;
+ ++SymbolContext;
+ } else if (SymbolContext->first ==
+ find_all_symbols::SymbolInfo::ContextType::EnumDecl) {
+ // Skip non-scoped enum context.
+ ++SymbolContext;
+ } else {
+ IsMatched = false;
+ break;
}
+ }
- // If the name was qualified we only want to add results if we evaluated
- // all contexts.
- if (IsFullyQualified)
- IsMatched &= (SymbolContext == Symbol.getContexts().end());
+ // If the name was qualified we only want to add results if we evaluated
+ // all contexts.
+ if (IsFullyQualified)
+ IsMatched &= (SymbolContext == Symbol.getContexts().end());
- // FIXME: Support full match. At this point, we only find symbols in
- // database which end with the same contexts with the identifier.
- if (IsMatched && IdentiferContext == Names.rend()) {
- // If we're in a situation where we took a prefix but the thing we
- // found couldn't possibly have a nested member ignore it.
- if (TookPrefix &&
- (Symbol.getSymbolKind() == SymbolInfo::SymbolKind::Function ||
- Symbol.getSymbolKind() == SymbolInfo::SymbolKind::Variable ||
- Symbol.getSymbolKind() ==
- SymbolInfo::SymbolKind::EnumConstantDecl ||
- Symbol.getSymbolKind() == SymbolInfo::SymbolKind::Macro))
- continue;
+ // FIXME: Support full match. At this point, we only find symbols in
+ // database which end with the same contexts with the identifier.
+ if (IsMatched && IdentiferContext == Names.rend()) {
+ // If we're in a situation where we took a prefix but the thing we
+ // found couldn't possibly have a nested member ignore it.
+ if (TookPrefix &&
+ (Symbol.getSymbolKind() == SymbolInfo::SymbolKind::Function ||
+ Symbol.getSymbolKind() == SymbolInfo::SymbolKind::Variable ||
+ Symbol.getSymbolKind() ==
+ SymbolInfo::SymbolKind::EnumConstantDecl ||
+ Symbol.getSymbolKind() == SymbolInfo::SymbolKind::Macro))
+ continue;
- MatchedSymbols.push_back(std::move(SymAndSig));
- }
+ MatchedSymbols.push_back(std::move(SymAndSig));
}
}
Names.pop_back();
@@ -152,7 +150,7 @@ SymbolIndexManager::search(llvm::StringR
rank(MatchedSymbols, FileName);
// Strip signals, they are no longer needed.
std::vector<SymbolInfo> Res;
- for (const auto &SymAndSig : MatchedSymbols)
+ for (auto &SymAndSig : MatchedSymbols)
Res.push_back(std::move(SymAndSig.Symbol));
return Res;
}
Modified: clang-tools-extra/trunk/include-fixer/tool/ClangIncludeFixer.cpp
URL: http://llvm.org/viewvc/llvm-project/clang-tools-extra/trunk/include-fixer/tool/ClangIncludeFixer.cpp?rev=297630&r1=297629&r2=297630&view=diff
==============================================================================
--- clang-tools-extra/trunk/include-fixer/tool/ClangIncludeFixer.cpp (original)
+++ clang-tools-extra/trunk/include-fixer/tool/ClangIncludeFixer.cpp Mon Mar 13 10:55:59 2017
@@ -7,6 +7,7 @@
//
//===----------------------------------------------------------------------===//
+#include "FuzzySymbolIndex.h"
#include "InMemorySymbolIndex.h"
#include "IncludeFixer.h"
#include "IncludeFixerContext.h"
@@ -83,14 +84,16 @@ namespace {
cl::OptionCategory IncludeFixerCategory("Tool options");
enum DatabaseFormatTy {
- fixed, ///< Hard-coded mapping.
- yaml, ///< Yaml database created by find-all-symbols.
+ fixed, ///< Hard-coded mapping.
+ yaml, ///< Yaml database created by find-all-symbols.
+ fuzzyYaml, ///< Yaml database with fuzzy-matched identifiers.
};
cl::opt<DatabaseFormatTy> DatabaseFormat(
"db", cl::desc("Specify input format"),
cl::values(clEnumVal(fixed, "Hard-coded mapping"),
- clEnumVal(yaml, "Yaml database created by find-all-symbols")),
+ clEnumVal(yaml, "Yaml database created by find-all-symbols"),
+ clEnumVal(fuzzyYaml, "Yaml database, with fuzzy-matched names")),
cl::init(yaml), cl::cat(IncludeFixerCategory));
cl::opt<std::string> Input("input",
@@ -215,6 +218,21 @@ createSymbolIndexManager(StringRef FileP
SymbolIndexMgr->addSymbolIndex(std::move(CreateYamlIdx));
break;
}
+ case fuzzyYaml: {
+ // This mode is not very useful, because we don't correct the identifier.
+ // It's main purpose is to expose FuzzySymbolIndex to tests.
+ SymbolIndexMgr->addSymbolIndex(
+ []() -> std::unique_ptr<include_fixer::SymbolIndex> {
+ auto DB = include_fixer::FuzzySymbolIndex::createFromYAML(Input);
+ if (!DB) {
+ llvm::errs() << "Couldn't load fuzzy YAML db: "
+ << llvm::toString(DB.takeError()) << '\n';
+ return nullptr;
+ }
+ return std::move(*DB);
+ });
+ break;
+ }
}
return SymbolIndexMgr;
}
Modified: clang-tools-extra/trunk/test/include-fixer/Inputs/fake_yaml_db.yaml
URL: http://llvm.org/viewvc/llvm-project/clang-tools-extra/trunk/test/include-fixer/Inputs/fake_yaml_db.yaml?rev=297630&r1=297629&r2=297630&view=diff
==============================================================================
--- clang-tools-extra/trunk/test/include-fixer/Inputs/fake_yaml_db.yaml (original)
+++ clang-tools-extra/trunk/test/include-fixer/Inputs/fake_yaml_db.yaml Mon Mar 13 10:55:59 2017
@@ -10,6 +10,17 @@ Type: Class
Seen: 1
Used: 0
---
+Name: foo_bar
+Contexts:
+ - ContextType: Namespace
+ ContextName: a
+ - ContextType: Namespace
+ ContextName: b
+FilePath: foobar.h
+Type: Class
+Seen: 0
+Used: 0
+---
Name: bar
Contexts:
- ContextType: Namespace
Added: clang-tools-extra/trunk/test/include-fixer/yaml_fuzzy.cpp
URL: http://llvm.org/viewvc/llvm-project/clang-tools-extra/trunk/test/include-fixer/yaml_fuzzy.cpp?rev=297630&view=auto
==============================================================================
--- clang-tools-extra/trunk/test/include-fixer/yaml_fuzzy.cpp (added)
+++ clang-tools-extra/trunk/test/include-fixer/yaml_fuzzy.cpp Mon Mar 13 10:55:59 2017
@@ -0,0 +1,9 @@
+// RUN: sed -e 's#//.*$##' %s > %t.cpp
+// RUN: clang-include-fixer -db=fuzzyYaml -input=%p/Inputs/fake_yaml_db.yaml %t.cpp --
+// RUN: FileCheck %s -input-file=%t.cpp
+
+// include-fixer will add the include, but doesn't complete the symbol.
+// CHECK: #include "foobar.h"
+// CHECK: fba f;
+
+b::a::fba f;
Modified: clang-tools-extra/trunk/unittests/include-fixer/CMakeLists.txt
URL: http://llvm.org/viewvc/llvm-project/clang-tools-extra/trunk/unittests/include-fixer/CMakeLists.txt?rev=297630&r1=297629&r2=297630&view=diff
==============================================================================
--- clang-tools-extra/trunk/unittests/include-fixer/CMakeLists.txt (original)
+++ clang-tools-extra/trunk/unittests/include-fixer/CMakeLists.txt Mon Mar 13 10:55:59 2017
@@ -13,6 +13,7 @@ include_directories(${CLANG_SOURCE_DIR})
add_extra_unittest(IncludeFixerTests
IncludeFixerTest.cpp
+ FuzzySymbolIndexTests.cpp
)
target_link_libraries(IncludeFixerTests
Added: clang-tools-extra/trunk/unittests/include-fixer/FuzzySymbolIndexTests.cpp
URL: http://llvm.org/viewvc/llvm-project/clang-tools-extra/trunk/unittests/include-fixer/FuzzySymbolIndexTests.cpp?rev=297630&view=auto
==============================================================================
--- clang-tools-extra/trunk/unittests/include-fixer/FuzzySymbolIndexTests.cpp (added)
+++ clang-tools-extra/trunk/unittests/include-fixer/FuzzySymbolIndexTests.cpp Mon Mar 13 10:55:59 2017
@@ -0,0 +1,61 @@
+//===-- FuzzySymbolIndexTests.cpp - Fuzzy symbol index unit tests ---------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "FuzzySymbolIndex.h"
+#include "gmock/gmock.h"
+#include "llvm/Support/Regex.h"
+#include "gtest/gtest.h"
+
+using testing::ElementsAre;
+using testing::Not;
+
+namespace clang {
+namespace include_fixer {
+namespace {
+
+TEST(FuzzySymbolIndexTest, Tokenize) {
+ EXPECT_THAT(FuzzySymbolIndex::tokenize("URLHandlerCallback"),
+ ElementsAre("url", "handler", "callback"));
+ EXPECT_THAT(FuzzySymbolIndex::tokenize("snake_case11"),
+ ElementsAre("snake", "case", "11"));
+ EXPECT_THAT(FuzzySymbolIndex::tokenize("__$42!!BOB\nbob"),
+ ElementsAre("42", "bob", "bob"));
+}
+
+MATCHER_P(MatchesSymbol, Identifier, "") {
+ llvm::Regex Pattern("^" + arg);
+ std::string err;
+ if (!Pattern.isValid(err)) {
+ *result_listener << "invalid regex: " << err;
+ return false;
+ }
+ auto Tokens = FuzzySymbolIndex::tokenize(Identifier);
+ std::string Target = llvm::join(Tokens.begin(), Tokens.end(), " ");
+ *result_listener << "matching against '" << Target << "'";
+ return llvm::Regex("^" + arg).match(Target);
+}
+
+TEST(FuzzySymbolIndexTest, QueryRegexp) {
+ auto QueryRegexp = [](const std::string &query) {
+ return FuzzySymbolIndex::queryRegexp(FuzzySymbolIndex::tokenize(query));
+ };
+ EXPECT_THAT(QueryRegexp("uhc"), MatchesSymbol("URLHandlerCallback"));
+ EXPECT_THAT(QueryRegexp("urhaca"), MatchesSymbol("URLHandlerCallback"));
+ EXPECT_THAT(QueryRegexp("uhcb"), Not(MatchesSymbol("URLHandlerCallback")))
+ << "Non-prefix";
+ EXPECT_THAT(QueryRegexp("uc"), Not(MatchesSymbol("URLHandlerCallback")))
+ << "Skip token";
+
+ EXPECT_THAT(QueryRegexp("uptr"), MatchesSymbol("unique_ptr"));
+ EXPECT_THAT(QueryRegexp("UniP"), MatchesSymbol("unique_ptr"));
+}
+
+} // namespace
+} // namespace include_fixer
+} // namespace clang
More information about the cfe-commits
mailing list