[clang-tools-extra] 3f1c2bf - [clangd] go-to-def on names in comments etc that are used nearby.

Wed Apr 22 10:46:52 PDT 2020

Author: Sam McCall
Date: 2020-04-22T19:46:41+02:00
New Revision: 3f1c2bf1712c7496a80a0f89036ab1625ff347a5

URL: https://github.com/llvm/llvm-project/commit/3f1c2bf1712c7496a80a0f89036ab1625ff347a5
DIFF: https://github.com/llvm/llvm-project/commit/3f1c2bf1712c7496a80a0f89036ab1625ff347a5.diff

LOG: [clangd] go-to-def on names in comments etc that are used nearby.

Summary:
This is intended as a companion to (and is inspired by) D72874 which attempts to
resolve these cases using the index.
The intent is we'd try this strategy after the AST-based approach but before the
index-based (I think local usages would be more reliable than index matches).

Reviewers: nridge

Subscribers: ilya-biryukov, MaskRay, jkorous, arphaman, kadircet, usaxena95, cfe-commits

Tags: #clang

Differential Revision: https://reviews.llvm.org/D75479

Added: 
    

Modified: 
    clang-tools-extra/clangd/SourceCode.cpp
    clang-tools-extra/clangd/SourceCode.h
    clang-tools-extra/clangd/XRefs.cpp
    clang-tools-extra/clangd/XRefs.h
    clang-tools-extra/clangd/unittests/SourceCodeTests.cpp
    clang-tools-extra/clangd/unittests/XRefsTests.cpp

Removed: 
    


################################################################################
diff  --git a/clang-tools-extra/clangd/SourceCode.cpp b/clang-tools-extra/clangd/SourceCode.cpp
index 1943784bfd18..dd4c863cb96a 100644

--- a/clang-tools-extra/clangd/SourceCode.cpp
+++ b/clang-tools-extra/clangd/SourceCode.cpp
@@ -855,6 +855,96 @@ llvm::StringSet<> collectWords(llvm::StringRef Content) {
   return Result;
 }
 
+static bool isLikelyIdentifier(llvm::StringRef Word, llvm::StringRef Before,
+                               llvm::StringRef After) {
+  // `foo` is an identifier.
+  if (Before.endswith("`") && After.startswith("`"))
+    return true;
+  // In foo::bar, both foo and bar are identifiers.
+  if (Before.endswith("::") || After.startswith("::"))
+    return true;
+  // Doxygen tags like \c foo indicate identifiers.
+  // Don't search too far back.
+  // This duplicates clang's doxygen parser, revisit if it gets complicated.
+  Before = Before.take_back(100); // Don't search too far back.
+  auto Pos = Before.find_last_of("\\@");
+  if (Pos != llvm::StringRef::npos) {
+    llvm::StringRef Tag = Before.substr(Pos + 1).rtrim(' ');
+    if (Tag == "p" || Tag == "c" || Tag == "class" || Tag == "tparam" ||
+        Tag == "param" || Tag == "param[in]" || Tag == "param[out]" ||
+        Tag == "param[in,out]" || Tag == "retval" || Tag == "throw" ||
+        Tag == "throws" || Tag == "link")
+      return true;
+  }
+
+  // Word contains underscore.
+  // This handles things like snake_case and MACRO_CASE.
+  if (Word.contains('_')) {
+    return true;
+  }
+  // Word contains capital letter other than at beginning.
+  // This handles things like lowerCamel and UpperCamel.
+  // The check for also containing a lowercase letter is to rule out
+  // initialisms like "HTTP".
+  bool HasLower = Word.find_if(clang::isLowercase) != StringRef::npos;
+  bool HasUpper = Word.substr(1).find_if(clang::isUppercase) != StringRef::npos;
+  if (HasLower && HasUpper) {
+    return true;
+  }
+  // FIXME: consider mid-sentence Capitalization?
+  return false;
+}
+
+llvm::Optional<SpelledWord> SpelledWord::touching(SourceLocation SpelledLoc,
+                                                  const syntax::TokenBuffer &TB,
+                                                  const LangOptions &LangOpts) {
+  const auto &SM = TB.sourceManager();
+  auto Touching = syntax::spelledTokensTouching(SpelledLoc, TB);
+  for (const auto &T : Touching) {
+    // If the token is an identifier or a keyword, don't use any heuristics.
+    if (tok::isAnyIdentifier(T.kind()) || tok::getKeywordSpelling(T.kind())) {
+      SpelledWord Result;
+      Result.Location = T.location();
+      Result.Text = T.text(SM);
+      Result.LikelyIdentifier = tok::isAnyIdentifier(T.kind());
+      Result.PartOfSpelledToken = &T;
+      Result.SpelledToken = &T;
+      auto Expanded =
+          TB.expandedTokens(SM.getMacroArgExpandedLocation(T.location()));
+      if (Expanded.size() == 1 && Expanded.front().text(SM) == Result.Text)
+        Result.ExpandedToken = &Expanded.front();
+      return Result;
+    }
+  }
+  FileID File;
+  unsigned Offset;
+  std::tie(File, Offset) = SM.getDecomposedLoc(SpelledLoc);
+  bool Invalid = false;
+  llvm::StringRef Code = SM.getBufferData(File, &Invalid);
+  if (Invalid)
+    return llvm::None;
+  unsigned B = Offset, E = Offset;
+  while (B > 0 && isIdentifierBody(Code[B - 1]))
+    --B;
+  while (E < Code.size() && isIdentifierBody(Code[E]))
+    ++E;
+  if (B == E)
+    return llvm::None;
+
+  SpelledWord Result;
+  Result.Location = SM.getComposedLoc(File, B);
+  Result.Text = Code.slice(B, E);
+  Result.LikelyIdentifier =
+      isLikelyIdentifier(Result.Text, Code.substr(0, B), Code.substr(E)) &&
+      // should not be a keyword
+      tok::isAnyIdentifier(
+          IdentifierTable(LangOpts).get(Result.Text).getTokenID());
+  for (const auto &T : Touching)
+    if (T.location() <= Result.Location)
+      Result.PartOfSpelledToken = &T;
+  return Result;
+}
+
 llvm::Optional<DefinedMacro> locateMacroAt(const syntax::Token &SpelledTok,
                                            Preprocessor &PP) {
   SourceLocation Loc = SpelledTok.location();

diff  --git a/clang-tools-extra/clangd/SourceCode.h b/clang-tools-extra/clangd/SourceCode.h
index dfa685fdd795..d1e5aa5f7c2b 100644
--- a/clang-tools-extra/clangd/SourceCode.h
+++ b/clang-tools-extra/clangd/SourceCode.h
@@ -216,6 +216,35 @@ std::vector<Range> collectIdentifierRanges(llvm::StringRef Identifier,
 /// - drops stopwords like "get" and "for"
 llvm::StringSet<> collectWords(llvm::StringRef Content);
 
+// Something that looks like a word in the source code.
+// Could be a "real" token that's "live" in the AST, a spelled token consumed by
+// the preprocessor, or part of a spelled token (e.g. word in a comment).
+struct SpelledWord {
+  // (Spelling) location of the start of the word.
+  SourceLocation Location;
+  // The range of the word itself, excluding any quotes.
+  // This is a subrange of the file buffer.
+  llvm::StringRef Text;
+  // Whether this word is likely to refer to an identifier. True if:
+  // - the word is a spelled identifier token
+  // - Text is identifier-like (e.g. "foo_bar")
+  // - Text is surrounded by backticks (e.g. Foo in "// returns `Foo`")
+  bool LikelyIdentifier = false;
+  // Set if the word is contained in a token spelled in the file.
+  // (This should always be true, but comments aren't retained by TokenBuffer).
+  const syntax::Token *PartOfSpelledToken = nullptr;
+  // Set if the word is exactly a token spelled in the file.
+  const syntax::Token *SpelledToken = nullptr;
+  // Set if the word is a token spelled in the file, and that token survives
+  // preprocessing to emit an expanded token spelled the same way.
+  const syntax::Token *ExpandedToken = nullptr;
+
+  // Find the unique word that contains SpelledLoc or starts/ends there.
+  static llvm::Optional<SpelledWord> touching(SourceLocation SpelledLoc,
+                                              const syntax::TokenBuffer &TB,
+                                              const LangOptions &LangOpts);
+};
+
 /// Heuristically determine namespaces visible at a point, without parsing Code.
 /// This considers using-directives and enclosing namespace-declarations that
 /// are visible (and not obfuscated) in the file itself (not headers).

diff  --git a/clang-tools-extra/clangd/XRefs.cpp b/clang-tools-extra/clangd/XRefs.cpp
index 2e2e6602c8d3..d17fa52bd82c 100644
--- a/clang-tools-extra/clangd/XRefs.cpp
+++ b/clang-tools-extra/clangd/XRefs.cpp
@@ -34,6 +34,7 @@
 #include "clang/Basic/LangOptions.h"
 #include "clang/Basic/SourceLocation.h"
 #include "clang/Basic/SourceManager.h"
+#include "clang/Basic/TokenKinds.h"
 #include "clang/Index/IndexDataConsumer.h"
 #include "clang/Index/IndexSymbol.h"
 #include "clang/Index/IndexingAction.h"
@@ -48,6 +49,7 @@
 #include "llvm/ADT/StringRef.h"
 #include "llvm/Support/Casting.h"
 #include "llvm/Support/Error.h"
+#include "llvm/Support/MathExtras.h"
 #include "llvm/Support/Path.h"
 #include "llvm/Support/raw_ostream.h"
 
@@ -315,93 +317,44 @@ locateASTReferent(SourceLocation CurLoc, const syntax::Token *TouchedIdentifier,
   return Result;
 }
 
-llvm::StringRef wordTouching(llvm::StringRef Code, unsigned Offset) {
-  unsigned B = Offset, E = Offset;
-  while (B > 0 && isIdentifierBody(Code[B - 1]))
-    --B;
-  while (E < Code.size() && isIdentifierBody(Code[E]))
-    ++E;
-  return Code.slice(B, E);
+bool tokenSpelledAt(SourceLocation SpellingLoc, const syntax::TokenBuffer &TB) {
+  auto ExpandedTokens = TB.expandedTokens(
+      TB.sourceManager().getMacroArgExpandedLocation(SpellingLoc));
+  return !ExpandedTokens.empty();
 }
 
-bool isLikelyToBeIdentifier(StringRef Word) {
-  // Word contains underscore.
-  // This handles things like snake_case and MACRO_CASE.
-  if (Word.contains('_')) {
-    return true;
-  }
-  // Word contains capital letter other than at beginning.
-  // This handles things like lowerCamel and UpperCamel.
-  // The check for also containing a lowercase letter is to rule out
-  // initialisms like "HTTP".
-  bool HasLower = Word.find_if(clang::isLowercase) != StringRef::npos;
-  bool HasUpper = Word.substr(1).find_if(clang::isUppercase) != StringRef::npos;
-  if (HasLower && HasUpper) {
-    return true;
-  }
-  // FIXME: There are other signals we could listen for.
-  // Some of these require inspecting the surroundings of the word as well.
-  //   - mid-sentence Capitalization
-  //   - markup like quotes / backticks / brackets / "\p"
-  //   - word has a qualifier (foo::bar)
-  return false;
-}
-
-bool tokenSurvivedPreprocessing(SourceLocation Loc,
-                                const syntax::TokenBuffer &TB) {
-  auto WordExpandedTokens =
-      TB.expandedTokens(TB.sourceManager().getMacroArgExpandedLocation(Loc));
-  return !WordExpandedTokens.empty();
+llvm::StringRef sourcePrefix(SourceLocation Loc, const SourceManager &SM) {
+  auto D = SM.getDecomposedLoc(Loc);
+  bool Invalid = false;
+  llvm::StringRef Buf = SM.getBufferData(D.first, &Invalid);
+  if (Invalid || D.second > Buf.size())
+    return "";
+  return Buf.substr(0, D.second);
 }
 
 } // namespace
 
 std::vector<LocatedSymbol>
-locateSymbolNamedTextuallyAt(ParsedAST &AST, const SymbolIndex *Index,
-                             SourceLocation Loc,
-                             const std::string &MainFilePath) {
-  const auto &SM = AST.getSourceManager();
-
-  // Get the raw word at the specified location.
-  unsigned Pos;
-  FileID File;
-  std::tie(File, Pos) = SM.getDecomposedLoc(Loc);
-  llvm::StringRef Code = SM.getBufferData(File);
-  llvm::StringRef Word = wordTouching(Code, Pos);
-  if (Word.empty())
-    return {};
-  unsigned WordOffset = Word.data() - Code.data();
-  SourceLocation WordStart = SM.getComposedLoc(File, WordOffset);
-
-  // Attempt to determine the kind of token that contains the word,
-  // and bail if it's a string literal. Note that we cannot always
-  // determine the token kind (e.g. comments, for which we do want
-  // to activate, are not retained by TokenBuffer).
-  for (syntax::Token T :
-       syntax::spelledTokensTouching(WordStart, AST.getTokens())) {
-    if (T.range(AST.getSourceManager()).touches(WordOffset + Word.size())) {
-      if (isStringLiteral(T.kind()))
-        return {};
-    }
-  }
-
-  // Do not consider tokens that survived preprocessing.
-  // We are erring on the safe side here, as a user may expect to get
-  // accurate (as opposed to textual-heuristic) results for such tokens.
-  // FIXME: Relax this for dependent code.
-  if (tokenSurvivedPreprocessing(WordStart, AST.getTokens()))
+locateSymbolTextually(const SpelledWord &Word, ParsedAST &AST,
+                      const SymbolIndex *Index,
+                      const std::string &MainFilePath) {
+  // Don't use heuristics if this is a real identifier, or not an identifier.
+  if (Word.ExpandedToken || !Word.LikelyIdentifier || !Index)
     return {};
-
-  // Additionally filter for signals that the word is likely to be an
-  // identifier. This avoids triggering on e.g. random words in a comment.
-  if (!isLikelyToBeIdentifier(Word))
+  // We don't want to handle words in string literals. It'd be nice to whitelist
+  // comments instead, but they're not retained in TokenBuffer.
+  if (Word.PartOfSpelledToken &&
+      isStringLiteral(Word.PartOfSpelledToken->kind()))
     return {};
 
+  const auto &SM = AST.getSourceManager();
   // Look up the selected word in the index.
   FuzzyFindRequest Req;
-  Req.Query = Word.str();
+  Req.Query = Word.Text.str();
   Req.ProximityPaths = {MainFilePath};
-  Req.Scopes = visibleNamespaces(Code.take_front(Pos), AST.getLangOpts());
+  // Find the namespaces to query by lexing the file.
+  Req.Scopes =
+      visibleNamespaces(sourcePrefix(Word.Location, SM), AST.getLangOpts());
   // FIXME: For extra strictness, consider AnyScope=false.
   Req.AnyScope = true;
   // We limit the results to 3 further below. This limit is to avoid fetching
@@ -416,7 +369,7 @@ locateSymbolNamedTextuallyAt(ParsedAST &AST, const SymbolIndex *Index,
     // This is to avoid too many false positives.
     // We could relax this in the future (e.g. to allow for typos) if we make
     // the query more accurate by other means.
-    if (Sym.Name != Word)
+    if (Sym.Name != Word.Text)
       return;
 
     // Exclude constructor results. They have the same name as the class,
@@ -481,6 +434,82 @@ locateSymbolNamedTextuallyAt(ParsedAST &AST, const SymbolIndex *Index,
   return Results;
 }
 
+const syntax::Token *findNearbyIdentifier(const SpelledWord &Word,
+                                          const syntax::TokenBuffer &TB) {
+  // Don't use heuristics if this is a real identifier.
+  // Unlikely identifiers are OK if they were used as identifiers nearby.
+  if (Word.ExpandedToken)
+    return nullptr;
+  // We don't want to handle words in string literals. It'd be nice to whitelist
+  // comments instead, but they're not retained in TokenBuffer.
+  if (Word.PartOfSpelledToken &&
+      isStringLiteral(Word.PartOfSpelledToken->kind()))
+    return {};
+
+  const SourceManager &SM = TB.sourceManager();
+  // We prefer the closest possible token, line-wise. Backwards is penalized.
+  // Ties are implicitly broken by traversal order (first-one-wins).
+  auto File = SM.getFileID(Word.Location);
+  unsigned WordLine = SM.getSpellingLineNumber(Word.Location);
+  auto Cost = [&](SourceLocation Loc) -> unsigned {
+    assert(SM.getFileID(Loc) == File && "spelled token in wrong file?");
+    unsigned Line = SM.getSpellingLineNumber(Loc);
+    if (Line > WordLine)
+      return 1 + llvm::Log2_64(Line - WordLine);
+    if (Line < WordLine)
+      return 2 + llvm::Log2_64(WordLine - Line);
+    return 0;
+  };
+  const syntax::Token *BestTok = nullptr;
+  // Search bounds are based on word length: 2^N lines forward.
+  unsigned BestCost = Word.Text.size() + 1;
+
+  // Updates BestTok and BestCost if Tok is a good candidate.
+  // May return true if the cost is too high for this token.
+  auto Consider = [&](const syntax::Token &Tok) {
+    if (!(Tok.kind() == tok::identifier && Tok.text(SM) == Word.Text))
+      return false;
+    // No point guessing the same location we started with.
+    if (Tok.location() == Word.Location)
+      return false;
+    // We've done cheap checks, compute cost so we can break the caller's loop.
+    unsigned TokCost = Cost(Tok.location());
+    if (TokCost >= BestCost)
+      return true; // causes the outer loop to break.
+    // Allow locations that might be part of the AST, and macros (even if empty)
+    // but not things like disabled preprocessor sections.
+    if (!(tokenSpelledAt(Tok.location(), TB) || TB.expansionStartingAt(&Tok)))
+      return false;
+    // We already verified this token is an improvement.
+    BestCost = TokCost;
+    BestTok = &Tok;
+    return false;
+  };
+  auto SpelledTokens = TB.spelledTokens(File);
+  // Find where the word occurred in the token stream, to search forward & back.
+  auto *I = llvm::partition_point(SpelledTokens, [&](const syntax::Token &T) {
+    assert(SM.getFileID(T.location()) == SM.getFileID(Word.Location));
+    return T.location() >= Word.Location; // Comparison OK: same file.
+  });
+  // Search for matches after the cursor.
+  for (const syntax::Token &Tok : llvm::makeArrayRef(I, SpelledTokens.end()))
+    if (Consider(Tok))
+      break; // costs of later tokens are greater...
+  // Search for matches before the cursor.
+  for (const syntax::Token &Tok :
+       llvm::reverse(llvm::makeArrayRef(SpelledTokens.begin(), I)))
+    if (Consider(Tok))
+      break;
+
+  if (BestTok)
+    vlog(
+        "Word {0} under cursor {1} isn't a token (after PP), trying nearby {2}",
+        Word.Text, Word.Location.printToString(SM),
+        BestTok->location().printToString(SM));
+
+  return BestTok;
+}
+
 std::vector<LocatedSymbol> locateSymbolAt(ParsedAST &AST, Position Pos,
                                           const SymbolIndex *Index) {
   const auto &SM = AST.getSourceManager();
@@ -516,7 +545,28 @@ std::vector<LocatedSymbol> locateSymbolAt(ParsedAST &AST, Position Pos,
   if (!ASTResults.empty())
     return ASTResults;
 
-  return locateSymbolNamedTextuallyAt(AST, Index, *CurLoc, *MainFilePath);
+  // If the cursor can't be resolved directly, try fallback strategies.
+  auto Word =
+      SpelledWord::touching(*CurLoc, AST.getTokens(), AST.getLangOpts());
+  if (Word) {
+    // Is the same word nearby a real identifier that might refer to something?
+    if (const syntax::Token *NearbyIdent =
+            findNearbyIdentifier(*Word, AST.getTokens())) {
+      if (auto Macro = locateMacroReferent(*NearbyIdent, AST, *MainFilePath))
+        return {*std::move(Macro)};
+      ASTResults = locateASTReferent(NearbyIdent->location(), NearbyIdent, AST,
+                                     *MainFilePath, Index);
+      if (!ASTResults.empty())
+        return ASTResults;
+    }
+    // No nearby word, or it didn't refer to anything either. Try the index.
+    auto TextualResults =
+        locateSymbolTextually(*Word, AST, Index, *MainFilePath);
+    if (!TextualResults.empty())
+      return TextualResults;
+  }
+
+  return {};
 }
 
 std::vector<DocumentLink> getDocumentLinks(ParsedAST &AST) {

diff  --git a/clang-tools-extra/clangd/XRefs.h b/clang-tools-extra/clangd/XRefs.h
index 8f42ca8d3265..af78ec780c5a 100644
--- a/clang-tools-extra/clangd/XRefs.h
+++ b/clang-tools-extra/clangd/XRefs.h
@@ -16,6 +16,7 @@
 #include "FormattedString.h"
 #include "Path.h"
 #include "Protocol.h"
+#include "SourceCode.h"
 #include "index/Index.h"
 #include "index/SymbolLocation.h"
 #include "clang/AST/Type.h"
@@ -26,6 +27,10 @@
 #include <vector>
 
 namespace clang {
+namespace syntax {
+class Token;
+class TokenBuffer;
+} // namespace syntax
 namespace clangd {
 class ParsedAST;
 
@@ -49,20 +54,22 @@ llvm::raw_ostream &operator<<(llvm::raw_ostream &, const LocatedSymbol &);
 std::vector<LocatedSymbol> locateSymbolAt(ParsedAST &AST, Position Pos,
                                           const SymbolIndex *Index = nullptr);
 
-// Tries to provide a textual fallback for locating a symbol referenced at
-// a location, by looking up the word under the cursor as a symbol name in the
-// index. The aim is to pick up references to symbols in contexts where
+// Tries to provide a textual fallback for locating a symbol by looking up the
+// word under the cursor as a symbol name in the index.
+// The aim is to pick up references to symbols in contexts where
 // AST-based resolution does not work, such as comments, strings, and PP
-// disabled regions. The implementation takes a number of measures to avoid
-// false positives, such as looking for some signal that the word at the
-// given location is likely to be an identifier. The function does not
-// currently return results for locations that end up as real expanded
-// tokens, although this may be relaxed for e.g. dependent code in the future.
+// disabled regions.
 // (This is for internal use by locateSymbolAt, and is exposed for testing).
 std::vector<LocatedSymbol>
-locateSymbolNamedTextuallyAt(ParsedAST &AST, const SymbolIndex *Index,
-                             SourceLocation Loc,
-                             const std::string &MainFilePath);
+locateSymbolTextually(const SpelledWord &Word, ParsedAST &AST,
+                      const SymbolIndex *Index,
+                      const std::string &MainFilePath);
+
+// Try to find a proximate occurrence of `Word` as an identifier, which can be
+// used to resolve it.
+// (This is for internal use by locateSymbolAt, and is exposed for testing).
+const syntax::Token *findNearbyIdentifier(const SpelledWord &Word,
+                                          const syntax::TokenBuffer &TB);
 
 /// Get all document links
 std::vector<DocumentLink> getDocumentLinks(ParsedAST &AST);

diff  --git a/clang-tools-extra/clangd/unittests/SourceCodeTests.cpp b/clang-tools-extra/clangd/unittests/SourceCodeTests.cpp
index 76a3a3cac267..71721fe81cd9 100644
--- a/clang-tools-extra/clangd/unittests/SourceCodeTests.cpp
+++ b/clang-tools-extra/clangd/unittests/SourceCodeTests.cpp
@@ -12,6 +12,7 @@
 #include "TestTU.h"
 #include "clang/Basic/LangOptions.h"
 #include "clang/Basic/SourceLocation.h"
+#include "clang/Basic/TokenKinds.h"
 #include "clang/Format/Format.h"
 #include "llvm/Support/Error.h"
 #include "llvm/Support/raw_os_ostream.h"
@@ -328,6 +329,101 @@ TEST(SourceCodeTests, CollectWords) {
   EXPECT_EQ(ActualWords, ExpectedWords);
 }
 
+class SpelledWordsTest : public ::testing::Test {
+  llvm::Optional<ParsedAST> AST;
+
+  llvm::Optional<SpelledWord> tryWord(const char *Text) {
+    llvm::Annotations A(Text);
+    auto TU = TestTU::withCode(A.code());
+    AST = TU.build();
+    auto SW = SpelledWord::touching(
+        AST->getSourceManager().getComposedLoc(
+            AST->getSourceManager().getMainFileID(), A.point()),
+        AST->getTokens(), AST->getLangOpts());
+    if (A.ranges().size()) {
+      llvm::StringRef Want = A.code().slice(A.range().Begin, A.range().End);
+      EXPECT_EQ(Want, SW->Text) << Text;
+    }
+    return SW;
+  }
+
+protected:
+  SpelledWord word(const char *Text) {
+    auto Result = tryWord(Text);
+    EXPECT_TRUE(Result) << Text;
+    return Result.getValueOr(SpelledWord());
+  }
+
+  void noWord(const char *Text) { EXPECT_FALSE(tryWord(Text)) << Text; }
+};
+
+TEST_F(SpelledWordsTest, HeuristicBoundaries) {
+  word("// [[^foo]] ");
+  word("// [[f^oo]] ");
+  word("// [[foo^]] ");
+  word("// [[foo^]]+bar ");
+  noWord("//^ foo ");
+  noWord("// foo ^");
+}
+
+TEST_F(SpelledWordsTest, LikelyIdentifier) {
+  EXPECT_FALSE(word("// ^foo ").LikelyIdentifier);
+  EXPECT_TRUE(word("// [[^foo_bar]] ").LikelyIdentifier);
+  EXPECT_TRUE(word("// [[^fooBar]] ").LikelyIdentifier);
+  EXPECT_FALSE(word("// H^TTP ").LikelyIdentifier);
+  EXPECT_TRUE(word("// \\p [[^foo]] ").LikelyIdentifier);
+  EXPECT_TRUE(word("// @param[in] [[^foo]] ").LikelyIdentifier);
+  EXPECT_TRUE(word("// `[[f^oo]]` ").LikelyIdentifier);
+  EXPECT_TRUE(word("// bar::[[f^oo]] ").LikelyIdentifier);
+  EXPECT_TRUE(word("// [[f^oo]]::bar ").LikelyIdentifier);
+}
+
+TEST_F(SpelledWordsTest, Comment) {
+  auto W = word("// [[^foo]]");
+  EXPECT_FALSE(W.PartOfSpelledToken);
+  EXPECT_FALSE(W.SpelledToken);
+  EXPECT_FALSE(W.ExpandedToken);
+}
+
+TEST_F(SpelledWordsTest, PartOfString) {
+  auto W = word(R"( auto str = "foo [[^bar]] baz"; )");
+  ASSERT_TRUE(W.PartOfSpelledToken);
+  EXPECT_EQ(W.PartOfSpelledToken->kind(), tok::string_literal);
+  EXPECT_FALSE(W.SpelledToken);
+  EXPECT_FALSE(W.ExpandedToken);
+}
+
+TEST_F(SpelledWordsTest, DisabledSection) {
+  auto W = word(R"cpp(
+    #if 0
+    foo [[^bar]] baz
+    #endif
+    )cpp");
+  ASSERT_TRUE(W.SpelledToken);
+  EXPECT_EQ(W.SpelledToken->kind(), tok::identifier);
+  EXPECT_EQ(W.SpelledToken, W.PartOfSpelledToken);
+  EXPECT_FALSE(W.ExpandedToken);
+}
+
+TEST_F(SpelledWordsTest, Macros) {
+  auto W = word(R"cpp(
+    #define ID(X) X
+    ID(int [[^i]]);
+    )cpp");
+  ASSERT_TRUE(W.SpelledToken);
+  EXPECT_EQ(W.SpelledToken->kind(), tok::identifier);
+  EXPECT_EQ(W.SpelledToken, W.PartOfSpelledToken);
+  ASSERT_TRUE(W.ExpandedToken);
+  EXPECT_EQ(W.ExpandedToken->kind(), tok::identifier);
+
+  W = word(R"cpp(
+    #define OBJECT Expansion;
+    int [[^OBJECT]];
+    )cpp");
+  EXPECT_TRUE(W.SpelledToken);
+  EXPECT_FALSE(W.ExpandedToken) << "Expanded token is spelled 
diff erently";
+}
+
 TEST(SourceCodeTests, VisibleNamespaces) {
   std::vector<std::pair<const char *, std::vector<std::string>>> Cases = {
       {

diff  --git a/clang-tools-extra/clangd/unittests/XRefsTests.cpp b/clang-tools-extra/clangd/unittests/XRefsTests.cpp
index ce7f76ccf4f4..027939e15f77 100644
--- a/clang-tools-extra/clangd/unittests/XRefsTests.cpp
+++ b/clang-tools-extra/clangd/unittests/XRefsTests.cpp
@@ -685,10 +685,15 @@ TEST(LocateSymbol, Textual) {
 
     auto AST = TU.build();
     auto Index = TU.index();
-    auto Results = locateSymbolNamedTextuallyAt(
-        AST, Index.get(),
+    auto Word = SpelledWord::touching(
         cantFail(sourceLocationInMainFile(AST.getSourceManager(), T.point())),
-        testPath(TU.Filename));
+        AST.getTokens(), AST.getLangOpts());
+    if (!Word) {
+      ADD_FAILURE() << "No word touching point!" << Test;
+      continue;
+    }
+    auto Results =
+        locateSymbolTextually(*Word, AST, Index.get(), testPath(TU.Filename));
 
     if (!WantDecl) {
       EXPECT_THAT(Results, IsEmpty()) << Test;
@@ -788,10 +793,12 @@ TEST(LocateSymbol, TextualAmbiguous) {
   auto TU = TestTU::withCode(T.code());
   auto AST = TU.build();
   auto Index = TU.index();
-  auto Results = locateSymbolNamedTextuallyAt(
-      AST, Index.get(),
+  auto Word = SpelledWord::touching(
       cantFail(sourceLocationInMainFile(AST.getSourceManager(), T.point())),
-      testPath(TU.Filename));
+      AST.getTokens(), AST.getLangOpts());
+  ASSERT_TRUE(Word);
+  auto Results =
+      locateSymbolTextually(*Word, AST, Index.get(), testPath(TU.Filename));
   EXPECT_THAT(Results,
               UnorderedElementsAre(Sym("uniqueMethodName", T.range("FooLoc")),
                                    Sym("uniqueMethodName", T.range("BarLoc"))));
@@ -985,6 +992,101 @@ TEST(LocateSymbol, WithPreamble) {
       ElementsAre(Sym("foo", FooWithoutHeader.range())));
 }
 
+TEST(LocateSymbol, NearbyTokenSmoke) {
+  auto T = Annotations(R"cpp(
+    // prints e^rr and crashes
+    void die(const char* [[err]]);
+  )cpp");
+  auto AST = TestTU::withCode(T.code()).build();
+  // We don't pass an index, so can't hit index-based fallback.
+  EXPECT_THAT(locateSymbolAt(AST, T.point()),
+              ElementsAre(Sym("err", T.range())));
+}
+
+TEST(LocateSymbol, NearbyIdentifier) {
+  const char *Tests[] = {
+      R"cpp(
+      // regular identifiers (won't trigger)
+      int hello;
+      int y = he^llo;
+    )cpp",
+      R"cpp(
+      // disabled preprocessor sections
+      int [[hello]];
+      #if 0
+      int y = ^hello;
+      #endif
+    )cpp",
+      R"cpp(
+      // comments
+      // he^llo, world
+      int [[hello]];
+    )cpp",
+      R"cpp(
+      // not triggered by string literals
+      int hello;
+      const char* greeting = "h^ello, world";
+    )cpp",
+
+      R"cpp(
+      // can refer to macro invocations
+      #define INT int
+      [[INT]] x;
+      // I^NT
+    )cpp",
+
+      R"cpp(
+      // can refer to macro invocations (even if they expand to nothing)
+      #define EMPTY
+      [[EMPTY]] int x;
+      // E^MPTY
+    )cpp",
+
+      R"cpp(
+      // prefer nearest occurrence, backwards is worse than forwards
+      int hello;
+      int x = hello;
+      // h^ello
+      int y = [[hello]];
+      int z = hello;
+    )cpp",
+
+      R"cpp(
+      // short identifiers find near results
+      int [[hi]];
+      // h^i
+    )cpp",
+      R"cpp(
+      // short identifiers don't find far results
+      int hi;
+
+
+
+      // h^i
+    )cpp",
+  };
+  for (const char *Test : Tests) {
+    Annotations T(Test);
+    auto AST = TestTU::withCode(T.code()).build();
+    const auto &SM = AST.getSourceManager();
+    llvm::Optional<Range> Nearby;
+    auto Word =
+        SpelledWord::touching(cantFail(sourceLocationInMainFile(SM, T.point())),
+                              AST.getTokens(), AST.getLangOpts());
+    if (!Word) {
+      ADD_FAILURE() << "No word at point! " << Test;
+      continue;
+    }
+    if (const auto *Tok = findNearbyIdentifier(*Word, AST.getTokens()))
+      Nearby = halfOpenToRange(SM, CharSourceRange::getCharRange(
+                                       Tok->location(), Tok->endLocation()));
+    if (T.ranges().empty())
+      EXPECT_THAT(Nearby, Eq(llvm::None)) << Test;
+    else
+      EXPECT_EQ(Nearby, T.range()) << Test;
+  }
+}
+
 TEST(FindReferences, WithinAST) {
   const char *Tests[] = {
       R"cpp(// Local variable