[clang-tools-extra] f2c8f6e - [clangd] Log rather than assert on bad UTF-8.

Wed Jun 10 02:40:32 PDT 2020

Author: Sam McCall
Date: 2020-06-10T11:40:23+02:00
New Revision: f2c8f6e16d25ca356f58995109292735b222b1b7

URL: https://github.com/llvm/llvm-project/commit/f2c8f6e16d25ca356f58995109292735b222b1b7
DIFF: https://github.com/llvm/llvm-project/commit/f2c8f6e16d25ca356f58995109292735b222b1b7.diff

LOG: [clangd] Log rather than assert on bad UTF-8.

Summary:
I don't love this behavior, but it prevents crashing when indexing boost
headers, and I can't think of a better practical alternative.

Fixes https://reviews.llvm.org/D81530

Based on a patch by AnakinZheng!

Reviewers: kadircet

Subscribers: ilya-biryukov, MaskRay, jkorous, arphaman, usaxena95, cfe-commits, AnakinZheng

Tags: #clang

Differential Revision: https://reviews.llvm.org/D81530

Added: 
    

Modified: 
    clang-tools-extra/clangd/SourceCode.cpp
    clang-tools-extra/clangd/unittests/SourceCodeTests.cpp
    clang-tools-extra/clangd/unittests/SymbolCollectorTests.cpp

Removed: 
    


################################################################################
diff  --git a/clang-tools-extra/clangd/SourceCode.cpp b/clang-tools-extra/clangd/SourceCode.cpp
index f6f9371d436f..c8502c580996 100644

--- a/clang-tools-extra/clangd/SourceCode.cpp
+++ b/clang-tools-extra/clangd/SourceCode.cpp
@@ -55,8 +55,14 @@ namespace clangd {
 // Iterates over unicode codepoints in the (UTF-8) string. For each,
 // invokes CB(UTF-8 length, UTF-16 length), and breaks if it returns true.
 // Returns true if CB returned true, false if we hit the end of string.
+//
+// If the string is not valid UTF-8, we log this error and "decode" the
+// text in some arbitrary way. This is pretty sad, but this tends to happen deep
+// within indexing of headers where clang misdetected the encoding, and
+// propagating the error all the way back up is (probably?) not be worth it.
 template <typename Callback>
 static bool iterateCodepoints(llvm::StringRef U8, const Callback &CB) {
+  bool LoggedInvalid = false;
   // A codepoint takes two UTF-16 code unit if it's astral (outside BMP).
   // Astral codepoints are encoded as 4 bytes in UTF-8, starting with 11110xxx.
   for (size_t I = 0; I < U8.size();) {
@@ -70,9 +76,20 @@ static bool iterateCodepoints(llvm::StringRef U8, const Callback &CB) {
     // This convenient property of UTF-8 holds for all non-ASCII characters.
     size_t UTF8Length = llvm::countLeadingOnes(C);
     // 0xxx is ASCII, handled above. 10xxx is a trailing byte, invalid here.
-    // 11111xxx is not valid UTF-8 at all. Assert because it's probably our bug.
-    assert((UTF8Length >= 2 && UTF8Length <= 4) &&
-           "Invalid UTF-8, or transcoding bug?");
+    // 11111xxx is not valid UTF-8 at all, maybe some ISO-8859-*.
+    if (LLVM_UNLIKELY(UTF8Length < 2 || UTF8Length > 4)) {
+      if (!LoggedInvalid) {
+        elog("File has invalid UTF-8 near offset {0}: {1}", I, llvm::toHex(U8));
+        LoggedInvalid = true;
+      }
+      // We can't give a correct result, but avoid returning something wild.
+      // Pretend this is a valid ASCII byte, for lack of better options.
+      // (Too late to get ISO-8859-* right, we've skipped some bytes already).
+      if (CB(1, 1))
+        return true;
+      ++I;
+      continue;
+    }
     I += UTF8Length; // Skip over all trailing bytes.
     // A codepoint takes two UTF-16 code unit if it's astral (outside BMP).
     // Astral codepoints are encoded as 4 bytes in UTF-8 (11110xxx ...)

diff  --git a/clang-tools-extra/clangd/unittests/SourceCodeTests.cpp b/clang-tools-extra/clangd/unittests/SourceCodeTests.cpp
index 0521327b623b..9c3ae4df51ff 100644
--- a/clang-tools-extra/clangd/unittests/SourceCodeTests.cpp
+++ b/clang-tools-extra/clangd/unittests/SourceCodeTests.cpp
@@ -71,6 +71,21 @@ TEST(SourceCodeTests, lspLength) {
   EXPECT_EQ(lspLength("😂"), 1UL);
 }
 
+TEST(SourceCodeTests, lspLengthBadUTF8) {
+  // Results are not well-defined if source file isn't valid UTF-8.
+  // However we shouldn't crash or return something totally wild.
+  const char *BadUTF8[] = {"\xa0", "\xff\xff\xff\xff\xff"};
+
+  for (OffsetEncoding Encoding :
+       {OffsetEncoding::UTF8, OffsetEncoding::UTF16, OffsetEncoding::UTF32}) {
+    WithContextValue UTF32(kCurrentOffsetEncoding, Encoding);
+    for (const char *Bad : BadUTF8) {
+      EXPECT_GE(lspLength(Bad), 0u);
+      EXPECT_LE(lspLength(Bad), strlen(Bad));
+    }
+  }
+}
+
 // The = → 🡆 below are ASCII (1 byte), BMP (3 bytes), and astral (4 bytes).
 const char File[] = R"(0:0 = 0
 1:0 → 8

diff  --git a/clang-tools-extra/clangd/unittests/SymbolCollectorTests.cpp b/clang-tools-extra/clangd/unittests/SymbolCollectorTests.cpp
index d3cd9f542de3..d90f99ff9f8c 100644
--- a/clang-tools-extra/clangd/unittests/SymbolCollectorTests.cpp
+++ b/clang-tools-extra/clangd/unittests/SymbolCollectorTests.cpp
@@ -1490,6 +1490,20 @@ TEST_F(SymbolCollectorTest, InvalidSourceLoc) {
   EXPECT_THAT(Symbols, Contains(QName("operator delete")));
 }
 
+TEST_F(SymbolCollectorTest, BadUTF8) {
+  // Extracted from boost/spirit/home/support/char_encoding/iso8859_1.hpp
+  // This looks like UTF-8 and fools clang, but has high-ISO-8859-1 comments.
+  const char *Header = "int PUNCT = 0;\n"
+                       "int types[] = { /* \xa1 */PUNCT };";
+  CollectorOpts.RefFilter = RefKind::All;
+  CollectorOpts.RefsInHeaders = true;
+  runSymbolCollector(Header, "");
+  EXPECT_THAT(Symbols, Contains(QName("types")));
+  EXPECT_THAT(Symbols, Contains(QName("PUNCT")));
+  // Reference is stored, although offset within line is not reliable.
+  EXPECT_THAT(Refs, Contains(Pair(findSymbol(Symbols, "PUNCT").ID, _)));
+}
+
 } // namespace
 } // namespace clangd
 } // namespace clang