[PATCH] D81530: [clangd] Log rather than assert on bad UTF-8.
Sam McCall via Phabricator via cfe-commits
cfe-commits at lists.llvm.org
Tue Jun 9 22:01:45 PDT 2020
sammccall created this revision.
sammccall added a reviewer: kadircet.
Herald added subscribers: cfe-commits, usaxena95, arphaman, jkorous, MaskRay, ilya-biryukov.
Herald added a project: clang.
sammccall added a subscriber: jaafar.
sammccall added a comment.
@jaafar would you be able to verify this is equivalent to D74731 <https://reviews.llvm.org/D74731> for your purposes?
If so I'd like to cherrypick this into tho 10.x branch after landing.
I don't love this change, but it prevents crashing when indexing boost headers,
and I can't think of a better practical alternative.
Based on a patch by AnakinZheng!
Repository:
rG LLVM Github Monorepo
https://reviews.llvm.org/D81530
Files:
clang-tools-extra/clangd/SourceCode.cpp
clang-tools-extra/clangd/unittests/SourceCodeTests.cpp
clang-tools-extra/clangd/unittests/SymbolCollectorTests.cpp
Index: clang-tools-extra/clangd/unittests/SymbolCollectorTests.cpp
===================================================================
--- clang-tools-extra/clangd/unittests/SymbolCollectorTests.cpp
+++ clang-tools-extra/clangd/unittests/SymbolCollectorTests.cpp
@@ -1490,6 +1490,20 @@
EXPECT_THAT(Symbols, Contains(QName("operator delete")));
}
+TEST_F(SymbolCollectorTest, BadUTF8) {
+ // Extracted from boost/spirit/home/support/char_encoding/iso8859_1.hpp
+ // This looks like UTF-8 and fools clang, but has high-ISO-8859-1 comments.
+ const char *Header = "int PUNCT = 0;\n"
+ "int types[] = { /* \xa1 */PUNCT };";
+ CollectorOpts.RefFilter = RefKind::All;
+ CollectorOpts.RefsInHeaders = true;
+ runSymbolCollector(Header, "");
+ EXPECT_THAT(Symbols, Contains(QName("types")));
+ EXPECT_THAT(Symbols, Contains(QName("PUNCT")));
+ // Reference is stored, although offset within line is not reliable.
+ EXPECT_THAT(Refs, Contains(Pair(findSymbol(Symbols, "PUNCT").ID, _)));
+}
+
} // namespace
} // namespace clangd
} // namespace clang
Index: clang-tools-extra/clangd/unittests/SourceCodeTests.cpp
===================================================================
--- clang-tools-extra/clangd/unittests/SourceCodeTests.cpp
+++ clang-tools-extra/clangd/unittests/SourceCodeTests.cpp
@@ -71,6 +71,21 @@
EXPECT_EQ(lspLength("😂"), 1UL);
}
+TEST(SourceCodeTests, lspLengthBadUTF8) {
+ // Results are not well-defined if source file isn't valid UTF-8.
+ // However we shouldn't crash or return something totally wild.
+ const char *BadUTF8[] = {"\xa0", "\xff\xff\xff\xff\xff"};
+
+ for (OffsetEncoding Encoding :
+ {OffsetEncoding::UTF8, OffsetEncoding::UTF16, OffsetEncoding::UTF32}) {
+ WithContextValue UTF32(kCurrentOffsetEncoding, Encoding);
+ for (const char *Bad : BadUTF8) {
+ EXPECT_GE(lspLength(Bad), 0u);
+ EXPECT_LE(lspLength(Bad), strlen(Bad));
+ }
+ }
+}
+
// The = → 🡆 below are ASCII (1 byte), BMP (3 bytes), and astral (4 bytes).
const char File[] = R"(0:0 = 0
1:0 → 8
Index: clang-tools-extra/clangd/SourceCode.cpp
===================================================================
--- clang-tools-extra/clangd/SourceCode.cpp
+++ clang-tools-extra/clangd/SourceCode.cpp
@@ -55,8 +55,14 @@
// Iterates over unicode codepoints in the (UTF-8) string. For each,
// invokes CB(UTF-8 length, UTF-16 length), and breaks if it returns true.
// Returns true if CB returned true, false if we hit the end of string.
+//
+// If the string is not valid UTF-8, we log this error and "decode" the
+// text in some arbitrary way. This is pretty sad, but this tends to happen deep
+// within indexing of headers where clang misdetected the encoding, and
+// propagating the error all the way back up is (probably?) not be worth it.
template <typename Callback>
static bool iterateCodepoints(llvm::StringRef U8, const Callback &CB) {
+ bool LoggedInvalid = false;
// A codepoint takes two UTF-16 code unit if it's astral (outside BMP).
// Astral codepoints are encoded as 4 bytes in UTF-8, starting with 11110xxx.
for (size_t I = 0; I < U8.size();) {
@@ -70,9 +76,20 @@
// This convenient property of UTF-8 holds for all non-ASCII characters.
size_t UTF8Length = llvm::countLeadingOnes(C);
// 0xxx is ASCII, handled above. 10xxx is a trailing byte, invalid here.
- // 11111xxx is not valid UTF-8 at all. Assert because it's probably our bug.
- assert((UTF8Length >= 2 && UTF8Length <= 4) &&
- "Invalid UTF-8, or transcoding bug?");
+ // 11111xxx is not valid UTF-8 at all, maybe some ISO-8859-*.
+ if (LLVM_UNLIKELY(UTF8Length < 2 || UTF8Length > 4)) {
+ if (!LoggedInvalid) {
+ elog("File has invalid UTF-8 near offset {0}: {1}", I, llvm::toHex(U8));
+ LoggedInvalid = true;
+ }
+ // We can't give a correct result, but avoid returning something wild.
+ // Pretend this is a valid ASCII byte, for lack of better options.
+ // (Too late to get ISO-8859-* right, we've skipped some bytes already).
+ if (CB(1, 1))
+ return true;
+ ++I;
+ continue;
+ }
I += UTF8Length; // Skip over all trailing bytes.
// A codepoint takes two UTF-16 code unit if it's astral (outside BMP).
// Astral codepoints are encoded as 4 bytes in UTF-8 (11110xxx ...)
-------------- next part --------------
A non-text attachment was scrubbed...
Name: D81530.269732.patch
Type: text/x-patch
Size: 4364 bytes
Desc: not available
URL: <http://lists.llvm.org/pipermail/cfe-commits/attachments/20200610/6d22dae7/attachment.bin>
More information about the cfe-commits
mailing list