[PATCH] D74731: [Clangd] Fixed assertion when processing extended ASCII characters.
Yancheng Zheng via Phabricator via cfe-commits
cfe-commits at lists.llvm.org
Mon Feb 17 14:18:47 PST 2020
AnakinZheng updated this revision to Diff 245036.
AnakinZheng added a comment.
Implement @sammccall 's suggestion and add test.
CHANGES SINCE LAST ACTION
https://reviews.llvm.org/D74731/new/
https://reviews.llvm.org/D74731
Files:
clang-tools-extra/clangd/SourceCode.cpp
clang-tools-extra/clangd/unittests/SourceCodeTests.cpp
Index: clang-tools-extra/clangd/unittests/SourceCodeTests.cpp
===================================================================
--- clang-tools-extra/clangd/unittests/SourceCodeTests.cpp
+++ clang-tools-extra/clangd/unittests/SourceCodeTests.cpp
@@ -57,6 +57,14 @@
EXPECT_EQ(lspLength("¥"), 1UL);
// astral
EXPECT_EQ(lspLength("😂"), 2UL);
+ // Extended ASCII
+ const char extendedASCIIStr[2] = {static_cast<char>(160U), '\0'};
+ EXPECT_EQ(lspLength(extendedASCIIStr), 1UL);
+ // Invalid UTF16
+ const char invalidUTF16[6] = {
+ static_cast<char>(0xFF), static_cast<char>(0xFF), static_cast<char>(0xFF),
+ static_cast<char>(0xFF), static_cast<char>(0xFF), '\0'};
+ EXPECT_EQ(lspLength(invalidUTF16), 2UL);
WithContextValue UTF8(kCurrentOffsetEncoding, OffsetEncoding::UTF8);
EXPECT_EQ(lspLength(""), 0UL);
@@ -66,6 +74,10 @@
EXPECT_EQ(lspLength("¥"), 2UL);
// astral
EXPECT_EQ(lspLength("😂"), 4UL);
+ // Extended ASCII
+ EXPECT_EQ(lspLength(extendedASCIIStr), 1UL);
+ // Invalid UTF16
+ EXPECT_EQ(lspLength(invalidUTF16), 5UL);
WithContextValue UTF32(kCurrentOffsetEncoding, OffsetEncoding::UTF32);
EXPECT_EQ(lspLength(""), 0UL);
@@ -75,6 +87,10 @@
EXPECT_EQ(lspLength("¥"), 1UL);
// astral
EXPECT_EQ(lspLength("😂"), 1UL);
+ // Extended ASCII
+ EXPECT_EQ(lspLength(extendedASCIIStr), 1UL);
+ // Invalid UTF16
+ EXPECT_EQ(lspLength(invalidUTF16), 1UL);
}
// The = → 🡆 below are ASCII (1 byte), BMP (3 bytes), and astral (4 bytes).
Index: clang-tools-extra/clangd/SourceCode.cpp
===================================================================
--- clang-tools-extra/clangd/SourceCode.cpp
+++ clang-tools-extra/clangd/SourceCode.cpp
@@ -67,15 +67,20 @@
}
// This convenient property of UTF-8 holds for all non-ASCII characters.
size_t UTF8Length = llvm::countLeadingOnes(C);
- // 0xxx is ASCII, handled above. 10xxx is a trailing byte, invalid here.
- // 11111xxx is not valid UTF-8 at all. Assert because it's probably our bug.
- assert((UTF8Length >= 2 && UTF8Length <= 4) &&
- "Invalid UTF-8, or transcoding bug?");
I += UTF8Length; // Skip over all trailing bytes.
- // A codepoint takes two UTF-16 code unit if it's astral (outside BMP).
- // Astral codepoints are encoded as 4 bytes in UTF-8 (11110xxx ...)
- if (CB(UTF8Length, UTF8Length == 4 ? 2 : 1))
- return true;
+ // 0xxx is ASCII, handled above. 10xxx is a trailing byte, invalid here.
+ // 11111xxx is not valid UTF-8 at all. Log it because it's probably our bug.
+ if(UTF8Length == 1 || UTF8Length > 4) {
+ vlog("File contains invalid UTF-8, or transcoding bug? UTF8Length = {0}, string = {1}", UTF8Length, U8);
+ // If UTF8 length is 1, treat as one UTF8, if above 4, treat as one UTF16
+ if (CB(UTF8Length, UTF8Length == 1 ? 1 : 2))
+ return true;
+ } else {
+ // A codepoint takes two UTF-16 code unit if it's astral (outside BMP).
+ // Astral codepoints are encoded as 4 bytes in UTF-8 (11110xxx ...)
+ if (CB(UTF8Length, UTF8Length == 4 ? 2 : 1))
+ return true;
+ }
}
return false;
}
-------------- next part --------------
A non-text attachment was scrubbed...
Name: D74731.245036.patch
Type: text/x-patch
Size: 3163 bytes
Desc: not available
URL: <http://lists.llvm.org/pipermail/cfe-commits/attachments/20200217/042f5646/attachment.bin>
More information about the cfe-commits
mailing list