[PATCH] D74731: [Clangd] Fixed assertion when processing extended ASCII characters.

Yancheng Zheng via Phabricator via cfe-commits cfe-commits at lists.llvm.org
Mon Feb 17 14:18:47 PST 2020


AnakinZheng updated this revision to Diff 245036.
AnakinZheng added a comment.

Implement @sammccall 's suggestion and add test.


CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D74731/new/

https://reviews.llvm.org/D74731

Files:
  clang-tools-extra/clangd/SourceCode.cpp
  clang-tools-extra/clangd/unittests/SourceCodeTests.cpp


Index: clang-tools-extra/clangd/unittests/SourceCodeTests.cpp
===================================================================
--- clang-tools-extra/clangd/unittests/SourceCodeTests.cpp
+++ clang-tools-extra/clangd/unittests/SourceCodeTests.cpp
@@ -57,6 +57,14 @@
   EXPECT_EQ(lspLength("¥"), 1UL);
   // astral
   EXPECT_EQ(lspLength("😂"), 2UL);
+  // Extended ASCII
+  const char extendedASCIIStr[2] = {static_cast<char>(160U), '\0'};
+  EXPECT_EQ(lspLength(extendedASCIIStr), 1UL);
+  // Invalid UTF16
+  const char invalidUTF16[6] = {
+      static_cast<char>(0xFF), static_cast<char>(0xFF), static_cast<char>(0xFF),
+      static_cast<char>(0xFF), static_cast<char>(0xFF), '\0'};
+  EXPECT_EQ(lspLength(invalidUTF16), 2UL);
 
   WithContextValue UTF8(kCurrentOffsetEncoding, OffsetEncoding::UTF8);
   EXPECT_EQ(lspLength(""), 0UL);
@@ -66,6 +74,10 @@
   EXPECT_EQ(lspLength("¥"), 2UL);
   // astral
   EXPECT_EQ(lspLength("😂"), 4UL);
+  // Extended ASCII
+  EXPECT_EQ(lspLength(extendedASCIIStr), 1UL);
+  // Invalid UTF16
+  EXPECT_EQ(lspLength(invalidUTF16), 5UL);
 
   WithContextValue UTF32(kCurrentOffsetEncoding, OffsetEncoding::UTF32);
   EXPECT_EQ(lspLength(""), 0UL);
@@ -75,6 +87,10 @@
   EXPECT_EQ(lspLength("¥"), 1UL);
   // astral
   EXPECT_EQ(lspLength("😂"), 1UL);
+  // Extended ASCII
+  EXPECT_EQ(lspLength(extendedASCIIStr), 1UL);
+  // Invalid UTF16
+  EXPECT_EQ(lspLength(invalidUTF16), 1UL);
 }
 
 // The = → 🡆 below are ASCII (1 byte), BMP (3 bytes), and astral (4 bytes).
Index: clang-tools-extra/clangd/SourceCode.cpp
===================================================================
--- clang-tools-extra/clangd/SourceCode.cpp
+++ clang-tools-extra/clangd/SourceCode.cpp
@@ -67,15 +67,20 @@
     }
     // This convenient property of UTF-8 holds for all non-ASCII characters.
     size_t UTF8Length = llvm::countLeadingOnes(C);
-    // 0xxx is ASCII, handled above. 10xxx is a trailing byte, invalid here.
-    // 11111xxx is not valid UTF-8 at all. Assert because it's probably our bug.
-    assert((UTF8Length >= 2 && UTF8Length <= 4) &&
-           "Invalid UTF-8, or transcoding bug?");
     I += UTF8Length; // Skip over all trailing bytes.
-    // A codepoint takes two UTF-16 code unit if it's astral (outside BMP).
-    // Astral codepoints are encoded as 4 bytes in UTF-8 (11110xxx ...)
-    if (CB(UTF8Length, UTF8Length == 4 ? 2 : 1))
-      return true;
+    // 0xxx is ASCII, handled above. 10xxx is a trailing byte, invalid here.
+    // 11111xxx is not valid UTF-8 at all. Log it because it's probably our bug.
+    if(UTF8Length == 1 || UTF8Length > 4) {
+      vlog("File contains invalid UTF-8, or transcoding bug? UTF8Length = {0}, string = {1}", UTF8Length, U8);
+      // If UTF8 length is 1, treat as one UTF8, if above 4, treat as one UTF16
+      if (CB(UTF8Length, UTF8Length == 1 ? 1 : 2))
+        return true;
+    } else {
+      // A codepoint takes two UTF-16 code unit if it's astral (outside BMP).
+      // Astral codepoints are encoded as 4 bytes in UTF-8 (11110xxx ...)
+      if (CB(UTF8Length, UTF8Length == 4 ? 2 : 1))
+        return true;
+    }
   }
   return false;
 }


-------------- next part --------------
A non-text attachment was scrubbed...
Name: D74731.245036.patch
Type: text/x-patch
Size: 3163 bytes
Desc: not available
URL: <http://lists.llvm.org/pipermail/cfe-commits/attachments/20200217/042f5646/attachment.bin>


More information about the cfe-commits mailing list