[llvm] d4892a1 - [Clang] Add a warning on invalid UTF-8 in comments.

Wed Jul 13 01:19:32 PDT 2022

Author: Corentin Jabot
Date: 2022-07-13T10:19:26+02:00
New Revision: d4892a168f51dabdba255af3a8d23049fcb66960

URL: https://github.com/llvm/llvm-project/commit/d4892a168f51dabdba255af3a8d23049fcb66960
DIFF: https://github.com/llvm/llvm-project/commit/d4892a168f51dabdba255af3a8d23049fcb66960.diff

LOG: [Clang] Add a warning on invalid UTF-8 in comments.

Introduce an off-by default `-Winvalid-utf8` warning
that detects invalid UTF-8 code units sequences in comments.

Invalid UTF-8 in other places is already diagnosed,
as that cannot appear in identifiers and other grammar constructs.

The warning is off by default as its likely to be somewhat disruptive
otherwise.

This warning allows clang to conform to the yet-to be approved WG21
"P2295R5 Support for UTF-8 as a portable source file encoding"
paper.

Reviewed By: aaron.ballman, #clang-language-wg

Differential Revision: https://reviews.llvm.org/D128059

Added: 
    clang/test/Lexer/comment-invalid-utf8.c
    clang/test/Lexer/comment-utf8.c

Modified: 
    clang/docs/ReleaseNotes.rst
    clang/include/clang/Basic/DiagnosticLexKinds.td
    clang/lib/Lex/Lexer.cpp
    clang/test/SemaCXX/static-assert.cpp
    llvm/include/llvm/Support/ConvertUTF.h
    llvm/lib/Support/ConvertUTF.cpp

Removed: 
    


################################################################################
diff  --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst
index e09a4a7c91b78..f8977d5ac720b 100644

--- a/clang/docs/ReleaseNotes.rst
+++ b/clang/docs/ReleaseNotes.rst
@@ -284,9 +284,11 @@ Improvements to Clang's diagnostics
   unevaluated operands of a ``typeid`` expression, as they are now
   modeled correctly in the CFG. This fixes
   `Issue 21668 <https://github.com/llvm/llvm-project/issues/21668>`_.
-- ``-Wself-assign``, ``-Wself-assign-overloaded`` and ``-Wself-move`` will 
+- ``-Wself-assign``, ``-Wself-assign-overloaded`` and ``-Wself-move`` will
   suggest a fix if the decl being assigned is a parameter that shadows a data
   member of the contained class.
+- Added ``-Winvalid-utf8`` which diagnoses invalid UTF-8 code unit sequences in
+  comments.
 
 Non-comprehensive list of changes in this release
 -------------------------------------------------
@@ -613,7 +615,7 @@ AST Matchers
 
 - Added ``forEachTemplateArgument`` matcher which creates a match every
   time a ``templateArgument`` matches the matcher supplied to it.
-  
+
 - Added ``objcStringLiteral`` matcher which matches ObjectiveC String
   literal expressions.
 

diff  --git a/clang/include/clang/Basic/DiagnosticLexKinds.td b/clang/include/clang/Basic/DiagnosticLexKinds.td
index ac86076140c58..38ee022e5f04c 100644
--- a/clang/include/clang/Basic/DiagnosticLexKinds.td
+++ b/clang/include/clang/Basic/DiagnosticLexKinds.td
@@ -113,6 +113,8 @@ def warn_four_char_character_literal : Warning<
 // Unicode and UCNs
 def err_invalid_utf8 : Error<
   "source file is not valid UTF-8">;
+def warn_invalid_utf8_in_comment : Extension<
+  "invalid UTF-8 in comment">, InGroup<DiagGroup<"invalid-utf8">>;
 def err_character_not_allowed : Error<
   "unexpected character <U+%0>">;
 def err_character_not_allowed_identifier : Error<

diff  --git a/clang/lib/Lex/Lexer.cpp b/clang/lib/Lex/Lexer.cpp
index 6820057642bea..6b2767dcee3d5 100644
--- a/clang/lib/Lex/Lexer.cpp
+++ b/clang/lib/Lex/Lexer.cpp
@@ -2392,13 +2392,37 @@ bool Lexer::SkipLineComment(Token &Result, const char *CurPtr,
   //
   // This loop terminates with CurPtr pointing at the newline (or end of buffer)
   // character that ends the line comment.
+
+  // C++23 [lex.phases] p1
+  // Diagnose invalid UTF-8 if the corresponding warning is enabled, emitting a
+  // diagnostic only once per entire ill-formed subsequence to avoid
+  // emiting to many diagnostics (see http://unicode.org/review/pr-121.html).
+  bool UnicodeDecodingAlreadyDiagnosed = false;
+
   char C;
   while (true) {
     C = *CurPtr;
     // Skip over characters in the fast loop.
-    while (C != 0 &&                // Potentially EOF.
-           C != '\n' && C != '\r')  // Newline or DOS-style newline.
+    while (isASCII(C) && C != 0 &&   // Potentially EOF.
+           C != '\n' && C != '\r') { // Newline or DOS-style newline.
       C = *++CurPtr;
+      UnicodeDecodingAlreadyDiagnosed = false;
+    }
+
+    if (!isASCII(C)) {
+      unsigned Length = llvm::getUTF8SequenceSize(
+          (const llvm::UTF8 *)CurPtr, (const llvm::UTF8 *)BufferEnd);
+      if (Length == 0) {
+        if (!UnicodeDecodingAlreadyDiagnosed && !isLexingRawMode())
+          Diag(CurPtr, diag::warn_invalid_utf8_in_comment);
+        UnicodeDecodingAlreadyDiagnosed = true;
+        ++CurPtr;
+      } else {
+        UnicodeDecodingAlreadyDiagnosed = false;
+        CurPtr += Length;
+      }
+      continue;
+    }
 
     const char *NextLine = CurPtr;
     if (C != 0) {
@@ -2665,6 +2689,12 @@ bool Lexer::SkipBlockComment(Token &Result, const char *CurPtr,
   if (C == '/')
     C = *CurPtr++;
 
+  // C++23 [lex.phases] p1
+  // Diagnose invalid UTF-8 if the corresponding warning is enabled, emitting a
+  // diagnostic only once per entire ill-formed subsequence to avoid
+  // emiting to many diagnostics (see http://unicode.org/review/pr-121.html).
+  bool UnicodeDecodingAlreadyDiagnosed = false;
+
   while (true) {
     // Skip over all non-interesting characters until we find end of buffer or a
     // (probably ending) '/' character.
@@ -2673,14 +2703,21 @@ bool Lexer::SkipBlockComment(Token &Result, const char *CurPtr,
         // doesn't check for '\0'.
         !(PP && PP->getCodeCompletionFileLoc() == FileLoc)) {
       // While not aligned to a 16-byte boundary.
-      while (C != '/' && ((intptr_t)CurPtr & 0x0F) != 0)
+      while (C != '/' && (intptr_t)CurPtr % 16 != 0) {
+        if (!isASCII(C))
+          goto MultiByteUTF8;
         C = *CurPtr++;
-
+      }
       if (C == '/') goto FoundSlash;
 
 #ifdef __SSE2__
       __m128i Slashes = _mm_set1_epi8('/');
-      while (CurPtr+16 <= BufferEnd) {
+      while (CurPtr + 16 < BufferEnd) {
+        int Mask = _mm_movemask_epi8(*(const __m128i *)CurPtr);
+        if (LLVM_UNLIKELY(Mask != 0)) {
+          goto MultiByteUTF8;
+        }
+        // look for slashes
         int cmp = _mm_movemask_epi8(_mm_cmpeq_epi8(*(const __m128i*)CurPtr,
                                     Slashes));
         if (cmp != 0) {
@@ -2693,21 +2730,38 @@ bool Lexer::SkipBlockComment(Token &Result, const char *CurPtr,
         CurPtr += 16;
       }
 #elif __ALTIVEC__
+      __vector unsigned char LongUTF = {0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+                                        0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+                                        0x80, 0x80, 0x80, 0x80};
       __vector unsigned char Slashes = {
         '/', '/', '/', '/',  '/', '/', '/', '/',
         '/', '/', '/', '/',  '/', '/', '/', '/'
       };
-      while (CurPtr + 16 <= BufferEnd &&
-             !vec_any_eq(*(const __vector unsigned char *)CurPtr, Slashes))
+      while (CurPtr + 16 < BufferEnd) {
+        if (LLVM_UNLIKELY(
+                vec_any_ge(*(const __vector unsigned char *)CurPtr, LongUTF)))
+          goto MultiByteUTF8;
+        if (vec_any_eq(*(const __vector unsigned char *)CurPtr, Slashes)) {
+          break;
+        }
         CurPtr += 16;
+      }
+
 #else
-      // Scan for '/' quickly.  Many block comments are very large.
-      while (CurPtr[0] != '/' &&
-             CurPtr[1] != '/' &&
-             CurPtr[2] != '/' &&
-             CurPtr[3] != '/' &&
-             CurPtr+4 < BufferEnd) {
-        CurPtr += 4;
+      while (CurPtr + 16 < BufferEnd) {
+        bool HasNonASCII = false;
+        for (unsigned I = 0; I < 16; ++I)
+          HasNonASCII |= !isASCII(CurPtr[I]);
+
+        if (LLVM_UNLIKELY(HasNonASCII))
+          goto MultiByteUTF8;
+
+        bool HasSlash = false;
+        for (unsigned I = 0; I < 16; ++I)
+          HasSlash |= CurPtr[I] == '/';
+        if (HasSlash)
+          break;
+        CurPtr += 16;
       }
 #endif
 
@@ -2715,9 +2769,30 @@ bool Lexer::SkipBlockComment(Token &Result, const char *CurPtr,
       C = *CurPtr++;
     }
 
-    // Loop to scan the remainder.
-    while (C != '/' && C != '\0')
+    // Loop to scan the remainder, warning on invalid UTF-8
+    // if the corresponding warning is enabled, emitting a diagnostic only once
+    // per sequence that cannot be decoded.
+    while (C != '/' && C != '\0') {
+      if (isASCII(C)) {
+        UnicodeDecodingAlreadyDiagnosed = false;
+        C = *CurPtr++;
+        continue;
+      }
+    MultiByteUTF8:
+      // CurPtr is 1 code unit past C, so to decode
+      // the codepoint, we need to read from the previous position.
+      unsigned Length = llvm::getUTF8SequenceSize(
+          (const llvm::UTF8 *)CurPtr - 1, (const llvm::UTF8 *)BufferEnd);
+      if (Length == 0) {
+        if (!UnicodeDecodingAlreadyDiagnosed && !isLexingRawMode())
+          Diag(CurPtr - 1, diag::warn_invalid_utf8_in_comment);
+        UnicodeDecodingAlreadyDiagnosed = true;
+      } else {
+        UnicodeDecodingAlreadyDiagnosed = false;
+        CurPtr += Length - 1;
+      }
       C = *CurPtr++;
+    }
 
     if (C == '/') {
   FoundSlash:

diff  --git a/clang/test/Lexer/comment-invalid-utf8.c b/clang/test/Lexer/comment-invalid-utf8.c
new file mode 100644
index 0000000000000..b8bf551dd8564
--- /dev/null
+++ b/clang/test/Lexer/comment-invalid-utf8.c
@@ -0,0 +1,27 @@
+// RUN: %clang_cc1 -fsyntax-only %s -Winvalid-utf8 -verify=expected
+// RUN: %clang_cc1 -fsyntax-only %s -verify=nowarn
+// nowarn-no-diagnostics
+
+// This file is purposefully encoded as windows-1252
+// be careful when modifying.
+
+//
+// expected-warning at -1 {{invalid UTF-8 in comment}}
+
+//      
+// expected-warning at -1 6{{invalid UTF-8 in comment}}
+
+/**/
+// expected-warning at -1 {{invalid UTF-8 in comment}}
+
+/*     */
+// expected-warning at -1 6{{invalid UTF-8 in comment}}
+
+/*
+
+*/
+// expected-warning at -2 {{invalid UTF-8 in comment}}
+
+// abcd
+// abcd
+// expected-warning at -1 {{invalid UTF-8 in comment}}

diff  --git a/clang/test/Lexer/comment-utf8.c b/clang/test/Lexer/comment-utf8.c
new file mode 100644
index 0000000000000..1b3009be95205
--- /dev/null
+++ b/clang/test/Lexer/comment-utf8.c
@@ -0,0 +1,27 @@
+// RUN: %clang_cc1 -fsyntax-only %s -Winvalid-utf8 -verify
+// expected-no-diagnostics
+
+
+//Â§ Â§ Â§ ð ä½ å¥½ Â©
+
+/*Â§ Â§ Â§ ð ä½ å¥½ Â©*/
+
+/*
+Â§ Â§ Â§ ð ä½ å¥½ Â©Â©Â©
+*/
+
+/* Â§ Â§ Â§ ð ä½ å¥½ Â© */
+/*
+    a longer comment to exerce the vectorized code path
+    ----------------------------------------------------
+    Î±Î±Î±Î±Î±Î±Î±Î±Î±Î±Î±Î±Î±Î±Î±Î±Î±Î±Î±Î±Î±Î±      // here is some unicode
+    ----------------------------------------------------
+    ----------------------------------------------------
+*/
+
+// The following test checks that a short comment is not merged
+// with a subsequent long comment containing utf-8
+enum a {
+    x  /* 01234567890ABCDEF*/
+};
+/*Î±Î±Î±Î±Î±Î±Î±Î±Î±*/

diff  --git a/clang/test/SemaCXX/static-assert.cpp b/clang/test/SemaCXX/static-assert.cpp
index 5801320f305da..2ac0dfdea9eae 100644
--- a/clang/test/SemaCXX/static-assert.cpp
+++ b/clang/test/SemaCXX/static-assert.cpp
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -fsyntax-only -verify %s -std=c++11 -pedantic -triple=x86_64-linux-gnu
+// RUN: %clang_cc1 -fsyntax-only -verify %s -std=c++11 -pedantic -triple=x86_64-linux-gnu -Wno-invalid-utf8
 
 int f(); // expected-note {{declared here}}
 

diff  --git a/llvm/include/llvm/Support/ConvertUTF.h b/llvm/include/llvm/Support/ConvertUTF.h
index 662f3aca5b543..1e05cfe1f4241 100644
--- a/llvm/include/llvm/Support/ConvertUTF.h
+++ b/llvm/include/llvm/Support/ConvertUTF.h
@@ -181,6 +181,8 @@ Boolean isLegalUTF8Sequence(const UTF8 *source, const UTF8 *sourceEnd);
 
 Boolean isLegalUTF8String(const UTF8 **source, const UTF8 *sourceEnd);
 
+unsigned getUTF8SequenceSize(const UTF8 *source, const UTF8 *sourceEnd);
+
 unsigned getNumBytesForUTF8(UTF8 firstByte);
 
 /*************************************************************************/

diff  --git a/llvm/lib/Support/ConvertUTF.cpp b/llvm/lib/Support/ConvertUTF.cpp
index e24a918c5c898..5436f557b993d 100644
--- a/llvm/lib/Support/ConvertUTF.cpp
+++ b/llvm/lib/Support/ConvertUTF.cpp
@@ -417,6 +417,16 @@ Boolean isLegalUTF8Sequence(const UTF8 *source, const UTF8 *sourceEnd) {
     return isLegalUTF8(source, length);
 }
 
+/*
+ * Exported function to return the size of the first utf-8 code unit sequence,
+ * Or 0 if the sequence is not valid;
+ */
+unsigned getUTF8SequenceSize(const UTF8 *source, const UTF8 *sourceEnd) {
+  int length = trailingBytesForUTF8[*source] + 1;
+  return (length <= sourceEnd - source && isLegalUTF8(source, length)) ? length
+                                                                       : 0;
+}
+
 /* --------------------------------------------------------------------- */
 
 static unsigned