[clang] e9fe20d - Revert "[Clang] Add a warning on invalid UTF-8 in comments."

Wed Jul 6 13:52:52 PDT 2022

Author: Nico Weber
Date: 2022-07-06T22:51:52+02:00
New Revision: e9fe20dab39edb911afacdb6ed9ec2c7499a59cf

URL: https://github.com/llvm/llvm-project/commit/e9fe20dab39edb911afacdb6ed9ec2c7499a59cf
DIFF: https://github.com/llvm/llvm-project/commit/e9fe20dab39edb911afacdb6ed9ec2c7499a59cf.diff

LOG: Revert "[Clang] Add a warning on invalid UTF-8 in comments."

This reverts commit 4174f0ca618b467571b43cff12cbe4c4239670f8.

Also revert follow-up "[Clang] Fix invalid utf-8 detection"
This reverts commit bf45e27a676d87944f1f13d5f0d0f39935fc4010.

The second commit broke tests, see comments on
https://reviews.llvm.org/D129223, and it sounds like the first
commit isn't valid without the second one. So reverting both for now.

Added: 
    

Modified: 
    clang/docs/ReleaseNotes.rst
    clang/include/clang/Basic/DiagnosticLexKinds.td
    clang/lib/Lex/Lexer.cpp
    clang/test/SemaCXX/static-assert.cpp
    llvm/include/llvm/Support/ConvertUTF.h
    llvm/lib/Support/ConvertUTF.cpp

Removed: 
    clang/test/Lexer/comment-invalid-utf8.c


################################################################################
diff  --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst
index c50e1b89649cf..0f542e08b841c 100644

--- a/clang/docs/ReleaseNotes.rst
+++ b/clang/docs/ReleaseNotes.rst
@@ -279,8 +279,6 @@ Improvements to Clang's diagnostics
   unevaluated operands of a ``typeid`` expression, as they are now
   modeled correctly in the CFG. This fixes
   `Issue 21668 <https://github.com/llvm/llvm-project/issues/21668>`_.
-- Added ``-Winvalid-utf8`` which diagnoses invalid UTF-8 code unit sequences in
-  comments.
 
 Non-comprehensive list of changes in this release
 -------------------------------------------------
@@ -578,7 +576,7 @@ AST Matchers
 
 - Added ``forEachTemplateArgument`` matcher which creates a match every
   time a ``templateArgument`` matches the matcher supplied to it.
-
+  
 - Added ``objcStringLiteral`` matcher which matches ObjectiveC String
   literal expressions.
 

diff  --git a/clang/include/clang/Basic/DiagnosticLexKinds.td b/clang/include/clang/Basic/DiagnosticLexKinds.td
index 38ee022e5f04c..ac86076140c58 100644
--- a/clang/include/clang/Basic/DiagnosticLexKinds.td
+++ b/clang/include/clang/Basic/DiagnosticLexKinds.td
@@ -113,8 +113,6 @@ def warn_four_char_character_literal : Warning<
 // Unicode and UCNs
 def err_invalid_utf8 : Error<
   "source file is not valid UTF-8">;
-def warn_invalid_utf8_in_comment : Extension<
-  "invalid UTF-8 in comment">, InGroup<DiagGroup<"invalid-utf8">>;
 def err_character_not_allowed : Error<
   "unexpected character <U+%0>">;
 def err_character_not_allowed_identifier : Error<

diff  --git a/clang/lib/Lex/Lexer.cpp b/clang/lib/Lex/Lexer.cpp
index 351e518c7ed37..6820057642bea 100644
--- a/clang/lib/Lex/Lexer.cpp
+++ b/clang/lib/Lex/Lexer.cpp
@@ -2392,37 +2392,13 @@ bool Lexer::SkipLineComment(Token &Result, const char *CurPtr,
   //
   // This loop terminates with CurPtr pointing at the newline (or end of buffer)
   // character that ends the line comment.
-
-  // C++23 [lex.phases] p1
-  // Diagnose invalid UTF-8 if the corresponding warning is enabled, emitting a
-  // diagnostic only once per entire ill-formed subsequence to avoid
-  // emiting to many diagnostics (see http://unicode.org/review/pr-121.html).
-  bool UnicodeDecodingAlreadyDiagnosed = false;
-
   char C;
   while (true) {
     C = *CurPtr;
     // Skip over characters in the fast loop.
-    while (isASCII(C) && C != 0 &&   // Potentially EOF.
-           C != '\n' && C != '\r') { // Newline or DOS-style newline.
+    while (C != 0 &&                // Potentially EOF.
+           C != '\n' && C != '\r')  // Newline or DOS-style newline.
       C = *++CurPtr;
-      UnicodeDecodingAlreadyDiagnosed = false;
-    }
-
-    if (!isASCII(C)) {
-      unsigned Length = llvm::getUTF8SequenceSize(
-          (const llvm::UTF8 *)CurPtr, (const llvm::UTF8 *)BufferEnd);
-      if (Length == 0) {
-        if (!UnicodeDecodingAlreadyDiagnosed && !isLexingRawMode())
-          Diag(CurPtr, diag::warn_invalid_utf8_in_comment);
-        UnicodeDecodingAlreadyDiagnosed = true;
-        ++CurPtr;
-      } else {
-        UnicodeDecodingAlreadyDiagnosed = false;
-        CurPtr += Length;
-      }
-      continue;
-    }
 
     const char *NextLine = CurPtr;
     if (C != 0) {
@@ -2689,12 +2665,6 @@ bool Lexer::SkipBlockComment(Token &Result, const char *CurPtr,
   if (C == '/')
     C = *CurPtr++;
 
-  // C++23 [lex.phases] p1
-  // Diagnose invalid UTF-8 if the corresponding warning is enabled, emitting a
-  // diagnostic only once per entire ill-formed subsequence to avoid
-  // emiting to many diagnostics (see http://unicode.org/review/pr-121.html).
-  bool UnicodeDecodingAlreadyDiagnosed = false;
-
   while (true) {
     // Skip over all non-interesting characters until we find end of buffer or a
     // (probably ending) '/' character.
@@ -2703,24 +2673,14 @@ bool Lexer::SkipBlockComment(Token &Result, const char *CurPtr,
         // doesn't check for '\0'.
         !(PP && PP->getCodeCompletionFileLoc() == FileLoc)) {
       // While not aligned to a 16-byte boundary.
-      while (C != '/' && (intptr_t)CurPtr % 16 != 0) {
-        if (!isASCII(C)) {
-          CurPtr--;
-          goto MultiByteUTF8;
-        }
+      while (C != '/' && ((intptr_t)CurPtr & 0x0F) != 0)
         C = *CurPtr++;
-      }
+
       if (C == '/') goto FoundSlash;
 
 #ifdef __SSE2__
       __m128i Slashes = _mm_set1_epi8('/');
-      while (CurPtr + 16 < BufferEnd) {
-        int Mask = _mm_movemask_epi8(*(const __m128i *)CurPtr);
-        if (LLVM_UNLIKELY(Mask != 0)) {
-          CurPtr += llvm::countTrailingZeros<unsigned>(Mask);
-          goto MultiByteUTF8;
-        }
-        // look for slashes
+      while (CurPtr+16 <= BufferEnd) {
         int cmp = _mm_movemask_epi8(_mm_cmpeq_epi8(*(const __m128i*)CurPtr,
                                     Slashes));
         if (cmp != 0) {
@@ -2733,41 +2693,21 @@ bool Lexer::SkipBlockComment(Token &Result, const char *CurPtr,
         CurPtr += 16;
       }
 #elif __ALTIVEC__
-      __vector unsigned char LongUTF = {0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
-                                        0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
-                                        0x80, 0x80, 0x80, 0x80};
       __vector unsigned char Slashes = {
         '/', '/', '/', '/',  '/', '/', '/', '/',
         '/', '/', '/', '/',  '/', '/', '/', '/'
       };
-      while (CurPtr + 16 < BufferEnd) {
-        if (LLVM_UNLIKELY(
-                vec_any_ge(*(const __vector unsigned char *)CurPtr, LongUTF)))
-          goto MultiByteUTF8;
-        if (vec_any_eq(*(const __vector unsigned char *)CurPtr, Slashes)) {
-          C = *CurPtr++;
-          break;
-        }
+      while (CurPtr + 16 <= BufferEnd &&
+             !vec_any_eq(*(const __vector unsigned char *)CurPtr, Slashes))
         CurPtr += 16;
-      }
-
 #else
-      while (CurPtr + 16 <= BufferEnd) {
-        bool HasNonASCII = false;
-        for (unsigned I = 0; I < 16; ++I) {
-          HasNonASCII |= !isASCII(CurPtr[I]);
-        }
-
-        if (LLVM_UNLIKELY(HasNonASCII))
-          goto MultiByteUTF8;
-
-        bool HasSlash = false;
-        for (unsigned I = 0; I < 16; ++I) {
-          HasSlash |= CurPtr[I] == '/';
-        }
-        if (HasSlash)
-          break;
-        CurPtr += 16;
+      // Scan for '/' quickly.  Many block comments are very large.
+      while (CurPtr[0] != '/' &&
+             CurPtr[1] != '/' &&
+             CurPtr[2] != '/' &&
+             CurPtr[3] != '/' &&
+             CurPtr+4 < BufferEnd) {
+        CurPtr += 4;
       }
 #endif
 
@@ -2775,28 +2715,9 @@ bool Lexer::SkipBlockComment(Token &Result, const char *CurPtr,
       C = *CurPtr++;
     }
 
-    // Loop to scan the remainder, warning on invalid UTF-8
-    // if the corresponding warning is enabled, emitting a diagnostic only once
-    // per sequence that cannot be decoded.
-    while (C != '/' && C != '\0') {
-      if (isASCII(C)) {
-        UnicodeDecodingAlreadyDiagnosed = false;
-        C = *CurPtr++;
-        continue;
-      }
-    MultiByteUTF8:
-      unsigned Length = llvm::getUTF8SequenceSize(
-          (const llvm::UTF8 *)CurPtr, (const llvm::UTF8 *)BufferEnd);
-      if (Length == 0) {
-        if (!UnicodeDecodingAlreadyDiagnosed && !isLexingRawMode())
-          Diag(CurPtr, diag::warn_invalid_utf8_in_comment);
-        UnicodeDecodingAlreadyDiagnosed = true;
-        C = *CurPtr++;
-        continue;
-      }
-      UnicodeDecodingAlreadyDiagnosed = false;
-      C = *(CurPtr += Length - 1);
-    }
+    // Loop to scan the remainder.
+    while (C != '/' && C != '\0')
+      C = *CurPtr++;
 
     if (C == '/') {
   FoundSlash:

diff  --git a/clang/test/Lexer/comment-invalid-utf8.c b/clang/test/Lexer/comment-invalid-utf8.c
deleted file mode 100644
index ed7405a3c079e..0000000000000
--- a/clang/test/Lexer/comment-invalid-utf8.c
+++ /dev/null
@@ -1,38 +0,0 @@
-// RUN: %clang_cc1 -fsyntax-only %s -Winvalid-utf8 -verify=expected
-// RUN: %clang_cc1 -fsyntax-only %s -verify=nowarn
-// nowarn-no-diagnostics
-
-// This file is purposefully encoded as windows-1252
-// be careful when modifying.
-
-//€
-// expected-warning at -1 {{invalid UTF-8 in comment}}
-
-// € ‚ƒ„…†‡ˆ‰ Š ‹ Œ Ž
-// expected-warning at -1 6{{invalid UTF-8 in comment}}
-
-/*€*/
-// expected-warning at -1 {{invalid UTF-8 in comment}}
-
-/*€ ‚ƒ„…†‡ˆ‰ Š ‹ Œ Ž*/
-// expected-warning at -1 6{{invalid UTF-8 in comment}}
-
-/*
-€
-*/
-// expected-warning at -2 {{invalid UTF-8 in comment}}
-
-// abcd
-// €abcd
-// expected-warning at -1 {{invalid UTF-8 in comment}}
-
-
-//Â§ Â§ Â§ ðŸ˜€ ä½ å¥½ Â©
-
-/*Â§ Â§ Â§ ðŸ˜€ ä½ å¥½ Â©*/
-
-/*
-Â§ Â§ Â§ ðŸ˜€ ä½ å¥½ Â©
-*/
-
-/* Â§ Â§ Â§ ðŸ˜€ ä½ å¥½ Â© */

diff  --git a/clang/test/SemaCXX/static-assert.cpp b/clang/test/SemaCXX/static-assert.cpp
index 2ac0dfdea9eae..5801320f305da 100644
--- a/clang/test/SemaCXX/static-assert.cpp
+++ b/clang/test/SemaCXX/static-assert.cpp
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -fsyntax-only -verify %s -std=c++11 -pedantic -triple=x86_64-linux-gnu -Wno-invalid-utf8
+// RUN: %clang_cc1 -fsyntax-only -verify %s -std=c++11 -pedantic -triple=x86_64-linux-gnu
 
 int f(); // expected-note {{declared here}}
 

diff  --git a/llvm/include/llvm/Support/ConvertUTF.h b/llvm/include/llvm/Support/ConvertUTF.h
index 1e05cfe1f4241..662f3aca5b543 100644
--- a/llvm/include/llvm/Support/ConvertUTF.h
+++ b/llvm/include/llvm/Support/ConvertUTF.h
@@ -181,8 +181,6 @@ Boolean isLegalUTF8Sequence(const UTF8 *source, const UTF8 *sourceEnd);
 
 Boolean isLegalUTF8String(const UTF8 **source, const UTF8 *sourceEnd);
 
-unsigned getUTF8SequenceSize(const UTF8 *source, const UTF8 *sourceEnd);
-
 unsigned getNumBytesForUTF8(UTF8 firstByte);
 
 /*************************************************************************/

diff  --git a/llvm/lib/Support/ConvertUTF.cpp b/llvm/lib/Support/ConvertUTF.cpp
index 25875d4c3184b..e24a918c5c898 100644
--- a/llvm/lib/Support/ConvertUTF.cpp
+++ b/llvm/lib/Support/ConvertUTF.cpp
@@ -417,16 +417,6 @@ Boolean isLegalUTF8Sequence(const UTF8 *source, const UTF8 *sourceEnd) {
     return isLegalUTF8(source, length);
 }
 
-/*
- * Exported function to return the size of the first utf-8 code unit sequence,
- * Or 0 if the sequence is not valid;
- */
-unsigned getUTF8SequenceSize(const UTF8 *source, const UTF8 *sourceEnd) {
-  int length = trailingBytesForUTF8[*source] + 1;
-  return (length < sourceEnd - source && isLegalUTF8(source, length)) ? length
-                                                                      : 0;
-}
-
 /* --------------------------------------------------------------------- */
 
 static unsigned