[llvm] a262f4d - Revert "[Clang] Add a warning on invalid UTF-8 in comments."
Jonas Devlieghere via llvm-commits
llvm-commits at lists.llvm.org
Tue Jul 12 15:22:35 PDT 2022
Author: Jonas Devlieghere
Date: 2022-07-12T15:22:29-07:00
New Revision: a262f4dbd78fc68609d230f3e9c5ca2b1d1d9437
URL: https://github.com/llvm/llvm-project/commit/a262f4dbd78fc68609d230f3e9c5ca2b1d1d9437
DIFF: https://github.com/llvm/llvm-project/commit/a262f4dbd78fc68609d230f3e9c5ca2b1d1d9437.diff
LOG: Revert "[Clang] Add a warning on invalid UTF-8 in comments."
This reverts commit cc309721d20c8e544ae7a10a66735ccf4981a11c because it
breaks the following tests on GreenDragon:
TestDataFormatterObjCCF.py
TestDataFormatterObjCExpr.py
TestDataFormatterObjCKVO.py
TestDataFormatterObjCNSBundle.py
TestDataFormatterObjCNSData.py
TestDataFormatterObjCNSError.py
TestDataFormatterObjCNSNumber.py
TestDataFormatterObjCNSURL.py
TestDataFormatterObjCPlain.py
TestDataFormatterObjNSException.py
https://green.lab.llvm.org/green/view/LLDB/job/lldb-cmake/45288/
Added:
Modified:
clang/docs/ReleaseNotes.rst
clang/include/clang/Basic/DiagnosticLexKinds.td
clang/lib/Lex/Lexer.cpp
clang/test/SemaCXX/static-assert.cpp
llvm/include/llvm/Support/ConvertUTF.h
llvm/lib/Support/ConvertUTF.cpp
Removed:
clang/test/Lexer/comment-invalid-utf8.c
clang/test/Lexer/comment-utf8.c
################################################################################
diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst
index f8977d5ac720b..e09a4a7c91b78 100644
--- a/clang/docs/ReleaseNotes.rst
+++ b/clang/docs/ReleaseNotes.rst
@@ -284,11 +284,9 @@ Improvements to Clang's diagnostics
unevaluated operands of a ``typeid`` expression, as they are now
modeled correctly in the CFG. This fixes
`Issue 21668 <https://github.com/llvm/llvm-project/issues/21668>`_.
-- ``-Wself-assign``, ``-Wself-assign-overloaded`` and ``-Wself-move`` will
+- ``-Wself-assign``, ``-Wself-assign-overloaded`` and ``-Wself-move`` will
suggest a fix if the decl being assigned is a parameter that shadows a data
member of the contained class.
-- Added ``-Winvalid-utf8`` which diagnoses invalid UTF-8 code unit sequences in
- comments.
Non-comprehensive list of changes in this release
-------------------------------------------------
@@ -615,7 +613,7 @@ AST Matchers
- Added ``forEachTemplateArgument`` matcher which creates a match every
time a ``templateArgument`` matches the matcher supplied to it.
-
+
- Added ``objcStringLiteral`` matcher which matches ObjectiveC String
literal expressions.
diff --git a/clang/include/clang/Basic/DiagnosticLexKinds.td b/clang/include/clang/Basic/DiagnosticLexKinds.td
index 38ee022e5f04c..ac86076140c58 100644
--- a/clang/include/clang/Basic/DiagnosticLexKinds.td
+++ b/clang/include/clang/Basic/DiagnosticLexKinds.td
@@ -113,8 +113,6 @@ def warn_four_char_character_literal : Warning<
// Unicode and UCNs
def err_invalid_utf8 : Error<
"source file is not valid UTF-8">;
-def warn_invalid_utf8_in_comment : Extension<
- "invalid UTF-8 in comment">, InGroup<DiagGroup<"invalid-utf8">>;
def err_character_not_allowed : Error<
"unexpected character <U+%0>">;
def err_character_not_allowed_identifier : Error<
diff --git a/clang/lib/Lex/Lexer.cpp b/clang/lib/Lex/Lexer.cpp
index 221ec2721fe00..6820057642bea 100644
--- a/clang/lib/Lex/Lexer.cpp
+++ b/clang/lib/Lex/Lexer.cpp
@@ -2392,37 +2392,13 @@ bool Lexer::SkipLineComment(Token &Result, const char *CurPtr,
//
// This loop terminates with CurPtr pointing at the newline (or end of buffer)
// character that ends the line comment.
-
- // C++23 [lex.phases] p1
- // Diagnose invalid UTF-8 if the corresponding warning is enabled, emitting a
- // diagnostic only once per entire ill-formed subsequence to avoid
- // emiting to many diagnostics (see http://unicode.org/review/pr-121.html).
- bool UnicodeDecodingAlreadyDiagnosed = false;
-
char C;
while (true) {
C = *CurPtr;
// Skip over characters in the fast loop.
- while (isASCII(C) && C != 0 && // Potentially EOF.
- C != '\n' && C != '\r') { // Newline or DOS-style newline.
+ while (C != 0 && // Potentially EOF.
+ C != '\n' && C != '\r') // Newline or DOS-style newline.
C = *++CurPtr;
- UnicodeDecodingAlreadyDiagnosed = false;
- }
-
- if (!isASCII(C)) {
- unsigned Length = llvm::getUTF8SequenceSize(
- (const llvm::UTF8 *)CurPtr, (const llvm::UTF8 *)BufferEnd);
- if (Length == 0) {
- if (!UnicodeDecodingAlreadyDiagnosed && !isLexingRawMode())
- Diag(CurPtr, diag::warn_invalid_utf8_in_comment);
- UnicodeDecodingAlreadyDiagnosed = true;
- ++CurPtr;
- } else {
- UnicodeDecodingAlreadyDiagnosed = false;
- CurPtr += Length;
- }
- continue;
- }
const char *NextLine = CurPtr;
if (C != 0) {
@@ -2689,12 +2665,6 @@ bool Lexer::SkipBlockComment(Token &Result, const char *CurPtr,
if (C == '/')
C = *CurPtr++;
- // C++23 [lex.phases] p1
- // Diagnose invalid UTF-8 if the corresponding warning is enabled, emitting a
- // diagnostic only once per entire ill-formed subsequence to avoid
- // emiting to many diagnostics (see http://unicode.org/review/pr-121.html).
- bool UnicodeDecodingAlreadyDiagnosed = false;
-
while (true) {
// Skip over all non-interesting characters until we find end of buffer or a
// (probably ending) '/' character.
@@ -2703,22 +2673,14 @@ bool Lexer::SkipBlockComment(Token &Result, const char *CurPtr,
// doesn't check for '\0'.
!(PP && PP->getCodeCompletionFileLoc() == FileLoc)) {
// While not aligned to a 16-byte boundary.
- while (C != '/' && (intptr_t)CurPtr % 16 != 0) {
- if (!isASCII(C))
- goto MultiByteUTF8;
+ while (C != '/' && ((intptr_t)CurPtr & 0x0F) != 0)
C = *CurPtr++;
- }
+
if (C == '/') goto FoundSlash;
#ifdef __SSE2__
__m128i Slashes = _mm_set1_epi8('/');
- while (CurPtr + 16 < BufferEnd) {
- int Mask = _mm_movemask_epi8(*(const __m128i *)CurPtr);
- if (LLVM_UNLIKELY(Mask != 0)) {
- CurPtr += llvm::countTrailingZeros<unsigned>(Mask);
- goto MultiByteUTF8;
- }
- // look for slashes
+ while (CurPtr+16 <= BufferEnd) {
int cmp = _mm_movemask_epi8(_mm_cmpeq_epi8(*(const __m128i*)CurPtr,
Slashes));
if (cmp != 0) {
@@ -2731,38 +2693,21 @@ bool Lexer::SkipBlockComment(Token &Result, const char *CurPtr,
CurPtr += 16;
}
#elif __ALTIVEC__
- __vector unsigned char LongUTF = {0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
- 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
- 0x80, 0x80, 0x80, 0x80};
__vector unsigned char Slashes = {
'/', '/', '/', '/', '/', '/', '/', '/',
'/', '/', '/', '/', '/', '/', '/', '/'
};
- while (CurPtr + 16 < BufferEnd) {
- if (LLVM_UNLIKELY(
- vec_any_ge(*(const __vector unsigned char *)CurPtr, LongUTF)))
- goto MultiByteUTF8;
- if (vec_any_eq(*(const __vector unsigned char *)CurPtr, Slashes)) {
- break;
- }
+ while (CurPtr + 16 <= BufferEnd &&
+ !vec_any_eq(*(const __vector unsigned char *)CurPtr, Slashes))
CurPtr += 16;
- }
-
#else
- while (CurPtr + 16 < BufferEnd) {
- bool HasNonASCII = false;
- for (unsigned I = 0; I < 16; ++I)
- HasNonASCII |= !isASCII(CurPtr[I]);
-
- if (LLVM_UNLIKELY(HasNonASCII))
- goto MultiByteUTF8;
-
- bool HasSlash = false;
- for (unsigned I = 0; I < 16; ++I)
- HasSlash |= CurPtr[I] == '/';
- if (HasSlash)
- break;
- CurPtr += 16;
+ // Scan for '/' quickly. Many block comments are very large.
+ while (CurPtr[0] != '/' &&
+ CurPtr[1] != '/' &&
+ CurPtr[2] != '/' &&
+ CurPtr[3] != '/' &&
+ CurPtr+4 < BufferEnd) {
+ CurPtr += 4;
}
#endif
@@ -2770,30 +2715,9 @@ bool Lexer::SkipBlockComment(Token &Result, const char *CurPtr,
C = *CurPtr++;
}
- // Loop to scan the remainder, warning on invalid UTF-8
- // if the corresponding warning is enabled, emitting a diagnostic only once
- // per sequence that cannot be decoded.
- while (C != '/' && C != '\0') {
- if (isASCII(C)) {
- UnicodeDecodingAlreadyDiagnosed = false;
- C = *CurPtr++;
- continue;
- }
- MultiByteUTF8:
- // CurPtr is 1 code unit past C, so to decode
- // the codepoint, we need to read from the previous position.
- unsigned Length = llvm::getUTF8SequenceSize(
- (const llvm::UTF8 *)CurPtr - 1, (const llvm::UTF8 *)BufferEnd);
- if (Length == 0) {
- if (!UnicodeDecodingAlreadyDiagnosed && !isLexingRawMode())
- Diag(CurPtr - 1, diag::warn_invalid_utf8_in_comment);
- UnicodeDecodingAlreadyDiagnosed = true;
- } else {
- UnicodeDecodingAlreadyDiagnosed = false;
- CurPtr += Length - 1;
- }
+ // Loop to scan the remainder.
+ while (C != '/' && C != '\0')
C = *CurPtr++;
- }
if (C == '/') {
FoundSlash:
diff --git a/clang/test/Lexer/comment-invalid-utf8.c b/clang/test/Lexer/comment-invalid-utf8.c
deleted file mode 100644
index b8bf551dd8564..0000000000000
--- a/clang/test/Lexer/comment-invalid-utf8.c
+++ /dev/null
@@ -1,27 +0,0 @@
-// RUN: %clang_cc1 -fsyntax-only %s -Winvalid-utf8 -verify=expected
-// RUN: %clang_cc1 -fsyntax-only %s -verify=nowarn
-// nowarn-no-diagnostics
-
-// This file is purposefully encoded as windows-1252
-// be careful when modifying.
-
-//
-// expected-warning at -1 {{invalid UTF-8 in comment}}
-
-//
-// expected-warning at -1 6{{invalid UTF-8 in comment}}
-
-/**/
-// expected-warning at -1 {{invalid UTF-8 in comment}}
-
-/*
*/
-// expected-warning at -1 6{{invalid UTF-8 in comment}}
-
-/*
-
-*/
-// expected-warning at -2 {{invalid UTF-8 in comment}}
-
-// abcd
-// abcd
-// expected-warning at -1 {{invalid UTF-8 in comment}}
diff --git a/clang/test/Lexer/comment-utf8.c b/clang/test/Lexer/comment-utf8.c
deleted file mode 100644
index 87f2d1375d4c7..0000000000000
--- a/clang/test/Lexer/comment-utf8.c
+++ /dev/null
@@ -1,20 +0,0 @@
-// RUN: %clang_cc1 -fsyntax-only %s -Winvalid-utf8 -verify
-// expected-no-diagnostics
-
-
-//§ § § ð ä½ å¥½ ©
-
-/*§ § § ð ä½ å¥½ ©*/
-
-/*
-§ § § ð ä½ å¥½ ©©©
-*/
-
-/* § § § ð ä½ å¥½ © */
-/*
- a longer comment to exerce the vectorized code path
- ----------------------------------------------------
- αααααααααααααααααααααα // here is some unicode
- ----------------------------------------------------
- ----------------------------------------------------
-*/
diff --git a/clang/test/SemaCXX/static-assert.cpp b/clang/test/SemaCXX/static-assert.cpp
index 2ac0dfdea9eae..5801320f305da 100644
--- a/clang/test/SemaCXX/static-assert.cpp
+++ b/clang/test/SemaCXX/static-assert.cpp
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -fsyntax-only -verify %s -std=c++11 -pedantic -triple=x86_64-linux-gnu -Wno-invalid-utf8
+// RUN: %clang_cc1 -fsyntax-only -verify %s -std=c++11 -pedantic -triple=x86_64-linux-gnu
int f(); // expected-note {{declared here}}
diff --git a/llvm/include/llvm/Support/ConvertUTF.h b/llvm/include/llvm/Support/ConvertUTF.h
index 1e05cfe1f4241..662f3aca5b543 100644
--- a/llvm/include/llvm/Support/ConvertUTF.h
+++ b/llvm/include/llvm/Support/ConvertUTF.h
@@ -181,8 +181,6 @@ Boolean isLegalUTF8Sequence(const UTF8 *source, const UTF8 *sourceEnd);
Boolean isLegalUTF8String(const UTF8 **source, const UTF8 *sourceEnd);
-unsigned getUTF8SequenceSize(const UTF8 *source, const UTF8 *sourceEnd);
-
unsigned getNumBytesForUTF8(UTF8 firstByte);
/*************************************************************************/
diff --git a/llvm/lib/Support/ConvertUTF.cpp b/llvm/lib/Support/ConvertUTF.cpp
index 5436f557b993d..e24a918c5c898 100644
--- a/llvm/lib/Support/ConvertUTF.cpp
+++ b/llvm/lib/Support/ConvertUTF.cpp
@@ -417,16 +417,6 @@ Boolean isLegalUTF8Sequence(const UTF8 *source, const UTF8 *sourceEnd) {
return isLegalUTF8(source, length);
}
-/*
- * Exported function to return the size of the first utf-8 code unit sequence,
- * Or 0 if the sequence is not valid;
- */
-unsigned getUTF8SequenceSize(const UTF8 *source, const UTF8 *sourceEnd) {
- int length = trailingBytesForUTF8[*source] + 1;
- return (length <= sourceEnd - source && isLegalUTF8(source, length)) ? length
- : 0;
-}
-
/* --------------------------------------------------------------------- */
static unsigned
More information about the llvm-commits
mailing list