[clang] 3eb67d2 - [Clang] Handle non-ASCII after line splicing
Corentin Jabot via cfe-commits
cfe-commits at lists.llvm.org
Wed Sep 6 14:20:06 PDT 2023
Author: Corentin Jabot
Date: 2023-09-06T23:20:00+02:00
New Revision: 3eb67d28dee5c27f5db24a1b370f00a1a2cb456d
URL: https://github.com/llvm/llvm-project/commit/3eb67d28dee5c27f5db24a1b370f00a1a2cb456d
DIFF: https://github.com/llvm/llvm-project/commit/3eb67d28dee5c27f5db24a1b370f00a1a2cb456d.diff
LOG: [Clang] Handle non-ASCII after line splicing
int a\
ス;
Failed to be parsed as a valid identifier.
Fixes #65156
Reviewed By: tahonermann
Differential Revision: https://reviews.llvm.org/D159345
Added:
clang/test/Lexer/escape_newline_unicode.c
Modified:
clang/docs/ReleaseNotes.rst
clang/include/clang/Lex/Lexer.h
clang/lib/Lex/Lexer.cpp
Removed:
################################################################################
diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst
index 142d534f55a042..271e0852781bc9 100644
--- a/clang/docs/ReleaseNotes.rst
+++ b/clang/docs/ReleaseNotes.rst
@@ -214,6 +214,8 @@ Bug Fixes in This Version
(`#64987 <https://github.com/llvm/llvm-project/issues/64987>`_)
- Support MSVC predefined macro expressions in constant expressions and in
local structs.
+- Correctly parse non-ascii identifiers that appear immediately after a line splicing
+ (`#65156 <https://github.com/llvm/llvm-project/issues/65156>`_`)
Bug Fixes to Compiler Builtins
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
diff --git a/clang/include/clang/Lex/Lexer.h b/clang/include/clang/Lex/Lexer.h
index 98d34b783f0847..ac0ef14c591bdd 100644
--- a/clang/include/clang/Lex/Lexer.h
+++ b/clang/include/clang/Lex/Lexer.h
@@ -805,9 +805,10 @@ class Lexer : public PreprocessorLexer {
/// Try to consume an identifier character encoded in UTF-8.
/// \param CurPtr Points to the start of the (potential) UTF-8 code unit
/// sequence. On success, updated to point past the end of it.
+ /// \param Result The token being formed.
/// \return \c true if a UTF-8 sequence mapping to an acceptable identifier
/// character was lexed, \c false otherwise.
- bool tryConsumeIdentifierUTF8Char(const char *&CurPtr);
+ bool tryConsumeIdentifierUTF8Char(const char *&CurPtr, Token &Result);
};
} // namespace clang
diff --git a/clang/lib/Lex/Lexer.cpp b/clang/lib/Lex/Lexer.cpp
index 74a02ca4f81f8f..37c3e4175d4736 100644
--- a/clang/lib/Lex/Lexer.cpp
+++ b/clang/lib/Lex/Lexer.cpp
@@ -1750,15 +1750,21 @@ bool Lexer::tryConsumeIdentifierUCN(const char *&CurPtr, unsigned Size,
return true;
}
-bool Lexer::tryConsumeIdentifierUTF8Char(const char *&CurPtr) {
- const char *UnicodePtr = CurPtr;
+bool Lexer::tryConsumeIdentifierUTF8Char(const char *&CurPtr, Token &Result) {
llvm::UTF32 CodePoint;
- llvm::ConversionResult Result =
- llvm::convertUTF8Sequence((const llvm::UTF8 **)&UnicodePtr,
- (const llvm::UTF8 *)BufferEnd,
- &CodePoint,
- llvm::strictConversion);
- if (Result != llvm::conversionOK)
+
+ // If a UTF-8 codepoint appears immediately after an escaped new line,
+ // CurPtr may point to the splicing \ on the preceding line,
+ // so we need to skip it.
+ unsigned FirstCodeUnitSize;
+ getCharAndSize(CurPtr, FirstCodeUnitSize);
+ const char *CharStart = CurPtr + FirstCodeUnitSize - 1;
+ const char *UnicodePtr = CharStart;
+
+ llvm::ConversionResult ConvResult = llvm::convertUTF8Sequence(
+ (const llvm::UTF8 **)&UnicodePtr, (const llvm::UTF8 *)BufferEnd,
+ &CodePoint, llvm::strictConversion);
+ if (ConvResult != llvm::conversionOK)
return false;
bool IsExtension = false;
@@ -1771,21 +1777,26 @@ bool Lexer::tryConsumeIdentifierUTF8Char(const char *&CurPtr) {
!PP->isPreprocessedOutput())
diagnoseInvalidUnicodeCodepointInIdentifier(
PP->getDiagnostics(), LangOpts, CodePoint,
- makeCharRange(*this, CurPtr, UnicodePtr), /*IsFirst=*/false);
+ makeCharRange(*this, CharStart, UnicodePtr), /*IsFirst=*/false);
// We got a unicode codepoint that is neither a space nor a
// a valid identifier part. Carry on as if the codepoint was
// valid for recovery purposes.
} else if (!isLexingRawMode()) {
if (IsExtension)
- diagnoseExtensionInIdentifier(PP->getDiagnostics(), CodePoint,
- makeCharRange(*this, CurPtr, UnicodePtr));
+ diagnoseExtensionInIdentifier(
+ PP->getDiagnostics(), CodePoint,
+ makeCharRange(*this, CharStart, UnicodePtr));
maybeDiagnoseIDCharCompat(PP->getDiagnostics(), CodePoint,
- makeCharRange(*this, CurPtr, UnicodePtr),
+ makeCharRange(*this, CharStart, UnicodePtr),
/*IsFirst=*/false);
maybeDiagnoseUTF8Homoglyph(PP->getDiagnostics(), CodePoint,
- makeCharRange(*this, CurPtr, UnicodePtr));
+ makeCharRange(*this, CharStart, UnicodePtr));
}
+ // Once we sucessfully parsed some UTF-8,
+ // calling ConsumeChar ensures the NeedsCleaning flag is set on the token
+ // being lexed, and that warnings about trailing spaces are emitted.
+ ConsumeChar(CurPtr, FirstCodeUnitSize, Result);
CurPtr = UnicodePtr;
return true;
}
@@ -1865,7 +1876,7 @@ bool Lexer::LexIdentifierContinue(Token &Result, const char *CurPtr) {
}
if (C == '\\' && tryConsumeIdentifierUCN(CurPtr, Size, Result))
continue;
- if (!isASCII(C) && tryConsumeIdentifierUTF8Char(CurPtr))
+ if (!isASCII(C) && tryConsumeIdentifierUTF8Char(CurPtr, Result))
continue;
// Neither an expected Unicode codepoint nor a UCN.
break;
@@ -1985,7 +1996,7 @@ bool Lexer::LexNumericConstant(Token &Result, const char *CurPtr) {
// If we have a UCN or UTF-8 character (perhaps in a ud-suffix), continue.
if (C == '\\' && tryConsumeIdentifierUCN(CurPtr, Size, Result))
return LexNumericConstant(Result, CurPtr);
- if (!isASCII(C) && tryConsumeIdentifierUTF8Char(CurPtr))
+ if (!isASCII(C) && tryConsumeIdentifierUTF8Char(CurPtr, Result))
return LexNumericConstant(Result, CurPtr);
// Update the location of token as well as BufferPtr.
@@ -2009,7 +2020,7 @@ const char *Lexer::LexUDSuffix(Token &Result, const char *CurPtr,
if (!isAsciiIdentifierStart(C)) {
if (C == '\\' && tryConsumeIdentifierUCN(CurPtr, Size, Result))
Consumed = true;
- else if (!isASCII(C) && tryConsumeIdentifierUTF8Char(CurPtr))
+ else if (!isASCII(C) && tryConsumeIdentifierUTF8Char(CurPtr, Result))
Consumed = true;
else
return CurPtr;
@@ -2079,7 +2090,7 @@ const char *Lexer::LexUDSuffix(Token &Result, const char *CurPtr,
if (isAsciiIdentifierContinue(C)) {
CurPtr = ConsumeChar(CurPtr, Size, Result);
} else if (C == '\\' && tryConsumeIdentifierUCN(CurPtr, Size, Result)) {
- } else if (!isASCII(C) && tryConsumeIdentifierUTF8Char(CurPtr)) {
+ } else if (!isASCII(C) && tryConsumeIdentifierUTF8Char(CurPtr, Result)) {
} else
break;
}
diff --git a/clang/test/Lexer/escape_newline_unicode.c b/clang/test/Lexer/escape_newline_unicode.c
new file mode 100644
index 00000000000000..81a6429df48418
--- /dev/null
+++ b/clang/test/Lexer/escape_newline_unicode.c
@@ -0,0 +1,38 @@
+// RUN: %clang_cc1 -verify=expected,c -x c -Wunused %s
+// RUN: %clang_cc1 -verify=expected,cpp -x c++ -Wunused %s
+
+void gh65156(void) {
+
+int a\
+ス = 42;
+// expected-warning at -2 {{unused variable 'aス'}}
+
+int b\
+\
+ス = 42;
+// expected-warning at -2 {{backslash and newline separated by space}}
+// expected-warning at -4 {{backslash and newline separated by space}}
+// expected-warning at -5 {{unused variable 'bス'}}
+
+int ス\
+ス = 42;
+// expected-warning at -2 {{unused variable 'スス'}}
+
+int \
+ス = 42;
+// expected-warning at -2 {{unused variable 'ス'}}
+
+}
+
+void gh65156_err(void) {
+
+int \
+❌ = 0;
+// cpp-error at -2 {{expected unqualified-id}}
+// c-error at -3 {{expected identifier}}
+
+
+int a\
+❌ = 0;
+// expected-error at -1 {{character <U+274C> not allowed in an identifier}}
+}
More information about the cfe-commits
mailing list