[clang] dbfe446 - [Clang] Implement CWG2640 Allow more characters in an n-char sequence
Corentin Jabot via cfe-commits
cfe-commits at lists.llvm.org
Tue Dec 13 00:02:59 PST 2022
Author: Corentin Jabot
Date: 2022-12-13T09:02:52+01:00
New Revision: dbfe446ef3b230e8d8421a6e79793fe6f405267f
URL: https://github.com/llvm/llvm-project/commit/dbfe446ef3b230e8d8421a6e79793fe6f405267f
DIFF: https://github.com/llvm/llvm-project/commit/dbfe446ef3b230e8d8421a6e79793fe6f405267f.diff
LOG: [Clang] Implement CWG2640 Allow more characters in an n-char sequence
Reviewed By: #clang-language-wg, aaron.ballman, tahonermann
Differential Revision: https://reviews.llvm.org/D138861
Added:
Modified:
clang/docs/ReleaseNotes.rst
clang/include/clang/Lex/Lexer.h
clang/lib/Lex/Lexer.cpp
clang/lib/Lex/LiteralSupport.cpp
clang/test/CXX/drs/dr26xx.cpp
clang/test/Lexer/char-escapes-delimited.c
clang/test/Lexer/unicode.c
clang/test/Preprocessor/ucn-pp-identifier.c
clang/www/cxx_dr_status.html
Removed:
################################################################################
diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst
index 3aef8c93f6ff..cad59b0ce5de 100644
--- a/clang/docs/ReleaseNotes.rst
+++ b/clang/docs/ReleaseNotes.rst
@@ -705,6 +705,7 @@ C++2b Feature Support
- Implemented "char8_t Compatibility and Portability Fix" (`P2513R3 <https://wg21.link/P2513R3>`_).
This change was applied to C++20 as a Defect Report.
- Implemented "Permitting static constexpr variables in constexpr functions" (`P2647R1 <https://wg21.link/P2647R1>_`).
+- Implemented `CWG2640 Allow more characters in an n-char sequence <https://wg21.link/CWG2640>_`.
CUDA/HIP Language Changes in Clang
----------------------------------
diff --git a/clang/include/clang/Lex/Lexer.h b/clang/include/clang/Lex/Lexer.h
index 748a112b7d57..35c6a7bdd5ca 100644
--- a/clang/include/clang/Lex/Lexer.h
+++ b/clang/include/clang/Lex/Lexer.h
@@ -772,7 +772,7 @@ class Lexer : public PreprocessorLexer {
llvm::Optional<uint32_t>
tryReadNumericUCN(const char *&StartPtr, const char *SlashLoc, Token *Result);
llvm::Optional<uint32_t> tryReadNamedUCN(const char *&StartPtr,
- Token *Result);
+ const char *SlashLoc, Token *Result);
/// Read a universal character name.
///
diff --git a/clang/lib/Lex/Lexer.cpp b/clang/lib/Lex/Lexer.cpp
index fc0b42f4688c..3866c2c85f18 100644
--- a/clang/lib/Lex/Lexer.cpp
+++ b/clang/lib/Lex/Lexer.cpp
@@ -1194,15 +1194,16 @@ static char GetTrigraphCharForLetter(char Letter) {
/// whether trigraphs are enabled or not.
static char DecodeTrigraphChar(const char *CP, Lexer *L, bool Trigraphs) {
char Res = GetTrigraphCharForLetter(*CP);
- if (!Res || !L) return Res;
+ if (!Res)
+ return Res;
if (!Trigraphs) {
- if (!L->isLexingRawMode())
+ if (L && !L->isLexingRawMode())
L->Diag(CP-2, diag::trigraph_ignored);
return 0;
}
- if (!L->isLexingRawMode())
+ if (L && !L->isLexingRawMode())
L->Diag(CP-2, diag::trigraph_converted) << StringRef(&Res, 1);
return Res;
}
@@ -3241,7 +3242,7 @@ llvm::Optional<uint32_t> Lexer::tryReadNumericUCN(const char *&StartPtr,
if (!Delimited)
break;
if (Diagnose)
- Diag(BufferPtr, diag::warn_delimited_ucn_incomplete)
+ Diag(SlashLoc, diag::warn_delimited_ucn_incomplete)
<< StringRef(KindLoc, 1);
return std::nullopt;
}
@@ -3260,7 +3261,7 @@ llvm::Optional<uint32_t> Lexer::tryReadNumericUCN(const char *&StartPtr,
if (Count == 0) {
if (Diagnose)
- Diag(StartPtr, FoundEndDelimiter ? diag::warn_delimited_ucn_empty
+ Diag(SlashLoc, FoundEndDelimiter ? diag::warn_delimited_ucn_empty
: diag::warn_ucn_escape_no_digits)
<< StringRef(KindLoc, 1);
return std::nullopt;
@@ -3268,13 +3269,13 @@ llvm::Optional<uint32_t> Lexer::tryReadNumericUCN(const char *&StartPtr,
if (Delimited && Kind == 'U') {
if (Diagnose)
- Diag(StartPtr, diag::err_hex_escape_no_digits) << StringRef(KindLoc, 1);
+ Diag(SlashLoc, diag::err_hex_escape_no_digits) << StringRef(KindLoc, 1);
return std::nullopt;
}
if (!Delimited && Count != NumHexDigits) {
if (Diagnose) {
- Diag(BufferPtr, diag::warn_ucn_escape_incomplete);
+ Diag(SlashLoc, diag::warn_ucn_escape_incomplete);
// If the user wrote \U1234, suggest a fixit to \u.
if (Count == 4 && NumHexDigits == 8) {
CharSourceRange URange = makeCharRange(*this, KindLoc, KindLoc + 1);
@@ -3286,15 +3287,18 @@ llvm::Optional<uint32_t> Lexer::tryReadNumericUCN(const char *&StartPtr,
}
if (Delimited && PP) {
- Diag(BufferPtr, PP->getLangOpts().CPlusPlus2b
- ? diag::warn_cxx2b_delimited_escape_sequence
- : diag::ext_delimited_escape_sequence)
+ Diag(SlashLoc, PP->getLangOpts().CPlusPlus2b
+ ? diag::warn_cxx2b_delimited_escape_sequence
+ : diag::ext_delimited_escape_sequence)
<< /*delimited*/ 0 << (PP->getLangOpts().CPlusPlus ? 1 : 0);
}
if (Result) {
Result->setFlag(Token::HasUCN);
- if (CurPtr - StartPtr == (ptr
diff _t)(Count + 2 + (Delimited ? 2 : 0)))
+ // If the UCN contains either a trigraph or a line splicing,
+ // we need to call getAndAdvanceChar again to set the appropriate flags
+ // on Result.
+ if (CurPtr - StartPtr == (ptr
diff _t)(Count + 1 + (Delimited ? 2 : 0)))
StartPtr = CurPtr;
else
while (StartPtr != CurPtr)
@@ -3306,6 +3310,7 @@ llvm::Optional<uint32_t> Lexer::tryReadNumericUCN(const char *&StartPtr,
}
llvm::Optional<uint32_t> Lexer::tryReadNamedUCN(const char *&StartPtr,
+ const char *SlashLoc,
Token *Result) {
unsigned CharSize;
bool Diagnose = Result && !isLexingRawMode();
@@ -3319,7 +3324,7 @@ llvm::Optional<uint32_t> Lexer::tryReadNamedUCN(const char *&StartPtr,
C = getCharAndSize(CurPtr, CharSize);
if (C != '{') {
if (Diagnose)
- Diag(StartPtr, diag::warn_ucn_escape_incomplete);
+ Diag(SlashLoc, diag::warn_ucn_escape_incomplete);
return std::nullopt;
}
CurPtr += CharSize;
@@ -3334,28 +3339,29 @@ llvm::Optional<uint32_t> Lexer::tryReadNamedUCN(const char *&StartPtr,
break;
}
- if (!isAlphanumeric(C) && C != '_' && C != '-' && C != ' ')
+ if (isVerticalWhitespace(C))
break;
Buffer.push_back(C);
}
if (!FoundEndDelimiter || Buffer.empty()) {
if (Diagnose)
- Diag(StartPtr, FoundEndDelimiter ? diag::warn_delimited_ucn_empty
+ Diag(SlashLoc, FoundEndDelimiter ? diag::warn_delimited_ucn_empty
: diag::warn_delimited_ucn_incomplete)
<< StringRef(KindLoc, 1);
return std::nullopt;
}
StringRef Name(Buffer.data(), Buffer.size());
- llvm::Optional<char32_t> Res =
+ llvm::Optional<char32_t> Match =
llvm::sys::unicode::nameToCodepointStrict(Name);
llvm::Optional<llvm::sys::unicode::LooseMatchingResult> LooseMatch;
- if (!Res) {
- if (!isLexingRawMode()) {
- Diag(StartPtr, diag::err_invalid_ucn_name)
- << StringRef(Buffer.data(), Buffer.size());
- LooseMatch = llvm::sys::unicode::nameToCodepointLooseMatching(Name);
+ if (!Match) {
+ LooseMatch = llvm::sys::unicode::nameToCodepointLooseMatching(Name);
+ if (Diagnose) {
+ Diag(StartName, diag::err_invalid_ucn_name)
+ << StringRef(Buffer.data(), Buffer.size())
+ << makeCharRange(*this, StartName, CurPtr - CharSize);
if (LooseMatch) {
Diag(StartName, diag::note_invalid_ucn_name_loose_matching)
<< FixItHint::CreateReplacement(
@@ -3363,27 +3369,30 @@ llvm::Optional<uint32_t> Lexer::tryReadNamedUCN(const char *&StartPtr,
LooseMatch->Name);
}
}
- // When finding a match using Unicode loose matching rules
- // recover after having emitted a diagnostic.
- if (!LooseMatch)
- return std::nullopt;
// We do not offer misspelled character names suggestions here
// as the set of what would be a valid suggestion depends on context,
// and we should not make invalid suggestions.
}
- if (Diagnose && PP && !LooseMatch)
- Diag(BufferPtr, PP->getLangOpts().CPlusPlus2b
- ? diag::warn_cxx2b_delimited_escape_sequence
- : diag::ext_delimited_escape_sequence)
+ if (Diagnose && Match)
+ Diag(SlashLoc, PP->getLangOpts().CPlusPlus2b
+ ? diag::warn_cxx2b_delimited_escape_sequence
+ : diag::ext_delimited_escape_sequence)
<< /*named*/ 1 << (PP->getLangOpts().CPlusPlus ? 1 : 0);
- if (LooseMatch)
- Res = LooseMatch->CodePoint;
+ // If no diagnostic has been emitted yet, likely because we are doing a
+ // tentative lexing, we do not want to recover here to make sure the token
+ // will not be incorrectly considered valid. This function will be called
+ // again and a diagnostic emitted then.
+ if (LooseMatch && Diagnose)
+ Match = LooseMatch->CodePoint;
if (Result) {
Result->setFlag(Token::HasUCN);
- if (CurPtr - StartPtr == (ptr
diff _t)(Buffer.size() + 4))
+ // If the UCN contains either a trigraph or a line splicing,
+ // we need to call getAndAdvanceChar again to set the appropriate flags
+ // on Result.
+ if (CurPtr - StartPtr == (ptr
diff _t)(Buffer.size() + 3))
StartPtr = CurPtr;
else
while (StartPtr != CurPtr)
@@ -3391,7 +3400,7 @@ llvm::Optional<uint32_t> Lexer::tryReadNamedUCN(const char *&StartPtr,
} else {
StartPtr = CurPtr;
}
- return *Res;
+ return Match ? llvm::Optional<uint32_t>(*Match) : std::nullopt;
}
uint32_t Lexer::tryReadUCN(const char *&StartPtr, const char *SlashLoc,
@@ -3403,7 +3412,7 @@ uint32_t Lexer::tryReadUCN(const char *&StartPtr, const char *SlashLoc,
if (Kind == 'u' || Kind == 'U')
CodePointOpt = tryReadNumericUCN(StartPtr, SlashLoc, Result);
else if (Kind == 'N')
- CodePointOpt = tryReadNamedUCN(StartPtr, Result);
+ CodePointOpt = tryReadNamedUCN(StartPtr, SlashLoc, Result);
if (!CodePointOpt)
return 0;
diff --git a/clang/lib/Lex/LiteralSupport.cpp b/clang/lib/Lex/LiteralSupport.cpp
index 160240e49dd7..4ba4d94a600f 100644
--- a/clang/lib/Lex/LiteralSupport.cpp
+++ b/clang/lib/Lex/LiteralSupport.cpp
@@ -548,11 +548,10 @@ static bool ProcessNamedUCNEscape(const char *ThisTokBegin,
return false;
}
ThisTokBuf++;
- const char *ClosingBrace =
- std::find_if_not(ThisTokBuf, ThisTokEnd, [](char C) {
- return llvm::isAlnum(C) || llvm::isSpace(C) || C == '_' || C == '-';
- });
- bool Incomplete = ClosingBrace == ThisTokEnd || *ClosingBrace != '}';
+ const char *ClosingBrace = std::find_if(ThisTokBuf, ThisTokEnd, [](char C) {
+ return C == '}' || isVerticalWhitespace(C);
+ });
+ bool Incomplete = ClosingBrace == ThisTokEnd;
bool Empty = ClosingBrace == ThisTokBuf;
if (Incomplete || Empty) {
if (Diags) {
diff --git a/clang/test/CXX/drs/dr26xx.cpp b/clang/test/CXX/drs/dr26xx.cpp
index 6aa0e5053bc9..987904913897 100644
--- a/clang/test/CXX/drs/dr26xx.cpp
+++ b/clang/test/CXX/drs/dr26xx.cpp
@@ -59,6 +59,21 @@ void TemplUse() {
// dr2636: na
+namespace dr2640 { // dr2640: 16
+
+int \N{Λ} = 0; //expected-error {{'Λ' is not a valid Unicode character name}} \
+ //expected-error {{expected unqualified-id}}
+const char* emoji = "\N{🤡}"; // expected-error {{'🤡' is not a valid Unicode character name}} \
+ // expected-note 5{{did you mean}}
+
+#define z(x) 0
+#define dr2640_a z(
+int x = dr2640_a\N{abc}); // expected-error {{'abc' is not a valid Unicode character name}}
+int y = dr2640_a\N{LOTUS}); // expected-error {{character <U+1FAB7> not allowed in an identifier}} \
+ // expected-error {{use of undeclared identifier 'dr2640_a🪷'}} \
+ // expected-error {{extraneous ')' before ';'}}
+}
+
// dr2642: na
namespace dr2644 { // dr2644: yes
diff --git a/clang/test/Lexer/char-escapes-delimited.c b/clang/test/Lexer/char-escapes-delimited.c
index 43ade65a5830..8e7094bc2ca5 100644
--- a/clang/test/Lexer/char-escapes-delimited.c
+++ b/clang/test/Lexer/char-escapes-delimited.c
@@ -96,6 +96,11 @@ void named(void) {
unsigned i = u'\N{GREEK CAPITAL LETTER DELTA}'; // ext-warning {{extension}} cxx2b-warning {{C++2b}}
char j = '\NN'; // expected-error {{expected '{' after '\N' escape sequence}} expected-warning {{multi-character character constant}}
unsigned k = u'\N{LOTUS'; // expected-error {{incomplete universal character name}}
+
+ const char* emoji = "\N{🤡}"; // expected-error {{'🤡' is not a valid Unicode character name}} \
+ // expected-note 5{{did you mean}}
+ const char* nested = "\N{\N{SPARKLE}}"; // expected-error {{'\N{SPARKLE' is not a valid Unicode character name}} \
+ // expected-note 5{{did you mean}}
}
void separators(void) {
diff --git a/clang/test/Lexer/unicode.c b/clang/test/Lexer/unicode.c
index 98a2e86aea55..d79a6ed50415 100644
--- a/clang/test/Lexer/unicode.c
+++ b/clang/test/Lexer/unicode.c
@@ -43,6 +43,7 @@ extern int \U00016AA2; // TANGSA LETTER GA - Added in Unicode 14
extern int \U0001E4D0; // 𞓐 NAG MUNDARI LETTER O - Added in Unicode 15
extern int _\N{TANGSA LETTER GA};
extern int _\N{TANGSALETTERGA}; // expected-error {{'TANGSALETTERGA' is not a valid Unicode character name}} \
+ // expected-error {{expected ';' after top level declarator}} \
// expected-note {{characters names in Unicode escape sequences are sensitive to case and whitespace}}
diff --git a/clang/test/Preprocessor/ucn-pp-identifier.c b/clang/test/Preprocessor/ucn-pp-identifier.c
index ec61537898af..8d30a6a2bb23 100644
--- a/clang/test/Preprocessor/ucn-pp-identifier.c
+++ b/clang/test/Preprocessor/ucn-pp-identifier.c
@@ -1,6 +1,6 @@
-// RUN: %clang_cc1 %s -fsyntax-only -std=c99 -pedantic -verify=expected,ext -Wundef
-// RUN: %clang_cc1 %s -fsyntax-only -x c++ -pedantic -verify=expected,ext -Wundef
-// RUN: %clang_cc1 %s -fsyntax-only -x c++ -std=c++2b -pedantic -ftrigraphs -verify=expected,cxx2b -Wundef -Wpre-c++2b-compat
+// RUN: %clang_cc1 %s -fsyntax-only -std=c99 -pedantic -verify=expected,ext -Wundef -DTRIGRAPHS=1
+// RUN: %clang_cc1 %s -fsyntax-only -x c++ -pedantic -verify=expected,ext -Wundef -fno-trigraphs
+// RUN: %clang_cc1 %s -fsyntax-only -x c++ -std=c++2b -pedantic -ftrigraphs -DTRIGRAPHS=1 -verify=expected,cxx2b -Wundef -Wpre-c++2b-compat
// RUN: %clang_cc1 %s -fsyntax-only -x c++ -pedantic -verify=expected,ext -Wundef -ftrigraphs -DTRIGRAPHS=1
// RUN: not %clang_cc1 %s -fsyntax-only -std=c99 -pedantic -Wundef 2>&1 | FileCheck -strict-whitespace %s
@@ -40,7 +40,6 @@
// ext-warning {{extension}} cxx2b-warning {{before C++2b}}
#define \N{WASTEBASKET} // expected-error {{macro name must be an identifier}} \
// ext-warning {{extension}} cxx2b-warning {{before C++2b}}
-
#define a\u0024
#if \u0110 // expected-warning {{is not defined, evaluates to 0}}
@@ -121,20 +120,39 @@ C 1
#define \u{123456789} // expected-error {{hex escape sequence out of range}} expected-error {{macro name must be an identifier}}
#define \u{ // expected-warning {{incomplete delimited universal character name; treating as '\' 'u' '{' identifier}} expected-error {{macro name must be an identifier}}
#define \u{fgh} // expected-warning {{incomplete delimited universal character name; treating as '\' 'u' '{' identifier}} expected-error {{macro name must be an identifier}}
-#define \N{ // expected-warning {{incomplete delimited universal character name; treating as '\' 'N' '{' identifier}} expected-error {{macro name must be an identifier}}
+#define \N{
+// expected-warning at -1 {{incomplete delimited universal character name; treating as '\' 'N' '{' identifier}}
+// expected-error at -2 {{macro name must be an identifier}}
#define \N{} // expected-warning {{empty delimited universal character name; treating as '\' 'N' '{' '}'}} expected-error {{macro name must be an identifier}}
#define \N{NOTATHING} // expected-error {{'NOTATHING' is not a valid Unicode character name}} \
// expected-error {{macro name must be an identifier}}
#define \NN // expected-warning {{incomplete universal character name; treating as '\' followed by identifier}} expected-error {{macro name must be an identifier}}
#define \N{GREEK_SMALL-LETTERALPHA} // expected-error {{'GREEK_SMALL-LETTERALPHA' is not a valid Unicode character name}} \
// expected-note {{characters names in Unicode escape sequences are sensitive to case and whitespaces}}
+#define \N{🤡} // expected-error {{'🤡' is not a valid Unicode character name}} \
+ // expected-error {{macro name must be an identifier}}
#define CONCAT(A, B) A##B
-int CONCAT(\N{GREEK, CAPITALLETTERALPHA}); // expected-error{{expected}} \
- // expected-warning {{incomplete delimited universal character name}}
+int CONCAT(\N{GREEK
+, CAPITALLETTERALPHA});
+// expected-error at -2 {{expected}} \
+// expected-warning at -2 {{incomplete delimited universal character name}}
+
+int \N{\
+LATIN CAPITAL LETTER A WITH GRAVE};
+//ext-warning at -2 {{extension}} cxx2b-warning at -2 {{before C++2b}}
#ifdef TRIGRAPHS
-int \N??<GREEK CAPITAL LETTER ALPHA??> = 0; // expected-warning{{extension}} cxx2b-warning {{before C++2b}} \
+int \N??<GREEK CAPITAL LETTER ALPHA??> = 0; // cxx2b-warning {{before C++2b}} \
+ //ext-warning {{extension}}\
// expected-warning 2{{trigraph converted}}
+int a\N{LATIN CAPITAL LETTER A WITH GRAVE??>; // expected-warning {{trigraph converted}}
+#endif
+
+#ifndef TRIGRAPHS
+int a\N{LATIN CAPITAL LETTER A WITH GRAVE??>;
+// expected-warning at -1 {{trigraph ignored}}\
+// expected-warning at -1 {{incomplete}}\
+// expected-error at -1 {{expected ';' after top level declarator}}
#endif
diff --git a/clang/www/cxx_dr_status.html b/clang/www/cxx_dr_status.html
index e171ef442e7b..11c90cc9e940 100755
--- a/clang/www/cxx_dr_status.html
+++ b/clang/www/cxx_dr_status.html
@@ -15647,7 +15647,7 @@ <h2 id="cxxdr">C++ defect report implementation status</h2>
<td><a href="https://wg21.link/cwg2640">2640</a></td>
<td>accepted</td>
<td>Allow more characters in an n-char sequence</td>
- <td class="none" align="center">Unknown</td>
+ <td class="unreleased" align="center">Clang 16</td>
</tr>
<tr id="2641">
<td><a href="https://wg21.link/cwg2641">2641</a></td>
More information about the cfe-commits
mailing list