[clang] [clang] Improve diagnostics for invalid named-universal-characters (PR #206326)
Jan Schultke via cfe-commits
cfe-commits at lists.llvm.org
Tue Jun 30 01:14:22 PDT 2026
https://github.com/eisenwave updated https://github.com/llvm/llvm-project/pull/206326
>From e2a20d710a1de9b67bcefe162ec3034249722c35 Mon Sep 17 00:00:00 2001
From: Eisenwave <me at eisenwave.net>
Date: Sun, 28 Jun 2026 14:08:36 +0200
Subject: [PATCH 1/5] [clang] Improve diagnostics for invalid
named-universal-characters
1. Fix typo in note_invalid_ucn_name_loose_matching message.
2. Fix unprintable characterss appearing in diagnostic messages.
3. Stop offering low-value fix suggestions when illegal characters appear in the name.
---
.../include/clang/Basic/DiagnosticLexKinds.td | 6 +-
clang/lib/Lex/LiteralSupport.cpp | 60 ++++++++++++++++---
clang/test/CXX/drs/cwg26xx.cpp | 6 +-
clang/test/Lexer/char-escapes-delimited.c | 10 +++-
clang/test/Lexer/unicode.c | 2 +-
clang/test/Preprocessor/ucn-pp-identifier.c | 2 +-
6 files changed, 65 insertions(+), 21 deletions(-)
diff --git a/clang/include/clang/Basic/DiagnosticLexKinds.td b/clang/include/clang/Basic/DiagnosticLexKinds.td
index 383bf1a7fdb3f..decfd6c781dbf 100644
--- a/clang/include/clang/Basic/DiagnosticLexKinds.td
+++ b/clang/include/clang/Basic/DiagnosticLexKinds.td
@@ -170,9 +170,11 @@ def err_hex_escape_no_digits : Error<
def err_invalid_ucn_name : Error<
"'%0' is not a valid Unicode character name">;
def note_invalid_ucn_name_loose_matching : Note<
- "characters names in Unicode escape sequences are sensitive to case and whitespaces">;
+ "character names in Unicode escape sequences are sensitive to case and whitespaces">;
def note_invalid_ucn_name_candidate : Note<
- "did you mean %0 ('%2' U+%1)?">;
+ "did you mean %0 (%1)?">;
+def note_invalid_ucn_name_character : Note<
+ "character %0 cannot appear in a Unicode character name">;
def warn_ucn_escape_no_digits : Warning<
"\\%0 used with no following hex digits; "
diff --git a/clang/lib/Lex/LiteralSupport.cpp b/clang/lib/Lex/LiteralSupport.cpp
index 482146ccf8654..e941e5eecbe8a 100644
--- a/clang/lib/Lex/LiteralSupport.cpp
+++ b/clang/lib/Lex/LiteralSupport.cpp
@@ -539,6 +539,12 @@ static bool ProcessNumericUCNEscape(const char *ThisTokBegin,
return !HasError;
}
+static bool AllowedInCharacterName(llvm::UTF32 CodePoint) {
+ return (CodePoint >= U'A' && CodePoint <= U'Z') ||
+ (CodePoint >= U'0' && CodePoint <= U'9') || CodePoint == U'-' ||
+ CodePoint == U' ';
+}
+
static void DiagnoseInvalidUnicodeCharacterName(
DiagnosticsEngine *Diags, const LangOptions &Features, FullSourceLoc Loc,
const char *TokBegin, const char *TokRangeBegin, const char *TokRangeEnd,
@@ -550,6 +556,44 @@ static void DiagnoseInvalidUnicodeCharacterName(
namespace u = llvm::sys::unicode;
+ auto StringifyCodePoint = [](llvm::UTF32 CodePoint) -> llvm::SmallString<16> {
+ llvm::SmallString<16> Result;
+ if (u::isPrintable(CodePoint)) {
+ std::string CharUTF8;
+ llvm::convertUTF32ToUTF8String(llvm::ArrayRef<llvm::UTF32>(&CodePoint, 1),
+ CharUTF8);
+ Result.append("'");
+ Result.append(CharUTF8);
+ Result.append("' U+");
+ } else {
+ Result.append("U+");
+ }
+ llvm::raw_svector_ostream OS(Result);
+ llvm::write_hex(OS, CodePoint, llvm::HexPrintStyle::Upper, 4);
+ return Result;
+ };
+
+ bool HasIllegalCharacter = false;
+ for (const char *P = Name.begin(), *E = Name.end(); P != E;) {
+ const auto *Src = reinterpret_cast<const llvm::UTF8 *>(P);
+ const auto *SrcEnd = reinterpret_cast<const llvm::UTF8 *>(E);
+ llvm::UTF32 CodePoint = 0;
+ if (llvm::convertUTF8Sequence(&Src, SrcEnd, &CodePoint,
+ llvm::strictConversion) != llvm::conversionOK)
+ break;
+ if (AllowedInCharacterName(CodePoint)) {
+ P = reinterpret_cast<const char *>(Src);
+ continue;
+ }
+ SourceLocation CharLoc = Lexer::AdvanceToTokenCharacter(
+ Loc, (TokRangeBegin - TokBegin) + (P - Name.begin()), Loc.getManager(),
+ Features);
+ Diags->Report(CharLoc, diag::note_invalid_ucn_name_character)
+ << StringifyCodePoint(CodePoint);
+ HasIllegalCharacter = true;
+ break;
+ }
+
std::optional<u::LooseMatchingResult> Res =
u::nameToCodepointLooseMatching(Name);
if (Res) {
@@ -562,6 +606,12 @@ static void DiagnoseInvalidUnicodeCharacterName(
return;
}
+ // Providing illegal characters suggests a fundamental misuse of the feature,
+ // like providing emoji in \N{}. Offering alternative suggestions is often
+ // unhelpful in that scenario.
+ if (HasIllegalCharacter)
+ return;
+
unsigned Distance = 0;
SmallVector<u::MatchForCodepointName> Matches =
u::nearestMatchesForCodepointName(Name, 5);
@@ -576,17 +626,9 @@ static void DiagnoseInvalidUnicodeCharacterName(
break;
Distance = Match.Distance;
- std::string Str;
- llvm::UTF32 V = Match.Value;
- bool Converted =
- llvm::convertUTF32ToUTF8String(llvm::ArrayRef<llvm::UTF32>(&V, 1), Str);
- (void)Converted;
- assert(Converted && "Found a match wich is not a unicode character");
-
Diag(Diags, Features, Loc, TokBegin, TokRangeBegin, TokRangeEnd,
diag::note_invalid_ucn_name_candidate)
- << Match.Name << llvm::utohexstr(Match.Value)
- << Str // FIXME: Fix the rendering of non printable characters
+ << Match.Name << StringifyCodePoint(Match.Value)
<< FixItHint::CreateReplacement(
MakeCharSourceRange(Features, Loc, TokBegin, TokRangeBegin,
TokRangeEnd),
diff --git a/clang/test/CXX/drs/cwg26xx.cpp b/clang/test/CXX/drs/cwg26xx.cpp
index 45653743ae574..6a15513879bbd 100644
--- a/clang/test/CXX/drs/cwg26xx.cpp
+++ b/clang/test/CXX/drs/cwg26xx.cpp
@@ -214,11 +214,7 @@ int \N{Λ} = 0;
// expected-error at -2 {{expected unqualified-id}}
const char* emoji = "\N{🤡}";
// expected-error at -1 {{'🤡' is not a valid Unicode character name}}
-// expected-note at -2 {{did you mean OX ('🐂' U+1F402)?}}
-// expected-note at -3 {{did you mean ANT ('🐜' U+1F41C)?}}
-// expected-note at -4 {{did you mean ARC ('⌒' U+2312)?}}
-// expected-note at -5 {{did you mean AXE ('🪓' U+1FA93)?}}
-// expected-note at -6 {{did you mean BAT ('🦇' U+1F987)?}}
+// expected-note at -2 {{character '🤡' U+1F921 cannot appear in a Unicode character name}}
#define z(x) 0
#define cwg2640_a z(
diff --git a/clang/test/Lexer/char-escapes-delimited.c b/clang/test/Lexer/char-escapes-delimited.c
index 7a8986bc5f867..1c20456701d75 100644
--- a/clang/test/Lexer/char-escapes-delimited.c
+++ b/clang/test/Lexer/char-escapes-delimited.c
@@ -82,7 +82,8 @@ void named(void) {
char b = '\N{DOLLAR SIGN}'; // ext-warning {{extension}} cxx23-warning {{C++23}}
char b_ = '\N{ DOL-LAR _SIGN }'; // expected-error {{' DOL-LAR _SIGN ' is not a valid Unicode character name}} \
- // expected-note {{characters names in Unicode escape sequences are sensitive to case and whitespaces}}
+ // expected-note {{character names in Unicode escape sequences are sensitive to case and whitespaces}} \
+ // expected-note {{character '_' U+005F cannot appear in a Unicode character name}}
char c = '\N{NOTATHING}'; // expected-error {{'NOTATHING' is not a valid Unicode character name}} \
// expected-note 5{{did you mean}}
@@ -100,9 +101,12 @@ void named(void) {
unsigned k = u'\N{LOTUS'; // expected-error {{incomplete universal character name}}
const char* emoji = "\N{🤡}"; // expected-error {{'🤡' is not a valid Unicode character name}} \
- // expected-note 5{{did you mean}}
+ // expected-note {{character '🤡' U+1F921 cannot appear in a Unicode character name}}
const char* nested = "\N{\N{SPARKLE}}"; // expected-error {{'\N{SPARKLE' is not a valid Unicode character name}} \
- // expected-note 5{{did you mean}}
+ // expected-note {{cannot appear in a Unicode character name}}
+ const char* line_feed = "\N{LINE FEE}"; // expected-error {{'LINE FEE' is not a valid Unicode character name}} \
+ // expected-note {{did you mean LINE FEED (U+000A)}} \
+ // expected-note 4{{did you mean}}
}
void separators(void) {
diff --git a/clang/test/Lexer/unicode.c b/clang/test/Lexer/unicode.c
index e0489e11b9da9..2d1edc77a1550 100644
--- a/clang/test/Lexer/unicode.c
+++ b/clang/test/Lexer/unicode.c
@@ -49,7 +49,7 @@ extern int \u{16D80}; // CHISOI LETTER A - Added in Unicode 18.0
extern int a\N{TANGSA LETTER GA};
extern int a\N{TANGSALETTERGA}; // expected-error {{'TANGSALETTERGA' is not a valid Unicode character name}} \
// expected-error {{expected ';' after top level declarator}} \
- // expected-note {{characters names in Unicode escape sequences are sensitive to case and whitespace}}
+ // expected-note {{character names in Unicode escape sequences are sensitive to case and whitespace}}
extern int 𝛛; // expected-warning {{mathematical notation character <U+1D6DB> in an identifier is a Clang extension}}
extern int ₉; // expected-error {{character <U+2089> not allowed at the start of an identifier}} \\
diff --git a/clang/test/Preprocessor/ucn-pp-identifier.c b/clang/test/Preprocessor/ucn-pp-identifier.c
index 5efcfe48f638a..ee008a73eb882 100644
--- a/clang/test/Preprocessor/ucn-pp-identifier.c
+++ b/clang/test/Preprocessor/ucn-pp-identifier.c
@@ -131,7 +131,7 @@ C 1
// expected-error {{macro name must be an identifier}}
#define \NN // expected-warning {{incomplete universal character name; treating as '\' followed by identifier}} expected-error {{macro name must be an identifier}}
#define \N{GREEK_SMALL-LETTERALPHA} // expected-error {{'GREEK_SMALL-LETTERALPHA' is not a valid Unicode character name}} \
- // expected-note {{characters names in Unicode escape sequences are sensitive to case and whitespaces}}
+ // expected-note {{character names in Unicode escape sequences are sensitive to case and whitespaces}}
#define \N{🤡} // expected-error {{'🤡' is not a valid Unicode character name}} \
// expected-error {{macro name must be an identifier}}
>From e8edc90cb9edd8aaaf0e1b5c7824a144e666b1ad Mon Sep 17 00:00:00 2001
From: Eisenwave <me at eisenwave.net>
Date: Tue, 30 Jun 2026 06:28:03 +0200
Subject: [PATCH 2/5] Avoid unconditional UTF-8 decode, make
DisplayCodePointForDiagnostic function
---
clang/include/clang/Basic/Diagnostic.h | 2 ++
clang/lib/Basic/Diagnostic.cpp | 20 +++++++++++++++
clang/lib/Lex/LiteralSupport.cpp | 35 ++++++--------------------
3 files changed, 30 insertions(+), 27 deletions(-)
diff --git a/clang/include/clang/Basic/Diagnostic.h b/clang/include/clang/Basic/Diagnostic.h
index 826b747f2c751..c033320687078 100644
--- a/clang/include/clang/Basic/Diagnostic.h
+++ b/clang/include/clang/Basic/Diagnostic.h
@@ -28,6 +28,7 @@
#include "llvm/ADT/StringExtras.h"
#include "llvm/ADT/iterator_range.h"
#include "llvm/Support/Compiler.h"
+#include "llvm/Support/ConvertUTF.h"
#include <cassert>
#include <cstdint>
#include <limits>
@@ -1876,6 +1877,7 @@ void ProcessWarningOptions(DiagnosticsEngine &Diags,
const DiagnosticOptions &Opts,
llvm::vfs::FileSystem &VFS, bool ReportDiags = true);
void EscapeStringForDiagnostic(StringRef Str, SmallVectorImpl<char> &OutStr);
+llvm::SmallString<16> DisplayCodePointForDiagnostic(llvm::UTF32 CodePoint);
} // namespace clang
#endif // LLVM_CLANG_BASIC_DIAGNOSTIC_H
diff --git a/clang/lib/Basic/Diagnostic.cpp b/clang/lib/Basic/Diagnostic.cpp
index 4802478c379bb..a4d1efab6437f 100644
--- a/clang/lib/Basic/Diagnostic.cpp
+++ b/clang/lib/Basic/Diagnostic.cpp
@@ -1085,6 +1085,26 @@ void clang::EscapeStringForDiagnostic(StringRef Str,
}
}
+/// DisplayCodePointForDiagnostic - Display CodePoint in U+NNNN notation,
+/// optionally prepending the CodePoint itself if it is printable.
+llvm::SmallString<16>
+clang::DisplayCodePointForDiagnostic(llvm::UTF32 CodePoint) {
+ llvm::SmallString<16> Result;
+ if (llvm::sys::unicode::isPrintable(CodePoint)) {
+ std::string CharUTF8;
+ llvm::convertUTF32ToUTF8String(llvm::ArrayRef<llvm::UTF32>(&CodePoint, 1),
+ CharUTF8);
+ Result.append("'");
+ Result.append(CharUTF8);
+ Result.append("' U+");
+ } else {
+ Result.append("U+");
+ }
+ llvm::raw_svector_ostream OS(Result);
+ llvm::write_hex(OS, CodePoint, llvm::HexPrintStyle::Upper, 4);
+ return Result;
+}
+
void Diagnostic::FormatDiagnostic(const char *DiagStr, const char *DiagEnd,
SmallVectorImpl<char> &OutStr) const {
// When the diagnostic string is only "%0", the entire string is being given
diff --git a/clang/lib/Lex/LiteralSupport.cpp b/clang/lib/Lex/LiteralSupport.cpp
index e941e5eecbe8a..193a23eab75c6 100644
--- a/clang/lib/Lex/LiteralSupport.cpp
+++ b/clang/lib/Lex/LiteralSupport.cpp
@@ -539,10 +539,8 @@ static bool ProcessNumericUCNEscape(const char *ThisTokBegin,
return !HasError;
}
-static bool AllowedInCharacterName(llvm::UTF32 CodePoint) {
- return (CodePoint >= U'A' && CodePoint <= U'Z') ||
- (CodePoint >= U'0' && CodePoint <= U'9') || CodePoint == U'-' ||
- CodePoint == U' ';
+static bool AllowedInCharacterName(char C) {
+ return (C >= 'A' && C < 'Z') || (C >= '0' && C < '9') || C == '-' || C == ' ';
}
static void DiagnoseInvalidUnicodeCharacterName(
@@ -556,40 +554,23 @@ static void DiagnoseInvalidUnicodeCharacterName(
namespace u = llvm::sys::unicode;
- auto StringifyCodePoint = [](llvm::UTF32 CodePoint) -> llvm::SmallString<16> {
- llvm::SmallString<16> Result;
- if (u::isPrintable(CodePoint)) {
- std::string CharUTF8;
- llvm::convertUTF32ToUTF8String(llvm::ArrayRef<llvm::UTF32>(&CodePoint, 1),
- CharUTF8);
- Result.append("'");
- Result.append(CharUTF8);
- Result.append("' U+");
- } else {
- Result.append("U+");
- }
- llvm::raw_svector_ostream OS(Result);
- llvm::write_hex(OS, CodePoint, llvm::HexPrintStyle::Upper, 4);
- return Result;
- };
-
bool HasIllegalCharacter = false;
for (const char *P = Name.begin(), *E = Name.end(); P != E;) {
+ if (AllowedInCharacterName(*P)) {
+ ++P;
+ continue;
+ }
const auto *Src = reinterpret_cast<const llvm::UTF8 *>(P);
const auto *SrcEnd = reinterpret_cast<const llvm::UTF8 *>(E);
llvm::UTF32 CodePoint = 0;
if (llvm::convertUTF8Sequence(&Src, SrcEnd, &CodePoint,
llvm::strictConversion) != llvm::conversionOK)
break;
- if (AllowedInCharacterName(CodePoint)) {
- P = reinterpret_cast<const char *>(Src);
- continue;
- }
SourceLocation CharLoc = Lexer::AdvanceToTokenCharacter(
Loc, (TokRangeBegin - TokBegin) + (P - Name.begin()), Loc.getManager(),
Features);
Diags->Report(CharLoc, diag::note_invalid_ucn_name_character)
- << StringifyCodePoint(CodePoint);
+ << DisplayCodePointForDiagnostic(CodePoint);
HasIllegalCharacter = true;
break;
}
@@ -628,7 +609,7 @@ static void DiagnoseInvalidUnicodeCharacterName(
Diag(Diags, Features, Loc, TokBegin, TokRangeBegin, TokRangeEnd,
diag::note_invalid_ucn_name_candidate)
- << Match.Name << StringifyCodePoint(Match.Value)
+ << Match.Name << DisplayCodePointForDiagnostic(Match.Value)
<< FixItHint::CreateReplacement(
MakeCharSourceRange(Features, Loc, TokBegin, TokRangeBegin,
TokRangeEnd),
>From f8e310fe6c42b59b5d43aac07946fc5c509124b9 Mon Sep 17 00:00:00 2001
From: Eisenwave <me at eisenwave.net>
Date: Tue, 30 Jun 2026 07:22:26 +0200
Subject: [PATCH 3/5] Implement review suggestions, fix wrong ranges in
allowedInCharacterName
---
clang/lib/Basic/Diagnostic.cpp | 4 ++--
clang/lib/Lex/LiteralSupport.cpp | 7 ++++---
2 files changed, 6 insertions(+), 5 deletions(-)
diff --git a/clang/lib/Basic/Diagnostic.cpp b/clang/lib/Basic/Diagnostic.cpp
index a4d1efab6437f..7165eb734c4df 100644
--- a/clang/lib/Basic/Diagnostic.cpp
+++ b/clang/lib/Basic/Diagnostic.cpp
@@ -1085,8 +1085,8 @@ void clang::EscapeStringForDiagnostic(StringRef Str,
}
}
-/// DisplayCodePointForDiagnostic - Display CodePoint in U+NNNN notation,
-/// optionally prepending the CodePoint itself if it is printable.
+/// Displays CodePoint in U+NNNN notation, optionally prepending the quoted
+/// CodePoint itself if printable.
llvm::SmallString<16>
clang::DisplayCodePointForDiagnostic(llvm::UTF32 CodePoint) {
llvm::SmallString<16> Result;
diff --git a/clang/lib/Lex/LiteralSupport.cpp b/clang/lib/Lex/LiteralSupport.cpp
index 193a23eab75c6..72a65dd156b19 100644
--- a/clang/lib/Lex/LiteralSupport.cpp
+++ b/clang/lib/Lex/LiteralSupport.cpp
@@ -539,8 +539,9 @@ static bool ProcessNumericUCNEscape(const char *ThisTokBegin,
return !HasError;
}
-static bool AllowedInCharacterName(char C) {
- return (C >= 'A' && C < 'Z') || (C >= '0' && C < '9') || C == '-' || C == ' ';
+static bool allowedInCharacterName(char C) {
+ return (C >= 'A' && C <= 'Z') || (C >= '0' && C <= '9') || C == '-' ||
+ C == ' ';
}
static void DiagnoseInvalidUnicodeCharacterName(
@@ -556,7 +557,7 @@ static void DiagnoseInvalidUnicodeCharacterName(
bool HasIllegalCharacter = false;
for (const char *P = Name.begin(), *E = Name.end(); P != E;) {
- if (AllowedInCharacterName(*P)) {
+ if (allowedInCharacterName(*P)) {
++P;
continue;
}
>From 6a082e950358d925b109445ef9fdd834dd351740 Mon Sep 17 00:00:00 2001
From: Eisenwave <me at eisenwave.net>
Date: Tue, 30 Jun 2026 10:12:30 +0200
Subject: [PATCH 4/5] Add unit tests for DisplayCodePointForDiagnosticTest
---
clang/unittests/Basic/DiagnosticTest.cpp | 20 ++++++++++++++++++++
1 file changed, 20 insertions(+)
diff --git a/clang/unittests/Basic/DiagnosticTest.cpp b/clang/unittests/Basic/DiagnosticTest.cpp
index 4ced52c8f715f..5cc0c39e5cb51 100644
--- a/clang/unittests/Basic/DiagnosticTest.cpp
+++ b/clang/unittests/Basic/DiagnosticTest.cpp
@@ -449,4 +449,24 @@ TEST_F(SuppressionMappingTest, CanonicalizesSlashesOnWindows) {
}
#endif
+TEST(DisplayCodePointForDiagnosticTest, printableDisplaysQuoted) {
+ EXPECT_EQ(DisplayCodePointForDiagnostic(U'A'), "'A' U+0041");
+ EXPECT_EQ(DisplayCodePointForDiagnostic(U'🤡'), "'🤡' U+1F921");
+ EXPECT_EQ(DisplayCodePointForDiagnostic(U' '), "' ' U+0020");
+}
+
+TEST(DisplayCodePointForDiagnosticTest, nonPrintableDisplaysNoQuoted) {
+ EXPECT_EQ(DisplayCodePointForDiagnostic(U'\n'), "U+000A");
+ EXPECT_EQ(DisplayCodePointForDiagnostic(U'\0'), "U+0000");
+ EXPECT_EQ(DisplayCodePointForDiagnostic(U'\x1B'), "U+001B");
+}
+
+TEST(DisplayCodePointForDiagnosticTest, nonScalarValues) {
+ // Low and high surrogates:
+ EXPECT_EQ(DisplayCodePointForDiagnostic(0xD800), "U+D800");
+ EXPECT_EQ(DisplayCodePointForDiagnostic(0xDFFF), "U+DFFF");
+ // Overly large values:
+ EXPECT_EQ(DisplayCodePointForDiagnostic(0x110000), "U+110000");
+}
+
} // namespace
>From 5ba95599f857ae0c172e648dc17daa4c600d6e8f Mon Sep 17 00:00:00 2001
From: Eisenwave <me at eisenwave.net>
Date: Tue, 30 Jun 2026 10:13:59 +0200
Subject: [PATCH 5/5] Adjust comment wording
---
clang/lib/Basic/Diagnostic.cpp | 4 ++--
1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/clang/lib/Basic/Diagnostic.cpp b/clang/lib/Basic/Diagnostic.cpp
index 7165eb734c4df..57fa8a16d3a50 100644
--- a/clang/lib/Basic/Diagnostic.cpp
+++ b/clang/lib/Basic/Diagnostic.cpp
@@ -1085,8 +1085,8 @@ void clang::EscapeStringForDiagnostic(StringRef Str,
}
}
-/// Displays CodePoint in U+NNNN notation, optionally prepending the quoted
-/// CodePoint itself if printable.
+/// Displays a single Unicode codepoint in U+NNNN notation, optionally
+/// prepending the quoted codepoint itself if printable.
llvm::SmallString<16>
clang::DisplayCodePointForDiagnostic(llvm::UTF32 CodePoint) {
llvm::SmallString<16> Result;
More information about the cfe-commits
mailing list