[clang] 31f4859 - [Clang] Allow additional mathematical symbols in identifiers.
Corentin Jabot via cfe-commits
cfe-commits at lists.llvm.org
Fri Dec 16 01:20:54 PST 2022
Author: Corentin Jabot
Date: 2022-12-16T10:20:49+01:00
New Revision: 31f4859c3e4d261d4a45118bb77d453138a6f7a9
URL: https://github.com/llvm/llvm-project/commit/31f4859c3e4d261d4a45118bb77d453138a6f7a9
DIFF: https://github.com/llvm/llvm-project/commit/31f4859c3e4d261d4a45118bb77d453138a6f7a9.diff
LOG: [Clang] Allow additional mathematical symbols in identifiers.
Implement the proposed UAX Profile
"Mathematical notation profile for default identifiers".
This implements a not-yet approved Unicode for a vetted
UAX31 identifier profile
https://www.unicode.org/L2/L2022/22230-math-profile.pdf
This change mitigates the reported disruption caused
by the implementation of UAX31 in C++ and C2x,
as these mathematical symbols are commonly used in the
scientific community.
Fixes #54732
Reviewed By: tahonermann, #clang-language-wg
Differential Revision: https://reviews.llvm.org/D137051
Added:
Modified:
clang/docs/ReleaseNotes.rst
clang/include/clang/Basic/DiagnosticLexKinds.td
clang/lib/Lex/Lexer.cpp
clang/lib/Lex/UnicodeCharSets.h
clang/test/Driver/autocomplete.c
clang/test/Lexer/unicode.c
Removed:
################################################################################
diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst
index d9b44b629220..09705a6b5b57 100644
--- a/clang/docs/ReleaseNotes.rst
+++ b/clang/docs/ReleaseNotes.rst
@@ -441,6 +441,11 @@ Non-comprehensive list of changes in this release
- Unicode support has been updated to support Unicode 15.0.
New unicode codepoints are supported as appropriate in diagnostics,
C and C++ identifiers, and escape sequences.
+- In identifiers, Clang allows a restricted set of additional mathematical symbols
+ as an extension. These symbols correspond to a proposed Unicode
+ `Mathematical notation profile for default identifiers
+ <https://www.unicode.org/L2/L2022/22230-math-profile.pdf>`_.
+ This resolves `Issue 54732 <https://github.com/llvm/llvm-project/issues/54732>`_.
- Clang now supports loading multiple configuration files. The files from
default configuration paths are loaded first, unless ``--no-default-config``
option is used. All files explicitly specified using ``--config=`` option
diff --git a/clang/include/clang/Basic/DiagnosticLexKinds.td b/clang/include/clang/Basic/DiagnosticLexKinds.td
index a915f75a8ccb..3b1b466e7602 100644
--- a/clang/include/clang/Basic/DiagnosticLexKinds.td
+++ b/clang/include/clang/Basic/DiagnosticLexKinds.td
@@ -132,6 +132,9 @@ def warn_utf8_symbol_homoglyph : Warning<
def warn_utf8_symbol_zero_width : Warning<
"identifier contains Unicode character <U+%0> that is invisible in "
"some environments">, InGroup<DiagGroup<"unicode-zero-width">>;
+def ext_mathematical_notation : ExtWarn<
+ "mathematical notation character <U+%0> in an identifier is a Clang extension">,
+ InGroup<DiagGroup<"mathematical-notation-identifier-extension">>;
def ext_delimited_escape_sequence : Extension<
"%select{delimited|named}0 escape sequences are a "
diff --git a/clang/lib/Lex/Lexer.cpp b/clang/lib/Lex/Lexer.cpp
index c93d3349c9ac..d1af455fbb91 100644
--- a/clang/lib/Lex/Lexer.cpp
+++ b/clang/lib/Lex/Lexer.cpp
@@ -1459,7 +1459,35 @@ static bool isUnicodeWhitespace(uint32_t Codepoint) {
return UnicodeWhitespaceChars.contains(Codepoint);
}
-static bool isAllowedIDChar(uint32_t C, const LangOptions &LangOpts) {
+static llvm::SmallString<5> codepointAsHexString(uint32_t C) {
+ llvm::SmallString<5> CharBuf;
+ llvm::raw_svector_ostream CharOS(CharBuf);
+ llvm::write_hex(CharOS, C, llvm::HexPrintStyle::Upper, 4);
+ return CharBuf;
+}
+
+// To mitigate https://github.com/llvm/llvm-project/issues/54732,
+// we allow "Mathematical Notation Characters" in identifiers.
+// This is a proposed profile that extends the XID_Start/XID_continue
+// with mathematical symbols, superscipts and subscripts digits
+// found in some production software.
+// https://www.unicode.org/L2/L2022/22230-math-profile.pdf
+static bool isMathematicalExtensionID(uint32_t C, const LangOptions &LangOpts,
+ bool IsStart, bool &IsExtension) {
+ static const llvm::sys::UnicodeCharSet MathStartChars(
+ MathematicalNotationProfileIDStartRanges);
+ static const llvm::sys::UnicodeCharSet MathContinueChars(
+ MathematicalNotationProfileIDContinueRanges);
+ if (MathStartChars.contains(C) ||
+ (!IsStart && MathContinueChars.contains(C))) {
+ IsExtension = true;
+ return true;
+ }
+ return false;
+}
+
+static bool isAllowedIDChar(uint32_t C, const LangOptions &LangOpts,
+ bool &IsExtension) {
if (LangOpts.AsmPreprocessor) {
return false;
} else if (LangOpts.DollarIdents && '$' == C) {
@@ -1471,8 +1499,10 @@ static bool isAllowedIDChar(uint32_t C, const LangOptions &LangOpts) {
// '_' doesn't have the XID_Continue property but is allowed in C and C++.
static const llvm::sys::UnicodeCharSet XIDStartChars(XIDStartRanges);
static const llvm::sys::UnicodeCharSet XIDContinueChars(XIDContinueRanges);
- return C == '_' || XIDStartChars.contains(C) ||
- XIDContinueChars.contains(C);
+ if (C == '_' || XIDStartChars.contains(C) || XIDContinueChars.contains(C))
+ return true;
+ return isMathematicalExtensionID(C, LangOpts, /*IsStart=*/false,
+ IsExtension);
} else if (LangOpts.C11) {
static const llvm::sys::UnicodeCharSet C11AllowedIDChars(
C11AllowedIDCharRanges);
@@ -1484,16 +1514,21 @@ static bool isAllowedIDChar(uint32_t C, const LangOptions &LangOpts) {
}
}
-static bool isAllowedInitiallyIDChar(uint32_t C, const LangOptions &LangOpts) {
+static bool isAllowedInitiallyIDChar(uint32_t C, const LangOptions &LangOpts,
+ bool &IsExtension) {
assert(C > 0x7F && "isAllowedInitiallyIDChar called with an ASCII codepoint");
+ IsExtension = false;
if (LangOpts.AsmPreprocessor) {
return false;
}
if (LangOpts.CPlusPlus || LangOpts.C2x) {
static const llvm::sys::UnicodeCharSet XIDStartChars(XIDStartRanges);
- return XIDStartChars.contains(C);
+ if (XIDStartChars.contains(C))
+ return true;
+ return isMathematicalExtensionID(C, LangOpts, /*IsStart=*/true,
+ IsExtension);
}
- if (!isAllowedIDChar(C, LangOpts))
+ if (!isAllowedIDChar(C, LangOpts, IsExtension))
return false;
if (LangOpts.C11) {
static const llvm::sys::UnicodeCharSet C11DisallowedInitialIDChars(
@@ -1505,6 +1540,20 @@ static bool isAllowedInitiallyIDChar(uint32_t C, const LangOptions &LangOpts) {
return !C99DisallowedInitialIDChars.contains(C);
}
+static void diagnoseExtensionInIdentifier(DiagnosticsEngine &Diags, uint32_t C,
+ CharSourceRange Range) {
+
+ static const llvm::sys::UnicodeCharSet MathStartChars(
+ MathematicalNotationProfileIDStartRanges);
+ static const llvm::sys::UnicodeCharSet MathContinueChars(
+ MathematicalNotationProfileIDContinueRanges);
+
+ assert((MathStartChars.contains(C) || MathContinueChars.contains(C)) &&
+ "Unexpected mathematical notation codepoint");
+ Diags.Report(Range.getBegin(), diag::ext_mathematical_notation)
+ << codepointAsHexString(C) << Range;
+}
+
static inline CharSourceRange makeCharRange(Lexer &L, const char *Begin,
const char *End) {
return CharSourceRange::getCharRange(L.getSourceLocation(Begin),
@@ -1604,18 +1653,13 @@ static void maybeDiagnoseUTF8Homoglyph(DiagnosticsEngine &Diags, uint32_t C,
std::lower_bound(std::begin(SortedHomoglyphs),
std::end(SortedHomoglyphs) - 1, HomoglyphPair{C, '\0'});
if (Homoglyph->Character == C) {
- llvm::SmallString<5> CharBuf;
- {
- llvm::raw_svector_ostream CharOS(CharBuf);
- llvm::write_hex(CharOS, C, llvm::HexPrintStyle::Upper, 4);
- }
if (Homoglyph->LooksLike) {
const char LooksLikeStr[] = {Homoglyph->LooksLike, 0};
Diags.Report(Range.getBegin(), diag::warn_utf8_symbol_homoglyph)
- << Range << CharBuf << LooksLikeStr;
+ << Range << codepointAsHexString(C) << LooksLikeStr;
} else {
Diags.Report(Range.getBegin(), diag::warn_utf8_symbol_zero_width)
- << Range << CharBuf;
+ << Range << codepointAsHexString(C);
}
}
}
@@ -1626,25 +1670,24 @@ static void diagnoseInvalidUnicodeCodepointInIdentifier(
if (isASCII(CodePoint))
return;
- bool IsIDStart = isAllowedInitiallyIDChar(CodePoint, LangOpts);
- bool IsIDContinue = IsIDStart || isAllowedIDChar(CodePoint, LangOpts);
+ bool IsExtension;
+ bool IsIDStart = isAllowedInitiallyIDChar(CodePoint, LangOpts, IsExtension);
+ bool IsIDContinue =
+ IsIDStart || isAllowedIDChar(CodePoint, LangOpts, IsExtension);
if ((IsFirst && IsIDStart) || (!IsFirst && IsIDContinue))
return;
bool InvalidOnlyAtStart = IsFirst && !IsIDStart && IsIDContinue;
- llvm::SmallString<5> CharBuf;
- llvm::raw_svector_ostream CharOS(CharBuf);
- llvm::write_hex(CharOS, CodePoint, llvm::HexPrintStyle::Upper, 4);
-
if (!IsFirst || InvalidOnlyAtStart) {
Diags.Report(Range.getBegin(), diag::err_character_not_allowed_identifier)
- << Range << CharBuf << int(InvalidOnlyAtStart)
+ << Range << codepointAsHexString(CodePoint) << int(InvalidOnlyAtStart)
<< FixItHint::CreateRemoval(Range);
} else {
Diags.Report(Range.getBegin(), diag::err_character_not_allowed)
- << Range << CharBuf << FixItHint::CreateRemoval(Range);
+ << Range << codepointAsHexString(CodePoint)
+ << FixItHint::CreateRemoval(Range);
}
}
@@ -1655,8 +1698,8 @@ bool Lexer::tryConsumeIdentifierUCN(const char *&CurPtr, unsigned Size,
if (CodePoint == 0) {
return false;
}
-
- if (!isAllowedIDChar(CodePoint, LangOpts)) {
+ bool IsExtension = false;
+ if (!isAllowedIDChar(CodePoint, LangOpts, IsExtension)) {
if (isASCII(CodePoint) || isUnicodeWhitespace(CodePoint))
return false;
if (!isLexingRawMode() && !ParsingPreprocessorDirective &&
@@ -1669,10 +1712,15 @@ bool Lexer::tryConsumeIdentifierUCN(const char *&CurPtr, unsigned Size,
// We got a unicode codepoint that is neither a space nor a
// a valid identifier part.
// Carry on as if the codepoint was valid for recovery purposes.
- } else if (!isLexingRawMode())
+ } else if (!isLexingRawMode()) {
+ if (IsExtension)
+ diagnoseExtensionInIdentifier(PP->getDiagnostics(), CodePoint,
+ makeCharRange(*this, CurPtr, UCNPtr));
+
maybeDiagnoseIDCharCompat(PP->getDiagnostics(), CodePoint,
makeCharRange(*this, CurPtr, UCNPtr),
/*IsFirst=*/false);
+ }
Result.setFlag(Token::HasUCN);
if ((UCNPtr - CurPtr == 6 && CurPtr[1] == 'u') ||
@@ -1695,7 +1743,9 @@ bool Lexer::tryConsumeIdentifierUTF8Char(const char *&CurPtr) {
if (Result != llvm::conversionOK)
return false;
- if (!isAllowedIDChar(static_cast<uint32_t>(CodePoint), LangOpts)) {
+ bool IsExtension = false;
+ if (!isAllowedIDChar(static_cast<uint32_t>(CodePoint), LangOpts,
+ IsExtension)) {
if (isASCII(CodePoint) || isUnicodeWhitespace(CodePoint))
return false;
@@ -1708,6 +1758,9 @@ bool Lexer::tryConsumeIdentifierUTF8Char(const char *&CurPtr) {
// a valid identifier part. Carry on as if the codepoint was
// valid for recovery purposes.
} else if (!isLexingRawMode()) {
+ if (IsExtension)
+ diagnoseExtensionInIdentifier(PP->getDiagnostics(), CodePoint,
+ makeCharRange(*this, CurPtr, UnicodePtr));
maybeDiagnoseIDCharCompat(PP->getDiagnostics(), CodePoint,
makeCharRange(*this, CurPtr, UnicodePtr),
/*IsFirst=*/false);
@@ -1721,9 +1774,13 @@ bool Lexer::tryConsumeIdentifierUTF8Char(const char *&CurPtr) {
bool Lexer::LexUnicodeIdentifierStart(Token &Result, uint32_t C,
const char *CurPtr) {
- if (isAllowedInitiallyIDChar(C, LangOpts)) {
+ bool IsExtension = false;
+ if (isAllowedInitiallyIDChar(C, LangOpts, IsExtension)) {
if (!isLexingRawMode() && !ParsingPreprocessorDirective &&
!PP->isPreprocessedOutput()) {
+ if (IsExtension)
+ diagnoseExtensionInIdentifier(PP->getDiagnostics(), C,
+ makeCharRange(*this, BufferPtr, CurPtr));
maybeDiagnoseIDCharCompat(PP->getDiagnostics(), C,
makeCharRange(*this, BufferPtr, CurPtr),
/*IsFirst=*/true);
@@ -1737,7 +1794,7 @@ bool Lexer::LexUnicodeIdentifierStart(Token &Result, uint32_t C,
if (!isLexingRawMode() && !ParsingPreprocessorDirective &&
!PP->isPreprocessedOutput() && !isASCII(*BufferPtr) &&
- !isAllowedInitiallyIDChar(C, LangOpts) && !isUnicodeWhitespace(C)) {
+ !isUnicodeWhitespace(C)) {
// Non-ASCII characters tend to creep into source code unintentionally.
// Instead of letting the parser complain about the unknown token,
// just drop the character.
diff --git a/clang/lib/Lex/UnicodeCharSets.h b/clang/lib/Lex/UnicodeCharSets.h
index f827217d6dbf..5316d2540b76 100644
--- a/clang/lib/Lex/UnicodeCharSets.h
+++ b/clang/lib/Lex/UnicodeCharSets.h
@@ -366,6 +366,36 @@ static const llvm::sys::UnicodeCharRange XIDContinueRanges[] = {
{0x1E4EC, 0x1E4F9}, {0x1E8D0, 0x1E8D6}, {0x1E944, 0x1E94A},
{0x1E950, 0x1E959}, {0x1FBF0, 0x1FBF9}, {0xE0100, 0xE01EF}};
+// Clang supports the "Mathematical notation profile" as an extension,
+// as described in https://www.unicode.org/L2/L2022/22230-math-profile.pdf
+// Math_Start
+static const llvm::sys::UnicodeCharRange
+ MathematicalNotationProfileIDStartRanges[] = {
+ {0x02202, 0x02202}, // โ
+ {0x02207, 0x02207}, // โ
+ {0x0221E, 0x0221E}, // โ
+ {0x1D6C1, 0x1D6C1}, // ๐
+ {0x1D6DB, 0x1D6DB}, // ๐
+ {0x1D6FB, 0x1D6FB}, // ๐ป
+ {0x1D715, 0x1D715}, // ๐
+ {0x1D735, 0x1D735}, // ๐ต
+ {0x1D74F, 0x1D74F}, // ๐
+ {0x1D76F, 0x1D76F}, // ๐ฏ
+ {0x1D789, 0x1D789}, // ๐
+ {0x1D7A9, 0x1D7A9}, // ๐ฉ
+ {0x1D7C3, 0x1D7C3}, // ๐
+};
+
+// Math_Continue
+static const llvm::sys::UnicodeCharRange
+ MathematicalNotationProfileIDContinueRanges[] = {
+ {0x000B2, 0x000B3}, // ยฒ-ยณ
+ {0x000B9, 0x000B9}, // ยน
+ {0x02070, 0x02070}, // โฐ
+ {0x02074, 0x0207E}, // โด-โพ
+ {0x02080, 0x0208E}, // โ-โ
+};
+
// C11 D.1, C++11 [charname.allowed]
static const llvm::sys::UnicodeCharRange C11AllowedIDCharRanges[] = {
// 1
diff --git a/clang/test/Driver/autocomplete.c b/clang/test/Driver/autocomplete.c
index 59055efac2ce..502eee107d0b 100644
--- a/clang/test/Driver/autocomplete.c
+++ b/clang/test/Driver/autocomplete.c
@@ -111,6 +111,7 @@
// WARNING-NEXT: -Wmain-return-type
// WARNING-NEXT: -Wmalformed-warning-check
// WARNING-NEXT: -Wmany-braces-around-scalar-init
+// WARNING-NEXT: -Wmathematical-notation-identifier-extension
// WARNING-NEXT: -Wmax-tokens
// WARNING-NEXT: -Wmax-unsigned-zero
// RUN: %clang --autocomplete=-Wno-invalid-pp- | FileCheck %s -check-prefix=NOWARNING
diff --git a/clang/test/Lexer/unicode.c b/clang/test/Lexer/unicode.c
index d79a6ed50415..d120d6ca8517 100644
--- a/clang/test/Lexer/unicode.c
+++ b/clang/test/Lexer/unicode.c
@@ -46,7 +46,17 @@ extern int _\N{TANGSALETTERGA}; // expected-error {{'TANGSALETTERGA' is not a va
// expected-error {{expected ';' after top level declarator}} \
// expected-note {{characters names in Unicode escape sequences are sensitive to case and whitespace}}
+extern int ๐; // expected-warning {{mathematical notation character <U+1D6DB> in an identifier is a Clang extension}}
+extern int โ; // expected-error {{character <U+2089> not allowed at the start of an identifier}} \\
+ expected-warning {{declaration does not declare anything}}
+int aยนbโโโโโ; // expected-warning 6{{mathematical notation character}}
+
+int \u{221E} = 1; // expected-warning {{mathematical notation character}}
+int \N{MATHEMATICAL SANS-SERIF BOLD ITALIC PARTIAL DIFFERENTIAL} = 1;
+ // expected-warning at -1 {{mathematical notation character}}
+
+int a\N{SUBSCRIPT EQUALS SIGN} = 1; // expected-warning {{mathematical notation character}}
// This character doesn't have the XID_Start property
extern int \U00016AC0; // TANGSA DIGIT ZERO // cxx-error {{expected unqualified-id}} \
More information about the cfe-commits
mailing list