[clang] 329e7c8 - [Clang] [C23] Implement N2653: u8 strings are char8_t[] (#97208)
via cfe-commits
cfe-commits at lists.llvm.org
Wed Jul 17 05:14:35 PDT 2024
Author: Mital Ashok
Date: 2024-07-17T14:14:31+02:00
New Revision: 329e7c80ac2dbc16c267390da5f1baaf1cd438b1
URL: https://github.com/llvm/llvm-project/commit/329e7c80ac2dbc16c267390da5f1baaf1cd438b1
DIFF: https://github.com/llvm/llvm-project/commit/329e7c80ac2dbc16c267390da5f1baaf1cd438b1.diff
LOG: [Clang] [C23] Implement N2653: u8 strings are char8_t[] (#97208)
https://www.open-std.org/jtc1/sc22/wg14/www/docs/n2653.htm
Closes #97202
---------
Co-authored-by: cor3ntin <corentinjabot at gmail.com>
Added:
clang/test/C/C23/n2653.c
Modified:
clang/docs/ReleaseNotes.rst
clang/include/clang/Basic/DiagnosticSemaKinds.td
clang/lib/Frontend/InitPreprocessor.cpp
clang/lib/Headers/stdatomic.h
clang/lib/Sema/SemaExpr.cpp
clang/www/c_status.html
Removed:
################################################################################
diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst
index 6dc45956a9afb..923f3d0a46164 100644
--- a/clang/docs/ReleaseNotes.rst
+++ b/clang/docs/ReleaseNotes.rst
@@ -362,6 +362,12 @@ C23 Feature Support
- Added the ``FLT_NORM_MAX``, ``DBL_NORM_MAX``, and ``LDBL_NORM_MAX`` to the
freestanding implementation of ``<float.h>`` that ships with Clang.
+- Compiler support for `N2653 char8_t: A type for UTF-8 characters and strings`
+ <https://www.open-std.org/jtc1/sc22/wg14/www/docs/n2653.htm>`_: ``u8`` string
+ literals are now of type ``char8_t[N]`` in C23 and expose
+ ``__CLANG_ATOMIC_CHAR8_T_LOCK_FREE``/``__GCC_ATOMIC_CHAR8_T_LOCK_FREE`` to
+ implement the corresponding macro in ``<stdatomic.h>``.
+
Non-comprehensive list of changes in this release
-------------------------------------------------
diff --git a/clang/include/clang/Basic/DiagnosticSemaKinds.td b/clang/include/clang/Basic/DiagnosticSemaKinds.td
index 52ff4b026a60e..de3d94155a9a0 100644
--- a/clang/include/clang/Basic/DiagnosticSemaKinds.td
+++ b/clang/include/clang/Basic/DiagnosticSemaKinds.td
@@ -7249,7 +7249,10 @@ def err_array_init_utf8_string_into_char : Error<
def warn_cxx20_compat_utf8_string : Warning<
"type of UTF-8 string literal will change from array of const char to "
"array of const char8_t in C++20">, InGroup<CXX20Compat>, DefaultIgnore;
-def note_cxx20_compat_utf8_string_remove_u8 : Note<
+def warn_c23_compat_utf8_string : Warning<
+ "type of UTF-8 string literal will change from array of char to "
+ "array of char8_t in C23">, InGroup<C23Compat>, DefaultIgnore;
+def note_cxx20_c23_compat_utf8_string_remove_u8 : Note<
"remove 'u8' prefix to avoid a change of behavior; "
"Clang encodes unprefixed narrow string literals as UTF-8">;
def err_array_init_
diff erent_type : Error<
diff --git a/clang/lib/Frontend/InitPreprocessor.cpp b/clang/lib/Frontend/InitPreprocessor.cpp
index d40d78a38540b..920ddf7e59913 100644
--- a/clang/lib/Frontend/InitPreprocessor.cpp
+++ b/clang/lib/Frontend/InitPreprocessor.cpp
@@ -1170,6 +1170,8 @@ static void InitializePredefinedMacros(const TargetInfo &TI,
DefineType("__WCHAR_TYPE__", TI.getWCharType(), Builder);
DefineType("__WINT_TYPE__", TI.getWIntType(), Builder);
DefineTypeSizeAndWidth("__SIG_ATOMIC", TI.getSigAtomicType(), TI, Builder);
+ if (LangOpts.C23)
+ DefineType("__CHAR8_TYPE__", TI.UnsignedChar, Builder);
DefineType("__CHAR16_TYPE__", TI.getChar16Type(), Builder);
DefineType("__CHAR32_TYPE__", TI.getChar32Type(), Builder);
@@ -1349,8 +1351,10 @@ static void InitializePredefinedMacros(const TargetInfo &TI,
getLockFreeValue(TI.get##Type##Width(), TI));
DEFINE_LOCK_FREE_MACRO(BOOL, Bool);
DEFINE_LOCK_FREE_MACRO(CHAR, Char);
- if (LangOpts.Char8)
- DEFINE_LOCK_FREE_MACRO(CHAR8_T, Char); // Treat char8_t like char.
+ // char8_t has the same representation / width as unsigned
+ // char in C++ and is a typedef for unsigned char in C23
+ if (LangOpts.Char8 || LangOpts.C23)
+ DEFINE_LOCK_FREE_MACRO(CHAR8_T, Char);
DEFINE_LOCK_FREE_MACRO(CHAR16_T, Char16);
DEFINE_LOCK_FREE_MACRO(CHAR32_T, Char32);
DEFINE_LOCK_FREE_MACRO(WCHAR_T, WChar);
diff --git a/clang/lib/Headers/stdatomic.h b/clang/lib/Headers/stdatomic.h
index 9c103d98af8c5..2027055f38796 100644
--- a/clang/lib/Headers/stdatomic.h
+++ b/clang/lib/Headers/stdatomic.h
@@ -35,6 +35,9 @@ extern "C" {
#define ATOMIC_BOOL_LOCK_FREE __CLANG_ATOMIC_BOOL_LOCK_FREE
#define ATOMIC_CHAR_LOCK_FREE __CLANG_ATOMIC_CHAR_LOCK_FREE
+#if defined(__STDC_VERSION__) && __STDC_VERSION__ >= 202311L
+#define ATOMIC_CHAR8_T_LOCK_FREE __CLANG_ATOMIC_CHAR8_T_LOCK_FREE
+#endif
#define ATOMIC_CHAR16_T_LOCK_FREE __CLANG_ATOMIC_CHAR16_T_LOCK_FREE
#define ATOMIC_CHAR32_T_LOCK_FREE __CLANG_ATOMIC_CHAR32_T_LOCK_FREE
#define ATOMIC_WCHAR_T_LOCK_FREE __CLANG_ATOMIC_WCHAR_T_LOCK_FREE
@@ -104,6 +107,9 @@ typedef _Atomic(long) atomic_long;
typedef _Atomic(unsigned long) atomic_ulong;
typedef _Atomic(long long) atomic_llong;
typedef _Atomic(unsigned long long) atomic_ullong;
+#if defined(__STDC_VERSION__) && __STDC_VERSION__ >= 202311L
+typedef _Atomic(unsigned char) atomic_char8_t;
+#endif
typedef _Atomic(uint_least16_t) atomic_char16_t;
typedef _Atomic(uint_least32_t) atomic_char32_t;
typedef _Atomic(wchar_t) atomic_wchar_t;
diff --git a/clang/lib/Sema/SemaExpr.cpp b/clang/lib/Sema/SemaExpr.cpp
index 0698c3fbe98d2..d47db14d5dd3b 100644
--- a/clang/lib/Sema/SemaExpr.cpp
+++ b/clang/lib/Sema/SemaExpr.cpp
@@ -2051,6 +2051,8 @@ Sema::ActOnStringLiteral(ArrayRef<Token> StringToks, Scope *UDLScope) {
} else if (Literal.isUTF8()) {
if (getLangOpts().Char8)
CharTy = Context.Char8Ty;
+ else if (getLangOpts().C23)
+ CharTy = Context.UnsignedCharTy;
Kind = StringLiteralKind::UTF8;
} else if (Literal.isUTF16()) {
CharTy = Context.Char16Ty;
@@ -2062,17 +2064,23 @@ Sema::ActOnStringLiteral(ArrayRef<Token> StringToks, Scope *UDLScope) {
CharTy = Context.UnsignedCharTy;
}
- // Warn on initializing an array of char from a u8 string literal; this
- // becomes ill-formed in C++2a.
- if (getLangOpts().CPlusPlus && !getLangOpts().CPlusPlus20 &&
- !getLangOpts().Char8 && Kind == StringLiteralKind::UTF8) {
- Diag(StringTokLocs.front(), diag::warn_cxx20_compat_utf8_string);
+ // Warn on u8 string literals before C++20 and C23, whose type
+ // was an array of char before but becomes an array of char8_t.
+ // In C++20, it cannot be used where a pointer to char is expected.
+ // In C23, it might have an unexpected value if char was signed.
+ if (Kind == StringLiteralKind::UTF8 &&
+ (getLangOpts().CPlusPlus
+ ? !getLangOpts().CPlusPlus20 && !getLangOpts().Char8
+ : !getLangOpts().C23)) {
+ Diag(StringTokLocs.front(), getLangOpts().CPlusPlus
+ ? diag::warn_cxx20_compat_utf8_string
+ : diag::warn_c23_compat_utf8_string);
// Create removals for all 'u8' prefixes in the string literal(s). This
- // ensures C++2a compatibility (but may change the program behavior when
+ // ensures C++20/C23 compatibility (but may change the program behavior when
// built by non-Clang compilers for which the execution character set is
// not always UTF-8).
- auto RemovalDiag = PDiag(diag::note_cxx20_compat_utf8_string_remove_u8);
+ auto RemovalDiag = PDiag(diag::note_cxx20_c23_compat_utf8_string_remove_u8);
SourceLocation RemovalDiagLoc;
for (const Token &Tok : StringToks) {
if (Tok.getKind() == tok::utf8_string_literal) {
diff --git a/clang/test/C/C23/n2653.c b/clang/test/C/C23/n2653.c
new file mode 100644
index 0000000000000..0c07c9a46eb64
--- /dev/null
+++ b/clang/test/C/C23/n2653.c
@@ -0,0 +1,34 @@
+// RUN: %clang_cc1 -ffreestanding -verify=c23 -std=c23 %s
+// RUN: %clang_cc1 -ffreestanding -verify=c17 -std=c17 %s
+
+// c23-no-diagnostics
+
+#include <stdatomic.h>
+
+#define __enable_constant_folding(x) (__builtin_constant_p(x) ? (x) : (x))
+#define __is_same(a, b) (__extension__ _Generic(a, b: 1, default: 0) && __extension__ _Generic(b, a: 1, default: 0))
+
+#ifndef ATOMIC_CHAR8_T_LOCK_FREE
+#error missing
+#endif
+// c17-error at -2 {{missing}}
+
+_Static_assert(__is_same(atomic_char8_t, unsigned char _Atomic), "");
+// c17-error at -1 {{use of undeclared identifier 'atomic_char8_t'}}
+// c17-error at -2 {{unknown type name 'atomic_char8_t'}}
+
+_Static_assert(_Generic(u8"", unsigned char*: 1, char*: 0), "");
+// c17-error at -1 {{static assertion failed}}
+
+// -fsigned-char is the default
+#define M(X) __enable_constant_folding((X) >= 0x80)
+
+_Static_assert(M(u8"\U000000E9"[0]), "");
+// c17-error at -1 {{static assertion failed}}
+#if __STDC_VERSION__ >= 202311L
+_Static_assert(M(u8'\xC3'), "");
+#endif
+
+const char cu8[] = u8"text";
+const signed char scu8[] = u8"text";
+const unsigned char ucu8[] = u8"text";
diff --git a/clang/www/c_status.html b/clang/www/c_status.html
index 669448635837e..3ea70b0163c70 100644
--- a/clang/www/c_status.html
+++ b/clang/www/c_status.html
@@ -1066,7 +1066,7 @@ <h2 id="c2x">C23 implementation status</h2>
<tr>
<td>char8_t: A type for UTF-8 characters and strings</td>
<td><a href="https://www.open-std.org/jtc1/sc22/wg14/www/docs/n2653.htm">N2653</a></td>
- <td class="none" align="center">No</td>
+ <td class="unreleased" align="center">Clang 19</td>
</tr>
<tr>
<td>Clarification for max exponent macros-update</td>
More information about the cfe-commits
mailing list