[clang] [Clang] [C23] Implement N2653: u8 strings are char8_t[] (PR #97208)

Mital Ashok via cfe-commits cfe-commits at lists.llvm.org
Sat Jul 6 07:42:14 PDT 2024


https://github.com/MitalAshok updated https://github.com/llvm/llvm-project/pull/97208

>From ef0072d1fc9b14f7ee657fa95f44a686b78b525a Mon Sep 17 00:00:00 2001
From: Mital Ashok <mital at mitalashok.co.uk>
Date: Sun, 30 Jun 2024 12:07:54 +0100
Subject: [PATCH 1/6] [Clang] [C23] Implement N2653: u8 strings are char8_t[]

---
 clang/docs/ReleaseNotes.rst                   |  6 ++++
 .../clang/Basic/DiagnosticSemaKinds.td        |  5 +++-
 clang/lib/Frontend/InitPreprocessor.cpp       |  6 ++--
 clang/lib/Headers/stdatomic.h                 |  5 ++++
 clang/lib/Sema/SemaExpr.cpp                   | 23 ++++++++++-----
 clang/test/C/C2x/n2653.c                      | 29 +++++++++++++++++++
 clang/www/c_status.html                       |  2 +-
 7 files changed, 65 insertions(+), 11 deletions(-)
 create mode 100644 clang/test/C/C2x/n2653.c

diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst
index c720e47dbe35b..e51be81d8b11a 100644
--- a/clang/docs/ReleaseNotes.rst
+++ b/clang/docs/ReleaseNotes.rst
@@ -337,6 +337,12 @@ C23 Feature Support
 - Properly promote bit-fields of bit-precise integer types to the field's type
   rather than to ``int``. #GH87641
 
+- Compiler support for `N2653 char8_t: A type for UTF-8 characters and strings`
+  <https://www.open-std.org/jtc1/sc22/wg14/www/docs/n2653.htm>`_: ``u8`` string
+  literals are now of type ``char8_t[N]`` in C23 and expose
+  ``__CLANG_ATOMIC_CHAR8_T_LOCK_FREE``/``__GCC_ATOMIC_CHAR8_T_LOCK_FREE`` to
+  implement the corresponding macro in ``<stdatomic.h>``.
+
 Non-comprehensive list of changes in this release
 -------------------------------------------------
 
diff --git a/clang/include/clang/Basic/DiagnosticSemaKinds.td b/clang/include/clang/Basic/DiagnosticSemaKinds.td
index 5dc36c594bcb7..6a00b92df1c36 100644
--- a/clang/include/clang/Basic/DiagnosticSemaKinds.td
+++ b/clang/include/clang/Basic/DiagnosticSemaKinds.td
@@ -7252,7 +7252,10 @@ def err_array_init_utf8_string_into_char : Error<
 def warn_cxx20_compat_utf8_string : Warning<
   "type of UTF-8 string literal will change from array of const char to "
   "array of const char8_t in C++20">, InGroup<CXX20Compat>, DefaultIgnore;
-def note_cxx20_compat_utf8_string_remove_u8 : Note<
+def warn_c23_compat_utf8_string : Warning<
+  "type of UTF-8 string literal will change from array of char to "
+  "array of char8_t in C23">, InGroup<C23Compat>, DefaultIgnore;
+def note_cxx20_c23_compat_utf8_string_remove_u8 : Note<
   "remove 'u8' prefix to avoid a change of behavior; "
   "Clang encodes unprefixed narrow string literals as UTF-8">;
 def err_array_init_different_type : Error<
diff --git a/clang/lib/Frontend/InitPreprocessor.cpp b/clang/lib/Frontend/InitPreprocessor.cpp
index 55ec460064830..6270c37342bcf 100644
--- a/clang/lib/Frontend/InitPreprocessor.cpp
+++ b/clang/lib/Frontend/InitPreprocessor.cpp
@@ -1342,8 +1342,10 @@ static void InitializePredefinedMacros(const TargetInfo &TI,
                       getLockFreeValue(TI.get##Type##Width(), TI));
     DEFINE_LOCK_FREE_MACRO(BOOL, Bool);
     DEFINE_LOCK_FREE_MACRO(CHAR, Char);
-    if (LangOpts.Char8)
-      DEFINE_LOCK_FREE_MACRO(CHAR8_T, Char); // Treat char8_t like char.
+    // char8_t has the same representation / width as unsigned
+    // char in C++ and is a typedef for unsigned char in C23
+    if (LangOpts.Char8 || LangOpts.C23)
+      DEFINE_LOCK_FREE_MACRO(CHAR8_T, Char);
     DEFINE_LOCK_FREE_MACRO(CHAR16_T, Char16);
     DEFINE_LOCK_FREE_MACRO(CHAR32_T, Char32);
     DEFINE_LOCK_FREE_MACRO(WCHAR_T, WChar);
diff --git a/clang/lib/Headers/stdatomic.h b/clang/lib/Headers/stdatomic.h
index 9c103d98af8c5..c33cd8083525c 100644
--- a/clang/lib/Headers/stdatomic.h
+++ b/clang/lib/Headers/stdatomic.h
@@ -35,6 +35,10 @@ extern "C" {
 
 #define ATOMIC_BOOL_LOCK_FREE       __CLANG_ATOMIC_BOOL_LOCK_FREE
 #define ATOMIC_CHAR_LOCK_FREE       __CLANG_ATOMIC_CHAR_LOCK_FREE
+#if (defined(__STDC_VERSION__) && __STDC_VERSION__ >= 202311L) ||              \
+    defined(__cplusplus)
+#define ATOMIC_CHAR8_T_LOCK_FREE    __CLANG_ATOMIC_CHAR8_T_LOCK_FREE
+#endif
 #define ATOMIC_CHAR16_T_LOCK_FREE   __CLANG_ATOMIC_CHAR16_T_LOCK_FREE
 #define ATOMIC_CHAR32_T_LOCK_FREE   __CLANG_ATOMIC_CHAR32_T_LOCK_FREE
 #define ATOMIC_WCHAR_T_LOCK_FREE    __CLANG_ATOMIC_WCHAR_T_LOCK_FREE
@@ -104,6 +108,7 @@ typedef _Atomic(long)               atomic_long;
 typedef _Atomic(unsigned long)      atomic_ulong;
 typedef _Atomic(long long)          atomic_llong;
 typedef _Atomic(unsigned long long) atomic_ullong;
+typedef _Atomic(unsigned char)      atomic_char8_t;
 typedef _Atomic(uint_least16_t)     atomic_char16_t;
 typedef _Atomic(uint_least32_t)     atomic_char32_t;
 typedef _Atomic(wchar_t)            atomic_wchar_t;
diff --git a/clang/lib/Sema/SemaExpr.cpp b/clang/lib/Sema/SemaExpr.cpp
index db44cfe1288b6..a1b060f7f1510 100644
--- a/clang/lib/Sema/SemaExpr.cpp
+++ b/clang/lib/Sema/SemaExpr.cpp
@@ -2082,6 +2082,8 @@ Sema::ActOnStringLiteral(ArrayRef<Token> StringToks, Scope *UDLScope) {
   } else if (Literal.isUTF8()) {
     if (getLangOpts().Char8)
       CharTy = Context.Char8Ty;
+    else if (getLangOpts().C23)
+      CharTy = Context.UnsignedCharTy;
     Kind = StringLiteralKind::UTF8;
   } else if (Literal.isUTF16()) {
     CharTy = Context.Char16Ty;
@@ -2093,17 +2095,24 @@ Sema::ActOnStringLiteral(ArrayRef<Token> StringToks, Scope *UDLScope) {
     CharTy = Context.UnsignedCharTy;
   }
 
-  // Warn on initializing an array of char from a u8 string literal; this
-  // becomes ill-formed in C++2a.
-  if (getLangOpts().CPlusPlus && !getLangOpts().CPlusPlus20 &&
-      !getLangOpts().Char8 && Kind == StringLiteralKind::UTF8) {
-    Diag(StringTokLocs.front(), diag::warn_cxx20_compat_utf8_string);
+  // Warn on u8 string literals before C++20 and C23, whose type
+  // was an array of char before but becomes an array of char8_t.
+  // In C++20, initializing an array of char from a u8 string literal
+  // becomes ill-formed. In C23, it might have an unexpected value if
+  // char was signed.
+  if (Kind == StringLiteralKind::UTF8 &&
+      (getLangOpts().CPlusPlus
+           ? !getLangOpts().CPlusPlus20 && !getLangOpts().Char8
+           : !getLangOpts().C23)) {
+    Diag(StringTokLocs.front(), getLangOpts().CPlusPlus
+                                    ? diag::warn_cxx20_compat_utf8_string
+                                    : diag::warn_c23_compat_utf8_string);
 
     // Create removals for all 'u8' prefixes in the string literal(s). This
-    // ensures C++2a compatibility (but may change the program behavior when
+    // ensures C++20/C23 compatibility (but may change the program behavior when
     // built by non-Clang compilers for which the execution character set is
     // not always UTF-8).
-    auto RemovalDiag = PDiag(diag::note_cxx20_compat_utf8_string_remove_u8);
+    auto RemovalDiag = PDiag(diag::note_cxx20_c23_compat_utf8_string_remove_u8);
     SourceLocation RemovalDiagLoc;
     for (const Token &Tok : StringToks) {
       if (Tok.getKind() == tok::utf8_string_literal) {
diff --git a/clang/test/C/C2x/n2653.c b/clang/test/C/C2x/n2653.c
new file mode 100644
index 0000000000000..1abd61947de7e
--- /dev/null
+++ b/clang/test/C/C2x/n2653.c
@@ -0,0 +1,29 @@
+// RUN: %clang_cc1 -verify=c23 -std=c23 %s
+// RUN: %clang_cc1 -verify=c17 -std=c17 %s
+
+// c23-no-diagnostics
+
+#include <stdatomic.h>
+
+#define __enable_constant_folding(x) (__builtin_constant_p(x) ? (x) : (x))
+
+#ifndef ATOMIC_CHAR8_T_LOCK_FREE
+#error missing
+#endif
+// c17-error at -2 {{missing}}
+
+_Static_assert(_Generic(u8"", unsigned char*: 1, char*: 0), "");
+// c17-error at -1 {{static assertion failed}}
+
+// -fsigned-char is the default
+#define M(X) __enable_constant_folding((X) >= 0x80)
+
+_Static_assert(M(u8"\U000000E9"[0]), "");
+// c17-error at -1 {{static assertion failed}}
+#if __STDC_VERSION__ >= 202311L
+_Static_assert(M(u8'\xC3'), "");
+#endif
+
+const          char cu8[]  = u8"text";
+const signed   char scu8[] = u8"text";
+const unsigned char ucu8[] = u8"text";
diff --git a/clang/www/c_status.html b/clang/www/c_status.html
index 84cd8e836006c..81bb51a58e5cb 100644
--- a/clang/www/c_status.html
+++ b/clang/www/c_status.html
@@ -1061,7 +1061,7 @@ <h2 id="c2x">C23 implementation status</h2>
     <tr>
       <td>char8_t: A type for UTF-8 characters and strings</td>
       <td><a href="https://www.open-std.org/jtc1/sc22/wg14/www/docs/n2653.htm">N2653</a></td>
-      <td class="none" align="center">No</td>
+      <td class="unreleased" align="center">Clang 19</td>
     </tr>
     <tr>
       <td>Clarification for max exponent macros-update</td>

>From d2594adb3ced3b5ecbb64a2c999715e06139f90b Mon Sep 17 00:00:00 2001
From: Mital Ashok <mital at mitalashok.co.uk>
Date: Mon, 1 Jul 2024 18:19:30 +0100
Subject: [PATCH 2/6] Char array initialized from u8 string was fixed in C++20
 as a DR

https://www.open-std.org/jtc1/sc22/wg21/docs/papers/2022/p2513r4.html
---
 clang/lib/Sema/SemaExpr.cpp | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/clang/lib/Sema/SemaExpr.cpp b/clang/lib/Sema/SemaExpr.cpp
index a1b060f7f1510..8692ca9e1e628 100644
--- a/clang/lib/Sema/SemaExpr.cpp
+++ b/clang/lib/Sema/SemaExpr.cpp
@@ -2097,9 +2097,8 @@ Sema::ActOnStringLiteral(ArrayRef<Token> StringToks, Scope *UDLScope) {
 
   // Warn on u8 string literals before C++20 and C23, whose type
   // was an array of char before but becomes an array of char8_t.
-  // In C++20, initializing an array of char from a u8 string literal
-  // becomes ill-formed. In C23, it might have an unexpected value if
-  // char was signed.
+  // In C++20, it cannot be used where a pointer to char is expected.
+  // In C23, it might have an unexpected value if char was signed.
   if (Kind == StringLiteralKind::UTF8 &&
       (getLangOpts().CPlusPlus
            ? !getLangOpts().CPlusPlus20 && !getLangOpts().Char8

>From 6816f7f63d3def751cc63fef5e4fa2978d735521 Mon Sep 17 00:00:00 2001
From: Mital Ashok <mital at mitalashok.co.uk>
Date: Mon, 1 Jul 2024 18:20:11 +0100
Subject: [PATCH 3/6] Define ATOMIC_CHAR8_T_LOCK_FREE only when available

This should be equivalent to:

\#if (defined(__STDC_VERSION__) && __STDC_VERSION__ > 202311L) || \
    (defined(__cpp_char8_t) && __cpp_char8_t >= 201811L)
---
 clang/lib/Headers/stdatomic.h | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/clang/lib/Headers/stdatomic.h b/clang/lib/Headers/stdatomic.h
index c33cd8083525c..ea07a58ec17a4 100644
--- a/clang/lib/Headers/stdatomic.h
+++ b/clang/lib/Headers/stdatomic.h
@@ -35,8 +35,7 @@ extern "C" {
 
 #define ATOMIC_BOOL_LOCK_FREE       __CLANG_ATOMIC_BOOL_LOCK_FREE
 #define ATOMIC_CHAR_LOCK_FREE       __CLANG_ATOMIC_CHAR_LOCK_FREE
-#if (defined(__STDC_VERSION__) && __STDC_VERSION__ >= 202311L) ||              \
-    defined(__cplusplus)
+#ifdef __CLANG_ATOMIC_CHAR8_T_LOCK_FREE
 #define ATOMIC_CHAR8_T_LOCK_FREE    __CLANG_ATOMIC_CHAR8_T_LOCK_FREE
 #endif
 #define ATOMIC_CHAR16_T_LOCK_FREE   __CLANG_ATOMIC_CHAR16_T_LOCK_FREE

>From 2480d5a864731ccf5c8949aed12c8377377c258b Mon Sep 17 00:00:00 2001
From: Mital Ashok <mital at mitalashok.co.uk>
Date: Wed, 3 Jul 2024 14:16:27 +0100
Subject: [PATCH 4/6] [Headers] Gate atomic_char8_t behind C23/C++20[-fchar8_t]

---
 clang/lib/Headers/stdatomic.h | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/clang/lib/Headers/stdatomic.h b/clang/lib/Headers/stdatomic.h
index ea07a58ec17a4..79a0652c401b1 100644
--- a/clang/lib/Headers/stdatomic.h
+++ b/clang/lib/Headers/stdatomic.h
@@ -107,7 +107,9 @@ typedef _Atomic(long)               atomic_long;
 typedef _Atomic(unsigned long)      atomic_ulong;
 typedef _Atomic(long long)          atomic_llong;
 typedef _Atomic(unsigned long long) atomic_ullong;
+#ifdef __CLANG_ATOMIC_CHAR8_T_LOCK_FREE
 typedef _Atomic(unsigned char)      atomic_char8_t;
+#endif
 typedef _Atomic(uint_least16_t)     atomic_char16_t;
 typedef _Atomic(uint_least32_t)     atomic_char32_t;
 typedef _Atomic(wchar_t)            atomic_wchar_t;

>From 8c6b2d09e24f0437083d4614626e40a8552cdb9f Mon Sep 17 00:00:00 2001
From: Mital Ashok <mital at mitalashok.co.uk>
Date: Wed, 3 Jul 2024 14:16:49 +0100
Subject: [PATCH 5/6] Add __CHAR8_TYPE__ predefined macro to be compatible with
 GCC

---
 clang/lib/Frontend/InitPreprocessor.cpp | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/clang/lib/Frontend/InitPreprocessor.cpp b/clang/lib/Frontend/InitPreprocessor.cpp
index 81ec67ebaf7b1..92b1542265c40 100644
--- a/clang/lib/Frontend/InitPreprocessor.cpp
+++ b/clang/lib/Frontend/InitPreprocessor.cpp
@@ -1165,6 +1165,8 @@ static void InitializePredefinedMacros(const TargetInfo &TI,
   DefineType("__WCHAR_TYPE__", TI.getWCharType(), Builder);
   DefineType("__WINT_TYPE__", TI.getWIntType(), Builder);
   DefineTypeSizeAndWidth("__SIG_ATOMIC", TI.getSigAtomicType(), TI, Builder);
+  if (LangOpts.Char8 || LangOpts.C23)
+    DefineType("__CHAR8_TYPE__", TI.UnsignedChar, Builder);
   DefineType("__CHAR16_TYPE__", TI.getChar16Type(), Builder);
   DefineType("__CHAR32_TYPE__", TI.getChar32Type(), Builder);
 

>From 9253467882b8b665021f02f593f7dbfaaae8582e Mon Sep 17 00:00:00 2001
From: Mital Ashok <mital at mitalashok.co.uk>
Date: Sat, 6 Jul 2024 15:40:22 +0100
Subject: [PATCH 6/6] Tentatively remove macros/atomic_char8_t typedef in C++

---
 clang/lib/Frontend/InitPreprocessor.cpp | 2 +-
 clang/lib/Headers/stdatomic.h           | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/clang/lib/Frontend/InitPreprocessor.cpp b/clang/lib/Frontend/InitPreprocessor.cpp
index 92b1542265c40..7228f283994d5 100644
--- a/clang/lib/Frontend/InitPreprocessor.cpp
+++ b/clang/lib/Frontend/InitPreprocessor.cpp
@@ -1165,7 +1165,7 @@ static void InitializePredefinedMacros(const TargetInfo &TI,
   DefineType("__WCHAR_TYPE__", TI.getWCharType(), Builder);
   DefineType("__WINT_TYPE__", TI.getWIntType(), Builder);
   DefineTypeSizeAndWidth("__SIG_ATOMIC", TI.getSigAtomicType(), TI, Builder);
-  if (LangOpts.Char8 || LangOpts.C23)
+  if (LangOpts.C23)
     DefineType("__CHAR8_TYPE__", TI.UnsignedChar, Builder);
   DefineType("__CHAR16_TYPE__", TI.getChar16Type(), Builder);
   DefineType("__CHAR32_TYPE__", TI.getChar32Type(), Builder);
diff --git a/clang/lib/Headers/stdatomic.h b/clang/lib/Headers/stdatomic.h
index 79a0652c401b1..33dc3a1491d24 100644
--- a/clang/lib/Headers/stdatomic.h
+++ b/clang/lib/Headers/stdatomic.h
@@ -35,7 +35,7 @@ extern "C" {
 
 #define ATOMIC_BOOL_LOCK_FREE       __CLANG_ATOMIC_BOOL_LOCK_FREE
 #define ATOMIC_CHAR_LOCK_FREE       __CLANG_ATOMIC_CHAR_LOCK_FREE
-#ifdef __CLANG_ATOMIC_CHAR8_T_LOCK_FREE
+#if defined(__STDC_VERSION__) && __STDC_VERSION__ > 202311L
 #define ATOMIC_CHAR8_T_LOCK_FREE    __CLANG_ATOMIC_CHAR8_T_LOCK_FREE
 #endif
 #define ATOMIC_CHAR16_T_LOCK_FREE   __CLANG_ATOMIC_CHAR16_T_LOCK_FREE
@@ -107,7 +107,7 @@ typedef _Atomic(long)               atomic_long;
 typedef _Atomic(unsigned long)      atomic_ulong;
 typedef _Atomic(long long)          atomic_llong;
 typedef _Atomic(unsigned long long) atomic_ullong;
-#ifdef __CLANG_ATOMIC_CHAR8_T_LOCK_FREE
+#if defined(__STDC_VERSION__) && __STDC_VERSION__ > 202311L
 typedef _Atomic(unsigned char)      atomic_char8_t;
 #endif
 typedef _Atomic(uint_least16_t)     atomic_char16_t;



More information about the cfe-commits mailing list