[clang] [C99] Claim conformance to WG14 N717 (PR #87228)

Fri Apr 5 10:50:25 PDT 2024

https://github.com/AaronBallman updated https://github.com/llvm/llvm-project/pull/87228

>From f8e130df9e602662444280346b71a82347ae8a29 Mon Sep 17 00:00:00 2001
From: Aaron Ballman <aaron at aaronballman.com>
Date: Mon, 1 Apr 2024 07:27:32 -0400
Subject: [PATCH 1/5] [C99] Claim conformance to WG14 N717

This was the paper that added Universal Character Names to C.
---
 clang/test/C/C99/n717.c  | 69 ++++++++++++++++++++++++++++++++++++++++
 clang/test/C/C99/n717.py | 39 +++++++++++++++++++++++
 clang/www/c_status.html  |  2 +-
 3 files changed, 109 insertions(+), 1 deletion(-)
 create mode 100644 clang/test/C/C99/n717.c
 create mode 100644 clang/test/C/C99/n717.py

diff --git a/clang/test/C/C99/n717.c b/clang/test/C/C99/n717.c
new file mode 100644
index 00000000000000..cc1aa0fd5d53cf
--- /dev/null
+++ b/clang/test/C/C99/n717.c
@@ -0,0 +1,69 @@
+// RUN: %clang_cc1 -verify -std=c99 %s
+// RUN: %clang_cc1 -verify -std=c99 -fno-dollars-in-identifiers %s
+
+/* WG14 N717: Clang 17
+ * Extended identifiers
+ */
+
+// Used as a sink for UCNs.
+#define M(arg)
+
+// C99 6.4.3p1 specifies the grammar for UCNs. A \u must be followed by exactly
+// four hex digits, and \U must be followed by exactly eight.
+M(\u1)    // expected-warning {{incomplete universal character name; treating as '\' followed by identifier}}
+M(\u12)   // expected-warning {{incomplete universal character name; treating as '\' followed by identifier}}
+M(\u123)  // expected-warning {{incomplete universal character name; treating as '\' followed by identifier}}
+M(\u1234) // Okay
+M(\u12345)// Okay, two tokens (UCN followed by 5)
+
+M(\U1)         // expected-warning {{incomplete universal character name; treating as '\' followed by identifier}}
+M(\U12)        // expected-warning {{incomplete universal character name; treating as '\' followed by identifier}}
+M(\U123)       // expected-warning {{incomplete universal character name; treating as '\' followed by identifier}}
+M(\U1234)      // expected-warning {{incomplete universal character name; treating as '\' followed by identifier}} \
+                  expected-note {{did you mean to use '\u'?}}
+M(\U12345)     // expected-warning {{incomplete universal character name; treating as '\' followed by identifier}}
+M(\U123456)    // expected-warning {{incomplete universal character name; treating as '\' followed by identifier}}
+M(\U1234567)   // expected-warning {{incomplete universal character name; treating as '\' followed by identifier}}
+M(\U12345678)  // Okay
+M(\U123456789) // Okay-ish, two tokens (valid-per-spec-but-actually-invalid UCN followed by 9)
+
+// C99 6.4.3p2:
+// A universal character name shall not specify a character whose short
+// identifier is less than 00A0 other than 0024 ($), 0040 (@), or 0060 (�), nor
+// one in the range D800 through DFFF inclusive.
+//
+// We use a python script to generate the test contents for the large ranges
+// without edge cases.
+// RUN: %python %S/n717.py >%t.inc
+// RUN: %clang_cc1 -verify -std=c99 -Wno-unicode-whitespace -Wno-unicode-homoglyph -Wno-unicode-zero-width -Wno-mathematical-notation-identifier-extension %t.inc
+
+// Now test the ones that should work. Note, these work in C17 and earlier but
+// are part of the basic character set in C23 and thus should be diagnosed in
+// that mode. They're valid in a character constant, but not valid in an
+// identifier, except for U+0024 which is allowed if -fdollars-in-identifiers
+// is enabled.
+// FIXME: These three should be handled the same way, and should be accepted
+// when dollar signs are allowed in identifiers, rather than rejected, see
+// GH87106.
+M(\u0024) // expected-error {{character '$' cannot be specified by a universal character name}}
+M(\U00000024) // expected-error {{character '$' cannot be specified by a universal character name}}
+M($)
+
+// These should always be rejected because they're not valid identifier
+// characters.
+// FIXME: the diagnostic could be improved to make it clear this is an issue
+// with forming an identifier rather than a UCN.
+M(\u0040) // expected-error {{character '@' cannot be specified by a universal character name}}
+M(\u0060) // expected-error {{character '`' cannot be specified by a universal character name}}
+M(\U00000040) // expected-error {{character '@' cannot be specified by a universal character name}}
+M(\U00000060) // expected-error {{character '`' cannot be specified by a universal character name}}
+
+// These should always be accepted because they're a valid in a character
+// constant.
+M('\u0024')
+M('\u0040')
+M('\u0060')
+
+M('\U00000024')
+M('\U00000040')
+M('\U00000060')
diff --git a/clang/test/C/C99/n717.py b/clang/test/C/C99/n717.py
new file mode 100644
index 00000000000000..8c02d336ff6f60
--- /dev/null
+++ b/clang/test/C/C99/n717.py
@@ -0,0 +1,39 @@
+print("#define M(arg)")
+
+def test(size):
+  Prefix = 'U' if size == 8 else 'u'
+  # [0x0000 to 0x00A0) excluding [0x0020, 0x007F)
+  for val in [val for val in range(0x0000, 0x00A0) if val < 0x0020 or val >= 0x007F]:
+     print(f'M(\\{Prefix}{val:0{size}X}) // expected-error {{{{universal character name refers to a control character}}}}')
+  print('')
+  
+  # [0x0020 to 0x007F), excluding 0x0024, 0x0040, and 0x0060
+  for val in [val for val in range(0x0020, 0x007F) if val != 0x0024 and val != 0x0040 and val != 0x0060]:
+     print(f"M(\\{Prefix}{val:0{size}X}) // expected-error {{{{character '{chr(val)}' cannot be specified by a universal character name}}}}")
+  print('')
+  
+  # [0xD800 to 0xDFFF]
+  for val in range(0xD800, 0xDFFF + 1):
+    print(f'M(\\{Prefix}{val:0{size}X}) // expected-error {{{{invalid universal character}}}}')
+  print('')
+  
+  # Everything in this range should be accepted, though it may produce a
+  # warning diagnostic for things like homoglyphs, whitespace, etc.
+  for val in range(0x00A1, 0xD800):
+    print(f'M(\\{Prefix}{val:0{size}X})')
+  print('')
+
+# Print \u tests
+test(4)
+# Print \U tests
+test(8)
+
+# Validate that the \U characters have the same identity as the \u characters
+# within the valid (short) range.
+# This is disabled because enabling the test 1) requires using L because u and
+# U don't exist until C11, 2) is questionable in terms of value because the
+# code points could be different if L isn't using a Unicode encoding, and 3)
+# this addition to the test adds 10x the execution time when running the test.
+#for val in range(0x00A1, 0xD800):
+#  print(f"_Static_assert(L'\\u{val:04X}' == L'\\U{val:08X}', \"\");")
+#print('')
diff --git a/clang/www/c_status.html b/clang/www/c_status.html
index 028234a8961db2..a14bfa2c1efb3d 100644
--- a/clang/www/c_status.html
+++ b/clang/www/c_status.html
@@ -203,7 +203,7 @@ <h2 id="c99">C99 implementation status</h2>
     <tr>
       <td>extended identifiers</td>
       <td><a href="https://www.open-std.org/jtc1/sc22/wg14/www/docs/n717.htm">N717</a></td>
-      <td class="unknown" align="center">Unknown</td>
+      <td class="full" align="center">Clang 17</td>
     </tr>
     <tr>
       <td>hexadecimal floating-point constants</td>

>From 9638a52a203c095f1b86622b26dcb25bd00a1366 Mon Sep 17 00:00:00 2001
From: Aaron Ballman <aaron at aaronballman.com>
Date: Fri, 5 Apr 2024 12:05:49 -0400
Subject: [PATCH 2/5] Update based on review comments

This removes the Python script as overkill and adds an additional test
case.
---
 clang/test/C/C99/n717.c  | 13 +++----------
 clang/test/C/C99/n717.py | 39 ---------------------------------------
 2 files changed, 3 insertions(+), 49 deletions(-)
 delete mode 100644 clang/test/C/C99/n717.py

diff --git a/clang/test/C/C99/n717.c b/clang/test/C/C99/n717.c
index cc1aa0fd5d53cf..3c9764bf2fb21c 100644
--- a/clang/test/C/C99/n717.c
+++ b/clang/test/C/C99/n717.c
@@ -27,16 +27,6 @@ M(\U1234567)   // expected-warning {{incomplete universal character name; treati
 M(\U12345678)  // Okay
 M(\U123456789) // Okay-ish, two tokens (valid-per-spec-but-actually-invalid UCN followed by 9)
 
-// C99 6.4.3p2:
-// A universal character name shall not specify a character whose short
-// identifier is less than 00A0 other than 0024 ($), 0040 (@), or 0060 (�), nor
-// one in the range D800 through DFFF inclusive.
-//
-// We use a python script to generate the test contents for the large ranges
-// without edge cases.
-// RUN: %python %S/n717.py >%t.inc
-// RUN: %clang_cc1 -verify -std=c99 -Wno-unicode-whitespace -Wno-unicode-homoglyph -Wno-unicode-zero-width -Wno-mathematical-notation-identifier-extension %t.inc
-
 // Now test the ones that should work. Note, these work in C17 and earlier but
 // are part of the basic character set in C23 and thus should be diagnosed in
 // that mode. They're valid in a character constant, but not valid in an
@@ -58,6 +48,9 @@ M(\u0060) // expected-error {{character '`' cannot be specified by a universal c
 M(\U00000040) // expected-error {{character '@' cannot be specified by a universal character name}}
 M(\U00000060) // expected-error {{character '`' cannot be specified by a universal character name}}
 
+// This falls outside of the expected range.
+M(\U110000) // expected-warning {{incomplete universal character name; treating as '\' followed by identifier}}
+
 // These should always be accepted because they're a valid in a character
 // constant.
 M('\u0024')
diff --git a/clang/test/C/C99/n717.py b/clang/test/C/C99/n717.py
deleted file mode 100644
index 8c02d336ff6f60..00000000000000
--- a/clang/test/C/C99/n717.py
+++ /dev/null
@@ -1,39 +0,0 @@
-print("#define M(arg)")
-
-def test(size):
-  Prefix = 'U' if size == 8 else 'u'
-  # [0x0000 to 0x00A0) excluding [0x0020, 0x007F)
-  for val in [val for val in range(0x0000, 0x00A0) if val < 0x0020 or val >= 0x007F]:
-     print(f'M(\\{Prefix}{val:0{size}X}) // expected-error {{{{universal character name refers to a control character}}}}')
-  print('')
-  
-  # [0x0020 to 0x007F), excluding 0x0024, 0x0040, and 0x0060
-  for val in [val for val in range(0x0020, 0x007F) if val != 0x0024 and val != 0x0040 and val != 0x0060]:
-     print(f"M(\\{Prefix}{val:0{size}X}) // expected-error {{{{character '{chr(val)}' cannot be specified by a universal character name}}}}")
-  print('')
-  
-  # [0xD800 to 0xDFFF]
-  for val in range(0xD800, 0xDFFF + 1):
-    print(f'M(\\{Prefix}{val:0{size}X}) // expected-error {{{{invalid universal character}}}}')
-  print('')
-  
-  # Everything in this range should be accepted, though it may produce a
-  # warning diagnostic for things like homoglyphs, whitespace, etc.
-  for val in range(0x00A1, 0xD800):
-    print(f'M(\\{Prefix}{val:0{size}X})')
-  print('')
-
-# Print \u tests
-test(4)
-# Print \U tests
-test(8)
-
-# Validate that the \U characters have the same identity as the \u characters
-# within the valid (short) range.
-# This is disabled because enabling the test 1) requires using L because u and
-# U don't exist until C11, 2) is questionable in terms of value because the
-# code points could be different if L isn't using a Unicode encoding, and 3)
-# this addition to the test adds 10x the execution time when running the test.
-#for val in range(0x00A1, 0xD800):
-#  print(f"_Static_assert(L'\\u{val:04X}' == L'\\U{val:08X}', \"\");")
-#print('')

>From 2237fa18f0ef04332b14c7940b6931d7bd02b7ff Mon Sep 17 00:00:00 2001
From: Aaron Ballman <aaron at aaronballman.com>
Date: Fri, 5 Apr 2024 13:18:17 -0400
Subject: [PATCH 3/5] Update the test based on offline discussions

Corentin pointed out that UCNs are converted in Phase 5 when they're
not part of an identifier, so I can't use my macro trick to test them.

Also added some tests for edge cases. Still claiming full support as of
Clang 17 for this despite there being some rough edges.
---
 clang/test/C/C99/n717.c | 22 ++++++++++++++--------
 1 file changed, 14 insertions(+), 8 deletions(-)

diff --git a/clang/test/C/C99/n717.c b/clang/test/C/C99/n717.c
index 3c9764bf2fb21c..da1c6dedb81bed 100644
--- a/clang/test/C/C99/n717.c
+++ b/clang/test/C/C99/n717.c
@@ -48,15 +48,21 @@ M(\u0060) // expected-error {{character '`' cannot be specified by a universal c
 M(\U00000040) // expected-error {{character '@' cannot be specified by a universal character name}}
 M(\U00000060) // expected-error {{character '`' cannot be specified by a universal character name}}
 
-// This falls outside of the expected range.
-M(\U110000) // expected-warning {{incomplete universal character name; treating as '\' followed by identifier}}
+// UCNs outside of identifiers are handled in Phase 5 of translation, so we
+// cannot use the macro expansion to test their behavior.
+
+// This is outside of the range of values specified by ISO 10646.
+int c1 = '\U00110000'; // expected-error {{invalid universal character}}
+// FIXME: this does not fall outside of the range and should work fine. This
+// character constant in C has type 'int' which can hold that value.
+int c2 = '\U0010FFFF'; // expected-error {{character too large for enclosing character literal type}}
 
 // These should always be accepted because they're a valid in a character
 // constant.
-M('\u0024')
-M('\u0040')
-M('\u0060')
+int c3 = '\u0024';
+int c4 = '\u0040';
+int c5 = '\u0060';
 
-M('\U00000024')
-M('\U00000040')
-M('\U00000060')
+int c6 = '\U00000024';
+int c7 = '\U00000040';
+int c8 = '\U00000060';

>From b870898c151334376401e28ffb0a6b79cd94a6b4 Mon Sep 17 00:00:00 2001
From: Aaron Ballman <aaron at aaronballman.com>
Date: Fri, 5 Apr 2024 13:29:01 -0400
Subject: [PATCH 4/5] Switch from character constant to string constant

This removes the FIXME, which is better handled via filing an issue.
---
 clang/test/C/C99/n717.c | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/clang/test/C/C99/n717.c b/clang/test/C/C99/n717.c
index da1c6dedb81bed..f13df3fc9cf89f 100644
--- a/clang/test/C/C99/n717.c
+++ b/clang/test/C/C99/n717.c
@@ -52,10 +52,9 @@ M(\U00000060) // expected-error {{character '`' cannot be specified by a univers
 // cannot use the macro expansion to test their behavior.
 
 // This is outside of the range of values specified by ISO 10646.
-int c1 = '\U00110000'; // expected-error {{invalid universal character}}
-// FIXME: this does not fall outside of the range and should work fine. This
-// character constant in C has type 'int' which can hold that value.
-int c2 = '\U0010FFFF'; // expected-error {{character too large for enclosing character literal type}}
+const char *c1 = "\U00110000"; // expected-error {{invalid universal character}}
+// This does not fall outside of the range
+const char *c2 = "\U0010FFFF";
 
 // These should always be accepted because they're a valid in a character
 // constant.

>From 10db0df1f7e0fabc5ab6fe0773f761bcc7230e9d Mon Sep 17 00:00:00 2001
From: Aaron Ballman <aaron at aaronballman.com>
Date: Fri, 5 Apr 2024 13:50:00 -0400
Subject: [PATCH 5/5] Add some more tests based on offline discussions

---
 clang/test/C/C99/n717.c | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/clang/test/C/C99/n717.c b/clang/test/C/C99/n717.c
index f13df3fc9cf89f..25010b41370655 100644
--- a/clang/test/C/C99/n717.c
+++ b/clang/test/C/C99/n717.c
@@ -65,3 +65,11 @@ int c5 = '\u0060';
 int c6 = '\U00000024';
 int c7 = '\U00000040';
 int c8 = '\U00000060';
+
+// Valid lone surrogates.
+M(\uD799)
+const char *c9 = "\U0000E000";
+
+// Invalid lone surrogates, which are excluded explicitly by 6.4.3p2.
+M(\uD800) // expected-error {{invalid universal character}}
+const char *c10  = "\U0000DFFF"; // expected-error {{invalid universal character}}