[clang] [C99] Claim conformance to WG14 N717 (PR #87228)

Mon Apr 1 04:29:34 PDT 2024

https://github.com/AaronBallman created https://github.com/llvm/llvm-project/pull/87228

This was the paper that added Universal Character Names to C.

>From f8e130df9e602662444280346b71a82347ae8a29 Mon Sep 17 00:00:00 2001
From: Aaron Ballman <aaron at aaronballman.com>
Date: Mon, 1 Apr 2024 07:27:32 -0400
Subject: [PATCH] [C99] Claim conformance to WG14 N717

This was the paper that added Universal Character Names to C.
---
 clang/test/C/C99/n717.c  | 69 ++++++++++++++++++++++++++++++++++++++++
 clang/test/C/C99/n717.py | 39 +++++++++++++++++++++++
 clang/www/c_status.html  |  2 +-
 3 files changed, 109 insertions(+), 1 deletion(-)
 create mode 100644 clang/test/C/C99/n717.c
 create mode 100644 clang/test/C/C99/n717.py

diff --git a/clang/test/C/C99/n717.c b/clang/test/C/C99/n717.c
new file mode 100644
index 00000000000000..cc1aa0fd5d53cf
--- /dev/null
+++ b/clang/test/C/C99/n717.c
@@ -0,0 +1,69 @@
+// RUN: %clang_cc1 -verify -std=c99 %s
+// RUN: %clang_cc1 -verify -std=c99 -fno-dollars-in-identifiers %s
+
+/* WG14 N717: Clang 17
+ * Extended identifiers
+ */
+
+// Used as a sink for UCNs.
+#define M(arg)
+
+// C99 6.4.3p1 specifies the grammar for UCNs. A \u must be followed by exactly
+// four hex digits, and \U must be followed by exactly eight.
+M(\u1)    // expected-warning {{incomplete universal character name; treating as '\' followed by identifier}}
+M(\u12)   // expected-warning {{incomplete universal character name; treating as '\' followed by identifier}}
+M(\u123)  // expected-warning {{incomplete universal character name; treating as '\' followed by identifier}}
+M(\u1234) // Okay
+M(\u12345)// Okay, two tokens (UCN followed by 5)
+
+M(\U1)         // expected-warning {{incomplete universal character name; treating as '\' followed by identifier}}
+M(\U12)        // expected-warning {{incomplete universal character name; treating as '\' followed by identifier}}
+M(\U123)       // expected-warning {{incomplete universal character name; treating as '\' followed by identifier}}
+M(\U1234)      // expected-warning {{incomplete universal character name; treating as '\' followed by identifier}} \
+                  expected-note {{did you mean to use '\u'?}}
+M(\U12345)     // expected-warning {{incomplete universal character name; treating as '\' followed by identifier}}
+M(\U123456)    // expected-warning {{incomplete universal character name; treating as '\' followed by identifier}}
+M(\U1234567)   // expected-warning {{incomplete universal character name; treating as '\' followed by identifier}}
+M(\U12345678)  // Okay
+M(\U123456789) // Okay-ish, two tokens (valid-per-spec-but-actually-invalid UCN followed by 9)
+
+// C99 6.4.3p2:
+// A universal character name shall not specify a character whose short
+// identifier is less than 00A0 other than 0024 ($), 0040 (@), or 0060 (�), nor
+// one in the range D800 through DFFF inclusive.
+//
+// We use a python script to generate the test contents for the large ranges
+// without edge cases.
+// RUN: %python %S/n717.py >%t.inc
+// RUN: %clang_cc1 -verify -std=c99 -Wno-unicode-whitespace -Wno-unicode-homoglyph -Wno-unicode-zero-width -Wno-mathematical-notation-identifier-extension %t.inc
+
+// Now test the ones that should work. Note, these work in C17 and earlier but
+// are part of the basic character set in C23 and thus should be diagnosed in
+// that mode. They're valid in a character constant, but not valid in an
+// identifier, except for U+0024 which is allowed if -fdollars-in-identifiers
+// is enabled.
+// FIXME: These three should be handled the same way, and should be accepted
+// when dollar signs are allowed in identifiers, rather than rejected, see
+// GH87106.
+M(\u0024) // expected-error {{character '$' cannot be specified by a universal character name}}
+M(\U00000024) // expected-error {{character '$' cannot be specified by a universal character name}}
+M($)
+
+// These should always be rejected because they're not valid identifier
+// characters.
+// FIXME: the diagnostic could be improved to make it clear this is an issue
+// with forming an identifier rather than a UCN.
+M(\u0040) // expected-error {{character '@' cannot be specified by a universal character name}}
+M(\u0060) // expected-error {{character '`' cannot be specified by a universal character name}}
+M(\U00000040) // expected-error {{character '@' cannot be specified by a universal character name}}
+M(\U00000060) // expected-error {{character '`' cannot be specified by a universal character name}}
+
+// These should always be accepted because they're a valid in a character
+// constant.
+M('\u0024')
+M('\u0040')
+M('\u0060')
+
+M('\U00000024')
+M('\U00000040')
+M('\U00000060')
diff --git a/clang/test/C/C99/n717.py b/clang/test/C/C99/n717.py
new file mode 100644
index 00000000000000..8c02d336ff6f60
--- /dev/null
+++ b/clang/test/C/C99/n717.py
@@ -0,0 +1,39 @@
+print("#define M(arg)")
+
+def test(size):
+  Prefix = 'U' if size == 8 else 'u'
+  # [0x0000 to 0x00A0) excluding [0x0020, 0x007F)
+  for val in [val for val in range(0x0000, 0x00A0) if val < 0x0020 or val >= 0x007F]:
+     print(f'M(\\{Prefix}{val:0{size}X}) // expected-error {{{{universal character name refers to a control character}}}}')
+  print('')
+  
+  # [0x0020 to 0x007F), excluding 0x0024, 0x0040, and 0x0060
+  for val in [val for val in range(0x0020, 0x007F) if val != 0x0024 and val != 0x0040 and val != 0x0060]:
+     print(f"M(\\{Prefix}{val:0{size}X}) // expected-error {{{{character '{chr(val)}' cannot be specified by a universal character name}}}}")
+  print('')
+  
+  # [0xD800 to 0xDFFF]
+  for val in range(0xD800, 0xDFFF + 1):
+    print(f'M(\\{Prefix}{val:0{size}X}) // expected-error {{{{invalid universal character}}}}')
+  print('')
+  
+  # Everything in this range should be accepted, though it may produce a
+  # warning diagnostic for things like homoglyphs, whitespace, etc.
+  for val in range(0x00A1, 0xD800):
+    print(f'M(\\{Prefix}{val:0{size}X})')
+  print('')
+
+# Print \u tests
+test(4)
+# Print \U tests
+test(8)
+
+# Validate that the \U characters have the same identity as the \u characters
+# within the valid (short) range.
+# This is disabled because enabling the test 1) requires using L because u and
+# U don't exist until C11, 2) is questionable in terms of value because the
+# code points could be different if L isn't using a Unicode encoding, and 3)
+# this addition to the test adds 10x the execution time when running the test.
+#for val in range(0x00A1, 0xD800):
+#  print(f"_Static_assert(L'\\u{val:04X}' == L'\\U{val:08X}', \"\");")
+#print('')
diff --git a/clang/www/c_status.html b/clang/www/c_status.html
index 028234a8961db2..a14bfa2c1efb3d 100644
--- a/clang/www/c_status.html
+++ b/clang/www/c_status.html
@@ -203,7 +203,7 @@ <h2 id="c99">C99 implementation status</h2>
     <tr>
       <td>extended identifiers</td>
       <td><a href="https://www.open-std.org/jtc1/sc22/wg14/www/docs/n717.htm">N717</a></td>
-      <td class="unknown" align="center">Unknown</td>
+      <td class="full" align="center">Clang 17</td>
     </tr>
     <tr>
       <td>hexadecimal floating-point constants</td>