[flang-commits] [flang] [flang] Support \u Unicode escape sequences (PR #76757)

Wed Jan 3 15:18:35 PST 2024

https://github.com/klausler updated https://github.com/llvm/llvm-project/pull/76757

>From 83f9711a76975a068e82e1f74ba0814e2450335e Mon Sep 17 00:00:00 2001
From: Peter Klausler <pklausler at nvidia.com>
Date: Tue, 2 Jan 2024 14:13:04 -0800
Subject: [PATCH] [flang] Support \u Unicode escape sequences

Support \uNNNN and \uNNNNNNNN escape sequences for
CHARACTER(KIND=2) and CHARACTER(KIND=4) literal constants
for better GNU Fortran compatibility.

Fixes llvm-test-suite/Fortran/gfortran/regression/achar_6.F90
and .../widechar_1.f90.
---
 flang/docs/Extensions.md                |  3 ++-
 flang/include/flang/Parser/characters.h | 17 +++++++++++++++++
 flang/lib/Evaluate/character.h          | 13 +++----------
 flang/lib/Parser/characters.cpp         | 25 ++++++++++++++++++++++++-
 flang/runtime/edit-input.cpp            | 14 ++++++++++++--
 flang/test/Semantics/modfile60.f90      | 19 +++++++++++++++++++
 6 files changed, 77 insertions(+), 14 deletions(-)
 create mode 100644 flang/test/Semantics/modfile60.f90

diff --git a/flang/docs/Extensions.md b/flang/docs/Extensions.md
index 16eb67f2e27c81..02ccd51dcb686a 100644
--- a/flang/docs/Extensions.md
+++ b/flang/docs/Extensions.md
@@ -322,7 +322,8 @@ end
 ### Extensions supported when enabled by options
 
 * C-style backslash escape sequences in quoted CHARACTER literals
-  (but not Hollerith) [-fbackslash]
+  (but not Hollerith) [-fbackslash], including Unicode escapes
+  with `\U`.
 * Logical abbreviations `.T.`, `.F.`, `.N.`, `.A.`, `.O.`, and `.X.`
   [-flogical-abbreviations]
 * `.XOR.` as a synonym for `.NEQV.` [-fxor-operator]
diff --git a/flang/include/flang/Parser/characters.h b/flang/include/flang/Parser/characters.h
index b3b82a4f0b9f7f..dae0d3e2a0cfff 100644
--- a/flang/include/flang/Parser/characters.h
+++ b/flang/include/flang/Parser/characters.h
@@ -237,6 +237,23 @@ void EmitQuotedChar(char32_t ch, const NORMAL &emit, const INSERTED &insert,
   }};
   if (ch <= 0x7f) {
     emitOneByte(ch);
+  } else if (useHexadecimalEscapeSequences) {
+    insert('\\');
+    insert('u');
+    if (ch > 0xffff) {
+      unsigned c1{(ch >> 28) & 0xf}, c2{(ch >> 24) & 0xf}, c3{(ch >> 20) & 0xf},
+          c4{(ch >> 16) & 0xf};
+      insert(c1 > 9 ? 'a' + c1 - 10 : '0' + c1);
+      insert(c2 > 9 ? 'a' + c2 - 10 : '0' + c2);
+      insert(c3 > 9 ? 'a' + c3 - 10 : '0' + c3);
+      insert(c4 > 9 ? 'a' + c4 - 10 : '0' + c4);
+    }
+    unsigned c1{(ch >> 12) & 0xf}, c2{(ch >> 8) & 0xf}, c3{(ch >> 4) & 0xf},
+        c4{ch & 0xf};
+    insert(c1 > 9 ? 'a' + c1 - 10 : '0' + c1);
+    insert(c2 > 9 ? 'a' + c2 - 10 : '0' + c2);
+    insert(c3 > 9 ? 'a' + c3 - 10 : '0' + c3);
+    insert(c4 > 9 ? 'a' + c4 - 10 : '0' + c4);
   } else {
     EncodedCharacter encoded{EncodeCharacter(encoding, ch)};
     for (int j{0}; j < encoded.bytes; ++j) {
diff --git a/flang/lib/Evaluate/character.h b/flang/lib/Evaluate/character.h
index ca24dd8e9413d2..2d6747741161bf 100644
--- a/flang/lib/Evaluate/character.h
+++ b/flang/lib/Evaluate/character.h
@@ -13,9 +13,7 @@
 #include <string>
 
 // Provides implementations of intrinsic functions operating on character
-// scalars. No assumption is made regarding character encodings other than they
-// must be compatible with ASCII (else, NEW_LINE, ACHAR and IACHAR need to be
-// adapted).
+// scalars.
 
 namespace Fortran::evaluate {
 
@@ -34,13 +32,8 @@ template <int KIND> class CharacterUtils {
   // contain ASCII
   static std::int64_t ICHAR(const Character &c) {
     CHECK(c.length() == 1);
-    if constexpr (std::is_same_v<CharT, char>) {
-      // char may be signed, so cast it first to unsigned to avoid having
-      // ichar(char(128_4)) returning -128
-      return static_cast<unsigned char>(c[0]);
-    } else {
-      return c[0];
-    }
+    // Convert first to an unsigned integer type to avoid sign extension
+    return static_cast<common::HostUnsignedIntType<(8 * KIND)>>(c[0]);
   }
 
   static Character NEW_LINE() { return Character{{NewLine()}}; }
diff --git a/flang/lib/Parser/characters.cpp b/flang/lib/Parser/characters.cpp
index dce20a4e5fe478..f6ac777ea874ca 100644
--- a/flang/lib/Parser/characters.cpp
+++ b/flang/lib/Parser/characters.cpp
@@ -235,7 +235,30 @@ template <Encoding ENCODING>
 DecodedCharacter DecodeCharacter(
     const char *cp, std::size_t bytes, bool backslashEscapes) {
   if (backslashEscapes && bytes >= 2 && *cp == '\\') {
-    return DecodeEscapedCharacters<ENCODING>(cp, bytes);
+    if (ENCODING == Encoding::UTF_8 && bytes >= 6 &&
+        ToLowerCaseLetter(cp[1]) == 'u' && IsHexadecimalDigit(cp[2]) &&
+        IsHexadecimalDigit(cp[3]) && IsHexadecimalDigit(cp[4]) &&
+        IsHexadecimalDigit(cp[5])) {
+      char32_t ch{
+          static_cast<char32_t>(4096 * HexadecimalDigitValue(cp[2]) +
+              256 * HexadecimalDigitValue(cp[3]) +
+              16 * HexadecimalDigitValue(cp[4]) + HexadecimalDigitValue(cp[5])),
+      };
+      if (bytes >= 10 && IsHexadecimalDigit(cp[6]) &&
+          IsHexadecimalDigit(cp[7]) && IsHexadecimalDigit(cp[8]) &&
+          IsHexadecimalDigit(cp[9])) {
+        return {(ch << 16) |
+                (4096 * HexadecimalDigitValue(cp[6]) +
+                    256 * HexadecimalDigitValue(cp[7]) +
+                    16 * HexadecimalDigitValue(cp[8]) +
+                    HexadecimalDigitValue(cp[9])),
+            10};
+      } else {
+        return {ch, 6};
+      }
+    } else {
+      return DecodeEscapedCharacters<ENCODING>(cp, bytes);
+    }
   } else {
     return DecodeRawCharacter<ENCODING>(cp, bytes);
   }
diff --git a/flang/runtime/edit-input.cpp b/flang/runtime/edit-input.cpp
index 6d4fa588cbf60d..71e7f4edbd0e19 100644
--- a/flang/runtime/edit-input.cpp
+++ b/flang/runtime/edit-input.cpp
@@ -976,7 +976,12 @@ bool EditCharacterInput(IoStatementState &io, const DataEdit &edit, CHAR *x,
       if (skipping) {
         --skipChars;
       } else if (auto ucs{DecodeUTF8(input)}) {
-        *x++ = *ucs;
+        if ((sizeof *x == 1 && *ucs > 0xff) ||
+            (sizeof *x == 2 && *ucs > 0xffff)) {
+          *x++ = '?';
+        } else {
+          *x++ = *ucs;
+        }
         --lengthChars;
       } else if (chunkBytes == 0) {
         // error recovery: skip bad encoding
@@ -990,7 +995,12 @@ bool EditCharacterInput(IoStatementState &io, const DataEdit &edit, CHAR *x,
       } else {
         char32_t buffer{0};
         std::memcpy(&buffer, input, chunkBytes);
-        *x++ = buffer;
+        if ((sizeof *x == 1 && buffer > 0xff) ||
+            (sizeof *x == 2 && buffer > 0xffff)) {
+          *x++ = '?';
+        } else {
+          *x++ = buffer;
+        }
         --lengthChars;
       }
     } else if constexpr (sizeof *x > 1) {
diff --git a/flang/test/Semantics/modfile60.f90 b/flang/test/Semantics/modfile60.f90
new file mode 100644
index 00000000000000..fdb0f8930fe063
--- /dev/null
+++ b/flang/test/Semantics/modfile60.f90
@@ -0,0 +1,19 @@
+! RUN: %python %S/test_modfile.py %s %flang_fc1 -fbackslash
+! Test Unicode escape sequences
+module m
+  integer, parameter :: wide = 4
+  character(kind=wide, len=20), parameter :: ch = wide_"\u1234 \u56789abc"
+  integer, parameter :: check(2) = [ iachar(ch(1:1)), iachar(ch(3:3)) ]
+  logical, parameter :: valid = all(check == [int(z'1234'), int(z'56789abc')])
+end
+
+!Expect: m.mod
+!module m
+!integer(4),parameter::wide=4_4
+!character(20_4,4),parameter::ch=4_"\341\210\264 \375\226\236\211\252\274                 "
+!integer(4),parameter::check(1_8:2_8)=[INTEGER(4)::4660_4,1450744508_4]
+!intrinsic::iachar
+!logical(4),parameter::valid=.true._4
+!intrinsic::all
+!intrinsic::int
+!end