[flang-commits] [flang] fdd989d - [flang] Fix UTF-8 minimality checks (#159142)

via flang-commits flang-commits at lists.llvm.org
Wed Sep 17 09:16:59 PDT 2025


Author: Peter Klausler
Date: 2025-09-17T09:16:55-07:00
New Revision: fdd989dac0f2bf06501dd115917f403cbfb9dc35

URL: https://github.com/llvm/llvm-project/commit/fdd989dac0f2bf06501dd115917f403cbfb9dc35
DIFF: https://github.com/llvm/llvm-project/commit/fdd989dac0f2bf06501dd115917f403cbfb9dc35.diff

LOG: [flang] Fix UTF-8 minimality checks (#159142)

UTF-8 encodings are required to be minimal, but the checks for
minimality of 3-byte and 4-byte sequences were incorrect. Fix.

Added: 
    flang/test/Parser/utf8-01.f90

Modified: 
    flang/lib/Parser/characters.cpp

Removed: 
    


################################################################################
diff  --git a/flang/lib/Parser/characters.cpp b/flang/lib/Parser/characters.cpp
index 1a00b16eefe9d..69b6d2ed5fafb 100644
--- a/flang/lib/Parser/characters.cpp
+++ b/flang/lib/Parser/characters.cpp
@@ -158,21 +158,24 @@ DecodedCharacter DecodeRawCharacter<Encoding::UTF_8>(
     const char *cp, std::size_t bytes) {
   auto p{reinterpret_cast<const std::uint8_t *>(cp)};
   char32_t ch{*p};
-  if (ch <= 0x7f) {
+  // Valid UTF-8 encodings must be minimal.
+  if (ch <= 0x7f) { // 1 byte: 7 bits of payload
     return {ch, 1};
-  } else if ((ch & 0xf8) == 0xf0 && bytes >= 4 && ch > 0xf0 &&
-      ((p[1] | p[2] | p[3]) & 0xc0) == 0x80) {
+  } else if ((ch & 0xf8) == 0xf0 && bytes >= 4 &&
+      ((p[1] | p[2] | p[3]) & 0xc0) == 0x80 && (ch > 0xf0 || p[1] > 0x8f)) {
+    // 4 bytes: 3+6+6+6=21 bits of payload
     ch = ((ch & 7) << 6) | (p[1] & 0x3f);
     ch = (ch << 6) | (p[2] & 0x3f);
     ch = (ch << 6) | (p[3] & 0x3f);
     return {ch, 4};
-  } else if ((ch & 0xf0) == 0xe0 && bytes >= 3 && ch > 0xe0 &&
-      ((p[1] | p[2]) & 0xc0) == 0x80) {
+  } else if ((ch & 0xf0) == 0xe0 && bytes >= 3 &&
+      ((p[1] | p[2]) & 0xc0) == 0x80 && (ch > 0xe0 || p[1] > 0x9f)) {
+    // 3 bytes: 4+6+6=16 bits of payload
     ch = ((ch & 0xf) << 6) | (p[1] & 0x3f);
     ch = (ch << 6) | (p[2] & 0x3f);
     return {ch, 3};
   } else if ((ch & 0xe0) == 0xc0 && bytes >= 2 && ch > 0xc0 &&
-      (p[1] & 0xc0) == 0x80) {
+      (p[1] & 0xc0) == 0x80) { // 2 bytes: 5+6=11 bits of payload
     ch = ((ch & 0x1f) << 6) | (p[1] & 0x3f);
     return {ch, 2};
   } else {

diff  --git a/flang/test/Parser/utf8-01.f90 b/flang/test/Parser/utf8-01.f90
new file mode 100644
index 0000000000000..3a3745524d807
--- /dev/null
+++ b/flang/test/Parser/utf8-01.f90
@@ -0,0 +1,15 @@
+!RUN: %flang_fc1 -fdebug-unparse %s 2>&1 | FileCheck %s
+
+character(kind=4), parameter :: c(2) = [character(kind=4) :: &
+4_'🍌', 4_'水' ]
+print *, '🍌'
+print *, 4_'🍌'
+print *, '水'
+print *, 4_'水'
+end
+
+!CHECK: CHARACTER(KIND=4_4), PARAMETER :: c(2_4) = [CHARACTER(KIND=4,LEN=1)::4_"\360\237\215\214",4_"\346\260\264"]
+!CHECK: PRINT *, "\360\237\215\214"
+!CHECK: PRINT *, 4_"\360\237\215\214"
+!CHECK: PRINT *, "\346\260\264"
+!CHECK: PRINT *, 4_"\346\260\264"


        


More information about the flang-commits mailing list