[flang-commits] [flang] [flang] Fix UTF-8 minimality checks (PR #159142)

Peter Klausler via flang-commits flang-commits at lists.llvm.org
Tue Sep 16 10:53:52 PDT 2025


https://github.com/klausler created https://github.com/llvm/llvm-project/pull/159142

UTF-8 encodings are required to be minimal, but the checks for minimality of 3-byte and 4-byte sequences were incorrect.  Fix.

>From 11d2a9b7d3df66782d373c1446f945cebfb2ff74 Mon Sep 17 00:00:00 2001
From: Peter Klausler <pklausler at nvidia.com>
Date: Tue, 16 Sep 2025 10:51:07 -0700
Subject: [PATCH] [flang] Fix UTF-8 minimality checks

UTF-8 encodings are required to be minimal, but the checks for
minimality of 3-byte and 4-byte sequences were incorrect.  Fix.
---
 flang/lib/Parser/characters.cpp | 15 +++++++++------
 flang/test/Parser/utf8-01.f90   | 15 +++++++++++++++
 2 files changed, 24 insertions(+), 6 deletions(-)
 create mode 100644 flang/test/Parser/utf8-01.f90

diff --git a/flang/lib/Parser/characters.cpp b/flang/lib/Parser/characters.cpp
index 1a00b16eefe9d..69b6d2ed5fafb 100644
--- a/flang/lib/Parser/characters.cpp
+++ b/flang/lib/Parser/characters.cpp
@@ -158,21 +158,24 @@ DecodedCharacter DecodeRawCharacter<Encoding::UTF_8>(
     const char *cp, std::size_t bytes) {
   auto p{reinterpret_cast<const std::uint8_t *>(cp)};
   char32_t ch{*p};
-  if (ch <= 0x7f) {
+  // Valid UTF-8 encodings must be minimal.
+  if (ch <= 0x7f) { // 1 byte: 7 bits of payload
     return {ch, 1};
-  } else if ((ch & 0xf8) == 0xf0 && bytes >= 4 && ch > 0xf0 &&
-      ((p[1] | p[2] | p[3]) & 0xc0) == 0x80) {
+  } else if ((ch & 0xf8) == 0xf0 && bytes >= 4 &&
+      ((p[1] | p[2] | p[3]) & 0xc0) == 0x80 && (ch > 0xf0 || p[1] > 0x8f)) {
+    // 4 bytes: 3+6+6+6=21 bits of payload
     ch = ((ch & 7) << 6) | (p[1] & 0x3f);
     ch = (ch << 6) | (p[2] & 0x3f);
     ch = (ch << 6) | (p[3] & 0x3f);
     return {ch, 4};
-  } else if ((ch & 0xf0) == 0xe0 && bytes >= 3 && ch > 0xe0 &&
-      ((p[1] | p[2]) & 0xc0) == 0x80) {
+  } else if ((ch & 0xf0) == 0xe0 && bytes >= 3 &&
+      ((p[1] | p[2]) & 0xc0) == 0x80 && (ch > 0xe0 || p[1] > 0x9f)) {
+    // 3 bytes: 4+6+6=16 bits of payload
     ch = ((ch & 0xf) << 6) | (p[1] & 0x3f);
     ch = (ch << 6) | (p[2] & 0x3f);
     return {ch, 3};
   } else if ((ch & 0xe0) == 0xc0 && bytes >= 2 && ch > 0xc0 &&
-      (p[1] & 0xc0) == 0x80) {
+      (p[1] & 0xc0) == 0x80) { // 2 bytes: 5+6=11 bits of payload
     ch = ((ch & 0x1f) << 6) | (p[1] & 0x3f);
     return {ch, 2};
   } else {
diff --git a/flang/test/Parser/utf8-01.f90 b/flang/test/Parser/utf8-01.f90
new file mode 100644
index 0000000000000..3a3745524d807
--- /dev/null
+++ b/flang/test/Parser/utf8-01.f90
@@ -0,0 +1,15 @@
+!RUN: %flang_fc1 -fdebug-unparse %s 2>&1 | FileCheck %s
+
+character(kind=4), parameter :: c(2) = [character(kind=4) :: &
+4_'🍌', 4_'水' ]
+print *, '🍌'
+print *, 4_'🍌'
+print *, '水'
+print *, 4_'水'
+end
+
+!CHECK: CHARACTER(KIND=4_4), PARAMETER :: c(2_4) = [CHARACTER(KIND=4,LEN=1)::4_"\360\237\215\214",4_"\346\260\264"]
+!CHECK: PRINT *, "\360\237\215\214"
+!CHECK: PRINT *, 4_"\360\237\215\214"
+!CHECK: PRINT *, "\346\260\264"
+!CHECK: PRINT *, 4_"\346\260\264"



More information about the flang-commits mailing list