[clang] b91073d - [clang][preprocessor] Fix unsigned-ness of utf8 char literals

Timm Bäder via cfe-commits cfe-commits at lists.llvm.org
Thu May 12 23:05:13 PDT 2022


Author: Timm Bäder
Date: 2022-05-13T07:57:10+02:00
New Revision: b91073db6ac3b9abefcf6211ea755e55e5879991

URL: https://github.com/llvm/llvm-project/commit/b91073db6ac3b9abefcf6211ea755e55e5879991
DIFF: https://github.com/llvm/llvm-project/commit/b91073db6ac3b9abefcf6211ea755e55e5879991.diff

LOG: [clang][preprocessor] Fix unsigned-ness of utf8 char literals

UTF8 char literals are always unsigned.

Fixes https://github.com/llvm/llvm-project/issues/54886

Differential Revision: https://reviews.llvm.org/D124996

Added: 
    

Modified: 
    clang/docs/ReleaseNotes.rst
    clang/lib/Lex/PPExpressions.cpp
    clang/test/Lexer/utf8-char-literal.cpp

Removed: 
    


################################################################################
diff  --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst
index 7c25456adb25..0b0aba080e12 100644
--- a/clang/docs/ReleaseNotes.rst
+++ b/clang/docs/ReleaseNotes.rst
@@ -347,6 +347,8 @@ C++ Language Changes in Clang
   template parameter, to conform to the Itanium C++ ABI and be compatible with
   GCC. This breaks binary compatibility with code compiled with earlier versions
   of clang; use the ``-fclang-abi-compat=14`` option to get the old mangling.
+- Preprocessor character literals with a ``u8`` prefix are now correctly treated as
+  unsigned character literals. This fixes `Issue 54886 <https://github.com/llvm/llvm-project/issues/54886>`_.
 
 C++20 Feature Support
 ^^^^^^^^^^^^^^^^^^^^^

diff  --git a/clang/lib/Lex/PPExpressions.cpp b/clang/lib/Lex/PPExpressions.cpp
index 3c33369ed5f2..bd35689f18e7 100644
--- a/clang/lib/Lex/PPExpressions.cpp
+++ b/clang/lib/Lex/PPExpressions.cpp
@@ -408,9 +408,18 @@ static bool EvaluateValue(PPValue &Result, Token &PeekTok, DefinedTracker &DT,
     // Set the value.
     Val = Literal.getValue();
     // Set the signedness. UTF-16 and UTF-32 are always unsigned
+    // UTF-8 is unsigned if -fchar8_t is specified.
     if (Literal.isWide())
       Val.setIsUnsigned(!TargetInfo::isTypeSigned(TI.getWCharType()));
-    else if (!Literal.isUTF16() && !Literal.isUTF32())
+    else if (Literal.isUTF16() || Literal.isUTF32())
+      Val.setIsUnsigned(true);
+    else if (Literal.isUTF8()) {
+      if (PP.getLangOpts().CPlusPlus)
+        Val.setIsUnsigned(
+            PP.getLangOpts().Char8 ? true : !PP.getLangOpts().CharIsSigned);
+      else
+        Val.setIsUnsigned(true);
+    } else
       Val.setIsUnsigned(!PP.getLangOpts().CharIsSigned);
 
     if (Result.Val.getBitWidth() > Val.getBitWidth()) {

diff  --git a/clang/test/Lexer/utf8-char-literal.cpp b/clang/test/Lexer/utf8-char-literal.cpp
index ababc8b9c21b..70df447e2a33 100644
--- a/clang/test/Lexer/utf8-char-literal.cpp
+++ b/clang/test/Lexer/utf8-char-literal.cpp
@@ -1,7 +1,10 @@
-// RUN: %clang_cc1 -triple x86_64-apple-darwin -std=c++11 -fsyntax-only -verify %s
 // RUN: %clang_cc1 -triple x86_64-apple-darwin -std=c11 -x c -fsyntax-only -verify %s
 // RUN: %clang_cc1 -triple x86_64-apple-darwin -std=c2x -x c -fsyntax-only -verify %s
-// RUN: %clang_cc1 -triple x86_64-apple-darwin -std=c++1z -fsyntax-only -verify %s
+// RUN: %clang_cc1 -triple x86_64-apple-darwin -std=c++11 -fsyntax-only -verify %s
+// RUN: %clang_cc1 -triple x86_64-apple-darwin -std=c++17 -fsyntax-only -verify %s
+// RUN: %clang_cc1 -triple x86_64-apple-darwin -std=c++17 -fsyntax-only -fchar8_t -verify %s
+// RUN: %clang_cc1 -triple x86_64-apple-darwin -std=c++20 -fsyntax-only -verify %s
+// RUN: %clang_cc1 -triple x86_64-apple-darwin -std=c++20 -fsyntax-only -fno-char8_t -verify %s
 
 int array0[u'ñ' == u'\xf1'? 1 : -1];
 int array1['\xF1' !=  u'\xf1'? 1 : -1];
@@ -13,7 +16,7 @@ char c = u8'\u0080'; // expected-error {{character too large for enclosing chara
 char d = u8'\u1234'; // expected-error {{character too large for enclosing character literal type}}
 char e = u8'ሴ'; // expected-error {{character too large for enclosing character literal type}}
 char f = u8'ab'; // expected-error {{Unicode character literals may not contain multiple characters}}
-#elif __STDC_VERSION__ > 202000L
+#elif __STDC_VERSION__ >= 202000L
 char a = u8'ñ';      // expected-error {{character too large for enclosing character literal type}}
 char b = u8'\x80';   // ok
 char c = u8'\u0080'; // expected-error {{universal character name refers to a control character}}
@@ -26,3 +29,29 @@ _Static_assert(
              unsigned char : 1),
     "Surprise!");
 #endif
+
+
+// UTF-8 character literals are enabled in C++17 and later. If `-fchar8_t` is not enabled
+// (as is the case in C++17), then UTF-8 character literals may produce signed or
+// unsigned values depending on whether char is a signed type. If `-fchar8_t` is enabled
+// (which is the default behavior for C++20), then UTF-8 character literals always
+// produce unsigned values. The tests below depend on the target having a signed
+// 8-bit char so that '\xff' produces a negative value.
+#if __cplusplus >= 201703L
+#  if !defined(__cpp_char8_t)
+#    if !(u8'\xff' == '\xff')
+#      error UTF-8 character value did not match ordinary character literal; this is unexpected
+#    endif
+#  else
+#    if u8'\xff' == '\xff' // expected-warning {{right side of operator converted from negative value to unsigned}}
+#      error UTF-8 character value matched ordinary character literal; this is unexpected
+#    endif
+#  endif
+#endif
+
+/// In C2x, u8 char literals are always unsigned.
+#if __STDC_VERSION__ >= 202000L
+#  if u8'\xff' == '\xff'// expected-warning {{right side of operator converted from negative value to unsigned}}
+#    error u8 char literal is not unsigned
+#  endif
+#endif


        


More information about the cfe-commits mailing list