[PATCH] D141392: Avoid u8"" literals in tests, their type changes in C++20

Jens Massberg via Phabricator via llvm-commits llvm-commits at lists.llvm.org
Thu Jan 12 01:22:07 PST 2023


This revision was landed with ongoing or failed builds.
This revision was automatically updated to reflect the committed changes.
Closed by commit rGd1e0fd2b1326: Avoid u8"" literals in tests, their type changes in C++20 (authored by massberg).

Repository:
  rG LLVM Github Monorepo

CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D141392/new/

https://reviews.llvm.org/D141392

Files:
  llvm/include/llvm/Support/raw_ostream.h
  llvm/unittests/Support/formatted_raw_ostream_test.cpp


Index: llvm/unittests/Support/formatted_raw_ostream_test.cpp
===================================================================
--- llvm/unittests/Support/formatted_raw_ostream_test.cpp
+++ llvm/unittests/Support/formatted_raw_ostream_test.cpp
@@ -92,34 +92,34 @@
   formatted_raw_ostream C(B);
 
   // U+00A0 Non-breaking space: encoded as two bytes, but only one column wide.
-  C << u8"\u00a0";
+  C << "\xc2\xa0";
   EXPECT_EQ(0U, C.getLine());
   EXPECT_EQ(1U, C.getColumn());
   EXPECT_EQ(2U, C.GetNumBytesInBuffer());
 
   // U+2468 CIRCLED DIGIT NINE: encoded as three bytes, but only one column
   // wide.
-  C << u8"\u2468";
+  C << reinterpret_cast<const char *>(u8"\u2468");
   EXPECT_EQ(0U, C.getLine());
   EXPECT_EQ(2U, C.getColumn());
   EXPECT_EQ(5U, C.GetNumBytesInBuffer());
 
   // U+00010000 LINEAR B SYLLABLE B008 A: encoded as four bytes, but only one
   // column wide.
-  C << u8"\U00010000";
+  C << reinterpret_cast<const char *>(u8"\U00010000");
   EXPECT_EQ(0U, C.getLine());
   EXPECT_EQ(3U, C.getColumn());
   EXPECT_EQ(9U, C.GetNumBytesInBuffer());
 
   // U+55B5, CJK character, encodes as three bytes, takes up two columns.
-  C << u8"\u55b5";
+  C << reinterpret_cast<const char *>(u8"\u55b5");
   EXPECT_EQ(0U, C.getLine());
   EXPECT_EQ(5U, C.getColumn());
   EXPECT_EQ(12U, C.GetNumBytesInBuffer());
 
   // U+200B, zero-width space, encoded as three bytes but has no effect on the
   // column or line number.
-  C << u8"\u200b";
+  C << reinterpret_cast<const char *>(u8"\u200b");
   EXPECT_EQ(0U, C.getLine());
   EXPECT_EQ(5U, C.getColumn());
   EXPECT_EQ(15U, C.GetNumBytesInBuffer());
@@ -137,7 +137,7 @@
   // the remaining two bytes are written, at which point we can check the
   // display width. In this case the display width is 1, so we end at column 4,
   // with 6 bytes written into total, 2 of which are in the buffer.
-  C << u8"123\u2468";
+  C << reinterpret_cast<const char *>(u8"123\u2468");
   EXPECT_EQ(0U, C.getLine());
   EXPECT_EQ(4U, C.getColumn());
   EXPECT_EQ(2U, C.GetNumBytesInBuffer());
@@ -145,7 +145,7 @@
   EXPECT_EQ(6U, A.size());
 
   // Same as above, but with a CJK character which displays as two columns.
-  C << u8"123\u55b5";
+  C << reinterpret_cast<const char *>(u8"123\u55b5");
   EXPECT_EQ(0U, C.getLine());
   EXPECT_EQ(9U, C.getColumn());
   EXPECT_EQ(2U, C.GetNumBytesInBuffer());
@@ -161,7 +161,7 @@
 
   // The stream has a one-byte buffer, so it gets flushed multiple times while
   // printing a single Unicode character.
-  C << u8"\u2468";
+  C << "\xe2\x91\xa8";
   EXPECT_EQ(0U, C.getLine());
   EXPECT_EQ(1U, C.getColumn());
   EXPECT_EQ(0U, C.GetNumBytesInBuffer());
Index: llvm/include/llvm/Support/raw_ostream.h
===================================================================
--- llvm/include/llvm/Support/raw_ostream.h
+++ llvm/include/llvm/Support/raw_ostream.h
@@ -224,6 +224,20 @@
     return *this;
   }
 
+#if defined(__cpp_char8_t)
+  // When using `char8_t *` integers or pointers are written to the ostream
+  // instead of UTF-8 code as one might expect. This might lead to unexpected
+  // behavior, especially as `u8""` literals are of type `char8_t*` instead of
+  // type `char_t*` from C++20 onwards. Thus we disallow using them with
+  // raw_ostreams.
+  // If you have u8"" literals to stream, you can rewrite them as ordinary
+  // literals with escape sequences
+  // e.g.  replace `u8"\u00a0"` by `"\xc2\xa0"`
+  // or use `reinterpret_cast`:
+  // e.g. replace `u8"\u00a0"` by `reinterpret_cast<const char *>(u8"\u00a0")`
+  raw_ostream &operator<<(const char8_t *Str) = delete;
+#endif
+
   raw_ostream &operator<<(const char *Str) {
     // Inline fast path, particularly for constant strings where a sufficiently
     // smart compiler will simplify strlen.


-------------- next part --------------
A non-text attachment was scrubbed...
Name: D141392.488528.patch
Type: text/x-patch
Size: 3791 bytes
Desc: not available
URL: <http://lists.llvm.org/pipermail/llvm-commits/attachments/20230112/dcf20526/attachment.bin>


More information about the llvm-commits mailing list