[libcxx-commits] [libcxx] 20341c3 - [libc++][format] Adds a UTF transcoder.

Mark de Wever via libcxx-commits libcxx-commits at lists.llvm.org
Tue Jul 11 11:28:26 PDT 2023


Author: Mark de Wever
Date: 2023-07-11T20:28:19+02:00
New Revision: 20341c3ad6f64a2a61d0e38d0cdafd356a5b6cbb

URL: https://github.com/llvm/llvm-project/commit/20341c3ad6f64a2a61d0e38d0cdafd356a5b6cbb
DIFF: https://github.com/llvm/llvm-project/commit/20341c3ad6f64a2a61d0e38d0cdafd356a5b6cbb.diff

LOG: [libc++][format] Adds a UTF transcoder.

This is a preparation for

  P2093R14 Formatted output

When the output of print is to the terminal it needs to use the native
API. This means transcoding UTF-8 to UTF-16 on Windows. The encoder's
interface is modeled after

 P2728 Unicode in the Library, Part 1: UTF Transcoding

But only the required part for P2093R14 is implemented.

On Windows wchar_t is 16 bits, in order to test on platforms where
wchar_t is 32 bits the transcoder has support for char16_t. It also adds
and UTF-8 to UTF-32 encoder which is useful for other tests.

Note it is possible to use <codecvt> for transcoding, but that header is
deprecated. So rather write new code that is not deprecated; the hard
part, decoding, has already been done. The <codecvt> header also
requires locale support while the new code works without including
<locale>.

Note the current transcoder implementation can be optimized since it
basically does UTF-8 -> UTF-32 -> UTF-16. The first goal is to have a
working implementation. Since it's not part of the ABI it's possible to
do the optimization later.

Depends on D149672

Reviewed By: ldionne, tahonermann, #libc

Differential Revision: https://reviews.llvm.org/D150031

Added: 
    libcxx/include/print
    libcxx/test/libcxx/input.output/iostream.format/print.fun/transcoding.pass.cpp

Modified: 
    libcxx/include/CMakeLists.txt
    libcxx/include/module.modulemap.in
    libcxx/modules/std/print.cppm
    libcxx/test/libcxx/transitive_includes/cxx03.csv
    libcxx/test/libcxx/transitive_includes/cxx11.csv
    libcxx/test/libcxx/transitive_includes/cxx14.csv
    libcxx/test/libcxx/transitive_includes/cxx17.csv
    libcxx/test/libcxx/transitive_includes/cxx20.csv
    libcxx/test/libcxx/transitive_includes/cxx23.csv
    libcxx/test/libcxx/transitive_includes/cxx26.csv
    libcxx/utils/ci/run-buildbot

Removed: 
    


################################################################################
diff  --git a/libcxx/include/CMakeLists.txt b/libcxx/include/CMakeLists.txt
index 8d2cfc8f61ef87..29f86d96f678f7 100644
--- a/libcxx/include/CMakeLists.txt
+++ b/libcxx/include/CMakeLists.txt
@@ -956,6 +956,7 @@ set(files
   numeric
   optional
   ostream
+  print
   queue
   random
   ranges

diff  --git a/libcxx/include/module.modulemap.in b/libcxx/include/module.modulemap.in
index 62e391df5dc2eb..1992f36884e8cc 100644
--- a/libcxx/include/module.modulemap.in
+++ b/libcxx/include/module.modulemap.in
@@ -1368,6 +1368,10 @@ module std [system] {
     // FIXME: should re-export ios, streambuf?
     export *
   }
+  module print {
+    header "print"
+    export *
+  }
   module queue {
     header "queue"
     export *

diff  --git a/libcxx/include/print b/libcxx/include/print
new file mode 100644
index 00000000000000..6fcd53c5db65a6
--- /dev/null
+++ b/libcxx/include/print
@@ -0,0 +1,119 @@
+// -*- C++ -*-
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef _LIBCPP_PRINT
+#define _LIBCPP_PRINT
+
+#include <__assert> // all public C++ headers provide the assertion handler
+#include <__concepts/same_as.h>
+#include <__config>
+#include <__format/unicode.h>
+#include <string_view>
+#include <version>
+
+#if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
+#  pragma GCC system_header
+#endif
+
+_LIBCPP_BEGIN_NAMESPACE_STD
+
+#if _LIBCPP_STD_VER >= 23
+
+#  ifndef _LIBCPP_HAS_NO_UNICODE
+// This is the code to transcode UTF-8 to UTF-16. This is used on
+// Windows for the native Unicode API. The code is modeled to make it
+// easier to extend to
+//
+//  P2728R0 Unicode in the Library, Part 1: UTF Transcoding
+//
+// This paper is still under heavy development so it makes no sense yet
+// to strictly follow the paper.
+namespace __unicode {
+
+// The names of these concepts are modelled after P2728R0, but the
+// implementation is not. char16_t may contain 32-bits so depending on the
+// number of bits is an issue.
+#    ifdef _LIBCPP_SHORT_WCHAR
+template <class _Tp>
+concept __utf16_code_unit =
+    same_as<_Tp, char16_t>
+#      ifndef _LIBCPP_HAS_NO_WIDE_CHARACTERS
+    || same_as<_Tp, wchar_t>
+#      endif
+    ;
+template <class _Tp>
+concept __utf32_code_unit = same_as<_Tp, char32_t>;
+#    else // _LIBCPP_SHORT_WCHAR
+template <class _Tp>
+concept __utf16_code_unit = same_as<_Tp, char16_t>;
+template <class _Tp>
+concept __utf32_code_unit =
+    same_as<_Tp, char32_t>
+#      ifndef _LIBCPP_HAS_NO_WIDE_CHARACTERS
+    || same_as<_Tp, wchar_t>
+#      endif
+    ;
+#    endif // _LIBCPP_SHORT_WCHAR
+
+// Pass by reference since an output_iterator may not be copyable.
+template <class _OutIt>
+_LIBCPP_HIDE_FROM_ABI constexpr void __encode(_OutIt&, char32_t) = delete;
+
+template <class _OutIt>
+  requires __utf16_code_unit<iter_value_t<_OutIt>>
+_LIBCPP_HIDE_FROM_ABI constexpr void __encode(_OutIt& __out_it, char32_t __value) {
+  _LIBCPP_ASSERT(__is_scalar_value(__value), "an invalid unicode scalar value results in invalid UTF-16");
+
+  if (__value < 0x10000) {
+    *__out_it++ = __value;
+    return;
+  }
+
+  __value -= 0x10000;
+  *__out_it++ = 0xd800 + (__value >> 10);
+  *__out_it++ = 0xdc00 + (__value & 0x3FF);
+}
+
+template <class _OutIt>
+  requires __utf32_code_unit<iter_value_t<_OutIt>>
+_LIBCPP_HIDE_FROM_ABI constexpr void __encode(_OutIt& __out_it, char32_t __value) {
+  _LIBCPP_ASSERT(__is_scalar_value(__value), "an invalid unicode scalar value results in invalid UTF-32");
+  *__out_it++ = __value;
+}
+
+template <class _OutIt, input_iterator _InIt>
+  requires output_iterator<_OutIt, const iter_value_t<_OutIt>&> && (!same_as<iter_value_t<_OutIt>, iter_value_t<_InIt>>)
+_LIBCPP_HIDE_FROM_ABI constexpr _OutIt __transcode(_InIt __first, _InIt __last, _OutIt __out_it) {
+  // The __code_point_view has a basic_string_view interface.
+  // When transcoding becomes part of the standard we probably want to
+  // look at smarter algorithms.
+  // For example, when processing a code point that is encoded in
+  // 1 to 3 code units in UTF-8, the result will always be encoded
+  // in 1 code unit in UTF-16 (code points that require 4 code
+  // units in UTF-8 will require 2 code units in UTF-16).
+  //
+  // Note if P2728 is accepted types like int may become valid. In that case
+  // the __code_point_view should use a span. Libc++ will remove support for
+  // char_traits<int>.
+  basic_string_view<iter_value_t<_InIt>> __data{__first, __last};
+  __code_point_view<iter_value_t<_InIt>> __view{__data.begin(), __data.end()};
+  while (!__view.__at_end())
+    __unicode::__encode(__out_it, __view.__consume().__code_point);
+  return __out_it;
+}
+
+} // namespace __unicode
+
+#  endif //  _LIBCPP_HAS_NO_UNICODE
+
+#endif // _LIBCPP_STD_VER >= 23
+
+_LIBCPP_END_NAMESPACE_STD
+
+#endif // _LIBCPP_PRINT

diff  --git a/libcxx/modules/std/print.cppm b/libcxx/modules/std/print.cppm
index 8ec495a7434e22..21de1cd7019be5 100644
--- a/libcxx/modules/std/print.cppm
+++ b/libcxx/modules/std/print.cppm
@@ -8,10 +8,7 @@
 //===----------------------------------------------------------------------===//
 
 module;
-#if __has_include(<print>)
-#  error "include this header unconditionally and uncomment the exported symbols"
-#  include <print>
-#endif
+#include <print>
 
 export module std:print;
 export namespace std {

diff  --git a/libcxx/test/libcxx/input.output/iostream.format/print.fun/transcoding.pass.cpp b/libcxx/test/libcxx/input.output/iostream.format/print.fun/transcoding.pass.cpp
new file mode 100644
index 00000000000000..cde66ef7bcd90d
--- /dev/null
+++ b/libcxx/test/libcxx/input.output/iostream.format/print.fun/transcoding.pass.cpp
@@ -0,0 +1,87 @@
+//===----------------------------------------------------------------------===//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// UNSUPPORTED: c++03, c++11, c++14, c++17, c++20
+// UNSUPPORTED: GCC-ALWAYS_INLINE-FIXME
+
+// <print>
+
+// Tests the UTF-8 to UTF-16/32 encoding.
+// UTF-16 is used on Windows to write to the Unicode API.
+// UTF-32 is used to test the Windows behaviour on Linux using 32-bit wchar_t.
+
+#include <algorithm>
+#include <array>
+#include <cassert>
+#include <print>
+#include <string_view>
+
+#include "test_macros.h"
+#include "make_string.h"
+
+#define SV(S) MAKE_STRING_VIEW(CharT, S)
+
+template <class CharT>
+constexpr void test(std::basic_string_view<CharT> expected, std::string_view input) {
+  assert(expected.size() < 1024);
+  std::array<CharT, 1024> buffer;
+  std::ranges::fill(buffer, CharT('*'));
+
+  CharT* out = std::__unicode::__transcode(input.begin(), input.end(), buffer.data());
+
+  assert(std::basic_string_view<CharT>(buffer.data(), out) == expected);
+
+  out = std::find_if(out, buffer.end(), [](CharT c) { return c != CharT('*'); });
+  assert(out == buffer.end());
+}
+
+template <class CharT>
+constexpr void test() {
+  // *** Test valid UTF-8 ***
+#define TEST(S) test(SV(S), S)
+  TEST("hello world");
+  // copied from benchmarks/std_format_spec_string_unicode.bench.cpp
+  TEST("Lorem ipsum dolor sit amet, ne sensibus evertitur aliquando his. Iuvaret fabulas qui ex.");
+  TEST("Lōrem ipsūm dolor sīt æmeÞ, ea vel nostrud feuġǣit, muciūs tēmporiȝusrefērrēnÞur no mel.");
+  TEST("Лорем ипсум долор сит амет, еу диам тамяуам принципес вис, еяуидем цонцептам диспутандо");
+  TEST("入ト年媛ろ舗学ラロ準募ケカ社金スノ屋検れう策他セヲシ引口ぎ集7独ぱクふ出車ぽでぱ円輪ルノ受打わ。");
+  TEST("\U0001f636\u200d\U0001f32b\ufe0f");
+#undef TEST
+
+  // *** Test invalid UTF-8 ***
+  test(SV("\ufffd"), "\xc3");
+  test(SV("\ufffd("), "\xc3\x28");
+
+  // Surrogate range
+  test(SV("\ufffd"), "\xed\xa0\x80"); // U+D800
+  test(SV("\ufffd"), "\xed\xaf\xbf"); // U+DBFF
+  test(SV("\ufffd"), "\xed\xbf\x80"); // U+DC00
+  test(SV("\ufffd"), "\xed\xbf\xbf"); // U+DFFF
+
+  // Beyond valid values
+  test(SV("\ufffd"), "\xf4\x90\x80\x80"); // U+110000
+  test(SV("\ufffd"), "\xf4\xbf\xbf\xbf"); // U+11FFFF
+
+  // Validates http://unicode.org/review/pr-121.html option 3.
+  test(SV("\u0061\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\u0062"), "\x61\xF1\x80\x80\xE1\x80\xC2\x62");
+}
+
+constexpr bool test() {
+  test<char16_t>();
+  test<char32_t>();
+#if !defined(TEST_HAS_NO_WIDE_CHARACTERS)
+  test<wchar_t>();
+#endif
+  return true;
+}
+
+int main(int, char**) {
+  test();
+  static_assert(test());
+
+  return 0;
+}

diff  --git a/libcxx/test/libcxx/transitive_includes/cxx03.csv b/libcxx/test/libcxx/transitive_includes/cxx03.csv
index 90c0885f64982f..a833d81498aa52 100644
--- a/libcxx/test/libcxx/transitive_includes/cxx03.csv
+++ b/libcxx/test/libcxx/transitive_includes/cxx03.csv
@@ -635,6 +635,12 @@ ostream string
 ostream type_traits
 ostream typeinfo
 ostream version
+print cstddef
+print cstdint
+print initializer_list
+print limits
+print string_view
+print version
 queue compare
 queue concepts
 queue cstddef

diff  --git a/libcxx/test/libcxx/transitive_includes/cxx11.csv b/libcxx/test/libcxx/transitive_includes/cxx11.csv
index d122ee7edd4230..9b6dab02c40877 100644
--- a/libcxx/test/libcxx/transitive_includes/cxx11.csv
+++ b/libcxx/test/libcxx/transitive_includes/cxx11.csv
@@ -636,6 +636,12 @@ ostream string
 ostream type_traits
 ostream typeinfo
 ostream version
+print cstddef
+print cstdint
+print initializer_list
+print limits
+print string_view
+print version
 queue compare
 queue concepts
 queue cstddef

diff  --git a/libcxx/test/libcxx/transitive_includes/cxx14.csv b/libcxx/test/libcxx/transitive_includes/cxx14.csv
index 0ca9ff6aa1d8fa..f80e0f3946dc26 100644
--- a/libcxx/test/libcxx/transitive_includes/cxx14.csv
+++ b/libcxx/test/libcxx/transitive_includes/cxx14.csv
@@ -638,6 +638,12 @@ ostream string
 ostream type_traits
 ostream typeinfo
 ostream version
+print cstddef
+print cstdint
+print initializer_list
+print limits
+print string_view
+print version
 queue compare
 queue concepts
 queue cstddef

diff  --git a/libcxx/test/libcxx/transitive_includes/cxx17.csv b/libcxx/test/libcxx/transitive_includes/cxx17.csv
index 0ca9ff6aa1d8fa..f80e0f3946dc26 100644
--- a/libcxx/test/libcxx/transitive_includes/cxx17.csv
+++ b/libcxx/test/libcxx/transitive_includes/cxx17.csv
@@ -638,6 +638,12 @@ ostream string
 ostream type_traits
 ostream typeinfo
 ostream version
+print cstddef
+print cstdint
+print initializer_list
+print limits
+print string_view
+print version
 queue compare
 queue concepts
 queue cstddef

diff  --git a/libcxx/test/libcxx/transitive_includes/cxx20.csv b/libcxx/test/libcxx/transitive_includes/cxx20.csv
index a1b9ccc99b74e4..4cfa114b8ecf23 100644
--- a/libcxx/test/libcxx/transitive_includes/cxx20.csv
+++ b/libcxx/test/libcxx/transitive_includes/cxx20.csv
@@ -644,6 +644,12 @@ ostream string
 ostream type_traits
 ostream typeinfo
 ostream version
+print cstddef
+print cstdint
+print initializer_list
+print limits
+print string_view
+print version
 queue compare
 queue concepts
 queue cstddef

diff  --git a/libcxx/test/libcxx/transitive_includes/cxx23.csv b/libcxx/test/libcxx/transitive_includes/cxx23.csv
index 5d68887113e895..0509207f621282 100644
--- a/libcxx/test/libcxx/transitive_includes/cxx23.csv
+++ b/libcxx/test/libcxx/transitive_includes/cxx23.csv
@@ -453,6 +453,12 @@ ostream streambuf
 ostream string
 ostream typeinfo
 ostream version
+print cstddef
+print cstdint
+print initializer_list
+print limits
+print string_view
+print version
 queue compare
 queue cstddef
 queue cstdint

diff  --git a/libcxx/test/libcxx/transitive_includes/cxx26.csv b/libcxx/test/libcxx/transitive_includes/cxx26.csv
index 5d68887113e895..0509207f621282 100644
--- a/libcxx/test/libcxx/transitive_includes/cxx26.csv
+++ b/libcxx/test/libcxx/transitive_includes/cxx26.csv
@@ -453,6 +453,12 @@ ostream streambuf
 ostream string
 ostream typeinfo
 ostream version
+print cstddef
+print cstdint
+print initializer_list
+print limits
+print string_view
+print version
 queue compare
 queue cstddef
 queue cstdint

diff  --git a/libcxx/utils/ci/run-buildbot b/libcxx/utils/ci/run-buildbot
index 9d8c65f2a10f61..e1b7809df55cc4 100755
--- a/libcxx/utils/ci/run-buildbot
+++ b/libcxx/utils/ci/run-buildbot
@@ -247,6 +247,7 @@ check-generated-output)
            --exclude 'locale-specific_form.pass.cpp' \
            --exclude 'ostream.pass.cpp' \
            --exclude 'std_format_spec_string_unicode.bench.cpp' \
+           --exclude 'transcoding.pass.cpp' \
            --exclude 'underflow.pass.cpp' \
            || false
 


        


More information about the libcxx-commits mailing list