[libcxx-commits] [libcxx] [libc++] Mostly Implement P1885R12: `<text_encoding>` (PR #141312)
William Tran-Viet via libcxx-commits
libcxx-commits at lists.llvm.org
Wed Mar 25 20:18:25 PDT 2026
================
@@ -0,0 +1,313 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include <__config>
+#include <locale>
+#include <text_encoding>
+
+#if defined(_LIBCPP_WIN32API)
+# include <__algorithm/max.h>
+# include <cwchar>
+# include <windows.h>
+#else
+# include <__locale_dir/locale_base_api.h>
+# include <__utility/scope_guard.h>
+#endif
+
+_LIBCPP_BEGIN_NAMESPACE_STD
+
+#if defined(_LIBCPP_WIN32API)
+_LIBCPP_HIDDEN text_encoding __get_win32_acp(unsigned int __codepage);
+
+_LIBCPP_HIDDEN text_encoding static __get_win32_acp(unsigned int __codepage) {
+ switch (__codepage) {
+ case 0:
+ // If no ANSI code page is available, only Unicode can be used for the locale.
+ // In this case, the value is CP_ACP (0).
+ // Such a locale cannot be set as the system locale.
+ // Applications that do not support Unicode do not work correctly with locales
+ // marked as "Unicode only".
+ return std::text_encoding::id::unknown;
+ case 037:
+ return std::text_encoding::id::IBM037;
+ case 437:
+ return std::text_encoding::id::PC8CodePage437;
+ case 500:
+ return std::text_encoding::id::IBM500;
+ case 708:
+ return std::text_encoding::id::ISOLatinArabic;
+ case 709:
+ return std::text_encoding::id::ISO89ASMO449;
+ case 775:
+ return std::text_encoding::id::PC775Baltic;
+ case 850:
+ return std::text_encoding::id::PC850Multilingual;
+ case 852:
+ return std::text_encoding::id::PCp852;
+ case 855:
+ return std::text_encoding::id::IBM855;
+ case 857:
+ return std::text_encoding::id::IBM857;
+ case 858:
+ return std::text_encoding::id::IBM00858;
+ case 860:
+ return std::text_encoding::id::IBM860;
+ case 861:
+ return std::text_encoding::id::IBM861;
+ case 862:
+ return std::text_encoding::id::PC862LatinHebrew;
+ case 863:
+ return std::text_encoding::id::IBM863;
+ case 864:
+ return std::text_encoding::id::IBM864;
+ case 865:
+ return std::text_encoding::id::IBM865;
+ case 866:
+ return std::text_encoding::id::IBM866;
+ case 869:
+ return std::text_encoding::id::IBM869;
+ case 870:
+ return std::text_encoding::id::IBM870;
+ case 874:
+ return std::text_encoding::id::windows874;
+ case 932:
+ return std::text_encoding::id::ShiftJIS;
+ case 936:
+ return std::text_encoding::id::GB2312;
+ case 949:
+ return std::text_encoding::id::KSC56011987;
+ case 950:
+ return std::text_encoding::id::Big5;
+ case 1026:
+ return std::text_encoding::id::IBM1026;
+ case 1047:
+ return std::text_encoding::id::IBM1047;
+ case 1140:
+ return std::text_encoding::id::IBM01140;
+ case 1141:
+ return std::text_encoding::id::IBM01141;
+ case 1142:
+ return std::text_encoding::id::IBM01142;
+ case 1143:
+ return std::text_encoding::id::IBM01143;
+ case 1144:
+ return std::text_encoding::id::IBM01144;
+ case 1145:
+ return std::text_encoding::id::IBM01145;
+ case 1146:
+ return std::text_encoding::id::IBM01146;
+ case 1147:
+ return std::text_encoding::id::IBM01147;
+ case 1148:
+ return std::text_encoding::id::IBM01148;
+ case 1149:
+ return std::text_encoding::id::IBM01149;
+ case 1200:
+ return std::text_encoding::id::UTF16LE;
+ case 1201:
+ return std::text_encoding::id::UTF16BE;
+ case 1250:
+ return std::text_encoding::id::windows1250;
+ case 1251:
+ return std::text_encoding::id::windows1251;
+ case 1252:
+ return std::text_encoding::id::windows1252;
+ case 1253:
+ return std::text_encoding::id::windows1253;
+ case 1254:
+ return std::text_encoding::id::windows1254;
+ case 1255:
+ return std::text_encoding::id::windows1255;
+ case 1256:
+ return std::text_encoding::id::windows1256;
+ case 1257:
+ return std::text_encoding::id::windows1257;
+ case 1258:
+ return std::text_encoding::id::windows1258;
+ case 10000:
+ return std::text_encoding::id::Macintosh;
+ case 12000:
+ return std::text_encoding::id::UTF32LE;
+ case 12001:
+ return std::text_encoding::id::UTF32BE;
+ case 20127:
+ return std::text_encoding::id::ASCII;
+ case 20273:
+ return std::text_encoding::id::IBM273;
+ case 20277:
+ return std::text_encoding::id::IBM277;
+ case 20278:
+ return std::text_encoding::id::IBM278;
+ case 20280:
+ return std::text_encoding::id::IBM280;
+ case 20284:
+ return std::text_encoding::id::IBM284;
+ case 20285:
+ return std::text_encoding::id::IBM285;
+ case 20290:
+ return std::text_encoding::id::IBM290;
+ case 20297:
+ return std::text_encoding::id::IBM297;
+ case 20420:
+ return std::text_encoding::id::IBM420;
+ case 20423:
+ return std::text_encoding::id::IBM423;
+ case 20424:
+ return std::text_encoding::id::IBM424;
+ case 20838:
+ return std::text_encoding::id::IBMThai;
+ case 20866:
+ return std::text_encoding::id::KOI8R;
+ case 20871:
+ return std::text_encoding::id::IBM871;
+ case 20880:
+ return std::text_encoding::id::IBM880;
+ case 20905:
+ return std::text_encoding::id::IBM905;
+ case 20924:
+ return std::text_encoding::id::IBM00924;
+ case 20932:
+ return std::text_encoding::id::EUCPkdFmtJapanese;
+ case 21866:
+ return std::text_encoding::id::KOI8U;
+ case 28591:
+ return std::text_encoding::id::ISOLatin1;
+ case 28592:
+ return std::text_encoding::id::ISOLatin2;
+ case 28593:
+ return std::text_encoding::id::ISOLatin3;
+ case 28594:
+ return std::text_encoding::id::ISOLatin4;
+ case 28595:
+ return std::text_encoding::id::ISOLatin5;
+ case 28596:
+ return std::text_encoding::id::ISOLatin6;
+ case 28597:
+ return std::text_encoding::id::ISOLatinGreek;
+ case 28598:
+ return std::text_encoding::id::ISOLatinHebrew;
+ case 28599:
+ return std::text_encoding::id::Windows31Latin5;
+ case 28603:
+ return std::text_encoding::id::ISO885913;
+ case 28605:
+ return std::text_encoding::id::ISO885915;
+ case 38598:
+ return std::text_encoding::id::ISO88598I;
+ case 50220:
+ case 50221:
+ case 50222:
+ return std::text_encoding::id::ISO2022JP;
+ case 51932:
+ return std::text_encoding::id::EUCPkdFmtJapanese;
+ case 51936:
+ return std::text_encoding::id::GB2312;
+ case 51949:
+ return std::text_encoding::id::EUCKR;
+ case 52936:
+ return std::text_encoding::id::HZGB2312;
+ case 54936:
+ return std::text_encoding::id::GB18030;
+ case 65000:
+ return std::text_encoding::id::UTF7;
+ case 65001:
+ return std::text_encoding::id::UTF8;
+ default:
+ return std::text_encoding::id::unknown;
+ }
+}
+
+_LIBCPP_HIDDEN static std::text_encoding __get_locale_encoding(const char* __name) {
+ wchar_t __locale_wbuffer[LOCALE_NAME_MAX_LENGTH + 1]{};
+ wchar_t __number_buffer[11]{};
+
+ bool __is_ansi = ::AreFileApisANSI();
+ auto __codepage = __is_ansi ? CP_ACP : CP_OEMCP;
+
+ string_view __sv(__name);
+ int __ret = ::MultiByteToWideChar(
+ __codepage, MB_ERR_INVALID_CHARS, __name, __sv.size(), __locale_wbuffer, LOCALE_NAME_MAX_LENGTH);
+
+ if (__ret <= 0)
+ return std::text_encoding();
+
+ // The below function fills the string with the number in text.
+ auto __lctype = __is_ansi ? LOCALE_IDEFAULTANSICODEPAGE : LOCALE_IDEFAULTCODEPAGE;
+ int __result = ::GetLocaleInfoEx(__locale_wbuffer, __lctype, __number_buffer, 10);
+
+ if (__result <= 0)
+ return std::text_encoding();
+
+ unsigned int __acp = std::wcstoul(__number_buffer, nullptr, 10);
+
+ return __get_win32_acp(__acp);
+}
+
+_LIBCPP_HIDDEN static std::text_encoding __get_env_encoding() { return __get_win32_acp(::GetACP()); }
+
+#elif defined(__ANDROID__)
+// Android has minimal libc suppport for locale, and doesn't support any other locale
+// than the ones checked for below.
+_LIBCPP_HIDDEN static std::text_encoding __get_locale_encoding(const char* __name) {
+ string_view __sv(__name);
+ if (__sv == "" || __sv == '*' || __sv == "C" || __sv == "POSIX" || __sv.contains("UTF-8")) {
+ return std::text_encoding(std::text_encoding::id::UTF8);
+ }
+
+ return std::text_encoding();
+}
+
+// Android is pretty much assumed to always be UTF-8.
+_LIBCPP_HIDDEN static std::text_encoding __get_env_encoding() {
+ return std::text_encoding(std::text_encoding::id::UTF8);
+}
+
+#else // POSIX
+_LIBCPP_HIDDEN static std::text_encoding __get_locale_encoding(const char* __name) {
+ std::text_encoding __e;
+
+ __locale::__locale_t __l = __locale::__newlocale(_LIBCPP_CTYPE_MASK, __name, static_cast<__locale::__locale_t>(0));
+
+ __scope_guard __locale_guard([&__l] {
+ if (__l) {
+ __locale::__freelocale(__l);
+ }
+ });
+
+ if (!__l) {
+ return __e;
+ }
+
+ const char* __codeset = __locale::__nl_langinfo(_LIBCPP_NL_CODESET, __l);
+
+ if (!__codeset) {
+ return __e;
+ }
+
+ string_view __codeset_sv(__codeset);
+
+ if (__codeset_sv.size() <= std::text_encoding::max_name_length) {
+ __e = std::text_encoding(__codeset_sv);
+ }
+
+ return __e;
+}
+
+_LIBCPP_HIDDEN static std::text_encoding __get_env_encoding() { return __get_locale_encoding(""); }
+#endif // _LIBCPP_WIN32API
+
+_LIBCPP_AVAILABILITY_TE_ENVIRONMENT _LIBCPP_EXPORTED_FROM_ABI std::text_encoding std::text_encoding::environment() {
+ return __get_env_encoding();
+}
+
+std::text_encoding locale::encoding() const {
----------------
smallp-o-p wrote:
I wonder if we want to expose `locale:;encoding()` or the implementation detail `__get_locale_encoding`.
https://github.com/llvm/llvm-project/pull/141312
More information about the libcxx-commits
mailing list