[Patch] libc++: Add a default rune table

Ed Schouten ed at 80386.nl
Thu Dec 11 02:21:50 PST 2014


2014-12-10 19:56 GMT+01:00 Dan Albert <danalbert at google.com>:
> Yep, this is what I had in mind, just kept forgetting to do it.

Great!

So I decided to spend some time to take a look at the code more
closely and polish it up a bit. Essentially, I did the following:

- I notice there is quite some duplication: we have functions that
test a single character, but also functions that can scan over ranges.
One can be implemented on top of the other. That way the platform
dependent bits become smaller.

- I checked n3797.pdf and it seems that table_size has to be at least
256, meaning that it is completely safe for us to index the tables
directly. I've patched up include/__locale to no longer call
isascii(). I've also changed classic_locale() to return a table of
size table_size.

- For tolower() and toupper(): these functions seem to be locale
agnostic. They only support ASCII. Instead of requiring a per-platform
snippet of code, we can just replace it by a simple manual conversion.
That way we can also eliminate the __classic_{upper,lower}_table()
code.

URL: http://80386.nl/pub/20141211-libcxx-locale-cleanup.txt

-- 
Ed Schouten <ed at 80386.nl>
-------------- next part --------------
Index: include/__locale
===================================================================
--- include/__locale	(revision 224007)
+++ include/__locale	(working copy)
@@ -528,7 +528,7 @@
     _LIBCPP_ALWAYS_INLINE
     bool is(mask __m, char_type __c) const
     {
-        return isascii(__c) ? (__tab_[static_cast<int>(__c)] & __m) !=0 : false;
+        return (__tab_[static_cast<unsigned char>(__c)] & __m) != 0;
     }
 
     _LIBCPP_ALWAYS_INLINE
@@ -535,15 +535,15 @@
     const char_type* is(const char_type* __low, const char_type* __high, mask* __vec) const
     {
         for (; __low != __high; ++__low, ++__vec)
-            *__vec = isascii(*__low) ? __tab_[static_cast<int>(*__low)] : 0;
+            *__vec = __tab_[static_cast<unsigned char>(*__low)];
         return __low;
     }
 
     _LIBCPP_ALWAYS_INLINE
-    const char_type* scan_is (mask __m, const char_type* __low, const char_type* __high) const
+    const char_type* scan_is(mask __m, const char_type* __low, const char_type* __high) const
     {
         for (; __low != __high; ++__low)
-            if (isascii(*__low) && (__tab_[static_cast<int>(*__low)] & __m))
+            if (is(__m, *__low))
                 break;
         return __low;
     }
@@ -552,7 +552,7 @@
     const char_type* scan_not(mask __m, const char_type* __low, const char_type* __high) const
     {
         for (; __low != __high; ++__low)
-            if (!(isascii(*__low) && (__tab_[static_cast<int>(*__low)] & __m)))
+            if (!is(__m, *__low))
                 break;
         return __low;
     }
@@ -614,14 +614,6 @@
 #endif
     _LIBCPP_ALWAYS_INLINE const mask* table() const  _NOEXCEPT {return __tab_;}
     static const mask* classic_table()  _NOEXCEPT;
-#if defined(__GLIBC__) || defined(__EMSCRIPTEN__)
-    static const int* __classic_upper_table() _NOEXCEPT;
-    static const int* __classic_lower_table() _NOEXCEPT;
-#endif
-#if defined(__NetBSD__)
-    static const short* __classic_upper_table() _NOEXCEPT;
-    static const short* __classic_lower_table() _NOEXCEPT;
-#endif
 
 protected:
     ~ctype();
Index: src/locale.cpp
===================================================================
--- src/locale.cpp	(revision 224007)
+++ src/locale.cpp	(working copy)
@@ -791,7 +791,7 @@
 ctype<wchar_t>::do_scan_is(mask m, const char_type* low, const char_type* high) const
 {
     for (; low != high; ++low)
-        if (isascii(*low) && (ctype<char>::classic_table()[*low] & m))
+        if (do_is(m, *low))
             break;
     return low;
 }
@@ -800,7 +800,7 @@
 ctype<wchar_t>::do_scan_not(mask m, const char_type* low, const char_type* high) const
 {
     for (; low != high; ++low)
-        if (!(isascii(*low) && (ctype<char>::classic_table()[*low] & m)))
+        if (!do_is(m, *low))
             break;
     return low;
 }
@@ -808,13 +808,7 @@
 wchar_t
 ctype<wchar_t>::do_toupper(char_type c) const
 {
-#ifdef _LIBCPP_HAS_DEFAULTRUNELOCALE
-    return isascii(c) ? _DefaultRuneLocale.__mapupper[c] : c;
-#elif defined(__GLIBC__) || defined(__EMSCRIPTEN__) || defined(__NetBSD__)
-    return isascii(c) ? ctype<char>::__classic_upper_table()[c] : c;
-#else
-    return (isascii(c) && iswlower_l(c, __cloc())) ? c-L'a'+L'A' : c;
-#endif
+    return (c >= L'a' && c <= L'z') ? c - L'a' + L'A' : c;
 }
 
 const wchar_t*
@@ -821,14 +815,7 @@
 ctype<wchar_t>::do_toupper(char_type* low, const char_type* high) const
 {
     for (; low != high; ++low)
-#ifdef _LIBCPP_HAS_DEFAULTRUNELOCALE
-        *low = isascii(*low) ? _DefaultRuneLocale.__mapupper[*low] : *low;
-#elif defined(__GLIBC__) || defined(__EMSCRIPTEN__) || defined(__NetBSD__)
-        *low = isascii(*low) ? ctype<char>::__classic_upper_table()[*low]
-                             : *low;
-#else
-        *low = (isascii(*low) && islower_l(*low, __cloc())) ? (*low-L'a'+L'A') : *low;
-#endif
+        *low = do_toupper(*low);
     return low;
 }
 
@@ -835,13 +822,7 @@
 wchar_t
 ctype<wchar_t>::do_tolower(char_type c) const
 {
-#ifdef _LIBCPP_HAS_DEFAULTRUNELOCALE
-    return isascii(c) ? _DefaultRuneLocale.__maplower[c] : c;
-#elif defined(__GLIBC__) || defined(__EMSCRIPTEN__) || defined(__NetBSD__)
-    return isascii(c) ? ctype<char>::__classic_lower_table()[c] : c;
-#else
-    return (isascii(c) && isupper_l(c, __cloc())) ? c-L'A'+'a' : c;
-#endif
+    return (c >= L'A' && c <= L'Z') ? c - L'A' + L'a' : c;
 }
 
 const wchar_t*
@@ -848,14 +829,7 @@
 ctype<wchar_t>::do_tolower(char_type* low, const char_type* high) const
 {
     for (; low != high; ++low)
-#ifdef _LIBCPP_HAS_DEFAULTRUNELOCALE
-        *low = isascii(*low) ? _DefaultRuneLocale.__maplower[*low] : *low;
-#elif defined(__GLIBC__) || defined(__EMSCRIPTEN__) || defined(__NetBSD__)
-        *low = isascii(*low) ? ctype<char>::__classic_lower_table()[*low]
-                             : *low;
-#else
-        *low = (isascii(*low) && isupper_l(*low, __cloc())) ? *low-L'A'+L'a' : *low;
-#endif
+        *low = do_tolower(*low);
     return low;
 }
 
@@ -869,7 +843,7 @@
 ctype<wchar_t>::do_widen(const char* low, const char* high, char_type* dest) const
 {
     for (; low != high; ++low, ++dest)
-        *dest = *low;
+        *dest = do_widen(*low);
     return low;
 }
 
@@ -885,10 +859,7 @@
 ctype<wchar_t>::do_narrow(const char_type* low, const char_type* high, char dfault, char* dest) const
 {
     for (; low != high; ++low, ++dest)
-        if (isascii(*low))
-            *dest = static_cast<char>(*low);
-        else
-            *dest = dfault;
+        *dest = do_narrow(*low, dfault);
     return low;
 }
 
@@ -914,17 +885,7 @@
 char
 ctype<char>::do_toupper(char_type c) const
 {
-#ifdef _LIBCPP_HAS_DEFAULTRUNELOCALE
-    return isascii(c) ?
-      static_cast<char>(_DefaultRuneLocale.__mapupper[static_cast<ptrdiff_t>(c)]) : c;
-#elif defined(__NetBSD__)
-    return static_cast<char>(__classic_upper_table()[static_cast<unsigned char>(c)]);
-#elif defined(__GLIBC__) || defined(__EMSCRIPTEN__)
-    return isascii(c) ? 
-      static_cast<char>(__classic_upper_table()[static_cast<unsigned char>(c)]) : c;
-#else
-    return (isascii(c) && islower_l(c, __cloc())) ? c-'a'+'A' : c;
-#endif
+    return (c >= 'a' && c <= 'z') ? c - 'a' + 'A' : c;
 }
 
 const char*
@@ -931,17 +892,7 @@
 ctype<char>::do_toupper(char_type* low, const char_type* high) const
 {
     for (; low != high; ++low)
-#ifdef _LIBCPP_HAS_DEFAULTRUNELOCALE
-        *low = isascii(*low) ?
-          static_cast<char>(_DefaultRuneLocale.__mapupper[static_cast<ptrdiff_t>(*low)]) : *low;
-#elif defined(__NetBSD__)
-        *low = static_cast<char>(__classic_upper_table()[static_cast<unsigned char>(*low)]);
-#elif defined(__GLIBC__) || defined(__EMSCRIPTEN__)
-        *low = isascii(*low) ?
-          static_cast<char>(__classic_upper_table()[static_cast<size_t>(*low)]) : *low;
-#else
-        *low = (isascii(*low) && islower_l(*low, __cloc())) ? *low-'a'+'A' : *low;
-#endif
+        *low = do_toupper(*low);
     return low;
 }
 
@@ -948,17 +899,7 @@
 char
 ctype<char>::do_tolower(char_type c) const
 {
-#ifdef _LIBCPP_HAS_DEFAULTRUNELOCALE
-    return isascii(c) ?
-      static_cast<char>(_DefaultRuneLocale.__maplower[static_cast<ptrdiff_t>(c)]) : c;
-#elif defined(__NetBSD__)
-    return static_cast<char>(__classic_lower_table()[static_cast<unsigned char>(c)]);
-#elif defined(__GLIBC__) || defined(__EMSCRIPTEN__) || defined(__NetBSD__)
-    return isascii(c) ?
-      static_cast<char>(__classic_lower_table()[static_cast<size_t>(c)]) : c;
-#else
-    return (isascii(c) && isupper_l(c, __cloc())) ? c-'A'+'a' : c;
-#endif
+    return (c >= 'A' && c <= 'Z') ? c - 'A' + 'a' : c;
 }
 
 const char*
@@ -965,15 +906,7 @@
 ctype<char>::do_tolower(char_type* low, const char_type* high) const
 {
     for (; low != high; ++low)
-#ifdef _LIBCPP_HAS_DEFAULTRUNELOCALE
-        *low = isascii(*low) ? static_cast<char>(_DefaultRuneLocale.__maplower[static_cast<ptrdiff_t>(*low)]) : *low;
-#elif defined(__NetBSD__)
-        *low = static_cast<char>(__classic_lower_table()[static_cast<unsigned char>(*low)]);
-#elif defined(__GLIBC__) || defined(__EMSCRIPTEN__)
-        *low = isascii(*low) ? static_cast<char>(__classic_lower_table()[static_cast<size_t>(*low)]) : *low;
-#else
-        *low = (isascii(*low) && isupper_l(*low, __cloc())) ? *low-'A'+'a' : *low;
-#endif
+        *low = do_tolower(*low);
     return low;
 }
 
@@ -987,7 +920,7 @@
 ctype<char>::do_widen(const char* low, const char* high, char_type* dest) const
 {
     for (; low != high; ++low, ++dest)
-        *dest = *low;
+        *dest = do_widen(*low);
     return low;
 }
 
@@ -1003,10 +936,7 @@
 ctype<char>::do_narrow(const char_type* low, const char_type* high, char dfault, char* dest) const
 {
     for (; low != high; ++low, ++dest)
-        if (isascii(*low))
-            *dest = *low;
-        else
-            *dest = dfault;
+        *dest = do_narrow(*low, dfault);
     return low;
 }
 
@@ -1019,73 +949,76 @@
 const ctype<char>::mask*
 ctype<char>::classic_table()  _NOEXCEPT
 {
-#if defined(__APPLE__) || defined(__FreeBSD__)
-    return _DefaultRuneLocale.__runetype;
-#elif defined(__NetBSD__)
-    return _C_ctype_tab_ + 1;
-#elif defined(__GLIBC__)
-    return __cloc()->__ctype_b;
-#elif __sun__
-    return __ctype_mask;
-#elif defined(_LIBCPP_MSVCRT) || defined(__MINGW32__)
-    return _ctype+1; // internal ctype mask table defined in msvcrt.dll
-// This is assumed to be safe, which is a nonsense assumption because we're
-// going to end up dereferencing it later...
-#elif defined(__EMSCRIPTEN__)
-    return *__ctype_b_loc();
-#elif defined(_AIX)
-    return (const unsigned int *)__lc_ctype_ptr->obj->mask;
-#elif defined(__ANDROID__)
-    return reinterpret_cast<const unsigned char*>(_ctype_) + 1;
-#else
-    // Platform not supported: abort so the person doing the port knows what to
-    // fix
-# warning  ctype<char>::classic_table() is not implemented
-    printf("ctype<char>::classic_table() is not implemented\n");
-    abort();
-    return NULL;
-#endif
+    // Character type information for US-ASCII.
+    static const ctype<char>::mask builtin_table[table_size] = {
+        cntrl,                          cntrl,
+        cntrl,                          cntrl,
+        cntrl,                          cntrl,
+        cntrl,                          cntrl,
+        cntrl,                          cntrl | space | blank,
+        cntrl | space,                  cntrl | space,
+        cntrl | space,                  cntrl | space,
+        cntrl,                          cntrl,
+        cntrl,                          cntrl,
+        cntrl,                          cntrl,
+        cntrl,                          cntrl,
+        cntrl,                          cntrl,
+        cntrl,                          cntrl,
+        cntrl,                          cntrl,
+        cntrl,                          cntrl,
+        cntrl,                          cntrl,
+        space | blank | print,          punct | print,
+        punct | print,                  punct | print,
+        punct | print,                  punct | print,
+        punct | print,                  punct | print,
+        punct | print,                  punct | print,
+        punct | print,                  punct | print,
+        punct | print,                  punct | print,
+        punct | print,                  punct | print,
+        digit | print | xdigit,         digit | print | xdigit,
+        digit | print | xdigit,         digit | print | xdigit,
+        digit | print | xdigit,         digit | print | xdigit,
+        digit | print | xdigit,         digit | print | xdigit,
+        digit | print | xdigit,         digit | print | xdigit,
+        punct | print,                  punct | print,
+        punct | print,                  punct | print,
+        punct | print,                  punct | print,
+        punct | print,                  upper | xdigit | print | alpha,
+        upper | xdigit | print | alpha, upper | xdigit | print | alpha,
+        upper | xdigit | print | alpha, upper | xdigit | print | alpha,
+        upper | xdigit | print | alpha, upper | print | alpha,
+        upper | print | alpha,          upper | print | alpha,
+        upper | print | alpha,          upper | print | alpha,
+        upper | print | alpha,          upper | print | alpha,
+        upper | print | alpha,          upper | print | alpha,
+        upper | print | alpha,          upper | print | alpha,
+        upper | print | alpha,          upper | print | alpha,
+        upper | print | alpha,          upper | print | alpha,
+        upper | print | alpha,          upper | print | alpha,
+        upper | print | alpha,          upper | print | alpha,
+        upper | print | alpha,          punct | print,
+        punct | print,                  punct | print,
+        punct | print,                  punct | print,
+        punct | print,                  lower | xdigit | print | alpha,
+        lower | xdigit | print | alpha, lower | xdigit | print | alpha,
+        lower | xdigit | print | alpha, lower | xdigit | print | alpha,
+        lower | xdigit | print | alpha, lower | print | alpha,
+        lower | print | alpha,          lower | print | alpha,
+        lower | print | alpha,          lower | print | alpha,
+        lower | print | alpha,          lower | print | alpha,
+        lower | print | alpha,          lower | print | alpha,
+        lower | print | alpha,          lower | print | alpha,
+        lower | print | alpha,          lower | print | alpha,
+        lower | print | alpha,          lower | print | alpha,
+        lower | print | alpha,          lower | print | alpha,
+        lower | print | alpha,          lower | print | alpha,
+        lower | print | alpha,          punct | print,
+        punct | print,                  punct | print,
+        punct | print,                  cntrl,
+    };
+    return builtin_table;
 }
 
-#if defined(__GLIBC__)
-const int*
-ctype<char>::__classic_lower_table() _NOEXCEPT
-{
-    return __cloc()->__ctype_tolower;
-}
-
-const int*
-ctype<char>::__classic_upper_table() _NOEXCEPT
-{
-    return __cloc()->__ctype_toupper;
-}
-#elif __NetBSD__
-const short*
-ctype<char>::__classic_lower_table() _NOEXCEPT
-{
-    return _C_tolower_tab_ + 1;
-}
-
-const short*
-ctype<char>::__classic_upper_table() _NOEXCEPT
-{
-    return _C_toupper_tab_ + 1;
-}
-
-#elif defined(__EMSCRIPTEN__)
-const int*
-ctype<char>::__classic_lower_table() _NOEXCEPT
-{
-    return *__ctype_tolower_loc();
-}
-
-const int*
-ctype<char>::__classic_upper_table() _NOEXCEPT
-{
-    return *__ctype_toupper_loc();
-}
-#endif // __GLIBC__ || __EMSCRIPTEN__ || __NETBSD__
-
 // template <> class ctype_byname<char>
 
 ctype_byname<char>::ctype_byname(const char* name, size_t refs)
@@ -1125,7 +1058,7 @@
 ctype_byname<char>::do_toupper(char_type* low, const char_type* high) const
 {
     for (; low != high; ++low)
-        *low = static_cast<char>(toupper_l(static_cast<unsigned char>(*low), __l));
+        *low = do_toupper(*low);
     return low;
 }
 
@@ -1139,7 +1072,7 @@
 ctype_byname<char>::do_tolower(char_type* low, const char_type* high) const
 {
     for (; low != high; ++low)
-        *low = static_cast<char>(tolower_l(static_cast<unsigned char>(*low), __l));
+        *low = do_tolower(*low);
     return low;
 }
 
@@ -1232,24 +1165,8 @@
 ctype_byname<wchar_t>::do_scan_is(mask m, const char_type* low, const char_type* high) const
 {
     for (; low != high; ++low)
-    {
-#ifdef _LIBCPP_WCTYPE_IS_MASK
-        if (iswctype_l(*low, m, __l))
+        if (do_is(m, *low))
             break;
-#else
-        wint_t ch = static_cast<wint_t>(*low);
-        if (m & space && iswspace_l(ch, __l)) break;
-        if (m & print && iswprint_l(ch, __l)) break;
-        if (m & cntrl && iswcntrl_l(ch, __l)) break;
-        if (m & upper && iswupper_l(ch, __l)) break;
-        if (m & lower && iswlower_l(ch, __l)) break;
-        if (m & alpha && iswalpha_l(ch, __l)) break;
-        if (m & digit && iswdigit_l(ch, __l)) break;
-        if (m & punct && iswpunct_l(ch, __l)) break;
-        if (m & xdigit && iswxdigit_l(ch, __l)) break;
-        if (m & blank && iswblank_l(ch, __l)) break;
-#endif
-    }
     return low;
 }
 
@@ -1257,25 +1174,8 @@
 ctype_byname<wchar_t>::do_scan_not(mask m, const char_type* low, const char_type* high) const
 {
     for (; low != high; ++low)
-    {
-#ifdef _LIBCPP_WCTYPE_IS_MASK
-        if (!iswctype_l(*low, m, __l))
+        if (!do_is(m, *low))
             break;
-#else
-        wint_t ch = static_cast<wint_t>(*low);
-        if (m & space && iswspace_l(ch, __l)) continue;
-        if (m & print && iswprint_l(ch, __l)) continue;
-        if (m & cntrl && iswcntrl_l(ch, __l)) continue;
-        if (m & upper && iswupper_l(ch, __l)) continue;
-        if (m & lower && iswlower_l(ch, __l)) continue;
-        if (m & alpha && iswalpha_l(ch, __l)) continue;
-        if (m & digit && iswdigit_l(ch, __l)) continue;
-        if (m & punct && iswpunct_l(ch, __l)) continue;
-        if (m & xdigit && iswxdigit_l(ch, __l)) continue;
-        if (m & blank && iswblank_l(ch, __l)) continue;
-        break;
-#endif
-    }
     return low;
 }
 
@@ -1289,7 +1189,7 @@
 ctype_byname<wchar_t>::do_toupper(char_type* low, const char_type* high) const
 {
     for (; low != high; ++low)
-        *low = towupper_l(*low, __l);
+        *low = do_toupper(*low);
     return low;
 }
 
@@ -1303,7 +1203,7 @@
 ctype_byname<wchar_t>::do_tolower(char_type* low, const char_type* high) const
 {
     for (; low != high; ++low)
-        *low = towlower_l(*low, __l);
+        *low = do_tolower(*low);
     return low;
 }
 
@@ -1321,11 +1221,7 @@
 ctype_byname<wchar_t>::do_widen(const char* low, const char* high, char_type* dest) const
 {
     for (; low != high; ++low, ++dest)
-#ifdef _LIBCPP_LOCALE__L_EXTENSIONS
-        *dest = btowc_l(*low, __l);
-#else
-        *dest = __btowc_l(*low, __l);
-#endif
+        *dest = do_widen(*low);
     return low;
 }
 
@@ -1344,14 +1240,7 @@
 ctype_byname<wchar_t>::do_narrow(const char_type* low, const char_type* high, char dfault, char* dest) const
 {
     for (; low != high; ++low, ++dest)
-    {
-#ifdef _LIBCPP_LOCALE__L_EXTENSIONS
-        int r = wctob_l(*low, __l);
-#else
-        int r = __wctob_l(*low, __l);
-#endif
-        *dest = r != static_cast<int>(WEOF) ? static_cast<char>(r) : dfault;
-    }
+        *dest = do_narrow(*low, dfault);
     return low;
 }
 


More information about the cfe-commits mailing list