[Patch] libc++: Add a default rune table
Ed Schouten
ed at 80386.nl
Thu Dec 11 02:21:50 PST 2014
2014-12-10 19:56 GMT+01:00 Dan Albert <danalbert at google.com>:
> Yep, this is what I had in mind, just kept forgetting to do it.
Great!
So I decided to spend some time to take a look at the code more
closely and polish it up a bit. Essentially, I did the following:
- I notice there is quite some duplication: we have functions that
test a single character, but also functions that can scan over ranges.
One can be implemented on top of the other. That way the platform
dependent bits become smaller.
- I checked n3797.pdf and it seems that table_size has to be at least
256, meaning that it is completely safe for us to index the tables
directly. I've patched up include/__locale to no longer call
isascii(). I've also changed classic_locale() to return a table of
size table_size.
- For tolower() and toupper(): these functions seem to be locale
agnostic. They only support ASCII. Instead of requiring a per-platform
snippet of code, we can just replace it by a simple manual conversion.
That way we can also eliminate the __classic_{upper,lower}_table()
code.
URL: http://80386.nl/pub/20141211-libcxx-locale-cleanup.txt
--
Ed Schouten <ed at 80386.nl>
-------------- next part --------------
Index: include/__locale
===================================================================
--- include/__locale (revision 224007)
+++ include/__locale (working copy)
@@ -528,7 +528,7 @@
_LIBCPP_ALWAYS_INLINE
bool is(mask __m, char_type __c) const
{
- return isascii(__c) ? (__tab_[static_cast<int>(__c)] & __m) !=0 : false;
+ return (__tab_[static_cast<unsigned char>(__c)] & __m) != 0;
}
_LIBCPP_ALWAYS_INLINE
@@ -535,15 +535,15 @@
const char_type* is(const char_type* __low, const char_type* __high, mask* __vec) const
{
for (; __low != __high; ++__low, ++__vec)
- *__vec = isascii(*__low) ? __tab_[static_cast<int>(*__low)] : 0;
+ *__vec = __tab_[static_cast<unsigned char>(*__low)];
return __low;
}
_LIBCPP_ALWAYS_INLINE
- const char_type* scan_is (mask __m, const char_type* __low, const char_type* __high) const
+ const char_type* scan_is(mask __m, const char_type* __low, const char_type* __high) const
{
for (; __low != __high; ++__low)
- if (isascii(*__low) && (__tab_[static_cast<int>(*__low)] & __m))
+ if (is(__m, *__low))
break;
return __low;
}
@@ -552,7 +552,7 @@
const char_type* scan_not(mask __m, const char_type* __low, const char_type* __high) const
{
for (; __low != __high; ++__low)
- if (!(isascii(*__low) && (__tab_[static_cast<int>(*__low)] & __m)))
+ if (!is(__m, *__low))
break;
return __low;
}
@@ -614,14 +614,6 @@
#endif
_LIBCPP_ALWAYS_INLINE const mask* table() const _NOEXCEPT {return __tab_;}
static const mask* classic_table() _NOEXCEPT;
-#if defined(__GLIBC__) || defined(__EMSCRIPTEN__)
- static const int* __classic_upper_table() _NOEXCEPT;
- static const int* __classic_lower_table() _NOEXCEPT;
-#endif
-#if defined(__NetBSD__)
- static const short* __classic_upper_table() _NOEXCEPT;
- static const short* __classic_lower_table() _NOEXCEPT;
-#endif
protected:
~ctype();
Index: src/locale.cpp
===================================================================
--- src/locale.cpp (revision 224007)
+++ src/locale.cpp (working copy)
@@ -791,7 +791,7 @@
ctype<wchar_t>::do_scan_is(mask m, const char_type* low, const char_type* high) const
{
for (; low != high; ++low)
- if (isascii(*low) && (ctype<char>::classic_table()[*low] & m))
+ if (do_is(m, *low))
break;
return low;
}
@@ -800,7 +800,7 @@
ctype<wchar_t>::do_scan_not(mask m, const char_type* low, const char_type* high) const
{
for (; low != high; ++low)
- if (!(isascii(*low) && (ctype<char>::classic_table()[*low] & m)))
+ if (!do_is(m, *low))
break;
return low;
}
@@ -808,13 +808,7 @@
wchar_t
ctype<wchar_t>::do_toupper(char_type c) const
{
-#ifdef _LIBCPP_HAS_DEFAULTRUNELOCALE
- return isascii(c) ? _DefaultRuneLocale.__mapupper[c] : c;
-#elif defined(__GLIBC__) || defined(__EMSCRIPTEN__) || defined(__NetBSD__)
- return isascii(c) ? ctype<char>::__classic_upper_table()[c] : c;
-#else
- return (isascii(c) && iswlower_l(c, __cloc())) ? c-L'a'+L'A' : c;
-#endif
+ return (c >= L'a' && c <= L'z') ? c - L'a' + L'A' : c;
}
const wchar_t*
@@ -821,14 +815,7 @@
ctype<wchar_t>::do_toupper(char_type* low, const char_type* high) const
{
for (; low != high; ++low)
-#ifdef _LIBCPP_HAS_DEFAULTRUNELOCALE
- *low = isascii(*low) ? _DefaultRuneLocale.__mapupper[*low] : *low;
-#elif defined(__GLIBC__) || defined(__EMSCRIPTEN__) || defined(__NetBSD__)
- *low = isascii(*low) ? ctype<char>::__classic_upper_table()[*low]
- : *low;
-#else
- *low = (isascii(*low) && islower_l(*low, __cloc())) ? (*low-L'a'+L'A') : *low;
-#endif
+ *low = do_toupper(*low);
return low;
}
@@ -835,13 +822,7 @@
wchar_t
ctype<wchar_t>::do_tolower(char_type c) const
{
-#ifdef _LIBCPP_HAS_DEFAULTRUNELOCALE
- return isascii(c) ? _DefaultRuneLocale.__maplower[c] : c;
-#elif defined(__GLIBC__) || defined(__EMSCRIPTEN__) || defined(__NetBSD__)
- return isascii(c) ? ctype<char>::__classic_lower_table()[c] : c;
-#else
- return (isascii(c) && isupper_l(c, __cloc())) ? c-L'A'+'a' : c;
-#endif
+ return (c >= L'A' && c <= L'Z') ? c - L'A' + L'a' : c;
}
const wchar_t*
@@ -848,14 +829,7 @@
ctype<wchar_t>::do_tolower(char_type* low, const char_type* high) const
{
for (; low != high; ++low)
-#ifdef _LIBCPP_HAS_DEFAULTRUNELOCALE
- *low = isascii(*low) ? _DefaultRuneLocale.__maplower[*low] : *low;
-#elif defined(__GLIBC__) || defined(__EMSCRIPTEN__) || defined(__NetBSD__)
- *low = isascii(*low) ? ctype<char>::__classic_lower_table()[*low]
- : *low;
-#else
- *low = (isascii(*low) && isupper_l(*low, __cloc())) ? *low-L'A'+L'a' : *low;
-#endif
+ *low = do_tolower(*low);
return low;
}
@@ -869,7 +843,7 @@
ctype<wchar_t>::do_widen(const char* low, const char* high, char_type* dest) const
{
for (; low != high; ++low, ++dest)
- *dest = *low;
+ *dest = do_widen(*low);
return low;
}
@@ -885,10 +859,7 @@
ctype<wchar_t>::do_narrow(const char_type* low, const char_type* high, char dfault, char* dest) const
{
for (; low != high; ++low, ++dest)
- if (isascii(*low))
- *dest = static_cast<char>(*low);
- else
- *dest = dfault;
+ *dest = do_narrow(*low, dfault);
return low;
}
@@ -914,17 +885,7 @@
char
ctype<char>::do_toupper(char_type c) const
{
-#ifdef _LIBCPP_HAS_DEFAULTRUNELOCALE
- return isascii(c) ?
- static_cast<char>(_DefaultRuneLocale.__mapupper[static_cast<ptrdiff_t>(c)]) : c;
-#elif defined(__NetBSD__)
- return static_cast<char>(__classic_upper_table()[static_cast<unsigned char>(c)]);
-#elif defined(__GLIBC__) || defined(__EMSCRIPTEN__)
- return isascii(c) ?
- static_cast<char>(__classic_upper_table()[static_cast<unsigned char>(c)]) : c;
-#else
- return (isascii(c) && islower_l(c, __cloc())) ? c-'a'+'A' : c;
-#endif
+ return (c >= 'a' && c <= 'z') ? c - 'a' + 'A' : c;
}
const char*
@@ -931,17 +892,7 @@
ctype<char>::do_toupper(char_type* low, const char_type* high) const
{
for (; low != high; ++low)
-#ifdef _LIBCPP_HAS_DEFAULTRUNELOCALE
- *low = isascii(*low) ?
- static_cast<char>(_DefaultRuneLocale.__mapupper[static_cast<ptrdiff_t>(*low)]) : *low;
-#elif defined(__NetBSD__)
- *low = static_cast<char>(__classic_upper_table()[static_cast<unsigned char>(*low)]);
-#elif defined(__GLIBC__) || defined(__EMSCRIPTEN__)
- *low = isascii(*low) ?
- static_cast<char>(__classic_upper_table()[static_cast<size_t>(*low)]) : *low;
-#else
- *low = (isascii(*low) && islower_l(*low, __cloc())) ? *low-'a'+'A' : *low;
-#endif
+ *low = do_toupper(*low);
return low;
}
@@ -948,17 +899,7 @@
char
ctype<char>::do_tolower(char_type c) const
{
-#ifdef _LIBCPP_HAS_DEFAULTRUNELOCALE
- return isascii(c) ?
- static_cast<char>(_DefaultRuneLocale.__maplower[static_cast<ptrdiff_t>(c)]) : c;
-#elif defined(__NetBSD__)
- return static_cast<char>(__classic_lower_table()[static_cast<unsigned char>(c)]);
-#elif defined(__GLIBC__) || defined(__EMSCRIPTEN__) || defined(__NetBSD__)
- return isascii(c) ?
- static_cast<char>(__classic_lower_table()[static_cast<size_t>(c)]) : c;
-#else
- return (isascii(c) && isupper_l(c, __cloc())) ? c-'A'+'a' : c;
-#endif
+ return (c >= 'A' && c <= 'Z') ? c - 'A' + 'a' : c;
}
const char*
@@ -965,15 +906,7 @@
ctype<char>::do_tolower(char_type* low, const char_type* high) const
{
for (; low != high; ++low)
-#ifdef _LIBCPP_HAS_DEFAULTRUNELOCALE
- *low = isascii(*low) ? static_cast<char>(_DefaultRuneLocale.__maplower[static_cast<ptrdiff_t>(*low)]) : *low;
-#elif defined(__NetBSD__)
- *low = static_cast<char>(__classic_lower_table()[static_cast<unsigned char>(*low)]);
-#elif defined(__GLIBC__) || defined(__EMSCRIPTEN__)
- *low = isascii(*low) ? static_cast<char>(__classic_lower_table()[static_cast<size_t>(*low)]) : *low;
-#else
- *low = (isascii(*low) && isupper_l(*low, __cloc())) ? *low-'A'+'a' : *low;
-#endif
+ *low = do_tolower(*low);
return low;
}
@@ -987,7 +920,7 @@
ctype<char>::do_widen(const char* low, const char* high, char_type* dest) const
{
for (; low != high; ++low, ++dest)
- *dest = *low;
+ *dest = do_widen(*low);
return low;
}
@@ -1003,10 +936,7 @@
ctype<char>::do_narrow(const char_type* low, const char_type* high, char dfault, char* dest) const
{
for (; low != high; ++low, ++dest)
- if (isascii(*low))
- *dest = *low;
- else
- *dest = dfault;
+ *dest = do_narrow(*low, dfault);
return low;
}
@@ -1019,73 +949,76 @@
const ctype<char>::mask*
ctype<char>::classic_table() _NOEXCEPT
{
-#if defined(__APPLE__) || defined(__FreeBSD__)
- return _DefaultRuneLocale.__runetype;
-#elif defined(__NetBSD__)
- return _C_ctype_tab_ + 1;
-#elif defined(__GLIBC__)
- return __cloc()->__ctype_b;
-#elif __sun__
- return __ctype_mask;
-#elif defined(_LIBCPP_MSVCRT) || defined(__MINGW32__)
- return _ctype+1; // internal ctype mask table defined in msvcrt.dll
-// This is assumed to be safe, which is a nonsense assumption because we're
-// going to end up dereferencing it later...
-#elif defined(__EMSCRIPTEN__)
- return *__ctype_b_loc();
-#elif defined(_AIX)
- return (const unsigned int *)__lc_ctype_ptr->obj->mask;
-#elif defined(__ANDROID__)
- return reinterpret_cast<const unsigned char*>(_ctype_) + 1;
-#else
- // Platform not supported: abort so the person doing the port knows what to
- // fix
-# warning ctype<char>::classic_table() is not implemented
- printf("ctype<char>::classic_table() is not implemented\n");
- abort();
- return NULL;
-#endif
+ // Character type information for US-ASCII.
+ static const ctype<char>::mask builtin_table[table_size] = {
+ cntrl, cntrl,
+ cntrl, cntrl,
+ cntrl, cntrl,
+ cntrl, cntrl,
+ cntrl, cntrl | space | blank,
+ cntrl | space, cntrl | space,
+ cntrl | space, cntrl | space,
+ cntrl, cntrl,
+ cntrl, cntrl,
+ cntrl, cntrl,
+ cntrl, cntrl,
+ cntrl, cntrl,
+ cntrl, cntrl,
+ cntrl, cntrl,
+ cntrl, cntrl,
+ cntrl, cntrl,
+ space | blank | print, punct | print,
+ punct | print, punct | print,
+ punct | print, punct | print,
+ punct | print, punct | print,
+ punct | print, punct | print,
+ punct | print, punct | print,
+ punct | print, punct | print,
+ punct | print, punct | print,
+ digit | print | xdigit, digit | print | xdigit,
+ digit | print | xdigit, digit | print | xdigit,
+ digit | print | xdigit, digit | print | xdigit,
+ digit | print | xdigit, digit | print | xdigit,
+ digit | print | xdigit, digit | print | xdigit,
+ punct | print, punct | print,
+ punct | print, punct | print,
+ punct | print, punct | print,
+ punct | print, upper | xdigit | print | alpha,
+ upper | xdigit | print | alpha, upper | xdigit | print | alpha,
+ upper | xdigit | print | alpha, upper | xdigit | print | alpha,
+ upper | xdigit | print | alpha, upper | print | alpha,
+ upper | print | alpha, upper | print | alpha,
+ upper | print | alpha, upper | print | alpha,
+ upper | print | alpha, upper | print | alpha,
+ upper | print | alpha, upper | print | alpha,
+ upper | print | alpha, upper | print | alpha,
+ upper | print | alpha, upper | print | alpha,
+ upper | print | alpha, upper | print | alpha,
+ upper | print | alpha, upper | print | alpha,
+ upper | print | alpha, upper | print | alpha,
+ upper | print | alpha, punct | print,
+ punct | print, punct | print,
+ punct | print, punct | print,
+ punct | print, lower | xdigit | print | alpha,
+ lower | xdigit | print | alpha, lower | xdigit | print | alpha,
+ lower | xdigit | print | alpha, lower | xdigit | print | alpha,
+ lower | xdigit | print | alpha, lower | print | alpha,
+ lower | print | alpha, lower | print | alpha,
+ lower | print | alpha, lower | print | alpha,
+ lower | print | alpha, lower | print | alpha,
+ lower | print | alpha, lower | print | alpha,
+ lower | print | alpha, lower | print | alpha,
+ lower | print | alpha, lower | print | alpha,
+ lower | print | alpha, lower | print | alpha,
+ lower | print | alpha, lower | print | alpha,
+ lower | print | alpha, lower | print | alpha,
+ lower | print | alpha, punct | print,
+ punct | print, punct | print,
+ punct | print, cntrl,
+ };
+ return builtin_table;
}
-#if defined(__GLIBC__)
-const int*
-ctype<char>::__classic_lower_table() _NOEXCEPT
-{
- return __cloc()->__ctype_tolower;
-}
-
-const int*
-ctype<char>::__classic_upper_table() _NOEXCEPT
-{
- return __cloc()->__ctype_toupper;
-}
-#elif __NetBSD__
-const short*
-ctype<char>::__classic_lower_table() _NOEXCEPT
-{
- return _C_tolower_tab_ + 1;
-}
-
-const short*
-ctype<char>::__classic_upper_table() _NOEXCEPT
-{
- return _C_toupper_tab_ + 1;
-}
-
-#elif defined(__EMSCRIPTEN__)
-const int*
-ctype<char>::__classic_lower_table() _NOEXCEPT
-{
- return *__ctype_tolower_loc();
-}
-
-const int*
-ctype<char>::__classic_upper_table() _NOEXCEPT
-{
- return *__ctype_toupper_loc();
-}
-#endif // __GLIBC__ || __EMSCRIPTEN__ || __NETBSD__
-
// template <> class ctype_byname<char>
ctype_byname<char>::ctype_byname(const char* name, size_t refs)
@@ -1125,7 +1058,7 @@
ctype_byname<char>::do_toupper(char_type* low, const char_type* high) const
{
for (; low != high; ++low)
- *low = static_cast<char>(toupper_l(static_cast<unsigned char>(*low), __l));
+ *low = do_toupper(*low);
return low;
}
@@ -1139,7 +1072,7 @@
ctype_byname<char>::do_tolower(char_type* low, const char_type* high) const
{
for (; low != high; ++low)
- *low = static_cast<char>(tolower_l(static_cast<unsigned char>(*low), __l));
+ *low = do_tolower(*low);
return low;
}
@@ -1232,24 +1165,8 @@
ctype_byname<wchar_t>::do_scan_is(mask m, const char_type* low, const char_type* high) const
{
for (; low != high; ++low)
- {
-#ifdef _LIBCPP_WCTYPE_IS_MASK
- if (iswctype_l(*low, m, __l))
+ if (do_is(m, *low))
break;
-#else
- wint_t ch = static_cast<wint_t>(*low);
- if (m & space && iswspace_l(ch, __l)) break;
- if (m & print && iswprint_l(ch, __l)) break;
- if (m & cntrl && iswcntrl_l(ch, __l)) break;
- if (m & upper && iswupper_l(ch, __l)) break;
- if (m & lower && iswlower_l(ch, __l)) break;
- if (m & alpha && iswalpha_l(ch, __l)) break;
- if (m & digit && iswdigit_l(ch, __l)) break;
- if (m & punct && iswpunct_l(ch, __l)) break;
- if (m & xdigit && iswxdigit_l(ch, __l)) break;
- if (m & blank && iswblank_l(ch, __l)) break;
-#endif
- }
return low;
}
@@ -1257,25 +1174,8 @@
ctype_byname<wchar_t>::do_scan_not(mask m, const char_type* low, const char_type* high) const
{
for (; low != high; ++low)
- {
-#ifdef _LIBCPP_WCTYPE_IS_MASK
- if (!iswctype_l(*low, m, __l))
+ if (!do_is(m, *low))
break;
-#else
- wint_t ch = static_cast<wint_t>(*low);
- if (m & space && iswspace_l(ch, __l)) continue;
- if (m & print && iswprint_l(ch, __l)) continue;
- if (m & cntrl && iswcntrl_l(ch, __l)) continue;
- if (m & upper && iswupper_l(ch, __l)) continue;
- if (m & lower && iswlower_l(ch, __l)) continue;
- if (m & alpha && iswalpha_l(ch, __l)) continue;
- if (m & digit && iswdigit_l(ch, __l)) continue;
- if (m & punct && iswpunct_l(ch, __l)) continue;
- if (m & xdigit && iswxdigit_l(ch, __l)) continue;
- if (m & blank && iswblank_l(ch, __l)) continue;
- break;
-#endif
- }
return low;
}
@@ -1289,7 +1189,7 @@
ctype_byname<wchar_t>::do_toupper(char_type* low, const char_type* high) const
{
for (; low != high; ++low)
- *low = towupper_l(*low, __l);
+ *low = do_toupper(*low);
return low;
}
@@ -1303,7 +1203,7 @@
ctype_byname<wchar_t>::do_tolower(char_type* low, const char_type* high) const
{
for (; low != high; ++low)
- *low = towlower_l(*low, __l);
+ *low = do_tolower(*low);
return low;
}
@@ -1321,11 +1221,7 @@
ctype_byname<wchar_t>::do_widen(const char* low, const char* high, char_type* dest) const
{
for (; low != high; ++low, ++dest)
-#ifdef _LIBCPP_LOCALE__L_EXTENSIONS
- *dest = btowc_l(*low, __l);
-#else
- *dest = __btowc_l(*low, __l);
-#endif
+ *dest = do_widen(*low);
return low;
}
@@ -1344,14 +1240,7 @@
ctype_byname<wchar_t>::do_narrow(const char_type* low, const char_type* high, char dfault, char* dest) const
{
for (; low != high; ++low, ++dest)
- {
-#ifdef _LIBCPP_LOCALE__L_EXTENSIONS
- int r = wctob_l(*low, __l);
-#else
- int r = __wctob_l(*low, __l);
-#endif
- *dest = r != static_cast<int>(WEOF) ? static_cast<char>(r) : dfault;
- }
+ *dest = do_narrow(*low, dfault);
return low;
}
More information about the cfe-commits
mailing list