[libc-commits] [libc] [libc] Change ctype to be encoding independent (PR #110574)
Michael Jones via libc-commits
libc-commits at lists.llvm.org
Mon Sep 30 14:25:53 PDT 2024
https://github.com/michaelrj-google created https://github.com/llvm/llvm-project/pull/110574
The previous implementation of the ctype functions assumed ASCII.
This patch changes to a switch/case implementation that looks odd, but
actually is easier for the compiler to understand and optimize.
>From ebc78302650de87632906aee8e062c05a6bd9d0b Mon Sep 17 00:00:00 2001
From: Michael Jones <michaelrj at google.com>
Date: Wed, 25 Sep 2024 14:52:14 -0700
Subject: [PATCH] [libc] Change ctype to be encoding independent
The previous implementation of the ctype functions assumed ASCII.
This patch changes to a switch/case implementation that looks odd, but
actually is easier for the compiler to understand and optimize.
---
libc/src/__support/ctype_utils.h | 298 ++++++++++++++++++++++++++++---
1 file changed, 275 insertions(+), 23 deletions(-)
diff --git a/libc/src/__support/ctype_utils.h b/libc/src/__support/ctype_utils.h
index 91f6ce8cabd8d0..2e9b23d3bc37f3 100644
--- a/libc/src/__support/ctype_utils.h
+++ b/libc/src/__support/ctype_utils.h
@@ -15,44 +15,296 @@
namespace LIBC_NAMESPACE_DECL {
namespace internal {
-// ------------------------------------------------------
-// Rationale: Since these classification functions are
-// called in other functions, we will avoid the overhead
-// of a function call by inlining them.
-// ------------------------------------------------------
+// -----------------------------------------------------------------------------
+// ****************** WARNING ******************
+// ****************** DO NOT TRY TO OPTIMIZE THESE FUNCTIONS! ******************
+// -----------------------------------------------------------------------------
+// This switch/case form is easier for the compiler to understand, and is
+// optimized into a form that is almost always the same as or better than
+// versions written by hand (see https://godbolt.org/z/qvrebqvvr). Also this
+// form makes these functions encoding independent. If you want to rewrite these
+// functions, make sure you have benchmarks to show your new solution is faster,
+// as well as a way to support non-ASCII character encodings.
-LIBC_INLINE static constexpr bool isalpha(unsigned ch) {
- return (ch | 32) - 'a' < 26;
+LIBC_INLINE static constexpr bool islower(int ch) {
+ switch (ch) {
+ case 'a':
+ case 'b':
+ case 'c':
+ case 'd':
+ case 'e':
+ case 'f':
+ case 'g':
+ case 'h':
+ case 'i':
+ case 'j':
+ case 'k':
+ case 'l':
+ case 'm':
+ case 'n':
+ case 'o':
+ case 'p':
+ case 'q':
+ case 'r':
+ case 's':
+ case 't':
+ case 'u':
+ case 'v':
+ case 'w':
+ case 'x':
+ case 'y':
+ case 'z':
+ return true;
+ default:
+ return false;
+ }
}
-LIBC_INLINE static constexpr bool isdigit(unsigned ch) {
- return (ch - '0') < 10;
+LIBC_INLINE static constexpr bool isupper(int ch) {
+ switch (ch) {
+ case 'A':
+ case 'B':
+ case 'C':
+ case 'D':
+ case 'E':
+ case 'F':
+ case 'G':
+ case 'H':
+ case 'I':
+ case 'J':
+ case 'K':
+ case 'L':
+ case 'M':
+ case 'N':
+ case 'O':
+ case 'P':
+ case 'Q':
+ case 'R':
+ case 'S':
+ case 'T':
+ case 'U':
+ case 'V':
+ case 'W':
+ case 'X':
+ case 'Y':
+ case 'Z':
+ return true;
+ default:
+ return false;
+ }
}
-LIBC_INLINE static constexpr bool isalnum(unsigned ch) {
- return isalpha(ch) || isdigit(ch);
+LIBC_INLINE static constexpr bool isdigit(int ch) {
+ switch (ch) {
+ case '0':
+ case '1':
+ case '2':
+ case '3':
+ case '4':
+ case '5':
+ case '6':
+ case '7':
+ case '8':
+ case '9':
+ return true;
+ default:
+ return false;
+ }
}
-LIBC_INLINE static constexpr bool isgraph(unsigned ch) {
- return 0x20 < ch && ch < 0x7f;
+LIBC_INLINE static constexpr int tolower(int ch) {
+ switch (ch) {
+ case 'A':
+ return 'a';
+ case 'B':
+ return 'b';
+ case 'C':
+ return 'c';
+ case 'D':
+ return 'd';
+ case 'E':
+ return 'e';
+ case 'F':
+ return 'f';
+ case 'G':
+ return 'g';
+ case 'H':
+ return 'h';
+ case 'I':
+ return 'i';
+ case 'J':
+ return 'j';
+ case 'K':
+ return 'k';
+ case 'L':
+ return 'l';
+ case 'M':
+ return 'm';
+ case 'N':
+ return 'n';
+ case 'O':
+ return 'o';
+ case 'P':
+ return 'p';
+ case 'Q':
+ return 'q';
+ case 'R':
+ return 'r';
+ case 'S':
+ return 's';
+ case 'T':
+ return 't';
+ case 'U':
+ return 'u';
+ case 'V':
+ return 'v';
+ case 'W':
+ return 'w';
+ case 'X':
+ return 'x';
+ case 'Y':
+ return 'y';
+ case 'Z':
+ return 'z';
+ default:
+ return ch;
+ }
+}
+
+LIBC_INLINE static constexpr bool isalpha(int ch) {
+ switch (tolower(ch)) {
+ case 'a':
+ case 'b':
+ case 'c':
+ case 'd':
+ case 'e':
+ case 'f':
+ case 'g':
+ case 'h':
+ case 'i':
+ case 'j':
+ case 'k':
+ case 'l':
+ case 'm':
+ case 'n':
+ case 'o':
+ case 'p':
+ case 'q':
+ case 'r':
+ case 's':
+ case 't':
+ case 'u':
+ case 'v':
+ case 'w':
+ case 'x':
+ case 'y':
+ case 'z':
+ return true;
+ default:
+ return false;
+ }
}
-LIBC_INLINE static constexpr bool islower(unsigned ch) {
- return (ch - 'a') < 26;
+LIBC_INLINE static constexpr bool isalnum(int ch) {
+ return isalpha(ch) || isdigit(ch);
}
-LIBC_INLINE static constexpr bool isupper(unsigned ch) {
- return (ch - 'A') < 26;
+LIBC_INLINE static constexpr int b36_char_to_int(int ch) {
+ switch (tolower(ch)) {
+ case '0':
+ return 0;
+ case '1':
+ return 1;
+ case '2':
+ return 2;
+ case '3':
+ return 3;
+ case '4':
+ return 4;
+ case '5':
+ return 5;
+ case '6':
+ return 6;
+ case '7':
+ return 7;
+ case '8':
+ return 8;
+ case '9':
+ return 9;
+ case 'a':
+ return 10;
+ case 'b':
+ return 11;
+ case 'c':
+ return 12;
+ case 'd':
+ return 13;
+ case 'e':
+ return 14;
+ case 'f':
+ return 15;
+ case 'g':
+ return 16;
+ case 'h':
+ return 17;
+ case 'i':
+ return 18;
+ case 'j':
+ return 19;
+ case 'k':
+ return 20;
+ case 'l':
+ return 21;
+ case 'm':
+ return 22;
+ case 'n':
+ return 23;
+ case 'o':
+ return 24;
+ case 'p':
+ return 25;
+ case 'q':
+ return 26;
+ case 'r':
+ return 27;
+ case 's':
+ return 28;
+ case 't':
+ return 29;
+ case 'u':
+ return 30;
+ case 'v':
+ return 31;
+ case 'w':
+ return 32;
+ case 'x':
+ return 33;
+ case 'y':
+ return 34;
+ case 'z':
+ return 35;
+ default:
+ return 0;
+ }
}
-LIBC_INLINE static constexpr bool isspace(unsigned ch) {
- return ch == ' ' || (ch - '\t') < 5;
+LIBC_INLINE static constexpr bool isspace(int ch) {
+ switch (ch) {
+ case ' ':
+ case '\t':
+ case '\n':
+ case '\v':
+ case '\f':
+ case '\r':
+ return true;
+ default:
+ return false;
+ }
}
-LIBC_INLINE static constexpr int tolower(int ch) {
- if (isupper(ch))
- return ch + ('a' - 'A');
- return ch;
+// not yet encoding independent.
+LIBC_INLINE static constexpr bool isgraph(int ch) {
+ return 0x20 < ch && ch < 0x7f;
}
} // namespace internal
More information about the libc-commits
mailing list