r174765 - Pull Lexer's CharInfo table out for general use throughout Clang.
Jordan Rose
jordan_rose at apple.com
Fri Feb 8 14:30:22 PST 2013
Author: jrose
Date: Fri Feb 8 16:30:22 2013
New Revision: 174765
URL: http://llvm.org/viewvc/llvm-project?rev=174765&view=rev
Log:
Pull Lexer's CharInfo table out for general use throughout Clang.
Rewriting the same predicates over and over again is bad for code size and
code maintainence. Using the functions in <ctype.h> is generally unsafe
unless they are specified to be locale-independent (i.e. only isdigit and
isxdigit).
The next commit will try to clean up uses of <ctype.h> functions within Clang.
Added:
cfe/trunk/include/clang/Basic/CharInfo.h
cfe/trunk/lib/Basic/CharInfo.cpp
cfe/trunk/unittests/Basic/CharInfoTest.cpp
Modified:
cfe/trunk/lib/Basic/CMakeLists.txt
cfe/trunk/lib/Lex/Lexer.cpp
cfe/trunk/unittests/Basic/CMakeLists.txt
Added: cfe/trunk/include/clang/Basic/CharInfo.h
URL: http://llvm.org/viewvc/llvm-project/cfe/trunk/include/clang/Basic/CharInfo.h?rev=174765&view=auto
==============================================================================
--- cfe/trunk/include/clang/Basic/CharInfo.h (added)
+++ cfe/trunk/include/clang/Basic/CharInfo.h Fri Feb 8 16:30:22 2013
@@ -0,0 +1,162 @@
+//===--- clang/Basic/CharInfo.h - Classifying ASCII Characters ------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef CLANG_BASIC_CHARINFO_H
+#define CLANG_BASIC_CHARINFO_H
+
+#include "llvm/Support/Compiler.h"
+#include "llvm/Support/DataTypes.h"
+
+namespace clang {
+namespace charinfo {
+ extern const uint16_t InfoTable[256];
+
+ enum {
+ CHAR_HORZ_WS = 0x0001, // '\t', '\f', '\v'. Note, no '\0'
+ CHAR_VERT_WS = 0x0002, // '\r', '\n'
+ CHAR_SPACE = 0x0004, // ' '
+ CHAR_DIGIT = 0x0008, // 0-9
+ CHAR_XLETTER = 0x0010, // a-f,A-F
+ CHAR_UPPER = 0x0020, // A-Z
+ CHAR_LOWER = 0x0040, // a-z
+ CHAR_UNDER = 0x0080, // _
+ CHAR_PERIOD = 0x0100, // .
+ CHAR_RAWDEL = 0x0200, // {}[]#<>%:;?*+-/^&|~!=,"'
+ CHAR_PUNCT = 0x0400 // `$@()
+ };
+
+ enum {
+ CHAR_XUPPER = CHAR_XLETTER | CHAR_UPPER,
+ CHAR_XLOWER = CHAR_XLETTER | CHAR_LOWER
+ };
+} // end namespace charinfo
+
+/// Returns true if this is an ASCII character.
+LLVM_READNONE static inline bool isASCII(char c) {
+ return static_cast<unsigned char>(c) <= 127;
+}
+
+/// Returns true if this is a valid first character of a C identifier,
+/// which is [a-zA-Z_].
+LLVM_READONLY static inline bool isIdentifierHead(unsigned char c,
+ bool AllowDollar = false) {
+ using namespace charinfo;
+ if (InfoTable[c] & (CHAR_UPPER|CHAR_LOWER|CHAR_UNDER))
+ return true;
+ return AllowDollar && c == '$';
+}
+
+/// Returns true if this is a body character of a C identifier,
+/// which is [a-zA-Z0-9_].
+LLVM_READONLY static inline bool isIdentifierBody(unsigned char c,
+ bool AllowDollar = false) {
+ using namespace charinfo;
+ if (InfoTable[c] & (CHAR_UPPER|CHAR_LOWER|CHAR_DIGIT|CHAR_UNDER))
+ return true;
+ return AllowDollar && c == '$';
+}
+
+/// Returns true if this character is horizontal ASCII whitespace:
+/// ' ', '\\t', '\\f', '\\v'.
+///
+/// Note that this returns false for '\\0'.
+LLVM_READONLY static inline bool isHorizontalWhitespace(unsigned char c) {
+ using namespace charinfo;
+ return (InfoTable[c] & (CHAR_HORZ_WS|CHAR_SPACE)) != 0;
+}
+
+/// Returns true if this character is vertical ASCII whitespace: '\\n', '\\r'.
+///
+/// Note that this returns false for '\\0'.
+LLVM_READONLY static inline bool isVerticalWhitespace(unsigned char c) {
+ using namespace charinfo;
+ return (InfoTable[c] & CHAR_VERT_WS) != 0;
+}
+
+/// Return true if this character is horizontal or vertical ASCII whitespace:
+/// ' ', '\\t', '\\f', '\\v', '\\n', '\\r'.
+///
+/// Note that this returns false for '\\0'.
+LLVM_READONLY static inline bool isWhitespace(unsigned char c) {
+ using namespace charinfo;
+ return (InfoTable[c] & (CHAR_HORZ_WS|CHAR_VERT_WS|CHAR_SPACE)) != 0;
+}
+
+/// Return true if this character is an ASCII digit: [0-9]
+LLVM_READONLY static inline bool isDigit(unsigned char c) {
+ using namespace charinfo;
+ return (InfoTable[c] & CHAR_DIGIT) != 0;
+}
+
+/// Return true if this character is a lowercase ASCII letter: [a-z]
+LLVM_READONLY static inline bool isLowercase(unsigned char c) {
+ using namespace charinfo;
+ return (InfoTable[c] & CHAR_LOWER) != 0;
+}
+
+/// Return true if this character is an uppercase ASCII letter: [A-Z]
+LLVM_READONLY static inline bool isUppercase(unsigned char c) {
+ using namespace charinfo;
+ return (InfoTable[c] & CHAR_UPPER) != 0;
+}
+
+/// Return true if this character is an ASCII letter: [a-zA-Z]
+LLVM_READONLY static inline bool isLetter(unsigned char c) {
+ using namespace charinfo;
+ return (InfoTable[c] & (CHAR_UPPER|CHAR_LOWER)) != 0;
+}
+
+/// Return true if this character is an ASCII letter or digit: [a-zA-Z0-9]
+LLVM_READONLY static inline bool isAlphanumeric(unsigned char c) {
+ using namespace charinfo;
+ return (InfoTable[c] & (CHAR_DIGIT|CHAR_UPPER|CHAR_LOWER)) != 0;
+}
+
+/// Return true if this character is an ASCII hex digit: [0-9a-fA-F]
+LLVM_READONLY static inline bool isHexDigit(unsigned char c) {
+ using namespace charinfo;
+ return (InfoTable[c] & (CHAR_DIGIT|CHAR_XLETTER)) != 0;
+ return true;
+}
+
+/// Return true if this character is an ASCII punctuation character.
+///
+/// Note that '_' is both a punctuation character and an identifier character!
+LLVM_READONLY static inline bool isPunctuation(unsigned char c) {
+ using namespace charinfo;
+ return (InfoTable[c] & (CHAR_UNDER|CHAR_PERIOD|CHAR_RAWDEL|CHAR_PUNCT)) != 0;
+}
+
+/// Return true if this character is an ASCII printable character; that is, a
+/// character that should take exactly one column to print in a fixed-width
+/// terminal.
+LLVM_READONLY static inline bool isPrintable(unsigned char c) {
+ using namespace charinfo;
+ return (InfoTable[c] & (CHAR_UPPER|CHAR_LOWER|CHAR_PERIOD|CHAR_PUNCT|
+ CHAR_DIGIT|CHAR_UNDER|CHAR_RAWDEL|CHAR_SPACE)) != 0;
+}
+
+/// Return true if this is the body character of a C preprocessing number,
+/// which is [a-zA-Z0-9_.].
+LLVM_READONLY static inline bool isPreprocessingNumberBody(unsigned char c) {
+ using namespace charinfo;
+ return (InfoTable[c] &
+ (CHAR_UPPER|CHAR_LOWER|CHAR_DIGIT|CHAR_UNDER|CHAR_PERIOD)) != 0;
+}
+
+/// Return true if this is the body character of a C++ raw string delimiter.
+LLVM_READONLY static inline bool isRawStringDelimBody(unsigned char c) {
+ using namespace charinfo;
+ return (InfoTable[c] & (CHAR_UPPER|CHAR_LOWER|CHAR_PERIOD|
+ CHAR_DIGIT|CHAR_UNDER|CHAR_RAWDEL)) != 0;
+}
+
+} // end namespace clang
+
+#endif
Modified: cfe/trunk/lib/Basic/CMakeLists.txt
URL: http://llvm.org/viewvc/llvm-project/cfe/trunk/lib/Basic/CMakeLists.txt?rev=174765&r1=174764&r2=174765&view=diff
==============================================================================
--- cfe/trunk/lib/Basic/CMakeLists.txt (original)
+++ cfe/trunk/lib/Basic/CMakeLists.txt Fri Feb 8 16:30:22 2013
@@ -2,6 +2,7 @@ set(LLVM_LINK_COMPONENTS mc)
add_clang_library(clangBasic
Builtins.cpp
+ CharInfo.cpp
Diagnostic.cpp
DiagnosticIDs.cpp
FileManager.cpp
Added: cfe/trunk/lib/Basic/CharInfo.cpp
URL: http://llvm.org/viewvc/llvm-project/cfe/trunk/lib/Basic/CharInfo.cpp?rev=174765&view=auto
==============================================================================
--- cfe/trunk/lib/Basic/CharInfo.cpp (added)
+++ cfe/trunk/lib/Basic/CharInfo.cpp Fri Feb 8 16:30:22 2013
@@ -0,0 +1,80 @@
+//===--- CharInfo.cpp - Static Data for Classifying ASCII Characters ------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "clang/Basic/CharInfo.h"
+
+// Statically initialize CharInfo table based on ASCII character set
+// Reference: FreeBSD 7.2 /usr/share/misc/ascii
+const uint16_t clang::charinfo::InfoTable[256] =
+{
+ // 0 NUL 1 SOH 2 STX 3 ETX
+ // 4 EOT 5 ENQ 6 ACK 7 BEL
+ 0 , 0 , 0 , 0 ,
+ 0 , 0 , 0 , 0 ,
+ // 8 BS 9 HT 10 NL 11 VT
+ //12 NP 13 CR 14 SO 15 SI
+ 0 , CHAR_HORZ_WS, CHAR_VERT_WS, CHAR_HORZ_WS,
+ CHAR_HORZ_WS, CHAR_VERT_WS, 0 , 0 ,
+ //16 DLE 17 DC1 18 DC2 19 DC3
+ //20 DC4 21 NAK 22 SYN 23 ETB
+ 0 , 0 , 0 , 0 ,
+ 0 , 0 , 0 , 0 ,
+ //24 CAN 25 EM 26 SUB 27 ESC
+ //28 FS 29 GS 30 RS 31 US
+ 0 , 0 , 0 , 0 ,
+ 0 , 0 , 0 , 0 ,
+ //32 SP 33 ! 34 " 35 #
+ //36 $ 37 % 38 & 39 '
+ CHAR_SPACE , CHAR_RAWDEL , CHAR_RAWDEL , CHAR_RAWDEL ,
+ CHAR_PUNCT , CHAR_RAWDEL , CHAR_RAWDEL , CHAR_RAWDEL ,
+ //40 ( 41 ) 42 * 43 +
+ //44 , 45 - 46 . 47 /
+ CHAR_PUNCT , CHAR_PUNCT , CHAR_RAWDEL , CHAR_RAWDEL ,
+ CHAR_RAWDEL , CHAR_RAWDEL , CHAR_PERIOD , CHAR_RAWDEL ,
+ //48 0 49 1 50 2 51 3
+ //52 4 53 5 54 6 55 7
+ CHAR_DIGIT , CHAR_DIGIT , CHAR_DIGIT , CHAR_DIGIT ,
+ CHAR_DIGIT , CHAR_DIGIT , CHAR_DIGIT , CHAR_DIGIT ,
+ //56 8 57 9 58 : 59 ;
+ //60 < 61 = 62 > 63 ?
+ CHAR_DIGIT , CHAR_DIGIT , CHAR_RAWDEL , CHAR_RAWDEL ,
+ CHAR_RAWDEL , CHAR_RAWDEL , CHAR_RAWDEL , CHAR_RAWDEL ,
+ //64 @ 65 A 66 B 67 C
+ //68 D 69 E 70 F 71 G
+ CHAR_PUNCT , CHAR_XUPPER , CHAR_XUPPER , CHAR_XUPPER ,
+ CHAR_XUPPER , CHAR_XUPPER , CHAR_XUPPER , CHAR_UPPER ,
+ //72 H 73 I 74 J 75 K
+ //76 L 77 M 78 N 79 O
+ CHAR_UPPER , CHAR_UPPER , CHAR_UPPER , CHAR_UPPER ,
+ CHAR_UPPER , CHAR_UPPER , CHAR_UPPER , CHAR_UPPER ,
+ //80 P 81 Q 82 R 83 S
+ //84 T 85 U 86 V 87 W
+ CHAR_UPPER , CHAR_UPPER , CHAR_UPPER , CHAR_UPPER ,
+ CHAR_UPPER , CHAR_UPPER , CHAR_UPPER , CHAR_UPPER ,
+ //88 X 89 Y 90 Z 91 [
+ //92 \ 93 ] 94 ^ 95 _
+ CHAR_UPPER , CHAR_UPPER , CHAR_UPPER , CHAR_RAWDEL ,
+ CHAR_PUNCT , CHAR_RAWDEL , CHAR_RAWDEL , CHAR_UNDER ,
+ //96 ` 97 a 98 b 99 c
+ //100 d 101 e 102 f 103 g
+ CHAR_PUNCT , CHAR_XLOWER , CHAR_XLOWER , CHAR_XLOWER ,
+ CHAR_XLOWER , CHAR_XLOWER , CHAR_XLOWER , CHAR_LOWER ,
+ //104 h 105 i 106 j 107 k
+ //108 l 109 m 110 n 111 o
+ CHAR_LOWER , CHAR_LOWER , CHAR_LOWER , CHAR_LOWER ,
+ CHAR_LOWER , CHAR_LOWER , CHAR_LOWER , CHAR_LOWER ,
+ //112 p 113 q 114 r 115 s
+ //116 t 117 u 118 v 119 w
+ CHAR_LOWER , CHAR_LOWER , CHAR_LOWER , CHAR_LOWER ,
+ CHAR_LOWER , CHAR_LOWER , CHAR_LOWER , CHAR_LOWER ,
+ //120 x 121 y 122 z 123 {
+ //124 | 125 } 126 ~ 127 DEL
+ CHAR_LOWER , CHAR_LOWER , CHAR_LOWER , CHAR_RAWDEL ,
+ CHAR_RAWDEL , CHAR_RAWDEL , CHAR_RAWDEL , 0
+};
Modified: cfe/trunk/lib/Lex/Lexer.cpp
URL: http://llvm.org/viewvc/llvm-project/cfe/trunk/lib/Lex/Lexer.cpp?rev=174765&r1=174764&r2=174765&view=diff
==============================================================================
--- cfe/trunk/lib/Lex/Lexer.cpp (original)
+++ cfe/trunk/lib/Lex/Lexer.cpp Fri Feb 8 16:30:22 2013
@@ -25,6 +25,7 @@
//===----------------------------------------------------------------------===//
#include "clang/Lex/Lexer.h"
+#include "clang/Basic/CharInfo.h"
#include "clang/Basic/SourceManager.h"
#include "clang/Lex/CodeCompletionHandler.h"
#include "clang/Lex/LexDiagnostic.h"
@@ -38,8 +39,6 @@
#include <cstring>
using namespace clang;
-static void InitCharacterInfo();
-
//===----------------------------------------------------------------------===//
// Token Class Implementation
//===----------------------------------------------------------------------===//
@@ -66,8 +65,6 @@ void Lexer::anchor() { }
void Lexer::InitLexer(const char *BufStart, const char *BufPtr,
const char *BufEnd) {
- InitCharacterInfo();
-
BufferStart = BufStart;
BufferPtr = BufPtr;
BufferEnd = BufEnd;
@@ -408,9 +405,6 @@ unsigned Lexer::getSpelling(const Token
}
-
-static bool isWhitespace(unsigned char c);
-
/// MeasureTokenLength - Relex the token at the specified location and return
/// its length in bytes in the input file. If the token needs cleaning (e.g.
/// includes a trigraph or an escaped newline) then this count includes bytes
@@ -1008,163 +1002,8 @@ StringRef Lexer::getImmediateMacroName(S
return ExpansionBuffer.substr(ExpansionInfo.second, MacroTokenLength);
}
-//===----------------------------------------------------------------------===//
-// Character information.
-//===----------------------------------------------------------------------===//
-
-enum {
- CHAR_HORZ_WS = 0x01, // ' ', '\t', '\f', '\v'. Note, no '\0'
- CHAR_VERT_WS = 0x02, // '\r', '\n'
- CHAR_LETTER = 0x04, // a-z,A-Z
- CHAR_NUMBER = 0x08, // 0-9
- CHAR_UNDER = 0x10, // _
- CHAR_PERIOD = 0x20, // .
- CHAR_RAWDEL = 0x40 // {}[]#<>%:;?*+-/^&|~!=,"'
-};
-
-// Statically initialize CharInfo table based on ASCII character set
-// Reference: FreeBSD 7.2 /usr/share/misc/ascii
-static const unsigned char CharInfo[256] =
-{
-// 0 NUL 1 SOH 2 STX 3 ETX
-// 4 EOT 5 ENQ 6 ACK 7 BEL
- 0 , 0 , 0 , 0 ,
- 0 , 0 , 0 , 0 ,
-// 8 BS 9 HT 10 NL 11 VT
-//12 NP 13 CR 14 SO 15 SI
- 0 , CHAR_HORZ_WS, CHAR_VERT_WS, CHAR_HORZ_WS,
- CHAR_HORZ_WS, CHAR_VERT_WS, 0 , 0 ,
-//16 DLE 17 DC1 18 DC2 19 DC3
-//20 DC4 21 NAK 22 SYN 23 ETB
- 0 , 0 , 0 , 0 ,
- 0 , 0 , 0 , 0 ,
-//24 CAN 25 EM 26 SUB 27 ESC
-//28 FS 29 GS 30 RS 31 US
- 0 , 0 , 0 , 0 ,
- 0 , 0 , 0 , 0 ,
-//32 SP 33 ! 34 " 35 #
-//36 $ 37 % 38 & 39 '
- CHAR_HORZ_WS, CHAR_RAWDEL , CHAR_RAWDEL , CHAR_RAWDEL ,
- 0 , CHAR_RAWDEL , CHAR_RAWDEL , CHAR_RAWDEL ,
-//40 ( 41 ) 42 * 43 +
-//44 , 45 - 46 . 47 /
- 0 , 0 , CHAR_RAWDEL , CHAR_RAWDEL ,
- CHAR_RAWDEL , CHAR_RAWDEL , CHAR_PERIOD , CHAR_RAWDEL ,
-//48 0 49 1 50 2 51 3
-//52 4 53 5 54 6 55 7
- CHAR_NUMBER , CHAR_NUMBER , CHAR_NUMBER , CHAR_NUMBER ,
- CHAR_NUMBER , CHAR_NUMBER , CHAR_NUMBER , CHAR_NUMBER ,
-//56 8 57 9 58 : 59 ;
-//60 < 61 = 62 > 63 ?
- CHAR_NUMBER , CHAR_NUMBER , CHAR_RAWDEL , CHAR_RAWDEL ,
- CHAR_RAWDEL , CHAR_RAWDEL , CHAR_RAWDEL , CHAR_RAWDEL ,
-//64 @ 65 A 66 B 67 C
-//68 D 69 E 70 F 71 G
- 0 , CHAR_LETTER , CHAR_LETTER , CHAR_LETTER ,
- CHAR_LETTER , CHAR_LETTER , CHAR_LETTER , CHAR_LETTER ,
-//72 H 73 I 74 J 75 K
-//76 L 77 M 78 N 79 O
- CHAR_LETTER , CHAR_LETTER , CHAR_LETTER , CHAR_LETTER ,
- CHAR_LETTER , CHAR_LETTER , CHAR_LETTER , CHAR_LETTER ,
-//80 P 81 Q 82 R 83 S
-//84 T 85 U 86 V 87 W
- CHAR_LETTER , CHAR_LETTER , CHAR_LETTER , CHAR_LETTER ,
- CHAR_LETTER , CHAR_LETTER , CHAR_LETTER , CHAR_LETTER ,
-//88 X 89 Y 90 Z 91 [
-//92 \ 93 ] 94 ^ 95 _
- CHAR_LETTER , CHAR_LETTER , CHAR_LETTER , CHAR_RAWDEL ,
- 0 , CHAR_RAWDEL , CHAR_RAWDEL , CHAR_UNDER ,
-//96 ` 97 a 98 b 99 c
-//100 d 101 e 102 f 103 g
- 0 , CHAR_LETTER , CHAR_LETTER , CHAR_LETTER ,
- CHAR_LETTER , CHAR_LETTER , CHAR_LETTER , CHAR_LETTER ,
-//104 h 105 i 106 j 107 k
-//108 l 109 m 110 n 111 o
- CHAR_LETTER , CHAR_LETTER , CHAR_LETTER , CHAR_LETTER ,
- CHAR_LETTER , CHAR_LETTER , CHAR_LETTER , CHAR_LETTER ,
-//112 p 113 q 114 r 115 s
-//116 t 117 u 118 v 119 w
- CHAR_LETTER , CHAR_LETTER , CHAR_LETTER , CHAR_LETTER ,
- CHAR_LETTER , CHAR_LETTER , CHAR_LETTER , CHAR_LETTER ,
-//120 x 121 y 122 z 123 {
-//124 | 125 } 126 ~ 127 DEL
- CHAR_LETTER , CHAR_LETTER , CHAR_LETTER , CHAR_RAWDEL ,
- CHAR_RAWDEL , CHAR_RAWDEL , CHAR_RAWDEL , 0
-};
-
-static void InitCharacterInfo() {
- static bool isInited = false;
- if (isInited) return;
- // check the statically-initialized CharInfo table
- assert(CHAR_HORZ_WS == CharInfo[(int)' ']);
- assert(CHAR_HORZ_WS == CharInfo[(int)'\t']);
- assert(CHAR_HORZ_WS == CharInfo[(int)'\f']);
- assert(CHAR_HORZ_WS == CharInfo[(int)'\v']);
- assert(CHAR_VERT_WS == CharInfo[(int)'\n']);
- assert(CHAR_VERT_WS == CharInfo[(int)'\r']);
- assert(CHAR_UNDER == CharInfo[(int)'_']);
- assert(CHAR_PERIOD == CharInfo[(int)'.']);
- for (unsigned i = 'a'; i <= 'z'; ++i) {
- assert(CHAR_LETTER == CharInfo[i]);
- assert(CHAR_LETTER == CharInfo[i+'A'-'a']);
- }
- for (unsigned i = '0'; i <= '9'; ++i)
- assert(CHAR_NUMBER == CharInfo[i]);
-
- isInited = true;
-}
-
-
-/// isIdentifierHead - Return true if this is the first character of an
-/// identifier, which is [a-zA-Z_].
-static inline bool isIdentifierHead(unsigned char c) {
- return (CharInfo[c] & (CHAR_LETTER|CHAR_UNDER)) ? true : false;
-}
-
-/// isIdentifierBody - Return true if this is the body character of an
-/// identifier, which is [a-zA-Z0-9_].
-static inline bool isIdentifierBody(unsigned char c) {
- return (CharInfo[c] & (CHAR_LETTER|CHAR_NUMBER|CHAR_UNDER)) ? true : false;
-}
-
-/// isHorizontalWhitespace - Return true if this character is horizontal
-/// whitespace: ' ', '\\t', '\\f', '\\v'. Note that this returns false for
-/// '\\0'.
-static inline bool isHorizontalWhitespace(unsigned char c) {
- return (CharInfo[c] & CHAR_HORZ_WS) ? true : false;
-}
-
-/// isVerticalWhitespace - Return true if this character is vertical
-/// whitespace: '\\n', '\\r'. Note that this returns false for '\\0'.
-static inline bool isVerticalWhitespace(unsigned char c) {
- return (CharInfo[c] & CHAR_VERT_WS) ? true : false;
-}
-
-/// isWhitespace - Return true if this character is horizontal or vertical
-/// whitespace: ' ', '\\t', '\\f', '\\v', '\\n', '\\r'. Note that this returns
-/// false for '\\0'.
-static inline bool isWhitespace(unsigned char c) {
- return (CharInfo[c] & (CHAR_HORZ_WS|CHAR_VERT_WS)) ? true : false;
-}
-
-/// isNumberBody - Return true if this is the body character of an
-/// preprocessing number, which is [a-zA-Z0-9_.].
-static inline bool isNumberBody(unsigned char c) {
- return (CharInfo[c] & (CHAR_LETTER|CHAR_NUMBER|CHAR_UNDER|CHAR_PERIOD)) ?
- true : false;
-}
-
-/// isRawStringDelimBody - Return true if this is the body character of a
-/// raw string delimiter.
-static inline bool isRawStringDelimBody(unsigned char c) {
- return (CharInfo[c] &
- (CHAR_LETTER|CHAR_NUMBER|CHAR_UNDER|CHAR_PERIOD|CHAR_RAWDEL)) ?
- true : false;
-}
-
-// Allow external clients to make use of CharInfo.
bool Lexer::isIdentifierBodyChar(char c, const LangOptions &LangOpts) {
- return isIdentifierBody(c) || (c == '$' && LangOpts.DollarIdents);
+ return isIdentifierBody(c, LangOpts.DollarIdents);
}
@@ -1578,10 +1417,6 @@ static bool isAllowedInitiallyIDChar(uin
!(0xFE20 <= c && c <= 0xFE2F);
}
-static inline bool isASCII(char C) {
- return static_cast<signed char>(C) >= 0;
-}
-
void Lexer::LexIdentifier(Token &Result, const char *CurPtr) {
// Match [_A-Za-z0-9]*, we have already matched [_A-Za-z$]
@@ -1595,8 +1430,8 @@ void Lexer::LexIdentifier(Token &Result,
// Fast path, no $,\,? in identifier found. '\' might be an escaped newline
// or UCN, and ? might be a trigraph for '\', an escaped newline or UCN.
//
- // TODO: Could merge these checks into a CharInfo flag to make the comparison
- // cheaper
+ // TODO: Could merge these checks into an InfoTable flag to make the
+ // comparison cheaper
if (isASCII(C) && C != '\\' && C != '?' &&
(C != '$' || !LangOpts.DollarIdents)) {
FinishIdentifier:
@@ -1700,7 +1535,7 @@ void Lexer::LexNumericConstant(Token &Re
unsigned Size;
char C = getCharAndSize(CurPtr, Size);
char PrevCh = 0;
- while (isNumberBody(C)) { // FIXME: UCNs in ud-suffix.
+ while (isPreprocessingNumberBody(C)) { // FIXME: UCNs in ud-suffix.
CurPtr = ConsumeChar(CurPtr, Size, Result);
PrevCh = C;
C = getCharAndSize(CurPtr, Size);
Modified: cfe/trunk/unittests/Basic/CMakeLists.txt
URL: http://llvm.org/viewvc/llvm-project/cfe/trunk/unittests/Basic/CMakeLists.txt?rev=174765&r1=174764&r2=174765&view=diff
==============================================================================
--- cfe/trunk/unittests/Basic/CMakeLists.txt (original)
+++ cfe/trunk/unittests/Basic/CMakeLists.txt Fri Feb 8 16:30:22 2013
@@ -1,4 +1,5 @@
add_clang_unittest(BasicTests
+ CharInfoTest.cpp
FileManagerTest.cpp
SourceManagerTest.cpp
)
Added: cfe/trunk/unittests/Basic/CharInfoTest.cpp
URL: http://llvm.org/viewvc/llvm-project/cfe/trunk/unittests/Basic/CharInfoTest.cpp?rev=174765&view=auto
==============================================================================
--- cfe/trunk/unittests/Basic/CharInfoTest.cpp (added)
+++ cfe/trunk/unittests/Basic/CharInfoTest.cpp Fri Feb 8 16:30:22 2013
@@ -0,0 +1,377 @@
+//===- unittests/Basic/CharInfoTest.cpp -- ASCII classification tests -----===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "clang/Basic/CharInfo.h"
+#include "gtest/gtest.h"
+
+using namespace llvm;
+using namespace clang;
+
+// Check that the CharInfo table has been constructed reasonably.
+TEST(CharInfoTest, validateInfoTable) {
+ using namespace charinfo;
+ EXPECT_EQ((unsigned)CHAR_SPACE, InfoTable[(unsigned)' ']);
+ EXPECT_EQ((unsigned)CHAR_HORZ_WS, InfoTable[(unsigned)'\t']);
+ EXPECT_EQ((unsigned)CHAR_HORZ_WS, InfoTable[(unsigned)'\f']); // ??
+ EXPECT_EQ((unsigned)CHAR_HORZ_WS, InfoTable[(unsigned)'\v']); // ??
+ EXPECT_EQ((unsigned)CHAR_VERT_WS, InfoTable[(unsigned)'\n']);
+ EXPECT_EQ((unsigned)CHAR_VERT_WS, InfoTable[(unsigned)'\r']);
+ EXPECT_EQ((unsigned)CHAR_UNDER, InfoTable[(unsigned)'_']);
+ EXPECT_EQ((unsigned)CHAR_PERIOD, InfoTable[(unsigned)'.']);
+
+ for (unsigned i = 'a'; i <= 'f'; ++i) {
+ EXPECT_EQ((unsigned)CHAR_XLOWER, InfoTable[i]);
+ EXPECT_EQ((unsigned)CHAR_XUPPER, InfoTable[i+'A'-'a']);
+ }
+
+ for (unsigned i = 'g'; i <= 'z'; ++i) {
+ EXPECT_EQ((unsigned)CHAR_LOWER, InfoTable[i]);
+ EXPECT_EQ((unsigned)CHAR_UPPER, InfoTable[i+'A'-'a']);
+ }
+
+ for (unsigned i = '0'; i <= '9'; ++i)
+ EXPECT_EQ((unsigned)CHAR_DIGIT, InfoTable[i]);
+}
+
+// Check various predicates.
+TEST(CharInfoTest, isASCII) {
+ EXPECT_TRUE(isASCII('\0'));
+ EXPECT_TRUE(isASCII('\n'));
+ EXPECT_TRUE(isASCII(' '));
+ EXPECT_TRUE(isASCII('a'));
+ EXPECT_TRUE(isASCII('\x7f'));
+ EXPECT_FALSE(isASCII('\x80'));
+ EXPECT_FALSE(isASCII('\xc2'));
+ EXPECT_FALSE(isASCII('\xff'));
+}
+
+TEST(CharInfoTest, isIdentifierHead) {
+ EXPECT_TRUE(isIdentifierHead('a'));
+ EXPECT_TRUE(isIdentifierHead('A'));
+ EXPECT_TRUE(isIdentifierHead('z'));
+ EXPECT_TRUE(isIdentifierHead('Z'));
+ EXPECT_TRUE(isIdentifierHead('_'));
+
+ EXPECT_FALSE(isIdentifierHead('0'));
+ EXPECT_FALSE(isIdentifierHead('.'));
+ EXPECT_FALSE(isIdentifierHead('`'));
+ EXPECT_FALSE(isIdentifierHead('\0'));
+
+ EXPECT_FALSE(isIdentifierHead('$'));
+ EXPECT_TRUE(isIdentifierHead('$', /*AllowDollar=*/true));
+
+ EXPECT_FALSE(isIdentifierHead('\x80'));
+ EXPECT_FALSE(isIdentifierHead('\xc2'));
+ EXPECT_FALSE(isIdentifierHead('\xff'));
+}
+
+TEST(CharInfoTest, isIdentifierBody) {
+ EXPECT_TRUE(isIdentifierBody('a'));
+ EXPECT_TRUE(isIdentifierBody('A'));
+ EXPECT_TRUE(isIdentifierBody('z'));
+ EXPECT_TRUE(isIdentifierBody('Z'));
+ EXPECT_TRUE(isIdentifierBody('_'));
+
+ EXPECT_TRUE(isIdentifierBody('0'));
+ EXPECT_FALSE(isIdentifierBody('.'));
+ EXPECT_FALSE(isIdentifierBody('`'));
+ EXPECT_FALSE(isIdentifierBody('\0'));
+
+ EXPECT_FALSE(isIdentifierBody('$'));
+ EXPECT_TRUE(isIdentifierBody('$', /*AllowDollar=*/true));
+
+ EXPECT_FALSE(isIdentifierBody('\x80'));
+ EXPECT_FALSE(isIdentifierBody('\xc2'));
+ EXPECT_FALSE(isIdentifierBody('\xff'));
+}
+
+TEST(CharInfoTest, isHorizontalWhitespace) {
+ EXPECT_FALSE(isHorizontalWhitespace('a'));
+ EXPECT_FALSE(isHorizontalWhitespace('_'));
+ EXPECT_FALSE(isHorizontalWhitespace('0'));
+ EXPECT_FALSE(isHorizontalWhitespace('.'));
+ EXPECT_FALSE(isHorizontalWhitespace('`'));
+ EXPECT_FALSE(isHorizontalWhitespace('\0'));
+ EXPECT_FALSE(isHorizontalWhitespace('\x7f'));
+
+ EXPECT_TRUE(isHorizontalWhitespace(' '));
+ EXPECT_TRUE(isHorizontalWhitespace('\t'));
+ EXPECT_TRUE(isHorizontalWhitespace('\f')); // ??
+ EXPECT_TRUE(isHorizontalWhitespace('\v')); // ??
+
+ EXPECT_FALSE(isHorizontalWhitespace('\n'));
+ EXPECT_FALSE(isHorizontalWhitespace('\r'));
+
+ EXPECT_FALSE(isHorizontalWhitespace('\x80'));
+ EXPECT_FALSE(isHorizontalWhitespace('\xc2'));
+ EXPECT_FALSE(isHorizontalWhitespace('\xff'));
+}
+
+TEST(CharInfoTest, isVerticalWhitespace) {
+ EXPECT_FALSE(isVerticalWhitespace('a'));
+ EXPECT_FALSE(isVerticalWhitespace('_'));
+ EXPECT_FALSE(isVerticalWhitespace('0'));
+ EXPECT_FALSE(isVerticalWhitespace('.'));
+ EXPECT_FALSE(isVerticalWhitespace('`'));
+ EXPECT_FALSE(isVerticalWhitespace('\0'));
+ EXPECT_FALSE(isVerticalWhitespace('\x7f'));
+
+ EXPECT_FALSE(isVerticalWhitespace(' '));
+ EXPECT_FALSE(isVerticalWhitespace('\t'));
+ EXPECT_FALSE(isVerticalWhitespace('\f')); // ??
+ EXPECT_FALSE(isVerticalWhitespace('\v')); // ??
+
+ EXPECT_TRUE(isVerticalWhitespace('\n'));
+ EXPECT_TRUE(isVerticalWhitespace('\r'));
+
+ EXPECT_FALSE(isVerticalWhitespace('\x80'));
+ EXPECT_FALSE(isVerticalWhitespace('\xc2'));
+ EXPECT_FALSE(isVerticalWhitespace('\xff'));
+}
+
+TEST(CharInfoTest, isWhitespace) {
+ EXPECT_FALSE(isWhitespace('a'));
+ EXPECT_FALSE(isWhitespace('_'));
+ EXPECT_FALSE(isWhitespace('0'));
+ EXPECT_FALSE(isWhitespace('.'));
+ EXPECT_FALSE(isWhitespace('`'));
+ EXPECT_FALSE(isWhitespace('\0'));
+ EXPECT_FALSE(isWhitespace('\x7f'));
+
+ EXPECT_TRUE(isWhitespace(' '));
+ EXPECT_TRUE(isWhitespace('\t'));
+ EXPECT_TRUE(isWhitespace('\f'));
+ EXPECT_TRUE(isWhitespace('\v'));
+
+ EXPECT_TRUE(isWhitespace('\n'));
+ EXPECT_TRUE(isWhitespace('\r'));
+
+ EXPECT_FALSE(isWhitespace('\x80'));
+ EXPECT_FALSE(isWhitespace('\xc2'));
+ EXPECT_FALSE(isWhitespace('\xff'));
+}
+
+TEST(CharInfoTest, isDigit) {
+ EXPECT_TRUE(isDigit('0'));
+ EXPECT_TRUE(isDigit('9'));
+
+ EXPECT_FALSE(isDigit('a'));
+ EXPECT_FALSE(isDigit('A'));
+
+ EXPECT_FALSE(isDigit('z'));
+ EXPECT_FALSE(isDigit('Z'));
+
+ EXPECT_FALSE(isDigit('.'));
+ EXPECT_FALSE(isDigit('_'));
+
+ EXPECT_FALSE(isDigit('/'));
+ EXPECT_FALSE(isDigit('\0'));
+
+ EXPECT_FALSE(isDigit('\x80'));
+ EXPECT_FALSE(isDigit('\xc2'));
+ EXPECT_FALSE(isDigit('\xff'));
+}
+
+TEST(CharInfoTest, isHexDigit) {
+ EXPECT_TRUE(isHexDigit('0'));
+ EXPECT_TRUE(isHexDigit('9'));
+
+ EXPECT_TRUE(isHexDigit('a'));
+ EXPECT_TRUE(isHexDigit('A'));
+
+ EXPECT_FALSE(isHexDigit('z'));
+ EXPECT_FALSE(isHexDigit('Z'));
+
+ EXPECT_FALSE(isHexDigit('.'));
+ EXPECT_FALSE(isHexDigit('_'));
+
+ EXPECT_FALSE(isHexDigit('/'));
+ EXPECT_FALSE(isHexDigit('\0'));
+
+ EXPECT_FALSE(isHexDigit('\x80'));
+ EXPECT_FALSE(isHexDigit('\xc2'));
+ EXPECT_FALSE(isHexDigit('\xff'));
+}
+
+TEST(CharInfoTest, isLetter) {
+ EXPECT_FALSE(isLetter('0'));
+ EXPECT_FALSE(isLetter('9'));
+
+ EXPECT_TRUE(isLetter('a'));
+ EXPECT_TRUE(isLetter('A'));
+
+ EXPECT_TRUE(isLetter('z'));
+ EXPECT_TRUE(isLetter('Z'));
+
+ EXPECT_FALSE(isLetter('.'));
+ EXPECT_FALSE(isLetter('_'));
+
+ EXPECT_FALSE(isLetter('/'));
+ EXPECT_FALSE(isLetter('('));
+ EXPECT_FALSE(isLetter('\0'));
+
+ EXPECT_FALSE(isLetter('\x80'));
+ EXPECT_FALSE(isLetter('\xc2'));
+ EXPECT_FALSE(isLetter('\xff'));
+}
+
+TEST(CharInfoTest, isLowercase) {
+ EXPECT_FALSE(isLowercase('0'));
+ EXPECT_FALSE(isLowercase('9'));
+
+ EXPECT_TRUE(isLowercase('a'));
+ EXPECT_FALSE(isLowercase('A'));
+
+ EXPECT_TRUE(isLowercase('z'));
+ EXPECT_FALSE(isLowercase('Z'));
+
+ EXPECT_FALSE(isLowercase('.'));
+ EXPECT_FALSE(isLowercase('_'));
+
+ EXPECT_FALSE(isLowercase('/'));
+ EXPECT_FALSE(isLowercase('('));
+ EXPECT_FALSE(isLowercase('\0'));
+
+ EXPECT_FALSE(isLowercase('\x80'));
+ EXPECT_FALSE(isLowercase('\xc2'));
+ EXPECT_FALSE(isLowercase('\xff'));
+}
+
+TEST(CharInfoTest, isUppercase) {
+ EXPECT_FALSE(isUppercase('0'));
+ EXPECT_FALSE(isUppercase('9'));
+
+ EXPECT_FALSE(isUppercase('a'));
+ EXPECT_TRUE(isUppercase('A'));
+
+ EXPECT_FALSE(isUppercase('z'));
+ EXPECT_TRUE(isUppercase('Z'));
+
+ EXPECT_FALSE(isUppercase('.'));
+ EXPECT_FALSE(isUppercase('_'));
+
+ EXPECT_FALSE(isUppercase('/'));
+ EXPECT_FALSE(isUppercase('('));
+ EXPECT_FALSE(isUppercase('\0'));
+
+ EXPECT_FALSE(isUppercase('\x80'));
+ EXPECT_FALSE(isUppercase('\xc2'));
+ EXPECT_FALSE(isUppercase('\xff'));
+}
+
+TEST(CharInfoTest, isAlphanumeric) {
+ EXPECT_TRUE(isAlphanumeric('0'));
+ EXPECT_TRUE(isAlphanumeric('9'));
+
+ EXPECT_TRUE(isAlphanumeric('a'));
+ EXPECT_TRUE(isAlphanumeric('A'));
+
+ EXPECT_TRUE(isAlphanumeric('z'));
+ EXPECT_TRUE(isAlphanumeric('Z'));
+
+ EXPECT_FALSE(isAlphanumeric('.'));
+ EXPECT_FALSE(isAlphanumeric('_'));
+
+ EXPECT_FALSE(isAlphanumeric('/'));
+ EXPECT_FALSE(isAlphanumeric('('));
+ EXPECT_FALSE(isAlphanumeric('\0'));
+
+ EXPECT_FALSE(isAlphanumeric('\x80'));
+ EXPECT_FALSE(isAlphanumeric('\xc2'));
+ EXPECT_FALSE(isAlphanumeric('\xff'));
+}
+
+TEST(CharInfoTest, isPunctuation) {
+ EXPECT_FALSE(isPunctuation('0'));
+ EXPECT_FALSE(isPunctuation('9'));
+
+ EXPECT_FALSE(isPunctuation('a'));
+ EXPECT_FALSE(isPunctuation('A'));
+
+ EXPECT_FALSE(isPunctuation('z'));
+ EXPECT_FALSE(isPunctuation('Z'));
+
+ EXPECT_TRUE(isPunctuation('.'));
+ EXPECT_TRUE(isPunctuation('_'));
+
+ EXPECT_TRUE(isPunctuation('/'));
+ EXPECT_TRUE(isPunctuation('('));
+
+ EXPECT_FALSE(isPunctuation(' '));
+ EXPECT_FALSE(isPunctuation('\n'));
+ EXPECT_FALSE(isPunctuation('\0'));
+
+ EXPECT_FALSE(isPunctuation('\x80'));
+ EXPECT_FALSE(isPunctuation('\xc2'));
+ EXPECT_FALSE(isPunctuation('\xff'));
+}
+
+TEST(CharInfoTest, isPrintable) {
+ EXPECT_TRUE(isPrintable('0'));
+ EXPECT_TRUE(isPrintable('9'));
+
+ EXPECT_TRUE(isPrintable('a'));
+ EXPECT_TRUE(isPrintable('A'));
+
+ EXPECT_TRUE(isPrintable('z'));
+ EXPECT_TRUE(isPrintable('Z'));
+
+ EXPECT_TRUE(isPrintable('.'));
+ EXPECT_TRUE(isPrintable('_'));
+
+ EXPECT_TRUE(isPrintable('/'));
+ EXPECT_TRUE(isPrintable('('));
+
+ EXPECT_TRUE(isPrintable(' '));
+ EXPECT_FALSE(isPrintable('\t'));
+ EXPECT_FALSE(isPrintable('\n'));
+ EXPECT_FALSE(isPrintable('\0'));
+
+ EXPECT_FALSE(isPrintable('\x80'));
+ EXPECT_FALSE(isPrintable('\xc2'));
+ EXPECT_FALSE(isPrintable('\xff'));
+}
+
+TEST(CharInfoTest, isPreprocessingNumberBody) {
+ EXPECT_TRUE(isPreprocessingNumberBody('0'));
+ EXPECT_TRUE(isPreprocessingNumberBody('9'));
+
+ EXPECT_TRUE(isPreprocessingNumberBody('a'));
+ EXPECT_TRUE(isPreprocessingNumberBody('A'));
+
+ EXPECT_TRUE(isPreprocessingNumberBody('z'));
+ EXPECT_TRUE(isPreprocessingNumberBody('Z'));
+ EXPECT_TRUE(isPreprocessingNumberBody('.'));
+ EXPECT_TRUE(isPreprocessingNumberBody('_'));
+
+ EXPECT_FALSE(isPreprocessingNumberBody('/'));
+ EXPECT_FALSE(isPreprocessingNumberBody('('));
+ EXPECT_FALSE(isPreprocessingNumberBody('\0'));
+
+ EXPECT_FALSE(isPreprocessingNumberBody('\x80'));
+ EXPECT_FALSE(isPreprocessingNumberBody('\xc2'));
+ EXPECT_FALSE(isPreprocessingNumberBody('\xff'));
+}
+
+TEST(CharInfoTest, isRawStringDelimBody) {
+ EXPECT_TRUE(isRawStringDelimBody('0'));
+ EXPECT_TRUE(isRawStringDelimBody('9'));
+
+ EXPECT_TRUE(isRawStringDelimBody('a'));
+ EXPECT_TRUE(isRawStringDelimBody('A'));
+
+ EXPECT_TRUE(isRawStringDelimBody('z'));
+ EXPECT_TRUE(isRawStringDelimBody('Z'));
+ EXPECT_TRUE(isRawStringDelimBody('.'));
+ EXPECT_TRUE(isRawStringDelimBody('_'));
+
+ EXPECT_TRUE(isRawStringDelimBody('/'));
+ EXPECT_FALSE(isRawStringDelimBody('('));
+ EXPECT_FALSE(isRawStringDelimBody('\0'));
+}
More information about the cfe-commits
mailing list