[cfe-commits] r57401 - in /cfe/trunk: include/clang/Lex/Lexer.h lib/Lex/Lexer.cpp
Chris Lattner
sabre at nondot.org
Sat Oct 11 21:05:49 PDT 2008
Author: lattner
Date: Sat Oct 11 23:05:48 2008
New Revision: 57401
URL: http://llvm.org/viewvc/llvm-project?rev=57401&view=rev
Log:
Add a new mode to the lexer which enables it to return all characters,
even whitespace, as tokens from the file. This is enabled with
L->SetKeepWhitespaceMode(true) on a raw lexer. In this mode, you too
can use clang as a really complex version of 'cat' with code like this:
Lexer RawLex(SourceLocation::getFileLoc(SM.getMainFileID(), 0),
PP.getLangOptions(), File.first, File.second);
RawLex.SetKeepWhitespaceMode(true);
Token RawTok;
RawLex.LexFromRawLexer(RawTok);
while (RawTok.isNot(tok::eof)) {
std::cout << PP.getSpelling(RawTok);
RawLex.LexFromRawLexer(RawTok);
}
This will emit exactly the input file, with no canonicalization or other
translation. Realistic clients actually do something with the tokens of
course :)
Modified:
cfe/trunk/include/clang/Lex/Lexer.h
cfe/trunk/lib/Lex/Lexer.cpp
Modified: cfe/trunk/include/clang/Lex/Lexer.h
URL: http://llvm.org/viewvc/llvm-project/cfe/trunk/include/clang/Lex/Lexer.h?rev=57401&r1=57400&r2=57401&view=diff
==============================================================================
--- cfe/trunk/include/clang/Lex/Lexer.h (original)
+++ cfe/trunk/include/clang/Lex/Lexer.h Sat Oct 11 23:05:48 2008
@@ -66,9 +66,14 @@
/// Note that in raw mode that the PP pointer may be null.
bool LexingRawMode;
- /// KeepCommentMode - The lexer can optionally keep C & BCPL-style comments,
- /// and return them as tokens. This is used for -C and -CC modes.
- bool KeepCommentMode;
+ /// ExtendedTokenMode - The lexer can optionally keep comments and whitespace
+ /// and return them as tokens. This is used for -C and -CC modes, and
+ /// whitespace preservation can be useful for some clients that want to lex
+ /// the file in raw mode and get every character from the file.
+ ///
+ /// When this is set to 2 it returns comments and whitespace. When set to 1
+ /// it returns comments, when it is set to 0 it returns normal tokens only.
+ unsigned char ExtendedTokenMode;
//===--------------------------------------------------------------------===//
// Context that changes as the file is lexed.
@@ -150,18 +155,36 @@
// lexer when in raw mode.
return BufferPtr == BufferEnd;
}
+
+ /// isKeepWhitespaceMode - Return true if the lexer should return tokens for
+ /// every character in the file, including whitespace and comments. This
+ /// should only be used in raw mode, as the preprocessor is not prepared to
+ /// deal with the excess tokens.
+ bool isKeepWhitespaceMode() const {
+ return ExtendedTokenMode > 1;
+ }
+
+ /// SetKeepWhitespaceMode - This method lets clients enable or disable
+ /// whitespace retention mode.
+ void SetKeepWhitespaceMode(bool Val) {
+ assert((!Val || LexingRawMode) &&
+ "Can only enable whitespace retention in raw mode");
+ ExtendedTokenMode = Val ? 2 : 0;
+ }
+
+ /// inKeepCommentMode - Return true if the lexer should return comments as
+ /// tokens.
+ bool inKeepCommentMode() const {
+ return ExtendedTokenMode > 0;
+ }
/// SetCommentRetentionMode - Change the comment retention mode of the lexer
/// to the specified mode. This is really only useful when lexing in raw
/// mode, because otherwise the lexer needs to manage this.
void SetCommentRetentionState(bool Mode) {
- KeepCommentMode = Mode;
- }
-
- /// inKeepCommentMode - Return true if the lexer should return comments as
- /// tokens.
- bool inKeepCommentMode() const {
- return KeepCommentMode;
+ assert(!isKeepWhitespaceMode() &&
+ "Can't play with comment retention state when retaining whitespace");
+ ExtendedTokenMode = Mode ? 1 : 0;
}
@@ -370,7 +393,7 @@
void LexCharConstant (Token &Result, const char *CurPtr);
bool LexEndOfFile (Token &Result, const char *CurPtr);
- void SkipWhitespace (Token &Result, const char *CurPtr);
+ bool SkipWhitespace (Token &Result, const char *CurPtr);
bool SkipBCPLComment (Token &Result, const char *CurPtr);
bool SkipBlockComment (Token &Result, const char *CurPtr);
bool SaveBCPLComment (Token &Result, const char *CurPtr);
Modified: cfe/trunk/lib/Lex/Lexer.cpp
URL: http://llvm.org/viewvc/llvm-project/cfe/trunk/lib/Lex/Lexer.cpp?rev=57401&r1=57400&r2=57401&view=diff
==============================================================================
--- cfe/trunk/lib/Lex/Lexer.cpp (original)
+++ cfe/trunk/lib/Lex/Lexer.cpp Sat Oct 11 23:05:48 2008
@@ -99,8 +99,8 @@
// or otherwise skipping over tokens.
LexingRawMode = false;
- // Default to keeping comments if requested.
- KeepCommentMode = false;
+ // Default to keeping comments if the preprocessor wants them.
+ ExtendedTokenMode = 0;
SetCommentRetentionState(PP->getCommentRetentionState());
}
@@ -137,7 +137,7 @@
LexingRawMode = true;
// Default to not keeping comments in raw mode.
- KeepCommentMode = false;
+ ExtendedTokenMode = 0;
}
@@ -591,7 +591,7 @@
/// LexStringLiteral - Lex the remainder of a string literal, after having lexed
/// either " or L".
-void Lexer::LexStringLiteral(Token &Result, const char *CurPtr, bool Wide){
+void Lexer::LexStringLiteral(Token &Result, const char *CurPtr, bool Wide) {
const char *NulCharacter = 0; // Does this string contain the \0 character?
char C = getAndAdvanceChar(CurPtr, Result);
@@ -704,7 +704,10 @@
/// SkipWhitespace - Efficiently skip over a series of whitespace characters.
/// Update BufferPtr to point to the next non-whitespace character and return.
-void Lexer::SkipWhitespace(Token &Result, const char *CurPtr) {
+///
+/// This method forms a token and returns true if KeepWhitespaceMode is enabled.
+///
+bool Lexer::SkipWhitespace(Token &Result, const char *CurPtr) {
// Whitespace - Skip it, then return the token after the whitespace.
unsigned char Char = *CurPtr; // Skip consequtive spaces efficiently.
while (1) {
@@ -719,7 +722,7 @@
if (ParsingPreprocessorDirective) {
// End of preprocessor directive line, let LexTokenInternal handle this.
BufferPtr = CurPtr;
- return;
+ return false;
}
// ok, but handle newline.
@@ -735,7 +738,15 @@
if (PrevChar != '\n' && PrevChar != '\r')
Result.setFlag(Token::LeadingSpace);
+ // If the client wants us to return whitespace, return it now.
+ if (isKeepWhitespaceMode()) {
+ Result.setKind(tok::unknown);
+ FormTokenWithChars(Result, CurPtr);
+ return true;
+ }
+
BufferPtr = CurPtr;
+ return false;
}
// SkipBCPLComment - We have just read the // characters from input. Skip until
@@ -817,7 +828,9 @@
// Otherwise, eat the \n character. We don't care if this is a \n\r or
// \r\n sequence. This is an efficiency hack (because we know the \n can't
- // contribute to another token), it isn't needed for correctness.
+ // contribute to another token), it isn't needed for correctness. Note that
+ // this is ok even in KeepWhitespaceMode, because we would have returned the
+ /// comment above in that mode.
++CurPtr;
// The next returned token is at the start of the line.
@@ -832,11 +845,16 @@
/// an appropriate way and return it.
bool Lexer::SaveBCPLComment(Token &Result, const char *CurPtr) {
Result.setKind(tok::comment);
- FormTokenWithChars(Result, CurPtr);
- // If this BCPL-style comment is in a macro definition, transmogrify it into
- // a C-style block comment.
- if (ParsingPreprocessorDirective) {
+ if (!ParsingPreprocessorDirective) {
+ // If we're not in a preprocessor directive, just return the // comment
+ // directly.
+ FormTokenWithChars(Result, CurPtr);
+ } else {
+ // If this BCPL-style comment is in a macro definition, transmogrify it into
+ // a C-style block comment.
+ BufferPtr = CurPtr;
+
std::string Spelling = PP->getSpelling(Result);
assert(Spelling[0] == '/' && Spelling[1] == '/' && "Not bcpl comment?");
Spelling[1] = '*'; // Change prefix to "/*".
@@ -1024,7 +1042,8 @@
// It is common for the tokens immediately after a /**/ comment to be
// whitespace. Instead of going through the big switch, handle it
- // efficiently now.
+ // efficiently now. This is safe even in KeepWhitespaceMode because we would
+ // have already returned above with the comment as a token.
if (isHorizontalWhitespace(*CurPtr)) {
Result.setFlag(Token::LeadingSpace);
SkipWhitespace(Result, CurPtr+1);
@@ -1203,6 +1222,16 @@
++CurPtr;
while ((*CurPtr == ' ') || (*CurPtr == '\t'))
++CurPtr;
+
+ // If we are keeping whitespace and other tokens, just return what we just
+ // skipped. The next lexer invocation will return the token after the
+ // whitespace.
+ if (isKeepWhitespaceMode()) {
+ Result.setKind(tok::unknown);
+ FormTokenWithChars(Result, CurPtr);
+ return;
+ }
+
BufferPtr = CurPtr;
Result.setFlag(Token::LeadingSpace);
}
@@ -1226,7 +1255,9 @@
Diag(CurPtr-1, diag::null_in_file);
Result.setFlag(Token::LeadingSpace);
- SkipWhitespace(Result, CurPtr);
+ if (SkipWhitespace(Result, CurPtr))
+ return; // KeepWhitespaceMode
+
goto LexNextToken; // GCC isn't tail call eliminating.
case '\n':
case '\r':
@@ -1249,7 +1280,9 @@
Result.setFlag(Token::StartOfLine);
// No leading whitespace seen so far.
Result.clearFlag(Token::LeadingSpace);
- SkipWhitespace(Result, CurPtr);
+
+ if (SkipWhitespace(Result, CurPtr))
+ return; // KeepWhitespaceMode
goto LexNextToken; // GCC isn't tail call eliminating.
case ' ':
case '\t':
@@ -1257,7 +1290,8 @@
case '\v':
SkipHorizontalWhitespace:
Result.setFlag(Token::LeadingSpace);
- SkipWhitespace(Result, CurPtr);
+ if (SkipWhitespace(Result, CurPtr))
+ return; // KeepWhitespaceMode
SkipIgnoredUnits:
CurPtr = BufferPtr;
More information about the cfe-commits
mailing list