r310576 - [Lexer] Finding beginning of token with escaped new line
Alexander Kornienko via cfe-commits
cfe-commits at lists.llvm.org
Thu Aug 10 03:06:17 PDT 2017
Author: alexfh
Date: Thu Aug 10 03:06:16 2017
New Revision: 310576
URL: http://llvm.org/viewvc/llvm-project?rev=310576&view=rev
Log:
[Lexer] Finding beginning of token with escaped new line
Summary:
Lexer::GetBeginningOfToken produced invalid location when
backtracking across escaped new lines.
This fixes PR26228
Reviewers: akyrtzi, alexfh, rsmith, doug.gregor
Reviewed By: alexfh
Subscribers: alexfh, cfe-commits
Patch by Paweł Żukowski!
Differential Revision: https://reviews.llvm.org/D30748
Modified:
cfe/trunk/include/clang/Lex/Lexer.h
cfe/trunk/lib/Lex/Lexer.cpp
cfe/trunk/unittests/Lex/LexerTest.cpp
Modified: cfe/trunk/include/clang/Lex/Lexer.h
URL: http://llvm.org/viewvc/llvm-project/cfe/trunk/include/clang/Lex/Lexer.h?rev=310576&r1=310575&r2=310576&view=diff
==============================================================================
--- cfe/trunk/include/clang/Lex/Lexer.h (original)
+++ cfe/trunk/include/clang/Lex/Lexer.h Thu Aug 10 03:06:16 2017
@@ -463,6 +463,10 @@ public:
/// \brief Returns true if the given character could appear in an identifier.
static bool isIdentifierBodyChar(char c, const LangOptions &LangOpts);
+ /// \brief Checks whether new line pointed by Str is preceded by escape
+ /// sequence.
+ static bool isNewLineEscaped(const char *BufferStart, const char *Str);
+
/// getCharAndSizeNoWarn - Like the getCharAndSize method, but does not ever
/// emit a warning.
static inline char getCharAndSizeNoWarn(const char *Ptr, unsigned &Size,
Modified: cfe/trunk/lib/Lex/Lexer.cpp
URL: http://llvm.org/viewvc/llvm-project/cfe/trunk/lib/Lex/Lexer.cpp?rev=310576&r1=310575&r2=310576&view=diff
==============================================================================
--- cfe/trunk/lib/Lex/Lexer.cpp (original)
+++ cfe/trunk/lib/Lex/Lexer.cpp Thu Aug 10 03:06:16 2017
@@ -463,19 +463,15 @@ static const char *findBeginningOfLine(S
const char *BufStart = Buffer.data();
if (Offset >= Buffer.size())
return nullptr;
- const char *StrData = BufStart + Offset;
- if (StrData[0] == '\n' || StrData[0] == '\r')
- return StrData;
-
- const char *LexStart = StrData;
- while (LexStart != BufStart) {
- if (LexStart[0] == '\n' || LexStart[0] == '\r') {
+ const char *LexStart = BufStart + Offset;
+ for (; LexStart != BufStart; --LexStart) {
+ if (isVerticalWhitespace(LexStart[0]) &&
+ !Lexer::isNewLineEscaped(BufStart, LexStart)) {
+ // LexStart should point at first character of logical line.
++LexStart;
break;
}
-
- --LexStart;
}
return LexStart;
}
@@ -487,7 +483,7 @@ static SourceLocation getBeginningOfFile
std::pair<FileID, unsigned> LocInfo = SM.getDecomposedLoc(Loc);
if (LocInfo.first.isInvalid())
return Loc;
-
+
bool Invalid = false;
StringRef Buffer = SM.getBufferData(LocInfo.first, &Invalid);
if (Invalid)
@@ -499,31 +495,31 @@ static SourceLocation getBeginningOfFile
const char *LexStart = findBeginningOfLine(Buffer, LocInfo.second);
if (!LexStart || LexStart == StrData)
return Loc;
-
+
// Create a lexer starting at the beginning of this token.
SourceLocation LexerStartLoc = Loc.getLocWithOffset(-LocInfo.second);
Lexer TheLexer(LexerStartLoc, LangOpts, Buffer.data(), LexStart,
Buffer.end());
TheLexer.SetCommentRetentionState(true);
-
+
// Lex tokens until we find the token that contains the source location.
Token TheTok;
do {
TheLexer.LexFromRawLexer(TheTok);
-
+
if (TheLexer.getBufferLocation() > StrData) {
// Lexing this token has taken the lexer past the source location we're
// looking for. If the current token encompasses our source location,
// return the beginning of that token.
if (TheLexer.getBufferLocation() - TheTok.getLength() <= StrData)
return TheTok.getLocation();
-
+
// We ended up skipping over the source location entirely, which means
// that it points into whitespace. We're done here.
break;
}
} while (TheTok.getKind() != tok::eof);
-
+
// We've passed our source location; just return the original source location.
return Loc;
}
@@ -531,20 +527,20 @@ static SourceLocation getBeginningOfFile
SourceLocation Lexer::GetBeginningOfToken(SourceLocation Loc,
const SourceManager &SM,
const LangOptions &LangOpts) {
- if (Loc.isFileID())
- return getBeginningOfFileToken(Loc, SM, LangOpts);
-
- if (!SM.isMacroArgExpansion(Loc))
- return Loc;
-
- SourceLocation FileLoc = SM.getSpellingLoc(Loc);
- SourceLocation BeginFileLoc = getBeginningOfFileToken(FileLoc, SM, LangOpts);
- std::pair<FileID, unsigned> FileLocInfo = SM.getDecomposedLoc(FileLoc);
- std::pair<FileID, unsigned> BeginFileLocInfo
- = SM.getDecomposedLoc(BeginFileLoc);
- assert(FileLocInfo.first == BeginFileLocInfo.first &&
- FileLocInfo.second >= BeginFileLocInfo.second);
- return Loc.getLocWithOffset(BeginFileLocInfo.second - FileLocInfo.second);
+ if (Loc.isFileID())
+ return getBeginningOfFileToken(Loc, SM, LangOpts);
+
+ if (!SM.isMacroArgExpansion(Loc))
+ return Loc;
+
+ SourceLocation FileLoc = SM.getSpellingLoc(Loc);
+ SourceLocation BeginFileLoc = getBeginningOfFileToken(FileLoc, SM, LangOpts);
+ std::pair<FileID, unsigned> FileLocInfo = SM.getDecomposedLoc(FileLoc);
+ std::pair<FileID, unsigned> BeginFileLocInfo =
+ SM.getDecomposedLoc(BeginFileLoc);
+ assert(FileLocInfo.first == BeginFileLocInfo.first &&
+ FileLocInfo.second >= BeginFileLocInfo.second);
+ return Loc.getLocWithOffset(BeginFileLocInfo.second - FileLocInfo.second);
}
namespace {
@@ -1032,6 +1028,26 @@ bool Lexer::isIdentifierBodyChar(char c,
return isIdentifierBody(c, LangOpts.DollarIdents);
}
+bool Lexer::isNewLineEscaped(const char *BufferStart, const char *Str) {
+ assert(isVerticalWhitespace(Str[0]));
+ if (Str - 1 < BufferStart)
+ return false;
+
+ if ((Str[0] == '\n' && Str[-1] == '\r') ||
+ (Str[0] == '\r' && Str[-1] == '\n')) {
+ if (Str - 2 < BufferStart)
+ return false;
+ --Str;
+ }
+ --Str;
+
+ // Rewind to first non-space character:
+ while (Str > BufferStart && isHorizontalWhitespace(*Str))
+ --Str;
+
+ return *Str == '\\';
+}
+
StringRef Lexer::getIndentationForLine(SourceLocation Loc,
const SourceManager &SM) {
if (Loc.isInvalid() || Loc.isMacroID())
Modified: cfe/trunk/unittests/Lex/LexerTest.cpp
URL: http://llvm.org/viewvc/llvm-project/cfe/trunk/unittests/Lex/LexerTest.cpp?rev=310576&r1=310575&r2=310576&view=diff
==============================================================================
--- cfe/trunk/unittests/Lex/LexerTest.cpp (original)
+++ cfe/trunk/unittests/Lex/LexerTest.cpp Thu Aug 10 03:06:16 2017
@@ -420,4 +420,57 @@ TEST_F(LexerTest, DontOverallocateString
#endif
}
+TEST_F(LexerTest, IsNewLineEscapedValid) {
+ auto hasNewLineEscaped = [](const char *S) {
+ return Lexer::isNewLineEscaped(S, S + strlen(S) - 1);
+ };
+
+ EXPECT_TRUE(hasNewLineEscaped("\\\r"));
+ EXPECT_TRUE(hasNewLineEscaped("\\\n"));
+ EXPECT_TRUE(hasNewLineEscaped("\\\r\n"));
+ EXPECT_TRUE(hasNewLineEscaped("\\\n\r"));
+ EXPECT_TRUE(hasNewLineEscaped("\\ \t\v\f\r"));
+ EXPECT_TRUE(hasNewLineEscaped("\\ \t\v\f\r\n"));
+
+ EXPECT_FALSE(hasNewLineEscaped("\\\r\r"));
+ EXPECT_FALSE(hasNewLineEscaped("\\\r\r\n"));
+ EXPECT_FALSE(hasNewLineEscaped("\\\n\n"));
+ EXPECT_FALSE(hasNewLineEscaped("\r"));
+ EXPECT_FALSE(hasNewLineEscaped("\n"));
+ EXPECT_FALSE(hasNewLineEscaped("\r\n"));
+ EXPECT_FALSE(hasNewLineEscaped("\n\r"));
+ EXPECT_FALSE(hasNewLineEscaped("\r\r"));
+ EXPECT_FALSE(hasNewLineEscaped("\n\n"));
+}
+
+TEST_F(LexerTest, GetBeginningOfTokenWithEscapedNewLine) {
+ // Each line should have the same length for
+ // further offset calculation to be more straightforward.
+ const unsigned IdentifierLength = 8;
+ std::string TextToLex = "rabarbar\n"
+ "foo\\\nbar\n"
+ "foo\\\rbar\n"
+ "fo\\\r\nbar\n"
+ "foo\\\n\rba\n";
+ std::vector<tok::TokenKind> ExpectedTokens{5, tok::identifier};
+ std::vector<Token> LexedTokens = CheckLex(TextToLex, ExpectedTokens);
+
+ for (const Token &Tok : LexedTokens) {
+ std::pair<FileID, unsigned> OriginalLocation =
+ SourceMgr.getDecomposedLoc(Tok.getLocation());
+ for (unsigned Offset = 0; Offset < IdentifierLength; ++Offset) {
+ SourceLocation LookupLocation =
+ Tok.getLocation().getLocWithOffset(Offset);
+
+ std::pair<FileID, unsigned> FoundLocation =
+ SourceMgr.getDecomposedExpansionLoc(
+ Lexer::GetBeginningOfToken(LookupLocation, SourceMgr, LangOpts));
+
+ // Check that location returned by the GetBeginningOfToken
+ // is the same as original token location reported by Lexer.
+ EXPECT_EQ(FoundLocation.second, OriginalLocation.second);
+ }
+ }
+}
+
} // anonymous namespace
More information about the cfe-commits
mailing list