[PATCH] D30748: [Lexer] Finding beginning of token with escaped new line

Wed Mar 8 10:29:15 PST 2017

idlecode created this revision.

Lexer::GetBeginningOfToken produced invalid location when
backtracking across escaped new lines.

This fixes PR26228


https://reviews.llvm.org/D30748

Files:
  lib/Lex/Lexer.cpp
  unittests/Lex/LexerTest.cpp


Index: unittests/Lex/LexerTest.cpp
===================================================================

--- unittests/Lex/LexerTest.cpp
+++ unittests/Lex/LexerTest.cpp
@@ -380,4 +380,36 @@
   EXPECT_EQ(SourceMgr.getFileIDSize(SourceMgr.getFileID(helper1ArgLoc)), 8U);
 }
 
+TEST_F(LexerTest, GetBeginningOfTokenWithEscapedNewLine) {
+  // Each line should have the same length for
+  // further offset calculation to be more straightforward.
+  const auto IdentifierLength = 8;
+  std::string textToLex =
+    "rabarbar\n"
+    "foo\\\nbar\n"
+    "foo\\\rbar\n"
+    "fo\\\r\nbar\n"
+    "foo\\\n\rba\n";
+  std::vector<tok::TokenKind> ExpectedTokens{5, tok::identifier};
+
+  auto lexedTokens = CheckLex(textToLex, ExpectedTokens);
+
+  for (const auto &tok : lexedTokens) {
+    auto originalLocation = SourceMgr.getDecomposedLoc(tok.getLocation());
+    for (unsigned offset = 0; offset < IdentifierLength; ++offset) {
+      auto lookupLocation = tok.getLocation().getLocWithOffset(offset);
+
+      auto foundLocation = SourceMgr.getDecomposedExpansionLoc(
+          Lexer::GetBeginningOfToken(
+            lookupLocation,
+            SourceMgr,
+            LangOpts));
+
+      // Check that location returned by the GetBeginningOfToken
+      // is the same as original token location reported by Lexer.
+      EXPECT_EQ(foundLocation.second, originalLocation.second);
+    }
+  }
+}
+
 } // anonymous namespace
Index: lib/Lex/Lexer.cpp
===================================================================
--- lib/Lex/Lexer.cpp
+++ lib/Lex/Lexer.cpp
@@ -452,6 +452,13 @@
   return false;
 }
 
+/// \brief Check if new line pointed by Str is escaped.
+static bool isNewLineEscaped(const char *BufferStart, const char *Str) {
+  while (Str > BufferStart && isWhitespace(*Str))
+    --Str;
+  return Str[0] == '\\';
+}
+
 static SourceLocation getBeginningOfFileToken(SourceLocation Loc,
                                               const SourceManager &SM,
                                               const LangOptions &LangOpts) {
@@ -467,22 +474,23 @@
 
   // Back up from the current location until we hit the beginning of a line
   // (or the buffer). We'll relex from that point.
-  const char *BufStart = Buffer.data();
   if (LocInfo.second >= Buffer.size())
     return Loc;
   
-  const char *StrData = BufStart+LocInfo.second;
-  if (StrData[0] == '\n' || StrData[0] == '\r')
-    return Loc;
+  const char *BufStart = Buffer.data();
+  const char *StrData = BufStart + LocInfo.second;
 
   const char *LexStart = StrData;
-  while (LexStart != BufStart) {
-    if (LexStart[0] == '\n' || LexStart[0] == '\r') {
-      ++LexStart;
-      break;
-    }
+  for (; LexStart != BufStart; --LexStart) {
+    if (!isVerticalWhitespace(LexStart[0]))
+      continue;
 
-    --LexStart;
+    if (isNewLineEscaped(BufStart, LexStart))
+      continue;
+
+    // LexStart should point at first character of logical line.
+    ++LexStart;
+    break;
   }
   
   // Create a lexer starting at the beginning of this token.


-------------- next part --------------
A non-text attachment was scrubbed...
Name: D30748.91047.patch
Type: text/x-patch
Size: 3034 bytes
Desc: not available
URL: <http://lists.llvm.org/pipermail/cfe-commits/attachments/20170308/7758d74c/attachment.bin>