r310576 - [Lexer] Finding beginning of token with escaped new line

Thu Aug 10 14:00:18 PDT 2017

On 10 August 2017 at 03:06, Alexander Kornienko via cfe-commits <
cfe-commits at lists.llvm.org> wrote:

> Author: alexfh
> Date: Thu Aug 10 03:06:16 2017
> New Revision: 310576
>
> URL: http://llvm.org/viewvc/llvm-project?rev=310576&view=rev
> Log:
> [Lexer] Finding beginning of token with escaped new line
>
> Summary:
> Lexer::GetBeginningOfToken produced invalid location when
> backtracking across escaped new lines.
>
> This fixes PR26228
>
> Reviewers: akyrtzi, alexfh, rsmith, doug.gregor
>
> Reviewed By: alexfh
>
> Subscribers: alexfh, cfe-commits
>
> Patch by Paweł Żukowski!
>
> Differential Revision: https://reviews.llvm.org/D30748
>
> Modified:
>     cfe/trunk/include/clang/Lex/Lexer.h
>     cfe/trunk/lib/Lex/Lexer.cpp
>     cfe/trunk/unittests/Lex/LexerTest.cpp
>
> Modified: cfe/trunk/include/clang/Lex/Lexer.h
> URL: http://llvm.org/viewvc/llvm-project/cfe/trunk/include/
> clang/Lex/Lexer.h?rev=310576&r1=310575&r2=310576&view=diff
> ============================================================
> ==================
> --- cfe/trunk/include/clang/Lex/Lexer.h (original)
> +++ cfe/trunk/include/clang/Lex/Lexer.h Thu Aug 10 03:06:16 2017
> @@ -463,6 +463,10 @@ public:
>    /// \brief Returns true if the given character could appear in an
> identifier.
>    static bool isIdentifierBodyChar(char c, const LangOptions &LangOpts);
>
> +  /// \brief Checks whether new line pointed by Str is preceded by escape
> +  /// sequence.
> +  static bool isNewLineEscaped(const char *BufferStart, const char *Str);
> +
>    /// getCharAndSizeNoWarn - Like the getCharAndSize method, but does not
> ever
>    /// emit a warning.
>    static inline char getCharAndSizeNoWarn(const char *Ptr, unsigned &Size,
>
> Modified: cfe/trunk/lib/Lex/Lexer.cpp
> URL: http://llvm.org/viewvc/llvm-project/cfe/trunk/lib/Lex/
> Lexer.cpp?rev=310576&r1=310575&r2=310576&view=diff
> ============================================================
> ==================
> --- cfe/trunk/lib/Lex/Lexer.cpp (original)
> +++ cfe/trunk/lib/Lex/Lexer.cpp Thu Aug 10 03:06:16 2017
> @@ -463,19 +463,15 @@ static const char *findBeginningOfLine(S
>    const char *BufStart = Buffer.data();
>    if (Offset >= Buffer.size())
>      return nullptr;
> -  const char *StrData = BufStart + Offset;
>
> -  if (StrData[0] == '\n' || StrData[0] == '\r')
> -    return StrData;
> -
> -  const char *LexStart = StrData;
> -  while (LexStart != BufStart) {
> -    if (LexStart[0] == '\n' || LexStart[0] == '\r') {
> +  const char *LexStart = BufStart + Offset;
> +  for (; LexStart != BufStart; --LexStart) {
> +    if (isVerticalWhitespace(LexStart[0]) &&
> +        !Lexer::isNewLineEscaped(BufStart, LexStart)) {
> +      // LexStart should point at first character of logical line.
>        ++LexStart;
>        break;
>      }
> -
> -    --LexStart;
>    }
>    return LexStart;
>  }
> @@ -487,7 +483,7 @@ static SourceLocation getBeginningOfFile
>    std::pair<FileID, unsigned> LocInfo = SM.getDecomposedLoc(Loc);
>    if (LocInfo.first.isInvalid())
>      return Loc;
> -
> +
>    bool Invalid = false;
>    StringRef Buffer = SM.getBufferData(LocInfo.first, &Invalid);
>    if (Invalid)
> @@ -499,31 +495,31 @@ static SourceLocation getBeginningOfFile
>    const char *LexStart = findBeginningOfLine(Buffer, LocInfo.second);
>    if (!LexStart || LexStart == StrData)
>      return Loc;
> -
> +
>    // Create a lexer starting at the beginning of this token.
>    SourceLocation LexerStartLoc = Loc.getLocWithOffset(-LocInfo.second);
>    Lexer TheLexer(LexerStartLoc, LangOpts, Buffer.data(), LexStart,
>                   Buffer.end());
>    TheLexer.SetCommentRetentionState(true);
> -
> +
>    // Lex tokens until we find the token that contains the source location.
>    Token TheTok;
>    do {
>      TheLexer.LexFromRawLexer(TheTok);
> -
> +
>      if (TheLexer.getBufferLocation() > StrData) {
>        // Lexing this token has taken the lexer past the source location
> we're
>        // looking for. If the current token encompasses our source
> location,
>        // return the beginning of that token.
>        if (TheLexer.getBufferLocation() - TheTok.getLength() <= StrData)
>          return TheTok.getLocation();
> -
> +
>        // We ended up skipping over the source location entirely, which
> means
>        // that it points into whitespace. We're done here.
>        break;
>      }
>    } while (TheTok.getKind() != tok::eof);
> -
> +
>    // We've passed our source location; just return the original source
> location.
>    return Loc;
>  }
> @@ -531,20 +527,20 @@ static SourceLocation getBeginningOfFile
>  SourceLocation Lexer::GetBeginningOfToken(SourceLocation Loc,
>                                            const SourceManager &SM,
>                                            const LangOptions &LangOpts) {
> - if (Loc.isFileID())
> -   return getBeginningOfFileToken(Loc, SM, LangOpts);
> -
> - if (!SM.isMacroArgExpansion(Loc))
> -   return Loc;
> -
> - SourceLocation FileLoc = SM.getSpellingLoc(Loc);
> - SourceLocation BeginFileLoc = getBeginningOfFileToken(FileLoc, SM,
> LangOpts);
> - std::pair<FileID, unsigned> FileLocInfo = SM.getDecomposedLoc(FileLoc);
> - std::pair<FileID, unsigned> BeginFileLocInfo
> -   = SM.getDecomposedLoc(BeginFileLoc);
> - assert(FileLocInfo.first == BeginFileLocInfo.first &&
> -        FileLocInfo.second >= BeginFileLocInfo.second);
> - return Loc.getLocWithOffset(BeginFileLocInfo.second -
> FileLocInfo.second);
> +  if (Loc.isFileID())
> +    return getBeginningOfFileToken(Loc, SM, LangOpts);
> +
> +  if (!SM.isMacroArgExpansion(Loc))
> +    return Loc;
> +
> +  SourceLocation FileLoc = SM.getSpellingLoc(Loc);
> +  SourceLocation BeginFileLoc = getBeginningOfFileToken(FileLoc, SM,
> LangOpts);
> +  std::pair<FileID, unsigned> FileLocInfo = SM.getDecomposedLoc(FileLoc);
> +  std::pair<FileID, unsigned> BeginFileLocInfo =
> +      SM.getDecomposedLoc(BeginFileLoc);
> +  assert(FileLocInfo.first == BeginFileLocInfo.first &&
> +         FileLocInfo.second >= BeginFileLocInfo.second);
> +  return Loc.getLocWithOffset(BeginFileLocInfo.second -
> FileLocInfo.second);
>  }
>
>  namespace {
> @@ -1032,6 +1028,26 @@ bool Lexer::isIdentifierBodyChar(char c,
>    return isIdentifierBody(c, LangOpts.DollarIdents);
>  }
>
> +bool Lexer::isNewLineEscaped(const char *BufferStart, const char *Str) {
> +  assert(isVerticalWhitespace(Str[0]));
> +  if (Str - 1 < BufferStart)
> +    return false;
> +
> +  if ((Str[0] == '\n' && Str[-1] == '\r') ||
> +      (Str[0] == '\r' && Str[-1] == '\n')) {
> +    if (Str - 2 < BufferStart)
> +      return false;
> +    --Str;
> +  }
> +  --Str;
> +
> +  // Rewind to first non-space character:
> +  while (Str > BufferStart && isHorizontalWhitespace(*Str))
> +    --Str;
> +
> +  return *Str == '\\';
>

When trigraphs are enabled, "??/" can also be used to escape a newline.

+}
> +
>  StringRef Lexer::getIndentationForLine(SourceLocation Loc,
>                                         const SourceManager &SM) {
>    if (Loc.isInvalid() || Loc.isMacroID())
>
> Modified: cfe/trunk/unittests/Lex/LexerTest.cpp
> URL: http://llvm.org/viewvc/llvm-project/cfe/trunk/unittests/
> Lex/LexerTest.cpp?rev=310576&r1=310575&r2=310576&view=diff
> ============================================================
> ==================
> --- cfe/trunk/unittests/Lex/LexerTest.cpp (original)
> +++ cfe/trunk/unittests/Lex/LexerTest.cpp Thu Aug 10 03:06:16 2017
> @@ -420,4 +420,57 @@ TEST_F(LexerTest, DontOverallocateString
>  #endif
>  }
>
> +TEST_F(LexerTest, IsNewLineEscapedValid) {
> +  auto hasNewLineEscaped = [](const char *S) {
> +    return Lexer::isNewLineEscaped(S, S + strlen(S) - 1);
> +  };
> +
> +  EXPECT_TRUE(hasNewLineEscaped("\\\r"));
> +  EXPECT_TRUE(hasNewLineEscaped("\\\n"));
> +  EXPECT_TRUE(hasNewLineEscaped("\\\r\n"));
> +  EXPECT_TRUE(hasNewLineEscaped("\\\n\r"));
> +  EXPECT_TRUE(hasNewLineEscaped("\\ \t\v\f\r"));
> +  EXPECT_TRUE(hasNewLineEscaped("\\ \t\v\f\r\n"));
> +
> +  EXPECT_FALSE(hasNewLineEscaped("\\\r\r"));
> +  EXPECT_FALSE(hasNewLineEscaped("\\\r\r\n"));
> +  EXPECT_FALSE(hasNewLineEscaped("\\\n\n"));
> +  EXPECT_FALSE(hasNewLineEscaped("\r"));
> +  EXPECT_FALSE(hasNewLineEscaped("\n"));
> +  EXPECT_FALSE(hasNewLineEscaped("\r\n"));
> +  EXPECT_FALSE(hasNewLineEscaped("\n\r"));
> +  EXPECT_FALSE(hasNewLineEscaped("\r\r"));
> +  EXPECT_FALSE(hasNewLineEscaped("\n\n"));
> +}
> +
> +TEST_F(LexerTest, GetBeginningOfTokenWithEscapedNewLine) {
> +  // Each line should have the same length for
> +  // further offset calculation to be more straightforward.
> +  const unsigned IdentifierLength = 8;
> +  std::string TextToLex = "rabarbar\n"
> +                          "foo\\\nbar\n"
> +                          "foo\\\rbar\n"
> +                          "fo\\\r\nbar\n"
> +                          "foo\\\n\rba\n";
> +  std::vector<tok::TokenKind> ExpectedTokens{5, tok::identifier};
> +  std::vector<Token> LexedTokens = CheckLex(TextToLex, ExpectedTokens);
> +
> +  for (const Token &Tok : LexedTokens) {
> +    std::pair<FileID, unsigned> OriginalLocation =
> +        SourceMgr.getDecomposedLoc(Tok.getLocation());
> +    for (unsigned Offset = 0; Offset < IdentifierLength; ++Offset) {
> +      SourceLocation LookupLocation =
> +          Tok.getLocation().getLocWithOffset(Offset);
> +
> +      std::pair<FileID, unsigned> FoundLocation =
> +          SourceMgr.getDecomposedExpansionLoc(
> +              Lexer::GetBeginningOfToken(LookupLocation, SourceMgr,
> LangOpts));
> +
> +      // Check that location returned by the GetBeginningOfToken
> +      // is the same as original token location reported by Lexer.
> +      EXPECT_EQ(FoundLocation.second, OriginalLocation.second);
> +    }
> +  }
> +}
> +
>  } // anonymous namespace
>
>
> _______________________________________________
> cfe-commits mailing list
> cfe-commits at lists.llvm.org
> http://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
>
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://lists.llvm.org/pipermail/cfe-commits/attachments/20170810/3f301b39/attachment-0001.html>