[clang] f7e8be7 - Skip escaped newlines before checking for whitespace in Lexer::getRawToken. (#117548)

Thu Dec 5 06:37:50 PST 2024

Author: Samira Bazuzi
Date: 2024-12-05T09:37:46-05:00
New Revision: f7e8be7c66b53a126c8cba9ac81b5b77d873aa1e

URL: https://github.com/llvm/llvm-project/commit/f7e8be7c66b53a126c8cba9ac81b5b77d873aa1e
DIFF: https://github.com/llvm/llvm-project/commit/f7e8be7c66b53a126c8cba9ac81b5b77d873aa1e.diff

LOG: Skip escaped newlines before checking for whitespace in Lexer::getRawToken. (#117548)

The Lexer used in getRawToken is not told to keep whitespace, so when it
skips over escaped newlines, it also ignores whitespace, regardless of
getRawToken's IgnoreWhiteSpace parameter. 

Instead of letting this case fall through to lexing, check
for whitespace after skipping over any escaped newlines.

Added: 
    

Modified: 
    clang/lib/Lex/Lexer.cpp
    clang/unittests/Lex/LexerTest.cpp

Removed: 
    


################################################################################
diff  --git a/clang/lib/Lex/Lexer.cpp b/clang/lib/Lex/Lexer.cpp
index e58c8bc72ae5b3..72364500a48f9f 100644

--- a/clang/lib/Lex/Lexer.cpp
+++ b/clang/lib/Lex/Lexer.cpp
@@ -527,7 +527,7 @@ bool Lexer::getRawToken(SourceLocation Loc, Token &Result,
 
   const char *StrData = Buffer.data()+LocInfo.second;
 
-  if (!IgnoreWhiteSpace && isWhitespace(StrData[0]))
+  if (!IgnoreWhiteSpace && isWhitespace(SkipEscapedNewLines(StrData)[0]))
     return true;
 
   // Create a lexer starting at the beginning of this token.

diff  --git a/clang/unittests/Lex/LexerTest.cpp b/clang/unittests/Lex/LexerTest.cpp
index 47aa2c131a304d..aead7fb899d0a8 100644
--- a/clang/unittests/Lex/LexerTest.cpp
+++ b/clang/unittests/Lex/LexerTest.cpp
@@ -652,6 +652,38 @@ TEST_F(LexerTest, RawAndNormalLexSameForLineComments) {
   EXPECT_TRUE(ToksView.empty());
 }
 
+TEST_F(LexerTest, GetRawTokenOnEscapedNewLineChecksWhitespace) {
+  const llvm::StringLiteral Source = R"cc(
+  #define ONE \
+  1
+
+  int i = ONE;
+  )cc";
+  std::vector<Token> Toks =
+      CheckLex(Source, {tok::kw_int, tok::identifier, tok::equal,
+                        tok::numeric_constant, tok::semi});
+
+  // Set up by getting the raw token for the `1` in the macro definition.
+  const Token &OneExpanded = Toks[3];
+  Token Tok;
+  ASSERT_FALSE(
+      Lexer::getRawToken(OneExpanded.getLocation(), Tok, SourceMgr, LangOpts));
+  // The `ONE`.
+  ASSERT_EQ(Tok.getKind(), tok::raw_identifier);
+  ASSERT_FALSE(
+      Lexer::getRawToken(SourceMgr.getSpellingLoc(OneExpanded.getLocation()),
+                         Tok, SourceMgr, LangOpts));
+  // The `1` in the macro definition.
+  ASSERT_EQ(Tok.getKind(), tok::numeric_constant);
+
+  // Go back 4 characters: two spaces, one newline, and the backslash.
+  SourceLocation EscapedNewLineLoc = Tok.getLocation().getLocWithOffset(-4);
+  // Expect true (=failure) because the whitespace immediately after the
+  // escaped newline is not ignored.
+  EXPECT_TRUE(Lexer::getRawToken(EscapedNewLineLoc, Tok, SourceMgr, LangOpts,
+                                 /*IgnoreWhiteSpace=*/false));
+}
+
 TEST(LexerPreambleTest, PreambleBounds) {
   std::vector<std::string> Cases = {
       R"cc([[