[clang] [clang][scandeps] Improve handling of rawstrings. (PR #139504)

Tobias Hieta via cfe-commits cfe-commits at lists.llvm.org
Mon Jun 23 02:13:29 PDT 2025


https://github.com/tru updated https://github.com/llvm/llvm-project/pull/139504

>From 859723743a51bc75d855b4ee924ea5cf86e26183 Mon Sep 17 00:00:00 2001
From: Tobias Hieta <tobias.hieta at ubisoft.com>
Date: Mon, 5 May 2025 11:40:17 +0200
Subject: [PATCH 1/4] [clang][scandeps] Improve handling of rawstrings.

The current parser just checks one step back for a R before each string
to know that it's a rawstring. But the preprocessor is much more
advanced here and can have constructs like:

R\
"str"

And much more. This patch also adds more test coverage for Rawstrings in
the dependencydirectivesscanner.

This was co-authored by Sylvain Audi <sylvain.audi at ubisoft.com>

Fixes #137648
---
 clang/lib/Lex/DependencyDirectivesScanner.cpp | 43 ++++++++++++---
 clang/test/ClangScanDeps/raw-strings.cpp      | 55 +++++++++++++++++++
 2 files changed, 91 insertions(+), 7 deletions(-)
 create mode 100644 clang/test/ClangScanDeps/raw-strings.cpp

diff --git a/clang/lib/Lex/DependencyDirectivesScanner.cpp b/clang/lib/Lex/DependencyDirectivesScanner.cpp
index 1b6b16c561141..bc936ac749a78 100644
--- a/clang/lib/Lex/DependencyDirectivesScanner.cpp
+++ b/clang/lib/Lex/DependencyDirectivesScanner.cpp
@@ -206,6 +206,24 @@ static void skipOverSpaces(const char *&First, const char *const End) {
     ++First;
 }
 
+// Move back by one character, skipping escaped newlines (backslash + \n)
+static char previousChar(const char *First, const char *&Current) {
+  assert(Current > First);
+  --Current;
+  while (Current > First + 1 && isVerticalWhitespace(*Current)) {
+    const char PrevChar = *(Current - 1);
+    if (PrevChar == '\\') {
+      Current -= 2; // backslash + (\n or \r)
+    } else if (Current > First + 2 && isVerticalWhitespace(PrevChar) &&
+               PrevChar != *Current && *(Current - 2) == '\\') {
+      Current -= 3; // backslash + (\n\r or \r\n)
+    } else {
+      break;
+    }
+  }
+  return *Current;
+}
+
 [[nodiscard]] static bool isRawStringLiteral(const char *First,
                                              const char *Current) {
   assert(First <= Current);
@@ -215,25 +233,28 @@ static void skipOverSpaces(const char *&First, const char *const End) {
     return false;
 
   // Check for an "R".
-  --Current;
-  if (*Current != 'R')
+  if (previousChar(First, Current) != 'R')
     return false;
-  if (First == Current || !isAsciiIdentifierContinue(*--Current))
+  if (First == Current ||
+      !isAsciiIdentifierContinue(previousChar(First, Current)))
     return true;
 
   // Check for a prefix of "u", "U", or "L".
   if (*Current == 'u' || *Current == 'U' || *Current == 'L')
-    return First == Current || !isAsciiIdentifierContinue(*--Current);
+    return First == Current ||
+           !isAsciiIdentifierContinue(previousChar(First, Current));
 
   // Check for a prefix of "u8".
-  if (*Current != '8' || First == Current || *Current-- != 'u')
+  if (*Current != '8' || First == Current ||
+      previousChar(First, Current) != 'u')
     return false;
-  return First == Current || !isAsciiIdentifierContinue(*--Current);
+  return First == Current ||
+         !isAsciiIdentifierContinue(previousChar(First, Current));
 }
 
 static void skipRawString(const char *&First, const char *const End) {
   assert(First[0] == '"');
-  assert(First[-1] == 'R');
+  //assert(First[-1] == 'R');
 
   const char *Last = ++First;
   while (Last != End && *Last != '(')
@@ -416,6 +437,14 @@ void Scanner::skipLine(const char *&First, const char *const End) {
         continue;
       }
 
+      // Continue on the same line if an EOL is preceded with backslash
+      if (First + 1 < End && *First == '\\') {
+        if (unsigned Len = isEOL(First + 1, End)) {
+          First += 1 + Len;
+          continue;
+        }
+      }
+
       // Iterate over comments correctly.
       if (*First != '/' || End - First < 2) {
         LastTokenPtr = First;
diff --git a/clang/test/ClangScanDeps/raw-strings.cpp b/clang/test/ClangScanDeps/raw-strings.cpp
new file mode 100644
index 0000000000000..5fda4a559c9e3
--- /dev/null
+++ b/clang/test/ClangScanDeps/raw-strings.cpp
@@ -0,0 +1,55 @@
+// RUN: rm -rf %t
+// RUN: split-file %s %t
+// RUN: sed -e "s|DIR|%/t|g" %t/cdb.json.in > %t/cdb.json
+
+//--- cdb.json.in
+[{
+    "directory": "DIR",
+    "command": "clang -c DIR/tu.c -o DIR/tu.o -IDIR/include",
+    "file": "DIR/tu.c"
+}]
+//--- include/header.h
+//--- include/header2.h
+//--- include/header3.h
+//--- include/header4.h
+//--- tu.c
+#if 0
+R"x()x"
+#endif
+
+#include "header.h"
+
+#if 0
+R"y(";
+#endif
+#include "header2.h"
+
+#if 0
+//")y"
+#endif
+
+#if 0
+R"y(";
+R"z()y";
+#endif
+#include "header3.h"
+#if 0
+//")z"
+#endif
+
+#if 0
+R\
+"y(";
+R"z()y";
+#endif
+#include "header4.h"
+#if 0
+//")z"
+#endif
+
+// RUN: clang-scan-deps -compilation-database %t/cdb.json -mode preprocess | FileCheck %s
+// RUN: clang-scan-deps -compilation-database %t/cdb.json -mode preprocess-dependency-directives | FileCheck %s
+// CHECK: tu.c
+// CHECK-NEXT: header.h
+// CHECK-NEXT: header3.h
+// CHECK-NEXT: header4.h

>From 7baebfba89ab37bfe94ed2743ef6e4af9a932250 Mon Sep 17 00:00:00 2001
From: Tobias Hieta <tobias.hieta at ubisoft.com>
Date: Mon, 23 Jun 2025 11:09:08 +0200
Subject: [PATCH 2/4] [clang][lexer] Make Lexer::getEscapedNewLineSize public

This is so that we can use it in the dependency scanner without duplicating the logic there.
---
 clang/include/clang/Lex/Lexer.h | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/clang/include/clang/Lex/Lexer.h b/clang/include/clang/Lex/Lexer.h
index ca812ba1583fb..0ab192c437040 100644
--- a/clang/include/clang/Lex/Lexer.h
+++ b/clang/include/clang/Lex/Lexer.h
@@ -585,6 +585,11 @@ class Lexer : public PreprocessorLexer {
   /// sequence.
   static bool isNewLineEscaped(const char *BufferStart, const char *Str);
 
+  /// getEscapedNewLineSize - Return the size of the specified escaped newline,
+  /// or 0 if it is not an escaped newline. P[-1] is known to be a "\" on entry
+  /// to this function.
+  static unsigned getEscapedNewLineSize(const char *P);
+
   /// Diagnose use of a delimited or named escape sequence.
   static void DiagnoseDelimitedOrNamedEscapeSequence(SourceLocation Loc,
                                                      bool Named,
@@ -725,11 +730,6 @@ class Lexer : public PreprocessorLexer {
   /// method.
   SizedChar getCharAndSizeSlow(const char *Ptr, Token *Tok = nullptr);
 
-  /// getEscapedNewLineSize - Return the size of the specified escaped newline,
-  /// or 0 if it is not an escaped newline. P[-1] is known to be a "\" on entry
-  /// to this function.
-  static unsigned getEscapedNewLineSize(const char *P);
-
   /// SkipEscapedNewLines - If P points to an escaped newline (or a series of
   /// them), skip over them and return the first non-escaped-newline found,
   /// otherwise return P.

>From 63ca11677ab786f77f42aa7dd78871d798f9c4cd Mon Sep 17 00:00:00 2001
From: Tobias Hieta <tobias.hieta at ubisoft.com>
Date: Mon, 23 Jun 2025 11:10:31 +0200
Subject: [PATCH 3/4] Improvements to raw-string parsing and expanding tests

As suggested in the review, use Lexer::getEscapedNewLineSize() instead
of implementing our own logic for it.
---
 clang/lib/Lex/DependencyDirectivesScanner.cpp | 18 ++--
 clang/test/ClangScanDeps/raw-strings.cpp      | 93 +++++++++++++++++++
 2 files changed, 104 insertions(+), 7 deletions(-)

diff --git a/clang/lib/Lex/DependencyDirectivesScanner.cpp b/clang/lib/Lex/DependencyDirectivesScanner.cpp
index bc936ac749a78..eb10cfe4cbf45 100644
--- a/clang/lib/Lex/DependencyDirectivesScanner.cpp
+++ b/clang/lib/Lex/DependencyDirectivesScanner.cpp
@@ -210,13 +210,17 @@ static void skipOverSpaces(const char *&First, const char *const End) {
 static char previousChar(const char *First, const char *&Current) {
   assert(Current > First);
   --Current;
-  while (Current > First + 1 && isVerticalWhitespace(*Current)) {
-    const char PrevChar = *(Current - 1);
-    if (PrevChar == '\\') {
-      Current -= 2; // backslash + (\n or \r)
-    } else if (Current > First + 2 && isVerticalWhitespace(PrevChar) &&
-               PrevChar != *Current && *(Current - 2) == '\\') {
-      Current -= 3; // backslash + (\n\r or \r\n)
+  while (Current > First && isVerticalWhitespace(*Current)) {
+    // Check if the previous character is a backslash
+    if (Current > First && *(Current - 1) == '\\') {
+      // Use Lexer's getEscapedNewLineSize to get the size of the escaped newline
+      unsigned EscapeSize = Lexer::getEscapedNewLineSize(Current);
+      if (EscapeSize > 0) {
+        // Skip back over the entire escaped newline sequence (backslash + newline)
+        Current -= (1 + EscapeSize);
+      } else {
+        break;
+      }
     } else {
       break;
     }
diff --git a/clang/test/ClangScanDeps/raw-strings.cpp b/clang/test/ClangScanDeps/raw-strings.cpp
index 5fda4a559c9e3..ff8a986643993 100644
--- a/clang/test/ClangScanDeps/raw-strings.cpp
+++ b/clang/test/ClangScanDeps/raw-strings.cpp
@@ -12,6 +12,16 @@
 //--- include/header2.h
 //--- include/header3.h
 //--- include/header4.h
+//--- include/header5.h
+//--- include/header6.h
+//--- include/header7.h
+//--- include/header8.h
+//--- include/header9.h
+//--- include/header10.h
+//--- include/header11.h
+//--- include/header12.h
+//--- include/header13.h
+//--- include/header14.h
 //--- tu.c
 #if 0
 R"x()x"
@@ -47,9 +57,92 @@ R"z()y";
 //")z"
 #endif
 
+// Test u8 prefix with escaped newline
+#if 0
+u8R\
+"prefix(test)prefix"
+#endif
+#include "header5.h"
+
+// Test u prefix with multiple escaped newlines
+#if 0
+uR\
+\
+"multi(test)multi"
+#endif
+#include "header6.h"
+
+// Test U prefix with escaped newline
+#if 0
+UR\
+"upper(test)upper"
+#endif
+#include "header7.h"
+
+// Test L prefix with escaped newline
+#if 0
+LR\
+"wide(test)wide"
+#endif
+#include "header8.h"
+
+// Test escaped newline with \r\n style
+#if 0
+R\
+"crlf(test)crlf"
+#endif
+#include "header9.h"
+
+// Test multiple escaped newlines in different positions
+#if 0
+u\
+8\
+R\
+"complex(test)complex"
+#endif
+#include "header10.h"
+
+// Test raw string that should NOT be treated as raw (no R prefix due to identifier continuation)
+#if 0
+identifierR"notraw(test)notraw"
+#endif
+#include "header11.h"
+
+// Test raw string with whitespace before escaped newline
+#if 0
+R \
+"whitespace(test)whitespace"
+#endif
+#include "header12.h"
+
+// Test nested raw strings in disabled code
+#if 0
+R"outer(
+    R"inner(content)inner"
+)outer"
+#endif
+#include "header13.h"
+
+// Test raw string with empty delimiter
+#if 0
+R\
+"(empty delimiter)";
+#endif
+#include "header14.h"
+
 // RUN: clang-scan-deps -compilation-database %t/cdb.json -mode preprocess | FileCheck %s
 // RUN: clang-scan-deps -compilation-database %t/cdb.json -mode preprocess-dependency-directives | FileCheck %s
 // CHECK: tu.c
 // CHECK-NEXT: header.h
 // CHECK-NEXT: header3.h
 // CHECK-NEXT: header4.h
+// CHECK-NEXT: header5.h
+// CHECK-NEXT: header6.h
+// CHECK-NEXT: header7.h
+// CHECK-NEXT: header8.h
+// CHECK-NEXT: header9.h
+// CHECK-NEXT: header10.h
+// CHECK-NEXT: header11.h
+// CHECK-NEXT: header12.h
+// CHECK-NEXT: header13.h
+// CHECK-NEXT: header14.h

>From c3561b0aa34f1b693fa86f2b69b09ea97afedb06 Mon Sep 17 00:00:00 2001
From: Tobias Hieta <tobias.hieta at ubisoft.com>
Date: Mon, 23 Jun 2025 11:13:07 +0200
Subject: [PATCH 4/4] Remove assertion that's no longer valid here

---
 clang/lib/Lex/DependencyDirectivesScanner.cpp | 1 -
 1 file changed, 1 deletion(-)

diff --git a/clang/lib/Lex/DependencyDirectivesScanner.cpp b/clang/lib/Lex/DependencyDirectivesScanner.cpp
index eb10cfe4cbf45..56dbb3fbcad6e 100644
--- a/clang/lib/Lex/DependencyDirectivesScanner.cpp
+++ b/clang/lib/Lex/DependencyDirectivesScanner.cpp
@@ -258,7 +258,6 @@ static char previousChar(const char *First, const char *&Current) {
 
 static void skipRawString(const char *&First, const char *const End) {
   assert(First[0] == '"');
-  //assert(First[-1] == 'R');
 
   const char *Last = ++First;
   while (Last != End && *Last != '(')



More information about the cfe-commits mailing list