[clang] [clang][scandeps] Improve handling of rawstrings. (PR #139504)
Tobias Hieta via cfe-commits
cfe-commits at lists.llvm.org
Mon Jun 23 02:20:19 PDT 2025
https://github.com/tru updated https://github.com/llvm/llvm-project/pull/139504
>From 859723743a51bc75d855b4ee924ea5cf86e26183 Mon Sep 17 00:00:00 2001
From: Tobias Hieta <tobias.hieta at ubisoft.com>
Date: Mon, 5 May 2025 11:40:17 +0200
Subject: [PATCH 1/5] [clang][scandeps] Improve handling of rawstrings.
The current parser just checks one step back for a R before each string
to know that it's a rawstring. But the preprocessor is much more
advanced here and can have constructs like:
R\
"str"
And much more. This patch also adds more test coverage for Rawstrings in
the dependencydirectivesscanner.
This was co-authored by Sylvain Audi <sylvain.audi at ubisoft.com>
Fixes #137648
---
clang/lib/Lex/DependencyDirectivesScanner.cpp | 43 ++++++++++++---
clang/test/ClangScanDeps/raw-strings.cpp | 55 +++++++++++++++++++
2 files changed, 91 insertions(+), 7 deletions(-)
create mode 100644 clang/test/ClangScanDeps/raw-strings.cpp
diff --git a/clang/lib/Lex/DependencyDirectivesScanner.cpp b/clang/lib/Lex/DependencyDirectivesScanner.cpp
index 1b6b16c561141..bc936ac749a78 100644
--- a/clang/lib/Lex/DependencyDirectivesScanner.cpp
+++ b/clang/lib/Lex/DependencyDirectivesScanner.cpp
@@ -206,6 +206,24 @@ static void skipOverSpaces(const char *&First, const char *const End) {
++First;
}
+// Move back by one character, skipping escaped newlines (backslash + \n)
+static char previousChar(const char *First, const char *&Current) {
+ assert(Current > First);
+ --Current;
+ while (Current > First + 1 && isVerticalWhitespace(*Current)) {
+ const char PrevChar = *(Current - 1);
+ if (PrevChar == '\\') {
+ Current -= 2; // backslash + (\n or \r)
+ } else if (Current > First + 2 && isVerticalWhitespace(PrevChar) &&
+ PrevChar != *Current && *(Current - 2) == '\\') {
+ Current -= 3; // backslash + (\n\r or \r\n)
+ } else {
+ break;
+ }
+ }
+ return *Current;
+}
+
[[nodiscard]] static bool isRawStringLiteral(const char *First,
const char *Current) {
assert(First <= Current);
@@ -215,25 +233,28 @@ static void skipOverSpaces(const char *&First, const char *const End) {
return false;
// Check for an "R".
- --Current;
- if (*Current != 'R')
+ if (previousChar(First, Current) != 'R')
return false;
- if (First == Current || !isAsciiIdentifierContinue(*--Current))
+ if (First == Current ||
+ !isAsciiIdentifierContinue(previousChar(First, Current)))
return true;
// Check for a prefix of "u", "U", or "L".
if (*Current == 'u' || *Current == 'U' || *Current == 'L')
- return First == Current || !isAsciiIdentifierContinue(*--Current);
+ return First == Current ||
+ !isAsciiIdentifierContinue(previousChar(First, Current));
// Check for a prefix of "u8".
- if (*Current != '8' || First == Current || *Current-- != 'u')
+ if (*Current != '8' || First == Current ||
+ previousChar(First, Current) != 'u')
return false;
- return First == Current || !isAsciiIdentifierContinue(*--Current);
+ return First == Current ||
+ !isAsciiIdentifierContinue(previousChar(First, Current));
}
static void skipRawString(const char *&First, const char *const End) {
assert(First[0] == '"');
- assert(First[-1] == 'R');
+ //assert(First[-1] == 'R');
const char *Last = ++First;
while (Last != End && *Last != '(')
@@ -416,6 +437,14 @@ void Scanner::skipLine(const char *&First, const char *const End) {
continue;
}
+ // Continue on the same line if an EOL is preceded with backslash
+ if (First + 1 < End && *First == '\\') {
+ if (unsigned Len = isEOL(First + 1, End)) {
+ First += 1 + Len;
+ continue;
+ }
+ }
+
// Iterate over comments correctly.
if (*First != '/' || End - First < 2) {
LastTokenPtr = First;
diff --git a/clang/test/ClangScanDeps/raw-strings.cpp b/clang/test/ClangScanDeps/raw-strings.cpp
new file mode 100644
index 0000000000000..5fda4a559c9e3
--- /dev/null
+++ b/clang/test/ClangScanDeps/raw-strings.cpp
@@ -0,0 +1,55 @@
+// RUN: rm -rf %t
+// RUN: split-file %s %t
+// RUN: sed -e "s|DIR|%/t|g" %t/cdb.json.in > %t/cdb.json
+
+//--- cdb.json.in
+[{
+ "directory": "DIR",
+ "command": "clang -c DIR/tu.c -o DIR/tu.o -IDIR/include",
+ "file": "DIR/tu.c"
+}]
+//--- include/header.h
+//--- include/header2.h
+//--- include/header3.h
+//--- include/header4.h
+//--- tu.c
+#if 0
+R"x()x"
+#endif
+
+#include "header.h"
+
+#if 0
+R"y(";
+#endif
+#include "header2.h"
+
+#if 0
+//")y"
+#endif
+
+#if 0
+R"y(";
+R"z()y";
+#endif
+#include "header3.h"
+#if 0
+//")z"
+#endif
+
+#if 0
+R\
+"y(";
+R"z()y";
+#endif
+#include "header4.h"
+#if 0
+//")z"
+#endif
+
+// RUN: clang-scan-deps -compilation-database %t/cdb.json -mode preprocess | FileCheck %s
+// RUN: clang-scan-deps -compilation-database %t/cdb.json -mode preprocess-dependency-directives | FileCheck %s
+// CHECK: tu.c
+// CHECK-NEXT: header.h
+// CHECK-NEXT: header3.h
+// CHECK-NEXT: header4.h
>From 7baebfba89ab37bfe94ed2743ef6e4af9a932250 Mon Sep 17 00:00:00 2001
From: Tobias Hieta <tobias.hieta at ubisoft.com>
Date: Mon, 23 Jun 2025 11:09:08 +0200
Subject: [PATCH 2/5] [clang][lexer] Make Lexer::getEscapedNewLineSize public
This is so that we can use it in the dependency scanner without duplicating the logic there.
---
clang/include/clang/Lex/Lexer.h | 10 +++++-----
1 file changed, 5 insertions(+), 5 deletions(-)
diff --git a/clang/include/clang/Lex/Lexer.h b/clang/include/clang/Lex/Lexer.h
index ca812ba1583fb..0ab192c437040 100644
--- a/clang/include/clang/Lex/Lexer.h
+++ b/clang/include/clang/Lex/Lexer.h
@@ -585,6 +585,11 @@ class Lexer : public PreprocessorLexer {
/// sequence.
static bool isNewLineEscaped(const char *BufferStart, const char *Str);
+ /// getEscapedNewLineSize - Return the size of the specified escaped newline,
+ /// or 0 if it is not an escaped newline. P[-1] is known to be a "\" on entry
+ /// to this function.
+ static unsigned getEscapedNewLineSize(const char *P);
+
/// Diagnose use of a delimited or named escape sequence.
static void DiagnoseDelimitedOrNamedEscapeSequence(SourceLocation Loc,
bool Named,
@@ -725,11 +730,6 @@ class Lexer : public PreprocessorLexer {
/// method.
SizedChar getCharAndSizeSlow(const char *Ptr, Token *Tok = nullptr);
- /// getEscapedNewLineSize - Return the size of the specified escaped newline,
- /// or 0 if it is not an escaped newline. P[-1] is known to be a "\" on entry
- /// to this function.
- static unsigned getEscapedNewLineSize(const char *P);
-
/// SkipEscapedNewLines - If P points to an escaped newline (or a series of
/// them), skip over them and return the first non-escaped-newline found,
/// otherwise return P.
>From 63ca11677ab786f77f42aa7dd78871d798f9c4cd Mon Sep 17 00:00:00 2001
From: Tobias Hieta <tobias.hieta at ubisoft.com>
Date: Mon, 23 Jun 2025 11:10:31 +0200
Subject: [PATCH 3/5] Improvements to raw-string parsing and expanding tests
As suggested in the review, use Lexer::getEscapedNewLineSize() instead
of implementing our own logic for it.
---
clang/lib/Lex/DependencyDirectivesScanner.cpp | 18 ++--
clang/test/ClangScanDeps/raw-strings.cpp | 93 +++++++++++++++++++
2 files changed, 104 insertions(+), 7 deletions(-)
diff --git a/clang/lib/Lex/DependencyDirectivesScanner.cpp b/clang/lib/Lex/DependencyDirectivesScanner.cpp
index bc936ac749a78..eb10cfe4cbf45 100644
--- a/clang/lib/Lex/DependencyDirectivesScanner.cpp
+++ b/clang/lib/Lex/DependencyDirectivesScanner.cpp
@@ -210,13 +210,17 @@ static void skipOverSpaces(const char *&First, const char *const End) {
static char previousChar(const char *First, const char *&Current) {
assert(Current > First);
--Current;
- while (Current > First + 1 && isVerticalWhitespace(*Current)) {
- const char PrevChar = *(Current - 1);
- if (PrevChar == '\\') {
- Current -= 2; // backslash + (\n or \r)
- } else if (Current > First + 2 && isVerticalWhitespace(PrevChar) &&
- PrevChar != *Current && *(Current - 2) == '\\') {
- Current -= 3; // backslash + (\n\r or \r\n)
+ while (Current > First && isVerticalWhitespace(*Current)) {
+ // Check if the previous character is a backslash
+ if (Current > First && *(Current - 1) == '\\') {
+ // Use Lexer's getEscapedNewLineSize to get the size of the escaped newline
+ unsigned EscapeSize = Lexer::getEscapedNewLineSize(Current);
+ if (EscapeSize > 0) {
+ // Skip back over the entire escaped newline sequence (backslash + newline)
+ Current -= (1 + EscapeSize);
+ } else {
+ break;
+ }
} else {
break;
}
diff --git a/clang/test/ClangScanDeps/raw-strings.cpp b/clang/test/ClangScanDeps/raw-strings.cpp
index 5fda4a559c9e3..ff8a986643993 100644
--- a/clang/test/ClangScanDeps/raw-strings.cpp
+++ b/clang/test/ClangScanDeps/raw-strings.cpp
@@ -12,6 +12,16 @@
//--- include/header2.h
//--- include/header3.h
//--- include/header4.h
+//--- include/header5.h
+//--- include/header6.h
+//--- include/header7.h
+//--- include/header8.h
+//--- include/header9.h
+//--- include/header10.h
+//--- include/header11.h
+//--- include/header12.h
+//--- include/header13.h
+//--- include/header14.h
//--- tu.c
#if 0
R"x()x"
@@ -47,9 +57,92 @@ R"z()y";
//")z"
#endif
+// Test u8 prefix with escaped newline
+#if 0
+u8R\
+"prefix(test)prefix"
+#endif
+#include "header5.h"
+
+// Test u prefix with multiple escaped newlines
+#if 0
+uR\
+\
+"multi(test)multi"
+#endif
+#include "header6.h"
+
+// Test U prefix with escaped newline
+#if 0
+UR\
+"upper(test)upper"
+#endif
+#include "header7.h"
+
+// Test L prefix with escaped newline
+#if 0
+LR\
+"wide(test)wide"
+#endif
+#include "header8.h"
+
+// Test escaped newline with \r\n style
+#if 0
+R\
+"crlf(test)crlf"
+#endif
+#include "header9.h"
+
+// Test multiple escaped newlines in different positions
+#if 0
+u\
+8\
+R\
+"complex(test)complex"
+#endif
+#include "header10.h"
+
+// Test raw string that should NOT be treated as raw (no R prefix due to identifier continuation)
+#if 0
+identifierR"notraw(test)notraw"
+#endif
+#include "header11.h"
+
+// Test raw string with whitespace before escaped newline
+#if 0
+R \
+"whitespace(test)whitespace"
+#endif
+#include "header12.h"
+
+// Test nested raw strings in disabled code
+#if 0
+R"outer(
+ R"inner(content)inner"
+)outer"
+#endif
+#include "header13.h"
+
+// Test raw string with empty delimiter
+#if 0
+R\
+"(empty delimiter)";
+#endif
+#include "header14.h"
+
// RUN: clang-scan-deps -compilation-database %t/cdb.json -mode preprocess | FileCheck %s
// RUN: clang-scan-deps -compilation-database %t/cdb.json -mode preprocess-dependency-directives | FileCheck %s
// CHECK: tu.c
// CHECK-NEXT: header.h
// CHECK-NEXT: header3.h
// CHECK-NEXT: header4.h
+// CHECK-NEXT: header5.h
+// CHECK-NEXT: header6.h
+// CHECK-NEXT: header7.h
+// CHECK-NEXT: header8.h
+// CHECK-NEXT: header9.h
+// CHECK-NEXT: header10.h
+// CHECK-NEXT: header11.h
+// CHECK-NEXT: header12.h
+// CHECK-NEXT: header13.h
+// CHECK-NEXT: header14.h
>From c3561b0aa34f1b693fa86f2b69b09ea97afedb06 Mon Sep 17 00:00:00 2001
From: Tobias Hieta <tobias.hieta at ubisoft.com>
Date: Mon, 23 Jun 2025 11:13:07 +0200
Subject: [PATCH 4/5] Remove assertion that's no longer valid here
---
clang/lib/Lex/DependencyDirectivesScanner.cpp | 1 -
1 file changed, 1 deletion(-)
diff --git a/clang/lib/Lex/DependencyDirectivesScanner.cpp b/clang/lib/Lex/DependencyDirectivesScanner.cpp
index eb10cfe4cbf45..56dbb3fbcad6e 100644
--- a/clang/lib/Lex/DependencyDirectivesScanner.cpp
+++ b/clang/lib/Lex/DependencyDirectivesScanner.cpp
@@ -258,7 +258,6 @@ static char previousChar(const char *First, const char *&Current) {
static void skipRawString(const char *&First, const char *const End) {
assert(First[0] == '"');
- //assert(First[-1] == 'R');
const char *Last = ++First;
while (Last != End && *Last != '(')
>From bacc9c20f2bec72cfc0a80bccd8fb7414a816659 Mon Sep 17 00:00:00 2001
From: Tobias Hieta <tobias.hieta at ubisoft.com>
Date: Mon, 23 Jun 2025 11:20:01 +0200
Subject: [PATCH 5/5] clang-format
---
clang/lib/Lex/DependencyDirectivesScanner.cpp | 6 ++++--
1 file changed, 4 insertions(+), 2 deletions(-)
diff --git a/clang/lib/Lex/DependencyDirectivesScanner.cpp b/clang/lib/Lex/DependencyDirectivesScanner.cpp
index 56dbb3fbcad6e..a862abcc44b38 100644
--- a/clang/lib/Lex/DependencyDirectivesScanner.cpp
+++ b/clang/lib/Lex/DependencyDirectivesScanner.cpp
@@ -213,10 +213,12 @@ static char previousChar(const char *First, const char *&Current) {
while (Current > First && isVerticalWhitespace(*Current)) {
// Check if the previous character is a backslash
if (Current > First && *(Current - 1) == '\\') {
- // Use Lexer's getEscapedNewLineSize to get the size of the escaped newline
+ // Use Lexer's getEscapedNewLineSize to get the size of the escaped
+ // newline
unsigned EscapeSize = Lexer::getEscapedNewLineSize(Current);
if (EscapeSize > 0) {
- // Skip back over the entire escaped newline sequence (backslash + newline)
+ // Skip back over the entire escaped newline sequence (backslash +
+ // newline)
Current -= (1 + EscapeSize);
} else {
break;
More information about the cfe-commits
mailing list