[clang] clang-format: Add splitting for strings with user-defined suffixes (PR #167150)
via cfe-commits
cfe-commits at lists.llvm.org
Sat Nov 8 07:59:57 PST 2025
https://github.com/shivrm created https://github.com/llvm/llvm-project/pull/167150
String literals with user-defined suffixes can now be split between lines.
- Uses regex to identify user-defined suffixes
- We want the suffix to be placed only on the last line, so I added `ContinuationPrefix` and `ContinuationPostfix` attributes to `BreakableStringLiteral` to have different postfixes for the last line and all the other lines
- `ContinuationPrefix` is currently unused - prefixes are still placed on every line when splitting. I've kept it for completeness.
- Adds a new unit test for splitting strings with user-defined-suffixes.
Fixes #165617
>From d211fd1030494d0db230ccd608f935edc5af8406 Mon Sep 17 00:00:00 2001
From: shivrm <shivrm at proton.me>
Date: Fri, 7 Nov 2025 17:02:47 +0530
Subject: [PATCH 1/4] Add splitting for user-defined suffixes
---
clang/lib/Format/BreakableToken.cpp | 21 ++++++++---
clang/lib/Format/BreakableToken.h | 12 +++++--
clang/lib/Format/ContinuationIndenter.cpp | 44 +++++++++++++++++------
3 files changed, 61 insertions(+), 16 deletions(-)
diff --git a/clang/lib/Format/BreakableToken.cpp b/clang/lib/Format/BreakableToken.cpp
index 994a427517ffc..ff9f2f10ffac0 100644
--- a/clang/lib/Format/BreakableToken.cpp
+++ b/clang/lib/Format/BreakableToken.cpp
@@ -253,10 +253,13 @@ unsigned BreakableStringLiteral::getContentStartColumn(unsigned LineIndex,
BreakableStringLiteral::BreakableStringLiteral(
const FormatToken &Tok, unsigned StartColumn, StringRef Prefix,
- StringRef Postfix, unsigned UnbreakableTailLength, bool InPPDirective,
- encoding::Encoding Encoding, const FormatStyle &Style)
+ StringRef Postfix, StringRef ContinuationPrefix,
+ StringRef ContinuationPostfix, unsigned UnbreakableTailLength,
+ bool InPPDirective, encoding::Encoding Encoding, const FormatStyle &Style)
: BreakableToken(Tok, InPPDirective, Encoding, Style),
StartColumn(StartColumn), Prefix(Prefix), Postfix(Postfix),
+ ContinuationPrefix(ContinuationPrefix),
+ ContinuationPostfix(ContinuationPostfix),
UnbreakableTailLength(UnbreakableTailLength) {
assert(Tok.TokenText.starts_with(Prefix) && Tok.TokenText.ends_with(Postfix));
Line = Tok.TokenText.substr(
@@ -274,9 +277,15 @@ void BreakableStringLiteral::insertBreak(unsigned LineIndex,
unsigned TailOffset, Split Split,
unsigned ContentIndent,
WhitespaceManager &Whitespaces) const {
+
+ const unsigned SplitEnd = TailOffset + Split.first + Split.second;
+ const bool IsLastFragment = SplitEnd >= Line.size() - UnbreakableTailLength;
+
+ StringRef LocalPostfix = (IsLastFragment) ? Postfix : ContinuationPostfix;
+
Whitespaces.replaceWhitespaceInToken(
- Tok, Prefix.size() + TailOffset + Split.first, Split.second, Postfix,
- Prefix, InPPDirective, 1, StartColumn);
+ Tok, ContinuationPrefix.size() + TailOffset + Split.first, Split.second,
+ LocalPostfix, ContinuationPrefix, InPPDirective, 1, StartColumn);
}
BreakableStringLiteralUsingOperators::BreakableStringLiteralUsingOperators(
@@ -288,6 +297,10 @@ BreakableStringLiteralUsingOperators::BreakableStringLiteralUsingOperators(
: QuoteStyle == AtDoubleQuotes ? "@\""
: "\"",
/*Postfix=*/QuoteStyle == SingleQuotes ? "'" : "\"",
+ /*ContinuationPrefix=*/QuoteStyle == SingleQuotes ? "'"
+ : QuoteStyle == AtDoubleQuotes ? "@\""
+ : "\"",
+ /*ContinuationPostfix=*/QuoteStyle == SingleQuotes ? "'" : "\"",
UnbreakableTailLength, InPPDirective, Encoding, Style),
BracesNeeded(Tok.isNot(TT_StringInConcatenation)),
QuoteStyle(QuoteStyle) {
diff --git a/clang/lib/Format/BreakableToken.h b/clang/lib/Format/BreakableToken.h
index 45c00b35fd01e..2ee37d3e0e059 100644
--- a/clang/lib/Format/BreakableToken.h
+++ b/clang/lib/Format/BreakableToken.h
@@ -252,6 +252,8 @@ class BreakableStringLiteral : public BreakableToken {
/// after formatting.
BreakableStringLiteral(const FormatToken &Tok, unsigned StartColumn,
StringRef Prefix, StringRef Postfix,
+ StringRef ContinuationPrefix,
+ StringRef ContinuationPostfix,
unsigned UnbreakableTailLength, bool InPPDirective,
encoding::Encoding Encoding, const FormatStyle &Style);
@@ -274,15 +276,21 @@ class BreakableStringLiteral : public BreakableToken {
protected:
// The column in which the token starts.
unsigned StartColumn;
- // The prefix a line needs after a break in the token.
+ // The prefix a line needs at the start
StringRef Prefix;
- // The postfix a line needs before introducing a break.
+ // The postfix a line needs at the end
StringRef Postfix;
+ // The prefix every line except the first line needs
+ StringRef ContinuationPrefix;
+ // The postfix every line except the last line needs
+ StringRef ContinuationPostfix;
// The token text excluding the prefix and postfix.
StringRef Line;
// Length of the sequence of tokens after this string literal that cannot
// contain line breaks.
unsigned UnbreakableTailLength;
+ // Whether the string prefix and postfix should be repeated on each line
+ // when breaking the string.
};
class BreakableStringLiteralUsingOperators : public BreakableStringLiteral {
diff --git a/clang/lib/Format/ContinuationIndenter.cpp b/clang/lib/Format/ContinuationIndenter.cpp
index 9ab024a03fbd7..6cfb7a505200e 100644
--- a/clang/lib/Format/ContinuationIndenter.cpp
+++ b/clang/lib/Format/ContinuationIndenter.cpp
@@ -2540,22 +2540,46 @@ ContinuationIndenter::createBreakableToken(const FormatToken &Current,
StringRef Prefix;
StringRef Postfix;
+
// FIXME: Handle whitespace between '_T', '(', '"..."', and ')'.
// FIXME: Store Prefix and Suffix (or PrefixLength and SuffixLength to
// reduce the overhead) for each FormatToken, which is a string, so that we
// don't run multiple checks here on the hot path.
- if ((Text.ends_with(Postfix = "\"") &&
- (Text.starts_with(Prefix = "@\"") || Text.starts_with(Prefix = "\"") ||
- Text.starts_with(Prefix = "u\"") ||
- Text.starts_with(Prefix = "U\"") ||
- Text.starts_with(Prefix = "u8\"") ||
- Text.starts_with(Prefix = "L\""))) ||
- (Text.starts_with(Prefix = "_T(\"") &&
- Text.ends_with(Postfix = "\")"))) {
+ if (Text.starts_with(Prefix = "_T(\"") && Text.ends_with(Postfix = "\")")) {
+ // We need to put `_T("` and `")` on each line because it is a macro
+ llvm::StringRef ContinuationPrefix = Prefix;
+ llvm::StringRef ContinuationPostfix = Postfix;
+
return std::make_unique<BreakableStringLiteral>(
- Current, StartColumn, Prefix, Postfix, UnbreakableTailLength,
- State.Line->InPPDirective, Encoding, Style);
+ Current, StartColumn, Prefix, Postfix, ContinuationPrefix,
+ ContinuationPostfix, UnbreakableTailLength, State.Line->InPPDirective,
+ Encoding, Style);
+ }
+
+ static const auto PostfixRegex =
+ llvm::Regex(R"("(_[a-zA-Z_][a-zA-Z0-9_]*)?$)");
+ llvm::SmallVector<llvm::StringRef, 1> Matches;
+
+ if (PostfixRegex.match(Text, &Matches)) {
+ Postfix = Matches.front();
+
+ if ((Text.starts_with(Prefix = "@\"") ||
+ Text.starts_with(Prefix = "\"") ||
+ Text.starts_with(Prefix = "u\"") ||
+ Text.starts_with(Prefix = "U\"") ||
+ Text.starts_with(Prefix = "u8\"") ||
+ Text.starts_with(Prefix = "L\""))) {
+
+ // Use quotes when breaking the string
+ llvm::StringRef ContinuationPrefix = "\"";
+ llvm::StringRef ContinuationPostfix = "\"";
+ return std::make_unique<BreakableStringLiteral>(
+ Current, StartColumn, Prefix, Postfix, ContinuationPrefix,
+ ContinuationPostfix, UnbreakableTailLength,
+ State.Line->InPPDirective, Encoding, Style);
+ }
}
+
} else if (Current.is(TT_BlockComment)) {
if (Style.ReflowComments == FormatStyle::RCS_Never ||
// If a comment token switches formatting, like
>From 93060fdd0a3b03ed6a9c38a06a5e6819f67c13e4 Mon Sep 17 00:00:00 2001
From: shivrm <shivrm at proton.me>
Date: Fri, 7 Nov 2025 22:06:12 +0530
Subject: [PATCH 2/4] Modify string splitting to repeat prefix
---
clang/lib/Format/BreakableToken.cpp | 2 +-
clang/lib/Format/ContinuationIndenter.cpp | 4 ++--
2 files changed, 3 insertions(+), 3 deletions(-)
diff --git a/clang/lib/Format/BreakableToken.cpp b/clang/lib/Format/BreakableToken.cpp
index ff9f2f10ffac0..872660535eb35 100644
--- a/clang/lib/Format/BreakableToken.cpp
+++ b/clang/lib/Format/BreakableToken.cpp
@@ -298,7 +298,7 @@ BreakableStringLiteralUsingOperators::BreakableStringLiteralUsingOperators(
: "\"",
/*Postfix=*/QuoteStyle == SingleQuotes ? "'" : "\"",
/*ContinuationPrefix=*/QuoteStyle == SingleQuotes ? "'"
- : QuoteStyle == AtDoubleQuotes ? "@\""
+ : QuoteStyle == AtDoubleQuotes ? "@\""
: "\"",
/*ContinuationPostfix=*/QuoteStyle == SingleQuotes ? "'" : "\"",
UnbreakableTailLength, InPPDirective, Encoding, Style),
diff --git a/clang/lib/Format/ContinuationIndenter.cpp b/clang/lib/Format/ContinuationIndenter.cpp
index 6cfb7a505200e..5badd6edf4a7b 100644
--- a/clang/lib/Format/ContinuationIndenter.cpp
+++ b/clang/lib/Format/ContinuationIndenter.cpp
@@ -2570,8 +2570,8 @@ ContinuationIndenter::createBreakableToken(const FormatToken &Current,
Text.starts_with(Prefix = "u8\"") ||
Text.starts_with(Prefix = "L\""))) {
- // Use quotes when breaking the string
- llvm::StringRef ContinuationPrefix = "\"";
+ // Repeat the prefix on every line but don't repeat the suffix
+ llvm::StringRef ContinuationPrefix = Prefix;
llvm::StringRef ContinuationPostfix = "\"";
return std::make_unique<BreakableStringLiteral>(
Current, StartColumn, Prefix, Postfix, ContinuationPrefix,
>From 91c9b81e83f82af2103f258b699fa5202fc2af89 Mon Sep 17 00:00:00 2001
From: shivrm <shivrm at proton.me>
Date: Fri, 7 Nov 2025 22:25:39 +0530
Subject: [PATCH 3/4] Fix bug causing repetition of suffixes
---
clang/lib/Format/BreakableToken.cpp | 3 +--
1 file changed, 1 insertion(+), 2 deletions(-)
diff --git a/clang/lib/Format/BreakableToken.cpp b/clang/lib/Format/BreakableToken.cpp
index 872660535eb35..dd9d4ecb2f3c7 100644
--- a/clang/lib/Format/BreakableToken.cpp
+++ b/clang/lib/Format/BreakableToken.cpp
@@ -279,8 +279,7 @@ void BreakableStringLiteral::insertBreak(unsigned LineIndex,
WhitespaceManager &Whitespaces) const {
const unsigned SplitEnd = TailOffset + Split.first + Split.second;
- const bool IsLastFragment = SplitEnd >= Line.size() - UnbreakableTailLength;
-
+ const bool IsLastFragment = SplitEnd > Line.size() - UnbreakableTailLength;
StringRef LocalPostfix = (IsLastFragment) ? Postfix : ContinuationPostfix;
Whitespaces.replaceWhitespaceInToken(
>From faa1996fa3238d95386d8377da96689714551a25 Mon Sep 17 00:00:00 2001
From: shivrm <shivrm at proton.me>
Date: Sat, 8 Nov 2025 21:17:21 +0530
Subject: [PATCH 4/4] Add unit tests
---
clang/unittests/Format/FormatTest.cpp | 4 ++++
1 file changed, 4 insertions(+)
diff --git a/clang/unittests/Format/FormatTest.cpp b/clang/unittests/Format/FormatTest.cpp
index 24235b966399d..4c7593b88202f 100644
--- a/clang/unittests/Format/FormatTest.cpp
+++ b/clang/unittests/Format/FormatTest.cpp
@@ -15838,6 +15838,10 @@ TEST_F(FormatTest, BreaksWideAndNSStringLiterals) {
"@\"NSString literal\";", getGoogleStyleWithColumns(19));
verifyFormat(R"(NSString *s = @"那那那那";)", getLLVMStyleWithColumns(26));
+ EXPECT_EQ("L\"suffixed \"\n"
+ "L\"string\"_s;",
+ format("L\"suffixed string\"_s;", getLLVMStyleWithColumns(19)));
+
// This input makes clang-format try to split the incomplete unicode escape
// sequence, which used to lead to a crasher.
verifyNoCrash(
More information about the cfe-commits
mailing list