[clang] [clang-format] Handle Trailing Whitespace After Line Continuation (P2223R2) (PR #145243)

Naveen Seth Hanig via cfe-commits cfe-commits at lists.llvm.org
Tue Jun 24 07:18:44 PDT 2025


https://github.com/naveen-seth updated https://github.com/llvm/llvm-project/pull/145243

>From 8cee6b33c54aca0ea69b3ad291287464523a3d7a Mon Sep 17 00:00:00 2001
From: Naveen Seth Hanig <naveen.hanig at outlook.com>
Date: Sun, 22 Jun 2025 18:06:07 +0200
Subject: [PATCH 1/4] [clang-format] Handle Trailing Whitespace After Line
 Continuation (P2223R2)

Fixes #145226.

Implement P2223R2 in clang-format to correctly handle cases where a
backslash '\' is followed by trailing whitespace before the newline.
Previously, clang-format failed to properly detect and
handle such cases, leading to misformatted code.

With this, clang-format matches the behavior already implemented in
Clang's lexer and DependencyDirectivesScanner.cpp, which allow
trailing whitespace after a line continuation in any C++ standard.
---
 clang/docs/ReleaseNotes.rst                   |  3 ++
 clang/lib/Format/FormatTokenLexer.cpp         | 31 ++++++++++++++-----
 .../splice-trailing-whitespace-p2223r2.cpp    | 14 +++++++++
 3 files changed, 40 insertions(+), 8 deletions(-)
 create mode 100644 clang/test/Format/splice-trailing-whitespace-p2223r2.cpp

diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst
index 96477ef6ddc9a..7c8e231655e86 100644
--- a/clang/docs/ReleaseNotes.rst
+++ b/clang/docs/ReleaseNotes.rst
@@ -1025,6 +1025,9 @@ clang-format
   ``enum`` enumerator lists.
 - Add ``OneLineFormatOffRegex`` option for turning formatting off for one line.
 - Add ``SpaceAfterOperatorKeyword`` option.
+- Support trailing whitespace in line splicing.
+  (P2223R2 <https://www.open-std.org/jtc1/sc22/wg21/docs/papers/2021/p2223r2.pdf>_, #GH145226)
+
 
 clang-refactor
 --------------
diff --git a/clang/lib/Format/FormatTokenLexer.cpp b/clang/lib/Format/FormatTokenLexer.cpp
index 4cc4f5f22db0d..bcc8e6ffd91ab 100644
--- a/clang/lib/Format/FormatTokenLexer.cpp
+++ b/clang/lib/Format/FormatTokenLexer.cpp
@@ -14,6 +14,7 @@
 
 #include "FormatTokenLexer.h"
 #include "FormatToken.h"
+#include "clang/Basic/CharInfo.h"
 #include "clang/Basic/SourceLocation.h"
 #include "clang/Basic/SourceManager.h"
 #include "clang/Format/Format.h"
@@ -1205,14 +1206,23 @@ static size_t countLeadingWhitespace(StringRef Text) {
   while (Cur < End) {
     if (isspace(Cur[0])) {
       ++Cur;
-    } else if (Cur[0] == '\\' && (Cur[1] == '\n' || Cur[1] == '\r')) {
-      // A '\' followed by a newline always escapes the newline, regardless
-      // of whether there is another '\' before it.
+    } else if (Cur[0] == '\\') {
+      // A '\' followed by a optional horizontal whitespace (P22232R2) and then
+      // newline  always escapes the newline, regardless  of whether there is
+      // another '\' before it.
       // The source has a null byte at the end. So the end of the entire input
       // isn't reached yet. Also the lexer doesn't break apart an escaped
       // newline.
-      assert(End - Cur >= 2);
-      Cur += 2;
+      const unsigned char *Lookahead = Cur + 1;
+      while (isHorizontalWhitespace(*Lookahead))
+        ++Lookahead;
+      if (*Lookahead == '\n' || *Lookahead == '\r') {
+        // Splice found, consume it.
+        Cur = Lookahead + 1;
+        continue;
+      }
+      // No line splice found; the '\' is a token.
+      break;
     } else if (Cur[0] == '?' && Cur[1] == '?' && Cur[2] == '/' &&
                (Cur[3] == '\n' || Cur[3] == '\r')) {
       // Newlines can also be escaped by a '?' '?' '/' trigraph. By the way, the
@@ -1295,13 +1305,18 @@ FormatToken *FormatTokenLexer::getNextToken() {
       case '/':
         // The text was entirely whitespace when this loop was entered. Thus
         // this has to be an escape sequence.
-        assert(Text.substr(i, 2) == "\\\r" || Text.substr(i, 2) == "\\\n" ||
-               Text.substr(i, 4) == "\?\?/\r" ||
+        assert(Text.substr(i, 4) == "\?\?/\r" ||
                Text.substr(i, 4) == "\?\?/\n" ||
                (i >= 1 && (Text.substr(i - 1, 4) == "\?\?/\r" ||
                            Text.substr(i - 1, 4) == "\?\?/\n")) ||
                (i >= 2 && (Text.substr(i - 2, 4) == "\?\?/\r" ||
-                           Text.substr(i - 2, 4) == "\?\?/\n")));
+                           Text.substr(i - 2, 4) == "\?\?/\n")) ||
+               (Text[i] == '\\' && [&]() -> bool {
+                 size_t j = i + 1;
+                 while (j < Text.size() && isHorizontalWhitespace(Text[j]))
+                   ++j;
+                 return j < Text.size() && (Text[j] == '\n' || Text[j] == '\r');
+               }()));
         InEscape = true;
         break;
       default:
diff --git a/clang/test/Format/splice-trailing-whitespace-p2223r2.cpp b/clang/test/Format/splice-trailing-whitespace-p2223r2.cpp
new file mode 100644
index 0000000000000..3cc08cc631965
--- /dev/null
+++ b/clang/test/Format/splice-trailing-whitespace-p2223r2.cpp
@@ -0,0 +1,14 @@
+// RUN: grep -Ev "// *[A-Z-]+:" %s \
+// RUN:   | clang-format -style='{BasedOnStyle: LLVM, AlignEscapedNewlines: DontAlign}' \
+// RUN:   | FileCheck -strict-whitespace %s
+
+// CHECK: {{^#define TAG\(\.\.\.\) \\}}
+// CHECK: {{^  struct a \{\};}}
+// There is whitespace following  v  this backslash!
+#define TAG(...)  struct a {     \     
+  };
+
+// CHECK: {{^int   i;}}
+// The comment below eats its following line because of the line splice.
+// I also have trailing whitespace. Nom nom nom \     
+int   i;

>From 26d525e3ff75885390f5d46ccb5c044fdbd8ddde Mon Sep 17 00:00:00 2001
From: Naveen Seth Hanig <naveen.hanig at outlook.com>
Date: Sun, 22 Jun 2025 20:23:51 +0200
Subject: [PATCH 2/4] Remove deprecated usage of grep from regression test

---
 clang/test/Format/splice-trailing-whitespace-p2223r2.cpp | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/clang/test/Format/splice-trailing-whitespace-p2223r2.cpp b/clang/test/Format/splice-trailing-whitespace-p2223r2.cpp
index 3cc08cc631965..4e9e2612af78a 100644
--- a/clang/test/Format/splice-trailing-whitespace-p2223r2.cpp
+++ b/clang/test/Format/splice-trailing-whitespace-p2223r2.cpp
@@ -1,5 +1,4 @@
-// RUN: grep -Ev "// *[A-Z-]+:" %s \
-// RUN:   | clang-format -style='{BasedOnStyle: LLVM, AlignEscapedNewlines: DontAlign}' \
+// RUN: clang-format -style='{BasedOnStyle: LLVM, AlignEscapedNewlines: DontAlign}' %s \
 // RUN:   | FileCheck -strict-whitespace %s
 
 // CHECK: {{^#define TAG\(\.\.\.\) \\}}

>From 1f54d834d94267a1e23896ef0f4addfb86f8a143 Mon Sep 17 00:00:00 2001
From: Naveen Seth Hanig <naveen.hanig at outlook.com>
Date: Mon, 23 Jun 2025 02:26:26 +0200
Subject: [PATCH 3/4] Use unit testing instead of regression testing

---
 .../splice-trailing-whitespace-p2223r2.cpp    | 13 -----------
 clang/unittests/Format/FormatTest.cpp         | 23 +++++++++++++++++++
 2 files changed, 23 insertions(+), 13 deletions(-)
 delete mode 100644 clang/test/Format/splice-trailing-whitespace-p2223r2.cpp

diff --git a/clang/test/Format/splice-trailing-whitespace-p2223r2.cpp b/clang/test/Format/splice-trailing-whitespace-p2223r2.cpp
deleted file mode 100644
index 4e9e2612af78a..0000000000000
--- a/clang/test/Format/splice-trailing-whitespace-p2223r2.cpp
+++ /dev/null
@@ -1,13 +0,0 @@
-// RUN: clang-format -style='{BasedOnStyle: LLVM, AlignEscapedNewlines: DontAlign}' %s \
-// RUN:   | FileCheck -strict-whitespace %s
-
-// CHECK: {{^#define TAG\(\.\.\.\) \\}}
-// CHECK: {{^  struct a \{\};}}
-// There is whitespace following  v  this backslash!
-#define TAG(...)  struct a {     \     
-  };
-
-// CHECK: {{^int   i;}}
-// The comment below eats its following line because of the line splice.
-// I also have trailing whitespace. Nom nom nom \     
-int   i;
diff --git a/clang/unittests/Format/FormatTest.cpp b/clang/unittests/Format/FormatTest.cpp
index c0633ba3c29b3..23d2bc7b0c442 100644
--- a/clang/unittests/Format/FormatTest.cpp
+++ b/clang/unittests/Format/FormatTest.cpp
@@ -25768,6 +25768,29 @@ TEST_F(FormatTest, OperatorPassedAsAFunctionPtr) {
   verifyFormat("foo(operator, , -42);", Style);
 }
 
+TEST_F(FormatTest, LineSpliceWithTrailingWhitespace) {
+  // Test that each sequence of a backslash (\) immediately followed by zero or
+  // more horizontal whitespace characters and then a new-line character is
+  // treated as a single logical line while formatting (as per P2223R2).
+  FormatStyle Style = getLLVMStyle();
+  Style.AlignEscapedNewlines = FormatStyle::ENAS_DontAlign;
+  Style.UseTab = FormatStyle::UT_Never;
+
+  verifyFormat("int i;",
+               "  \\  \n"
+               "  int i;",
+               Style);
+  verifyFormat("#define FOO(args) \\\n  struct a {};\n",
+               "#define FOO( args )   \\   \n"
+               "struct a{\\\t\t\t\n"
+               "  };\n",
+               Style);
+  verifyFormat("comment here",
+               "comment \\ \n"
+               "here",
+               Style);
+}
+
 TEST_F(FormatTest, WhitespaceSensitiveMacros) {
   FormatStyle Style = getLLVMStyle();
   Style.WhitespaceSensitiveMacros.push_back("FOO");

>From ed80aca9eb4c1d02d4ef48ea08995159f3832444 Mon Sep 17 00:00:00 2001
From: Naveen Seth Hanig <naveen.hanig at outlook.com>
Date: Tue, 24 Jun 2025 16:11:29 +0200
Subject: [PATCH 4/4] Address review feedback

---
 clang/docs/ReleaseNotes.rst           |  3 ---
 clang/lib/Format/FormatTokenLexer.cpp | 21 +++++++++------------
 clang/unittests/Format/FormatTest.cpp | 20 ++++++--------------
 3 files changed, 15 insertions(+), 29 deletions(-)

diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst
index 7c8e231655e86..96477ef6ddc9a 100644
--- a/clang/docs/ReleaseNotes.rst
+++ b/clang/docs/ReleaseNotes.rst
@@ -1025,9 +1025,6 @@ clang-format
   ``enum`` enumerator lists.
 - Add ``OneLineFormatOffRegex`` option for turning formatting off for one line.
 - Add ``SpaceAfterOperatorKeyword`` option.
-- Support trailing whitespace in line splicing.
-  (P2223R2 <https://www.open-std.org/jtc1/sc22/wg21/docs/papers/2021/p2223r2.pdf>_, #GH145226)
-
 
 clang-refactor
 --------------
diff --git a/clang/lib/Format/FormatTokenLexer.cpp b/clang/lib/Format/FormatTokenLexer.cpp
index bcc8e6ffd91ab..06f68ec8b0fc1 100644
--- a/clang/lib/Format/FormatTokenLexer.cpp
+++ b/clang/lib/Format/FormatTokenLexer.cpp
@@ -1204,25 +1204,22 @@ static size_t countLeadingWhitespace(StringRef Text) {
   const unsigned char *const End = Text.bytes_end();
   const unsigned char *Cur = Begin;
   while (Cur < End) {
-    if (isspace(Cur[0])) {
+    if (isWhitespace(Cur[0])) {
       ++Cur;
     } else if (Cur[0] == '\\') {
-      // A '\' followed by a optional horizontal whitespace (P22232R2) and then
-      // newline  always escapes the newline, regardless  of whether there is
-      // another '\' before it.
+      // A backslash followed by optional horizontal whitespaces (P22232R2) and
+      // then a newline always escapes the newline.
       // The source has a null byte at the end. So the end of the entire input
       // isn't reached yet. Also the lexer doesn't break apart an escaped
       // newline.
-      const unsigned char *Lookahead = Cur + 1;
+      const auto *Lookahead = Cur + 1;
       while (isHorizontalWhitespace(*Lookahead))
         ++Lookahead;
-      if (*Lookahead == '\n' || *Lookahead == '\r') {
-        // Splice found, consume it.
-        Cur = Lookahead + 1;
-        continue;
-      }
-      // No line splice found; the '\' is a token.
-      break;
+      // No line splice found; the backslash is a token.
+      if (!isVerticalWhitespace(*Lookahead))
+        break;
+      // Splice found, consume it.
+      Cur = Lookahead + 1;
     } else if (Cur[0] == '?' && Cur[1] == '?' && Cur[2] == '/' &&
                (Cur[3] == '\n' || Cur[3] == '\r')) {
       // Newlines can also be escaped by a '?' '?' '/' trigraph. By the way, the
diff --git a/clang/unittests/Format/FormatTest.cpp b/clang/unittests/Format/FormatTest.cpp
index 23d2bc7b0c442..a05bf8305716b 100644
--- a/clang/unittests/Format/FormatTest.cpp
+++ b/clang/unittests/Format/FormatTest.cpp
@@ -25769,25 +25769,17 @@ TEST_F(FormatTest, OperatorPassedAsAFunctionPtr) {
 }
 
 TEST_F(FormatTest, LineSpliceWithTrailingWhitespace) {
-  // Test that each sequence of a backslash (\) immediately followed by zero or
-  // more horizontal whitespace characters and then a new-line character is
-  // treated as a single logical line while formatting (as per P2223R2).
-  FormatStyle Style = getLLVMStyle();
+  auto Style = getLLVMStyle();
   Style.AlignEscapedNewlines = FormatStyle::ENAS_DontAlign;
   Style.UseTab = FormatStyle::UT_Never;
 
-  verifyFormat("int i;",
-               "  \\  \n"
-               "  int i;",
-               Style);
-  verifyFormat("#define FOO(args) \\\n  struct a {};\n",
+  verifyFormat("int i;", "  \\  \n"
+                         "  int i;");
+  verifyFormat("#define FOO(args) \\\n"
+               "  struct a {};",
                "#define FOO( args )   \\   \n"
                "struct a{\\\t\t\t\n"
-               "  };\n",
-               Style);
-  verifyFormat("comment here",
-               "comment \\ \n"
-               "here",
+               "  };",
                Style);
 }
 



More information about the cfe-commits mailing list