[clang] [clang-format] Handle Java text blocks (PR #141334)
Owen Pan via cfe-commits
cfe-commits at lists.llvm.org
Sat May 24 12:03:30 PDT 2025
https://github.com/owenca updated https://github.com/llvm/llvm-project/pull/141334
>From 470eca4b4d963bf5c1ba87fb2f22620eb717c848 Mon Sep 17 00:00:00 2001
From: Owen Pan <owenpiano at gmail.com>
Date: Fri, 23 May 2025 23:21:12 -0700
Subject: [PATCH 1/2] [clang-format] Handle Java text blocks
Fix #61954
---
clang/lib/Format/FormatTokenLexer.cpp | 45 ++++++++++++++++++++
clang/lib/Format/FormatTokenLexer.h | 2 +
clang/unittests/Format/FormatTestJava.cpp | 52 +++++++++++++++++++++++
3 files changed, 99 insertions(+)
diff --git a/clang/lib/Format/FormatTokenLexer.cpp b/clang/lib/Format/FormatTokenLexer.cpp
index 864486a9b878d..31c3613c8b083 100644
--- a/clang/lib/Format/FormatTokenLexer.cpp
+++ b/clang/lib/Format/FormatTokenLexer.cpp
@@ -694,6 +694,49 @@ bool FormatTokenLexer::canPrecedeRegexLiteral(FormatToken *Prev) {
return true;
}
+void FormatTokenLexer::tryParseJavaTextBlock() {
+ if (FormatTok->TokenText != "\"\"")
+ return;
+
+ const auto *Str = Lex->getBufferLocation();
+ const auto *End = Lex->getBuffer().end();
+
+ if (Str == End || *Str != '\"')
+ return;
+
+ // Skip the `"""` that begins a text block.
+ const auto *S = Str + 1;
+
+ // From docs.oracle.com/en/java/javase/15/text-blocks/#text-block-syntax:
+ // A text block begins with three double-quote characters followed by a line
+ // terminator.
+ while (S < End && *S != '\n') {
+ if (!isblank(*S))
+ return;
+ ++S;
+ }
+
+ // Find the `"""` that ends the text block.
+ for (int Count = 0; Count < 3; ++S) {
+ if (S == End)
+ return;
+
+ switch (*S) {
+ case '\\':
+ Count = -1;
+ break;
+ case '\"':
+ ++Count;
+ break;
+ default:
+ Count = 0;
+ }
+ }
+
+ // Skip the text block.
+ resetLexer(SourceMgr.getFileOffset(Lex->getSourceLocation(S)));
+}
+
// Tries to parse a JavaScript Regex literal starting at the current token,
// if that begins with a slash and is in a location where JavaScript allows
// regex literals. Changes the current token to a regex literal and updates
@@ -1374,6 +1417,8 @@ FormatToken *FormatTokenLexer::getNextToken() {
FormatTok->TokenText = FormatTok->TokenText.substr(0, 1);
++Column;
StateStack.push(LexerState::TOKEN_STASHED);
+ } else if (Style.isJava() && FormatTok->is(tok::string_literal)) {
+ tryParseJavaTextBlock();
}
if (Style.isVerilog() && Tokens.size() > 0 &&
diff --git a/clang/lib/Format/FormatTokenLexer.h b/clang/lib/Format/FormatTokenLexer.h
index 105847b126e20..026383db1fe6c 100644
--- a/clang/lib/Format/FormatTokenLexer.h
+++ b/clang/lib/Format/FormatTokenLexer.h
@@ -72,6 +72,8 @@ class FormatTokenLexer {
bool canPrecedeRegexLiteral(FormatToken *Prev);
+ void tryParseJavaTextBlock();
+
// Tries to parse a JavaScript Regex literal starting at the current token,
// if that begins with a slash and is in a location where JavaScript allows
// regex literals. Changes the current token to a regex literal and updates
diff --git a/clang/unittests/Format/FormatTestJava.cpp b/clang/unittests/Format/FormatTestJava.cpp
index e01c1d6d7e684..35ee257d015d3 100644
--- a/clang/unittests/Format/FormatTestJava.cpp
+++ b/clang/unittests/Format/FormatTestJava.cpp
@@ -791,6 +791,58 @@ TEST_F(FormatTestJava, AlignCaseArrows) {
Style);
}
+TEST_F(FormatTestJava, TextBlock) {
+ verifyNoChange("String myStr = \"\"\"\n"
+ "hello\n"
+ "there\n"
+ "\"\"\";");
+
+ verifyNoChange("String tb = \"\"\"\n"
+ " the new\"\"\";");
+
+ verifyNoChange("System.out.println(\"\"\"\n"
+ " This is the first line\n"
+ " This is the second line\n"
+ " \"\"\");");
+
+ verifyNoChange("void writeHTML() {\n"
+ " String html = \"\"\" \n"
+ " <html>\n"
+ " <p>Hello World.</p>\n"
+ " </html>\n"
+ "\"\"\";\n"
+ " writeOutput(html);\n"
+ "}");
+
+ verifyNoChange("String colors = \"\"\"\t\n"
+ " red\n"
+ " green\n"
+ " blue\"\"\".indent(4);");
+
+ verifyNoChange("String code = \"\"\"\n"
+ " String source = \\\"\"\"\n"
+ " String message = \"Hello, World!\";\n"
+ " System.out.println(message);\n"
+ " \\\"\"\";\n"
+ " \"\"\";");
+
+ verifyNoChange(
+ "class Outer {\n"
+ " void printPoetry() {\n"
+ " String lilacs = \"\"\"\n"
+ "Passing the apple-tree blows of white and pink in the orchards\n"
+ "\"\"\";\n"
+ " System.out.println(lilacs);\n"
+ " }\n"
+ "}");
+
+ verifyNoChange("String name = \"\"\"\n"
+ " red\n"
+ " green\n"
+ " blue\\\n"
+ " \"\"\";");
+}
+
} // namespace
} // namespace test
} // namespace format
>From e3462be23effcf2eb016efaafaa47b6a0de09b2d Mon Sep 17 00:00:00 2001
From: Owen Pan <owenpiano at gmail.com>
Date: Sat, 24 May 2025 12:01:25 -0700
Subject: [PATCH 2/2] Handle Windows line endings
---
clang/lib/Format/FormatTokenLexer.cpp | 8 ++++++--
clang/unittests/Format/FormatTestJava.cpp | 2 +-
2 files changed, 7 insertions(+), 3 deletions(-)
diff --git a/clang/lib/Format/FormatTokenLexer.cpp b/clang/lib/Format/FormatTokenLexer.cpp
index 31c3613c8b083..ababfb743bb4d 100644
--- a/clang/lib/Format/FormatTokenLexer.cpp
+++ b/clang/lib/Format/FormatTokenLexer.cpp
@@ -710,8 +710,12 @@ void FormatTokenLexer::tryParseJavaTextBlock() {
// From docs.oracle.com/en/java/javase/15/text-blocks/#text-block-syntax:
// A text block begins with three double-quote characters followed by a line
// terminator.
- while (S < End && *S != '\n') {
- if (!isblank(*S))
+ while (S < End) {
+ if (*S == '\n') {
+ ++S;
+ break;
+ }
+ if (!isspace(*S))
return;
++S;
}
diff --git a/clang/unittests/Format/FormatTestJava.cpp b/clang/unittests/Format/FormatTestJava.cpp
index 35ee257d015d3..134ecf052c844 100644
--- a/clang/unittests/Format/FormatTestJava.cpp
+++ b/clang/unittests/Format/FormatTestJava.cpp
@@ -836,7 +836,7 @@ TEST_F(FormatTestJava, TextBlock) {
" }\n"
"}");
- verifyNoChange("String name = \"\"\"\n"
+ verifyNoChange("String name = \"\"\"\r\n"
" red\n"
" green\n"
" blue\\\n"
More information about the cfe-commits
mailing list