[Lldb-commits] [lldb] [lldb] Implement ANSI & Unicode aware string stripping & padding (PR #130878)
Jonas Devlieghere via lldb-commits
lldb-commits at lists.llvm.org
Tue Mar 11 21:49:07 PDT 2025
https://github.com/JDevlieghere updated https://github.com/llvm/llvm-project/pull/130878
>From e50c6aec3cdb5d30fa12b19709329b49036ae924 Mon Sep 17 00:00:00 2001
From: Jonas Devlieghere <jonas at devlieghere.com>
Date: Tue, 11 Mar 2025 18:59:50 -0700
Subject: [PATCH] [lldb] Implement ANSI & Unicode aware string stripping &
padding
This PR implements a unicode and ANSI escape code aware function to trim
and pad strings. This is a break-out from #121860.
---
lldb/include/lldb/Utility/AnsiTerminal.h | 102 +++++++++++++++++---
lldb/unittests/Utility/AnsiTerminalTest.cpp | 49 ++++++++++
2 files changed, 137 insertions(+), 14 deletions(-)
diff --git a/lldb/include/lldb/Utility/AnsiTerminal.h b/lldb/include/lldb/Utility/AnsiTerminal.h
index 1939c49c7b859..a49f6db26f7b7 100644
--- a/lldb/include/lldb/Utility/AnsiTerminal.h
+++ b/lldb/include/lldb/Utility/AnsiTerminal.h
@@ -70,9 +70,12 @@
#define ANSI_1_CTRL(ctrl1) "\033["##ctrl1 ANSI_ESC_END
#define ANSI_2_CTRL(ctrl1, ctrl2) "\033["##ctrl1 ";"##ctrl2 ANSI_ESC_END
+#define ANSI_ESC_START_LEN 2
+
#include "llvm/ADT/ArrayRef.h"
#include "llvm/ADT/STLExtras.h"
#include "llvm/ADT/StringRef.h"
+#include "llvm/Support/Locale.h"
#include <string>
@@ -172,28 +175,99 @@ inline std::string FormatAnsiTerminalCodes(llvm::StringRef format,
return fmt;
}
+inline std::tuple<llvm::StringRef, llvm::StringRef, llvm::StringRef>
+FindNextAnsiSequence(llvm::StringRef str) {
+ llvm::StringRef left;
+ llvm::StringRef right = str;
+
+ while (!right.empty()) {
+ const size_t start = right.find(ANSI_ESC_START);
+
+ // ANSI_ESC_START not found.
+ if (start == llvm::StringRef::npos)
+ return {str, {}, {}};
+
+ // Split the string around the current ANSI_ESC_START.
+ left = str.take_front(left.size() + start);
+ llvm::StringRef escape = right.substr(start);
+ right = right.substr(start + ANSI_ESC_START_LEN + 1);
+
+ const size_t end = right.find_first_not_of("0123456789;");
+
+ // ANSI_ESC_END found.
+ if (end < right.size() && (right[end] == 'm' || right[end] == 'G'))
+ return {left, escape.take_front(ANSI_ESC_START_LEN + 1 + end + 1),
+ right.substr(end + 1)};
+
+ // Maintain the invariant that str == left + right at the start of the loop.
+ left = str.take_front(left.size() + ANSI_ESC_START_LEN + 1);
+ }
+
+ return {str, {}, {}};
+}
+
inline std::string StripAnsiTerminalCodes(llvm::StringRef str) {
std::string stripped;
while (!str.empty()) {
- llvm::StringRef left, right;
-
- std::tie(left, right) = str.split(ANSI_ESC_START);
+ auto [left, escape, right] = FindNextAnsiSequence(str);
stripped += left;
+ str = right;
+ }
+ return stripped;
+}
- // ANSI_ESC_START not found.
- if (left == str && right.empty())
- break;
+inline std::string TrimAndPad(llvm::StringRef str, size_t visible_length,
+ char padding = ' ') {
+ std::string result;
+ result.reserve(visible_length);
+ size_t result_visibile_length = 0;
+
+ // Trim the string to the given visible length.
+ while (!str.empty()) {
+ auto [left, escape, right] = FindNextAnsiSequence(str);
+ str = right;
- size_t end = right.find_first_not_of("0123456789;");
- if (end < right.size() && (right[end] == 'm' || right[end] == 'G')) {
- str = right.substr(end + 1);
- } else {
- // ANSI_ESC_END not found.
- stripped += ANSI_ESC_START;
- str = right;
+ // Compute the length of the string without escape codes. If it fits, append
+ // it together with the invisible escape code.
+ size_t column_width = llvm::sys::locale::columnWidth(left);
+ if (result_visibile_length + column_width <= visible_length) {
+ result.append(left).append(escape);
+ result_visibile_length += column_width;
+ continue;
+ }
+
+ // The string doesn't fit but doesn't fit but doesn't contain unicode.
+ // Append the substring that fits.
+ if (column_width == left.size()) {
+ llvm::StringRef trimmed =
+ left.take_front(visible_length - result_visibile_length);
+ result.append(trimmed);
+ result_visibile_length += visible_length - result_visibile_length;
+ continue;
+ }
+
+ // The string doesn't fit but contains unicode. Repeatedly trim the string
+ // until it fits.
+ llvm::StringRef trimmed = left;
+ while (!trimmed.empty()) {
+ // This relies on columnWidth returning -2 for invalid/partial unicode
+ // characters, which after conversion to size_t will be larger than the
+ // visible width.
+ column_width = llvm::sys::locale::columnWidth(trimmed);
+ if (result_visibile_length + column_width <= visible_length) {
+ result.append(trimmed);
+ result_visibile_length += column_width;
+ break;
+ }
+ trimmed = trimmed.drop_back();
}
}
- return stripped;
+
+ // Pad the string.
+ if (result_visibile_length < visible_length)
+ result.append(visible_length - result_visibile_length, padding);
+
+ return result;
}
} // namespace ansi
diff --git a/lldb/unittests/Utility/AnsiTerminalTest.cpp b/lldb/unittests/Utility/AnsiTerminalTest.cpp
index 1ba9565c3f6af..cef73ffaf9136 100644
--- a/lldb/unittests/Utility/AnsiTerminalTest.cpp
+++ b/lldb/unittests/Utility/AnsiTerminalTest.cpp
@@ -67,3 +67,52 @@ TEST(AnsiTerminal, InvalidEscapeCode) {
EXPECT_EQ("abc\x1B[31kabcabc",
ansi::StripAnsiTerminalCodes("abc\x1B[31kabc\x1B[0mabc"));
}
+
+TEST(AnsiTerminal, FindNextAnsiSequenceBasic) {
+ auto [left, escape, right] = ansi::FindNextAnsiSequence("foo\x1B[31mbar");
+ EXPECT_EQ("foo", left);
+ EXPECT_EQ("\x1B[31m", escape);
+ EXPECT_EQ("bar", right);
+}
+
+TEST(AnsiTerminal, FindNextAnsiSequenceIncompleteStart) {
+ auto [left, escape, right] =
+ ansi::FindNextAnsiSequence("foo\x1B[bar\x1B[31mbaz");
+ EXPECT_EQ("foo\x1B[bar", left);
+ EXPECT_EQ("\x1B[31m", escape);
+ EXPECT_EQ("baz", right);
+}
+
+TEST(AnsiTerminal, FindNextAnsiSequenceEscapeStart) {
+ auto [left, escape, right] = ansi::FindNextAnsiSequence("\x1B[31mfoo");
+ EXPECT_EQ("", left);
+ EXPECT_EQ("\x1B[31m", escape);
+ EXPECT_EQ("foo", right);
+}
+
+TEST(AnsiTerminal, TrimAndPad) {
+ // Test basic ASCII.
+ EXPECT_EQ(" ", ansi::TrimAndPad("", 5));
+ EXPECT_EQ("foo ", ansi::TrimAndPad("foo", 5));
+ EXPECT_EQ("fooba", ansi::TrimAndPad("fooba", 5));
+ EXPECT_EQ("fooba", ansi::TrimAndPad("foobar", 5));
+
+ // Simple test that ANSI escape codes don't contribute to the visible width.
+ EXPECT_EQ("\x1B[30m ", ansi::TrimAndPad("\x1B[30m", 5));
+ EXPECT_EQ("\x1B[30mfoo ", ansi::TrimAndPad("\x1B[30mfoo", 5));
+ EXPECT_EQ("\x1B[30mfooba", ansi::TrimAndPad("\x1B[30mfooba", 5));
+ EXPECT_EQ("\x1B[30mfooba", ansi::TrimAndPad("\x1B[30mfoobar", 5));
+
+ // Test that we include as many escape codes as we can.
+ EXPECT_EQ("fooba\x1B[30m", ansi::TrimAndPad("fooba\x1B[30m", 5));
+ EXPECT_EQ("fooba\x1B[30m\x1B[34m",
+ ansi::TrimAndPad("fooba\x1B[30m\x1B[34m", 5));
+ EXPECT_EQ("fooba\x1B[30m\x1B[34m",
+ ansi::TrimAndPad("fooba\x1B[30m\x1B[34mr", 5));
+
+ // Test Unicode.
+ EXPECT_EQ("❤️ ", ansi::TrimAndPad("❤️", 5));
+ EXPECT_EQ(" ❤️", ansi::TrimAndPad(" ❤️", 5));
+ EXPECT_EQ("12❤️4❤️", ansi::TrimAndPad("12❤️4❤️", 5));
+ EXPECT_EQ("12❤️45", ansi::TrimAndPad("12❤️45❤️", 5));
+}
More information about the lldb-commits
mailing list