[Lldb-commits] [lldb] [lldb] Implement ANSI & Unicode aware string stripping & padding (PR #130878)

Wed Mar 12 08:44:24 PDT 2025

================
@@ -172,28 +175,99 @@ inline std::string FormatAnsiTerminalCodes(llvm::StringRef format,
   return fmt;
 }
 
+inline std::tuple<llvm::StringRef, llvm::StringRef, llvm::StringRef>
+FindNextAnsiSequence(llvm::StringRef str) {
+  llvm::StringRef left;
+  llvm::StringRef right = str;
+
+  while (!right.empty()) {
+    const size_t start = right.find(ANSI_ESC_START);
+
+    // ANSI_ESC_START not found.
+    if (start == llvm::StringRef::npos)
+      return {str, {}, {}};
+
+    // Split the string around the current ANSI_ESC_START.
+    left = str.take_front(left.size() + start);
+    llvm::StringRef escape = right.substr(start);
+    right = right.substr(start + ANSI_ESC_START_LEN + 1);
+
+    const size_t end = right.find_first_not_of("0123456789;");
+
+    // ANSI_ESC_END found.
+    if (end < right.size() && (right[end] == 'm' || right[end] == 'G'))
+      return {left, escape.take_front(ANSI_ESC_START_LEN + 1 + end + 1),
+              right.substr(end + 1)};
+
+    // Maintain the invariant that str == left + right at the start of the loop.
+    left = str.take_front(left.size() + ANSI_ESC_START_LEN + 1);
+  }
+
+  return {str, {}, {}};
+}
+
 inline std::string StripAnsiTerminalCodes(llvm::StringRef str) {
   std::string stripped;
   while (!str.empty()) {
-    llvm::StringRef left, right;
-
-    std::tie(left, right) = str.split(ANSI_ESC_START);
+    auto [left, escape, right] = FindNextAnsiSequence(str);
     stripped += left;
+    str = right;
+  }
+  return stripped;
+}
 
-    // ANSI_ESC_START not found.
-    if (left == str && right.empty())
-      break;
+inline std::string TrimAndPad(llvm::StringRef str, size_t visible_length,
+                              char padding = ' ') {
+  std::string result;
+  result.reserve(visible_length);
+  size_t result_visibile_length = 0;
+
+  // Trim the string to the given visible length.
+  while (!str.empty()) {
+    auto [left, escape, right] = FindNextAnsiSequence(str);
+    str = right;
 
-    size_t end = right.find_first_not_of("0123456789;");
-    if (end < right.size() && (right[end] == 'm' || right[end] == 'G')) {
-      str = right.substr(end + 1);
-    } else {
-      // ANSI_ESC_END not found.
-      stripped += ANSI_ESC_START;
-      str = right;
+    // Compute the length of the string without escape codes. If it fits, append
+    // it together with the invisible escape code.
+    size_t column_width = llvm::sys::locale::columnWidth(left);
+    if (result_visibile_length + column_width <= visible_length) {
+      result.append(left).append(escape);
+      result_visibile_length += column_width;
+      continue;
+    }
+
+    // The string doesn't fit but doesn't fit but doesn't contain unicode.
+    // Append the substring that fits.
+    if (column_width == left.size()) {
+      llvm::StringRef trimmed =
+          left.take_front(visible_length - result_visibile_length);
+      result.append(trimmed);
+      result_visibile_length += visible_length - result_visibile_length;
+      continue;
+    }
----------------
JDevlieghere wrote:

Good point, I didn't consider that. 

https://github.com/llvm/llvm-project/pull/130878