[libcxx-commits] [libcxx] [llvm] [LIT] remove `to_unicode`, `to_string`, and `to_bytes` helpers (PR #165950)

Tomohiro Kashiwada via libcxx-commits libcxx-commits at lists.llvm.org
Wed Nov 5 01:46:05 PST 2025


https://github.com/kikairoya updated https://github.com/llvm/llvm-project/pull/165950

>From 6252ecd8ac0e448d55b39d8c2b70bb033eefc753 Mon Sep 17 00:00:00 2001
From: kikairoya <kikairoya at gmail.com>
Date: Fri, 17 Oct 2025 20:41:22 +0900
Subject: [PATCH 01/10] [LIT] replace `lit.util.mkdir` with
 `pathlib.Path.mkdir`

---
 llvm/utils/lit/lit/TestRunner.py              | 15 ++++---
 llvm/utils/lit/lit/util.py                    | 40 +++----------------
 .../shtest-glob/example_dir1.input/empty      |  0
 .../shtest-glob/example_dir2.input/empty      |  0
 .../tests/Inputs/shtest-glob/glob-mkdir.txt   |  5 +++
 llvm/utils/lit/tests/shtest-glob.py           |  9 +++--
 6 files changed, 22 insertions(+), 47 deletions(-)
 create mode 100644 llvm/utils/lit/tests/Inputs/shtest-glob/example_dir1.input/empty
 create mode 100644 llvm/utils/lit/tests/Inputs/shtest-glob/example_dir2.input/empty

diff --git a/llvm/utils/lit/lit/TestRunner.py b/llvm/utils/lit/lit/TestRunner.py
index a7e2705f609af..f1346c257ec75 100644
--- a/llvm/utils/lit/lit/TestRunner.py
+++ b/llvm/utils/lit/lit/TestRunner.py
@@ -462,16 +462,15 @@ def executeBuiltinMkdir(cmd, cmd_shenv):
     stderr = StringIO()
     exitCode = 0
     for dir in args:
-        cwd = cmd_shenv.cwd
-        dir = to_unicode(dir) if kIsWindows else to_bytes(dir)
-        cwd = to_unicode(cwd) if kIsWindows else to_bytes(cwd)
-        if not os.path.isabs(dir):
-            dir = lit.util.abs_path_preserve_drive(os.path.join(cwd, dir))
+        dir = pathlib.Path(dir)
+        cwd = pathlib.Path(to_unicode(cmd_shenv.cwd))
+        if not dir.is_absolute():
+            dir = lit.util.abs_path_preserve_drive(cwd / dir)
         if parent:
-            lit.util.mkdir_p(dir)
+            dir.mkdir(parents=True, exist_ok=True)
         else:
             try:
-                lit.util.mkdir(dir)
+                dir.mkdir(exist_ok=True)
             except OSError as err:
                 stderr.write("Error: 'mkdir' command failed, %s\n" % str(err))
                 exitCode = 1
@@ -2395,7 +2394,7 @@ def runOnce(
         return out, err, exitCode, timeoutInfo, status
 
     # Create the output directory if it does not already exist.
-    lit.util.mkdir_p(os.path.dirname(tmpBase))
+    pathlib.Path(tmpBase).parent.mkdir(parents=True, exist_ok=True)
 
     # Re-run failed tests up to test.allowed_retries times.
     execdir = os.path.dirname(test.getExecPath())
diff --git a/llvm/utils/lit/lit/util.py b/llvm/utils/lit/lit/util.py
index ce4c3c2df3436..e4e031b3e0898 100644
--- a/llvm/utils/lit/lit/util.py
+++ b/llvm/utils/lit/lit/util.py
@@ -5,6 +5,7 @@
 import math
 import numbers
 import os
+import pathlib
 import platform
 import re
 import signal
@@ -131,48 +132,17 @@ def abs_path_preserve_drive(path):
         # Since Python 3.8, os.path.realpath resolves sustitute drives,
         # so we should not use it. In Python 3.7, os.path.realpath
         # was implemented as os.path.abspath.
+        if isinstance(path, pathlib.Path):
+            return path.absolute()
         return os.path.abspath(path)
     else:
         # On UNIX, the current directory always has symbolic links resolved,
         # so any program accepting relative paths cannot preserve symbolic
         # links in paths and we should always use os.path.realpath.
+        if isinstance(path, pathlib.Path):
+            return path.resolve()
         return os.path.realpath(path)
 
-def mkdir(path):
-    try:
-        if platform.system() == "Windows":
-            from ctypes import windll
-            from ctypes import GetLastError, WinError
-
-            path = os.path.abspath(path)
-            # Make sure that the path uses backslashes here, in case
-            # python would have happened to use forward slashes, as the
-            # NT path format only supports backslashes.
-            path = path.replace("/", "\\")
-            NTPath = to_unicode(r"\\?\%s" % path)
-            if not windll.kernel32.CreateDirectoryW(NTPath, None):
-                raise WinError(GetLastError())
-        else:
-            os.mkdir(path)
-    except OSError:
-        e = sys.exc_info()[1]
-        # ignore EEXIST, which may occur during a race condition
-        if e.errno != errno.EEXIST:
-            raise
-
-
-def mkdir_p(path):
-    """mkdir_p(path) - Make the "path" directory, if it does not exist; this
-    will also make directories for any missing parent directories."""
-    if not path or os.path.exists(path):
-        return
-
-    parent = os.path.dirname(path)
-    if parent != path:
-        mkdir_p(parent)
-
-    mkdir(path)
-
 
 def listdir_files(dirname, suffixes=None, exclude_filenames=None, prefixes=None):
     """Yields files in a directory.
diff --git a/llvm/utils/lit/tests/Inputs/shtest-glob/example_dir1.input/empty b/llvm/utils/lit/tests/Inputs/shtest-glob/example_dir1.input/empty
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/llvm/utils/lit/tests/Inputs/shtest-glob/example_dir2.input/empty b/llvm/utils/lit/tests/Inputs/shtest-glob/example_dir2.input/empty
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/llvm/utils/lit/tests/Inputs/shtest-glob/glob-mkdir.txt b/llvm/utils/lit/tests/Inputs/shtest-glob/glob-mkdir.txt
index d1329f5dbfaae..71972411bc4cf 100644
--- a/llvm/utils/lit/tests/Inputs/shtest-glob/glob-mkdir.txt
+++ b/llvm/utils/lit/tests/Inputs/shtest-glob/glob-mkdir.txt
@@ -1,2 +1,7 @@
 ## Tests glob pattern handling in the mkdir command.
+
+## This mkdir should fail because the `example_file*.input`s are regular files.
 # RUN: not mkdir %S/example_file*.input
+
+## This mkdir should succeed (so RUN should fail) because the `example_dir*.input`s that already exist are directories.
+# RUN: not mkdir %S/example_dir*.input
diff --git a/llvm/utils/lit/tests/shtest-glob.py b/llvm/utils/lit/tests/shtest-glob.py
index ae90f31907d49..aa4705b634a7d 100644
--- a/llvm/utils/lit/tests/shtest-glob.py
+++ b/llvm/utils/lit/tests/shtest-glob.py
@@ -1,12 +1,13 @@
 ## Tests glob pattern handling in echo command.
 
 # RUN: not %{lit} -a -v %{inputs}/shtest-glob \
-# RUN: | FileCheck -dump-input=fail -match-full-lines %s
-#
+# RUN: | FileCheck -dump-input=fail -match-full-lines --implicit-check-not=Error: %s
 # END.
 
 # CHECK: UNRESOLVED: shtest-glob :: glob-echo.txt ({{[^)]*}})
 # CHECK: TypeError: string argument expected, got 'GlobItem'
 
-# CHECK: FAIL: shtest-glob :: glob-mkdir.txt ({{[^)]*}})
-# CHECK: # error: command failed with exit status: 1
+# CHECK:      FAIL: shtest-glob :: glob-mkdir.txt ({{[^)]*}})
+# CHECK:      # | Error: 'mkdir' command failed, {{.*}}example_file1.input'
+# CHECK-NEXT: # | Error: 'mkdir' command failed, {{.*}}example_file2.input'
+# CHECK:      # error: command failed with exit status: 1

>From 4e82cd7f92c02b05324f4b2e10603b993ae5484e Mon Sep 17 00:00:00 2001
From: kikairoya <kikairoya at gmail.com>
Date: Sat, 1 Nov 2025 09:05:48 +0900
Subject: [PATCH 02/10] remove to_unicode

---
 llvm/utils/lit/lit/TestRunner.py |  9 ++-------
 llvm/utils/lit/lit/util.py       | 14 --------------
 2 files changed, 2 insertions(+), 21 deletions(-)

diff --git a/llvm/utils/lit/lit/TestRunner.py b/llvm/utils/lit/lit/TestRunner.py
index f1346c257ec75..89a71add93458 100644
--- a/llvm/utils/lit/lit/TestRunner.py
+++ b/llvm/utils/lit/lit/TestRunner.py
@@ -21,7 +21,7 @@
 import lit.ShUtil as ShUtil
 import lit.Test as Test
 import lit.util
-from lit.util import to_bytes, to_string, to_unicode
+from lit.util import to_bytes, to_string
 from lit.BooleanExpression import BooleanExpression
 
 
@@ -463,7 +463,7 @@ def executeBuiltinMkdir(cmd, cmd_shenv):
     exitCode = 0
     for dir in args:
         dir = pathlib.Path(dir)
-        cwd = pathlib.Path(to_unicode(cmd_shenv.cwd))
+        cwd = pathlib.Path(cmd_shenv.cwd)
         if not dir.is_absolute():
             dir = lit.util.abs_path_preserve_drive(cwd / dir)
         if parent:
@@ -508,8 +508,6 @@ def on_rm_error(func, path, exc_info):
     exitCode = 0
     for path in args:
         cwd = cmd_shenv.cwd
-        path = to_unicode(path) if kIsWindows else to_bytes(path)
-        cwd = to_unicode(cwd) if kIsWindows else to_bytes(cwd)
         if not os.path.isabs(path):
             path = lit.util.abs_path_preserve_drive(os.path.join(cwd, path))
         if force and not os.path.exists(path):
@@ -703,9 +701,6 @@ def processRedirects(cmd, stdin_source, cmd_shenv, opened_files):
         else:
             # Make sure relative paths are relative to the cwd.
             redir_filename = os.path.join(cmd_shenv.cwd, name)
-            redir_filename = (
-                to_unicode(redir_filename) if kIsWindows else to_bytes(redir_filename)
-            )
             fd = open(redir_filename, mode)
         # Workaround a Win32 and/or subprocess bug when appending.
         #
diff --git a/llvm/utils/lit/lit/util.py b/llvm/utils/lit/lit/util.py
index e4e031b3e0898..f85cd7fda0317 100644
--- a/llvm/utils/lit/lit/util.py
+++ b/llvm/utils/lit/lit/util.py
@@ -89,20 +89,6 @@ def to_string(b):
         raise TypeError("not sure how to convert %s to %s" % (type(b), str))
 
 
-def to_unicode(s):
-    """Return the parameter as type which supports unicode, possibly decoding
-    it.
-
-    In Python2, this is the unicode type. In Python3 it's the str type.
-
-    """
-    if isinstance(s, bytes):
-        # In Python2, this branch is taken for both 'str' and 'bytes'.
-        # In Python3, this branch is taken only for 'bytes'.
-        return s.decode("utf-8")
-    return s
-
-
 def usable_core_count():
     """Return the number of cores the current process can use, if supported.
     Otherwise, return the total number of cores (like `os.cpu_count()`).

>From fa490a657369d203b5d77b92e0934ba098acbb19 Mon Sep 17 00:00:00 2001
From: kikairoya <kikairoya at gmail.com>
Date: Sat, 1 Nov 2025 09:39:25 +0900
Subject: [PATCH 03/10] remove to_string from the runner

---
 llvm/utils/lit/lit/TestRunner.py | 35 ++++++++++++++------------------
 llvm/utils/lit/lit/util.py       |  6 +++---
 2 files changed, 18 insertions(+), 23 deletions(-)

diff --git a/llvm/utils/lit/lit/TestRunner.py b/llvm/utils/lit/lit/TestRunner.py
index 89a71add93458..2cbceae141f89 100644
--- a/llvm/utils/lit/lit/TestRunner.py
+++ b/llvm/utils/lit/lit/TestRunner.py
@@ -21,7 +21,7 @@
 import lit.ShUtil as ShUtil
 import lit.Test as Test
 import lit.util
-from lit.util import to_bytes, to_string
+from lit.util import to_bytes
 from lit.BooleanExpression import BooleanExpression
 
 
@@ -391,18 +391,14 @@ def executeBuiltinEcho(cmd, shenv):
     # Some tests have un-redirected echo commands to help debug test failures.
     # Buffer our output and return it to the caller.
     is_redirected = True
-    encode = lambda x: x
     if stdout == subprocess.PIPE:
         is_redirected = False
         stdout = StringIO()
     elif kIsWindows:
-        # Reopen stdout in binary mode to avoid CRLF translation. The versions
-        # of echo we are replacing on Windows all emit plain LF, and the LLVM
-        # tests now depend on this.
-        # When we open as binary, however, this also means that we have to write
-        # 'bytes' objects to stdout instead of 'str' objects.
-        encode = lit.util.to_bytes
-        stdout = open(stdout.name, stdout.mode + "b")
+        # Reopen stdout with specifying `newline` to avoid CRLF translation.
+        # The versions of echo we are replacing on Windows all emit plain LF,
+        # and the LLVM tests now depend on this.
+        stdout = open(stdout.name, stdout.mode, newline='')
         opened_files.append((None, None, stdout, None))
 
     # Implement echo flags. We only support -e and -n, and not yet in
@@ -423,16 +419,15 @@ def maybeUnescape(arg):
         if not interpret_escapes:
             return arg
 
-        arg = lit.util.to_bytes(arg)
-        return arg.decode("unicode_escape")
+        return arg.encode("utf-8").decode("unicode_escape")
 
     if args:
         for arg in args[:-1]:
-            stdout.write(encode(maybeUnescape(arg)))
-            stdout.write(encode(" "))
-        stdout.write(encode(maybeUnescape(args[-1])))
+            stdout.write(maybeUnescape(arg))
+            stdout.write(" ")
+        stdout.write(maybeUnescape(args[-1]))
     if write_newline:
-        stdout.write(encode("\n"))
+        stdout.write("\n")
 
     for (name, mode, f, path) in opened_files:
         f.close()
@@ -1062,14 +1057,14 @@ def _executeShCmd(cmd, shenv, results, timeoutHelper):
             if out is None:
                 out = ""
             else:
-                out = to_string(out.decode("utf-8", errors="replace"))
+                out = out.decode("utf-8", errors="replace")
         except:
             out = str(out)
         try:
             if err is None:
                 err = ""
             else:
-                err = to_string(err.decode("utf-8", errors="replace"))
+                err = err.decode("utf-8", errors="replace")
         except:
             err = str(err)
 
@@ -1261,7 +1256,7 @@ def executeScriptInternal(
 
         # Add the command output, if redirected.
         for (name, path, data) in result.outputFiles:
-            data = to_string(data.decode("utf-8", errors="replace"))
+            data = data.decode("utf-8", errors="replace")
             out += formatOutput(f"redirected output from '{name}'", data, limit=1024)
         if result.stdout.strip():
             out += formatOutput("command stdout", result.stdout)
@@ -1471,8 +1466,8 @@ def parseIntegratedTestScriptCommands(source_path, keywords):
             keyword, ln = match.groups()
             yield (
                 line_number,
-                to_string(keyword.decode("utf-8")),
-                to_string(ln.decode("utf-8").rstrip("\r")),
+                keyword.decode("utf-8"),
+                ln.decode("utf-8").rstrip("\r"),
             )
     finally:
         f.close()
diff --git a/llvm/utils/lit/lit/util.py b/llvm/utils/lit/lit/util.py
index f85cd7fda0317..ee5823ec0aa27 100644
--- a/llvm/utils/lit/lit/util.py
+++ b/llvm/utils/lit/lit/util.py
@@ -327,7 +327,7 @@ def executeCommand(
 
     """
     if input is not None:
-        input = to_bytes(input)
+        input = input.encode("utf-8")
     err_out = subprocess.STDOUT if redirect_stderr else subprocess.PIPE
     p = subprocess.Popen(
         command,
@@ -363,8 +363,8 @@ def killProcess():
             timerObject.cancel()
 
     # Ensure the resulting output is always of string type.
-    out = to_string(out)
-    err = "" if redirect_stderr else to_string(err)
+    out = out.decode("utf-8", errors="replace")
+    err = "" if redirect_stderr else err.decode("utf-8", errors="replace")
 
     if hitTimeOut[0]:
         raise ExecuteCommandTimeoutException(

>From 14ca5b6fea26f1d816bcb1c9199fc116077b9cf3 Mon Sep 17 00:00:00 2001
From: kikairoya <kikairoya at gmail.com>
Date: Sat, 1 Nov 2025 09:46:44 +0900
Subject: [PATCH 04/10] remove all to_string

---
 llvm/utils/lit/lit/builtin_commands/diff.py |  6 ++--
 llvm/utils/lit/lit/formats/googletest.py    |  2 +-
 llvm/utils/lit/lit/llvm/config.py           |  6 ++--
 llvm/utils/lit/lit/util.py                  | 39 ---------------------
 4 files changed, 6 insertions(+), 47 deletions(-)

diff --git a/llvm/utils/lit/lit/builtin_commands/diff.py b/llvm/utils/lit/lit/builtin_commands/diff.py
index f2b5869b35889..a32a31d50ada8 100644
--- a/llvm/utils/lit/lit/builtin_commands/diff.py
+++ b/llvm/utils/lit/lit/builtin_commands/diff.py
@@ -8,7 +8,6 @@
 import sys
 
 import util
-from util import to_string
 
 
 class DiffFlags:
@@ -67,10 +66,9 @@ def compareTwoBinaryFiles(flags, filepaths, filelines):
         filepaths[1].encode(),
         n=flags.num_context_lines,
     )
-    diffs = [diff.decode(errors="backslashreplace") for diff in diffs]
 
     for diff in diffs:
-        sys.stdout.write(to_string(diff))
+        sys.stdout.write(diff.decode(errors="backslashreplace"))
         exitCode = 1
     return exitCode
 
@@ -117,7 +115,7 @@ def compose2(f, g):
         filepaths[1],
         n=flags.num_context_lines,
     ):
-        sys.stdout.write(to_string(diff))
+        sys.stdout.write(diff)
         exitCode = 1
     return exitCode
 
diff --git a/llvm/utils/lit/lit/formats/googletest.py b/llvm/utils/lit/lit/formats/googletest.py
index 172cd0beee4a1..d7fb25c983a65 100644
--- a/llvm/utils/lit/lit/formats/googletest.py
+++ b/llvm/utils/lit/lit/formats/googletest.py
@@ -43,7 +43,7 @@ def get_num_tests(self, path, litConfig, localConfig):
             return None
         return sum(
             map(
-                lambda line: lit.util.to_string(line).startswith("  "),
+                lambda line: line.decode("utf-8", errors="replace").startswith("  "),
                 out.splitlines(False),
             )
         )
diff --git a/llvm/utils/lit/lit/llvm/config.py b/llvm/utils/lit/lit/llvm/config.py
index 913ba69d63328..497009848b563 100644
--- a/llvm/utils/lit/lit/llvm/config.py
+++ b/llvm/utils/lit/lit/llvm/config.py
@@ -223,7 +223,7 @@ def _find_git_windows_unix_tools(self, tools_needed):
                         continue
 
                     # We found it, stop enumerating.
-                    return lit.util.to_string(candidate_path)
+                    return candidate_path
             except:
                 continue
 
@@ -284,8 +284,8 @@ def get_process_output(self, command):
                 env=self.config.environment,
             )
             stdout, stderr = cmd.communicate()
-            stdout = lit.util.to_string(stdout)
-            stderr = lit.util.to_string(stderr)
+            stdout = stdout.decode("utf-8", errors="replace")
+            stderr = stderr.decode("utf-8", errors="replace")
             return (stdout, stderr)
         except OSError:
             self.lit_config.fatal("Could not run process %s" % command)
diff --git a/llvm/utils/lit/lit/util.py b/llvm/utils/lit/lit/util.py
index ee5823ec0aa27..341f7268f7ea5 100644
--- a/llvm/utils/lit/lit/util.py
+++ b/llvm/utils/lit/lit/util.py
@@ -50,45 +50,6 @@ def to_bytes(s):
     return s.encode("utf-8")
 
 
-def to_string(b):
-    """Return the parameter as type 'str', possibly encoding it.
-
-    In Python2, the 'str' type is the same as 'bytes'. In Python3, the
-    'str' type is (essentially) Python2's 'unicode' type, and 'bytes' is
-    distinct.
-
-    """
-    if isinstance(b, str):
-        # In Python2, this branch is taken for types 'str' and 'bytes'.
-        # In Python3, this branch is taken only for 'str'.
-        return b
-    if isinstance(b, bytes):
-        # In Python2, this branch is never taken ('bytes' is handled as 'str').
-        # In Python3, this is true only for 'bytes'.
-        try:
-            return b.decode("utf-8")
-        except UnicodeDecodeError:
-            # If the value is not valid Unicode, return the default
-            # repr-line encoding.
-            return str(b)
-
-    # By this point, here's what we *don't* have:
-    #
-    #  - In Python2:
-    #    - 'str' or 'bytes' (1st branch above)
-    #  - In Python3:
-    #    - 'str' (1st branch above)
-    #    - 'bytes' (2nd branch above)
-    #
-    # The last type we might expect is the Python2 'unicode' type. There is no
-    # 'unicode' type in Python3 (all the Python3 cases were already handled). In
-    # order to get a 'str' object, we need to encode the 'unicode' object.
-    try:
-        return b.encode("utf-8")
-    except AttributeError:
-        raise TypeError("not sure how to convert %s to %s" % (type(b), str))
-
-
 def usable_core_count():
     """Return the number of cores the current process can use, if supported.
     Otherwise, return the total number of cores (like `os.cpu_count()`).

>From 347435283d81c3714cd665777dde261c52d23978 Mon Sep 17 00:00:00 2001
From: kikairoya <kikairoya at gmail.com>
Date: Sat, 1 Nov 2025 09:56:38 +0900
Subject: [PATCH 05/10] remove to_bytes

---
 llvm/utils/lit/lit/TestRunner.py | 25 +++++++------------------
 llvm/utils/lit/lit/util.py       | 17 -----------------
 2 files changed, 7 insertions(+), 35 deletions(-)

diff --git a/llvm/utils/lit/lit/TestRunner.py b/llvm/utils/lit/lit/TestRunner.py
index 2cbceae141f89..f0fb5ba30ca4b 100644
--- a/llvm/utils/lit/lit/TestRunner.py
+++ b/llvm/utils/lit/lit/TestRunner.py
@@ -21,7 +21,6 @@
 import lit.ShUtil as ShUtil
 import lit.Test as Test
 import lit.util
-from lit.util import to_bytes
 from lit.BooleanExpression import BooleanExpression
 
 
@@ -1419,19 +1418,11 @@ def parseIntegratedTestScriptCommands(source_path, keywords):
     (line_number, command_type, line).
     """
 
-    # This code is carefully written to be dual compatible with Python 2.5+ and
-    # Python 3 without requiring input files to always have valid codings. The
-    # trick we use is to open the file in binary mode and use the regular
-    # expression library to find the commands, with it scanning strings in
-    # Python2 and bytes in Python3.
-    #
-    # Once we find a match, we do require each script line to be decodable to
-    # UTF-8, so we convert the outputs to UTF-8 before returning. This way the
-    # remaining code can work with "strings" agnostic of the executing Python
-    # version.
+    # We use `bytes` for scanning input files to avoid requiring them to always
+    # have valid codings.
 
     keywords_re = re.compile(
-        to_bytes("(%s)(.*)\n" % ("|".join(re.escape(k) for k in keywords),))
+        b"(%s)(.*)\n" % (b"|".join(re.escape(k.encode("utf-8")) for k in keywords),)
     )
 
     f = open(source_path, "rb")
@@ -1440,8 +1431,8 @@ def parseIntegratedTestScriptCommands(source_path, keywords):
         data = f.read()
 
         # Ensure the data ends with a newline.
-        if not data.endswith(to_bytes("\n")):
-            data = data + to_bytes("\n")
+        if not data.endswith(b"\n"):
+            data = data + b"\n"
 
         # Iterate over the matches.
         line_number = 1
@@ -1451,14 +1442,12 @@ def parseIntegratedTestScriptCommands(source_path, keywords):
             # newlines.
             match_position = match.start()
             line_number += data.count(
-                to_bytes("\n"), last_match_position, match_position
+                b"\n", last_match_position, match_position
             )
             last_match_position = match_position
 
             # Convert the keyword and line to UTF-8 strings and yield the
-            # command. Note that we take care to return regular strings in
-            # Python 2, to avoid other code having to differentiate between the
-            # str and unicode types.
+            # command.
             #
             # Opening the file in binary mode prevented Windows \r newline
             # characters from being converted to Unix \n newlines, so manually
diff --git a/llvm/utils/lit/lit/util.py b/llvm/utils/lit/lit/util.py
index 341f7268f7ea5..7815c361a8b7d 100644
--- a/llvm/utils/lit/lit/util.py
+++ b/llvm/utils/lit/lit/util.py
@@ -33,23 +33,6 @@ def make_word_regex(word):
     return r"\b" + word + r"\b"
 
 
-def to_bytes(s):
-    """Return the parameter as type 'bytes', possibly encoding it.
-
-    In Python2, the 'bytes' type is the same as 'str'. In Python3, they
-    are distinct.
-
-    """
-    if isinstance(s, bytes):
-        # In Python2, this branch is taken for both 'str' and 'bytes'.
-        # In Python3, this branch is taken only for 'bytes'.
-        return s
-    # In Python2, 's' is a 'unicode' object.
-    # In Python3, 's' is a 'str' object.
-    # Encode to UTF-8 to get 'bytes' data.
-    return s.encode("utf-8")
-
-
 def usable_core_count():
     """Return the number of cores the current process can use, if supported.
     Otherwise, return the total number of cores (like `os.cpu_count()`).

>From f05fa2d592a6233482bf37f5a4ed75d96cbd30eb Mon Sep 17 00:00:00 2001
From: kikairoya <kikairoya at gmail.com>
Date: Sat, 1 Nov 2025 10:29:35 +0900
Subject: [PATCH 06/10] format

---
 llvm/utils/lit/lit/TestRunner.py | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/llvm/utils/lit/lit/TestRunner.py b/llvm/utils/lit/lit/TestRunner.py
index f0fb5ba30ca4b..468c3276cc1ce 100644
--- a/llvm/utils/lit/lit/TestRunner.py
+++ b/llvm/utils/lit/lit/TestRunner.py
@@ -397,7 +397,7 @@ def executeBuiltinEcho(cmd, shenv):
         # Reopen stdout with specifying `newline` to avoid CRLF translation.
         # The versions of echo we are replacing on Windows all emit plain LF,
         # and the LLVM tests now depend on this.
-        stdout = open(stdout.name, stdout.mode, newline='')
+        stdout = open(stdout.name, stdout.mode, newline="")
         opened_files.append((None, None, stdout, None))
 
     # Implement echo flags. We only support -e and -n, and not yet in
@@ -1441,9 +1441,7 @@ def parseIntegratedTestScriptCommands(source_path, keywords):
             # Compute the updated line number by counting the intervening
             # newlines.
             match_position = match.start()
-            line_number += data.count(
-                b"\n", last_match_position, match_position
-            )
+            line_number += data.count(b"\n", last_match_position, match_position)
             last_match_position = match_position
 
             # Convert the keyword and line to UTF-8 strings and yield the

>From dc3f0c747f2722957e71d2c1349cb3ad8d2a142f Mon Sep 17 00:00:00 2001
From: kikairoya <kikairoya at gmail.com>
Date: Sat, 1 Nov 2025 11:03:16 +0900
Subject: [PATCH 07/10] replace a use in libcxx

---
 libcxx/test/selftest/dsl/lit.local.cfg | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/libcxx/test/selftest/dsl/lit.local.cfg b/libcxx/test/selftest/dsl/lit.local.cfg
index dc6887ad7e48b..73e1c4db9ca4e 100644
--- a/libcxx/test/selftest/dsl/lit.local.cfg
+++ b/libcxx/test/selftest/dsl/lit.local.cfg
@@ -10,6 +10,6 @@
 # within the test.
 import base64, lit.util, pickle
 
-base64Encode = lambda s: lit.util.to_string(base64.b64encode(lit.util.to_bytes(s)))
+base64Encode = lambda s: base64.b64encode(s).decode("utf-8")
 escapedSubstitutions = base64Encode(pickle.dumps(config.substitutions))
 config.substitutions.append(("%{substitutions}", escapedSubstitutions))

>From 10de4e1ae5ab7db632f3a6a570447e6fc588727d Mon Sep 17 00:00:00 2001
From: kikairoya <kikairoya at gmail.com>
Date: Sat, 1 Nov 2025 13:08:36 +0900
Subject: [PATCH 08/10] set encoding for open()

---
 llvm/utils/lit/lit/TestRunner.py | 32 ++++++++++++--------------------
 llvm/utils/lit/lit/reports.py    |  4 ++--
 2 files changed, 14 insertions(+), 22 deletions(-)

diff --git a/llvm/utils/lit/lit/TestRunner.py b/llvm/utils/lit/lit/TestRunner.py
index 468c3276cc1ce..2220354e3fa42 100644
--- a/llvm/utils/lit/lit/TestRunner.py
+++ b/llvm/utils/lit/lit/TestRunner.py
@@ -397,7 +397,7 @@ def executeBuiltinEcho(cmd, shenv):
         # Reopen stdout with specifying `newline` to avoid CRLF translation.
         # The versions of echo we are replacing on Windows all emit plain LF,
         # and the LLVM tests now depend on this.
-        stdout = open(stdout.name, stdout.mode, newline="")
+        stdout = open(stdout.name, stdout.mode, encoding="utf-8", newline="")
         opened_files.append((None, None, stdout, None))
 
     # Implement echo flags. We only support -e and -n, and not yet in
@@ -695,7 +695,7 @@ def processRedirects(cmd, stdin_source, cmd_shenv, opened_files):
         else:
             # Make sure relative paths are relative to the cwd.
             redir_filename = os.path.join(cmd_shenv.cwd, name)
-            fd = open(redir_filename, mode)
+            fd = open(redir_filename, mode, encoding="utf-8")
         # Workaround a Win32 and/or subprocess bug when appending.
         #
         # FIXME: Actually, this is probably an instance of PR6753.
@@ -1311,13 +1311,6 @@ def executeScript(test, litConfig, tmpBase, commands, cwd):
         script += ".bat"
 
     # Write script file
-    mode = "w"
-    open_kwargs = {}
-    if litConfig.isWindows and not isWin32CMDEXE:
-        mode += "b"  # Avoid CRLFs when writing bash scripts.
-    else:
-        open_kwargs["encoding"] = "utf-8"
-    f = open(script, mode, **open_kwargs)
     if isWin32CMDEXE:
         for i, ln in enumerate(commands):
             match = re.fullmatch(kPdbgRegex, ln)
@@ -1326,8 +1319,9 @@ def executeScript(test, litConfig, tmpBase, commands, cwd):
                 commands[i] = match.expand(
                     "echo '\\1' > nul && " if command else "echo '\\1' > nul"
                 )
-        f.write("@echo on\n")
-        f.write("\n at if %ERRORLEVEL% NEQ 0 EXIT\n".join(commands))
+        with open(script, "w", encoding="utf-8") as f:
+            f.write("@echo on\n")
+            f.write("\n at if %ERRORLEVEL% NEQ 0 EXIT\n".join(commands))
     else:
         for i, ln in enumerate(commands):
             match = re.fullmatch(kPdbgRegex, ln)
@@ -1366,8 +1360,6 @@ def executeScript(test, litConfig, tmpBase, commands, cwd):
                 # seen the latter manage to terminate the shell running lit.
                 if command:
                     commands[i] += f" && {{ {command}; }}"
-        if test.config.pipefail:
-            f.write(b"set -o pipefail;" if mode == "wb" else "set -o pipefail;")
 
         # Manually export any DYLD_* variables used by dyld on macOS because
         # otherwise they are lost when the shell executable is run, before the
@@ -1377,14 +1369,14 @@ def executeScript(test, litConfig, tmpBase, commands, cwd):
             for k, v in test.config.environment.items()
             if k.startswith("DYLD_")
         )
-        f.write(bytes(env_str, "utf-8") if mode == "wb" else env_str)
-        f.write(b"set -x;" if mode == "wb" else "set -x;")
-        if mode == "wb":
-            f.write(bytes("{ " + "; } &&\n{ ".join(commands) + "; }", "utf-8"))
-        else:
+
+        with open(script, "w", encoding="utf-8", newline="") as f:
+            if test.config.pipefail:
+                f.write("set -o pipefail;")
+            f.write(env_str)
+            f.write("set -x;")
             f.write("{ " + "; } &&\n{ ".join(commands) + "; }")
-    f.write(b"\n" if mode == "wb" else "\n")
-    f.close()
+            f.write("\n")
 
     if isWin32CMDEXE:
         command = ["cmd", "/c", script]
diff --git a/llvm/utils/lit/lit/reports.py b/llvm/utils/lit/lit/reports.py
index 1b43ab9357b37..6f8a782a40aa8 100755
--- a/llvm/utils/lit/lit/reports.py
+++ b/llvm/utils/lit/lit/reports.py
@@ -29,10 +29,10 @@ def write_results(self, tests, elapsed):
             fd, _ = tempfile.mkstemp(
                 suffix=ext, prefix=f"{filename}.", dir=os.path.dirname(self.output_file)
             )
-            report_file = os.fdopen(fd, "w")
+            report_file = os.fdopen(fd, "w", encoding="utf-8")
         else:
             # Overwrite if the results already exist.
-            report_file = open(self.output_file, "w")
+            report_file = open(self.output_file, "w", encoding="utf-8")
 
         with report_file:
             self._write_results_to_file(tests, elapsed, report_file)

>From 8da5b76ef9bda87dd7a91bf49a58a646053f5257 Mon Sep 17 00:00:00 2001
From: Tomohiro Kashiwada <kikairoya at gmail.com>
Date: Tue, 4 Nov 2025 21:59:27 +0900
Subject: [PATCH 09/10] Update llvm/utils/lit/lit/formats/googletest.py

Co-authored-by: Alexander Richardson <mail at alexrichardson.me>
---
 llvm/utils/lit/lit/formats/googletest.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llvm/utils/lit/lit/formats/googletest.py b/llvm/utils/lit/lit/formats/googletest.py
index d7fb25c983a65..01820da38c954 100644
--- a/llvm/utils/lit/lit/formats/googletest.py
+++ b/llvm/utils/lit/lit/formats/googletest.py
@@ -43,7 +43,7 @@ def get_num_tests(self, path, litConfig, localConfig):
             return None
         return sum(
             map(
-                lambda line: line.decode("utf-8", errors="replace").startswith("  "),
+                lambda line: line.startswith(b"  "),
                 out.splitlines(False),
             )
         )

>From 6c1476958113934c54e7ca65eb10bdd5e1621a86 Mon Sep 17 00:00:00 2001
From: Tomohiro Kashiwada <kikairoya at gmail.com>
Date: Wed, 5 Nov 2025 06:38:56 +0900
Subject: [PATCH 10/10] Update llvm/utils/lit/lit/TestRunner.py

Co-authored-by: Alexander Richardson <mail at alexrichardson.me>
---
 llvm/utils/lit/lit/TestRunner.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llvm/utils/lit/lit/TestRunner.py b/llvm/utils/lit/lit/TestRunner.py
index f0d3cb0828622..1b0853b772ea9 100644
--- a/llvm/utils/lit/lit/TestRunner.py
+++ b/llvm/utils/lit/lit/TestRunner.py
@@ -394,7 +394,7 @@ def executeBuiltinEcho(cmd, shenv):
         is_redirected = False
         stdout = StringIO()
     elif kIsWindows:
-        # Reopen stdout with specifying `newline` to avoid CRLF translation.
+        # Reopen stdout with `newline=""` to avoid CRLF translation.
         # The versions of echo we are replacing on Windows all emit plain LF,
         # and the LLVM tests now depend on this.
         stdout = open(stdout.name, stdout.mode, encoding="utf-8", newline="")



More information about the libcxx-commits mailing list