[llvm] r306643 - [lit] Re-apply: Fix some convoluted logic around Unicode encoding, and de-duplicate across modules that used it.

Wed Jun 28 21:37:35 PDT 2017

Author: dlj
Date: Wed Jun 28 21:37:35 2017
New Revision: 306643

URL: http://llvm.org/viewvc/llvm-project?rev=306643&view=rev
Log:
[lit] Re-apply: Fix some convoluted logic around Unicode encoding, and de-duplicate across modules that used it.

(Take 2: this patch re-applies r306625, which was reverted in r306629. This
patch includes only trivial fixes.)

In Python2 and Python3, the various (non-)?Unicode string types are sort of
spaghetti. Python2 has unicode support tacked on via the 'unicode' type, which
is distinct from 'str' (which are bytes). Python3 takes the "unicode-everywhere"
approach, with 'str' representing a Unicode string.

Both have a 'bytes' type. In Python3, it is the only way to represent raw bytes.
However, in Python2, 'bytes' is an alias for 'str'. This leads to interesting
problems when an interface requires a precise type, but has to run under both
Python2 and Python3.

The previous logic appeared to be correct in all cases, but went through more
layers of indirection than necessary. This change does the necessary conversions
in one shot, with documentation about which paths might be taken in Python2 or
Python3.

Changes from r306625: some tests just print binary outputs, so in those cases,
fall back to str() in Python3. For googletests, add one missing call to
to_string().

(Tested by verifying the visible breakage with Python3. Verified that everything
works in py2 and py3.)

Modified:
    llvm/trunk/utils/lit/lit/formats/googletest.py
    llvm/trunk/utils/lit/lit/util.py

Modified: llvm/trunk/utils/lit/lit/formats/googletest.py
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/utils/lit/lit/formats/googletest.py?rev=306643&r1=306642&r2=306643&view=diff
==============================================================================

--- llvm/trunk/utils/lit/lit/formats/googletest.py (original)
+++ llvm/trunk/utils/lit/lit/formats/googletest.py Wed Jun 28 21:37:35 2017
@@ -30,19 +30,24 @@ class GoogleTest(TestFormat):
           localConfig: TestingConfig instance"""
 
         try:
-            lines = lit.util.capture([path, '--gtest_list_tests'],
-                                     env=localConfig.environment)
-            if kIsWindows:
-              lines = lines.replace('\r', '')
-            lines = lines.split('\n')
-        except Exception as exc:
-            out = exc.output if isinstance(exc, subprocess.CalledProcessError) else ''
-            litConfig.warning("unable to discover google-tests in %r: %s. Process output: %s"
-                              % (path, sys.exc_info()[1], out))
+            output = subprocess.check_output([path, '--gtest_list_tests'],
+                                             env=localConfig.environment)
+        except subprocess.CalledProcessError as exc:
+            litConfig.warning(
+                "unable to discover google-tests in %r: %s. Process output: %s"
+                % (path, sys.exc_info()[1], exc.output))
             raise StopIteration
 
         nested_tests = []
-        for ln in lines:
+        for ln in output.splitlines(False):  # Don't keep newlines.
+            ln = lit.util.to_string(ln)
+
+            if 'Running main() from gtest_main.cc' in ln:
+                # Upstream googletest prints this to stdout prior to running
+                # tests. LLVM removed that print statement in r61540, but we
+                # handle it here in case upstream googletest is being used.
+                continue
+
             # The test name list includes trailing comments beginning with
             # a '#' on some lines, so skip those. We don't support test names
             # that use escaping to embed '#' into their name as the names come
@@ -52,12 +57,6 @@ class GoogleTest(TestFormat):
             if not ln.lstrip():
                 continue
 
-            if 'Running main() from gtest_main.cc' in ln:
-                # Upstream googletest prints this to stdout prior to running
-                # tests. LLVM removed that print statement in r61540, but we
-                # handle it here in case upstream googletest is being used.
-                continue
-
             index = 0
             while ln[index*2:index*2+2] == '  ':
                 index += 1

Modified: llvm/trunk/utils/lit/lit/util.py
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/utils/lit/lit/util.py?rev=306643&r1=306642&r2=306643&view=diff
==============================================================================
--- llvm/trunk/utils/lit/lit/util.py (original)
+++ llvm/trunk/utils/lit/lit/util.py Wed Jun 28 21:37:35 2017
@@ -8,24 +8,57 @@ import subprocess
 import sys
 import threading
 
-def to_bytes(str):
-    # Encode to UTF-8 to get binary data.
-    if isinstance(str, bytes):
-        return str
-    return str.encode('utf-8')
-
-def to_string(bytes):
-    if isinstance(bytes, str):
-        return bytes
-    return to_bytes(bytes)
+def to_bytes(s):
+    """Return the parameter as type 'bytes', possibly encoding it.
 
-def convert_string(bytes):
+    In Python2, the 'bytes' type is the same as 'str'. In Python3, they are
+    distinct.
+    """
+    if isinstance(s, bytes):
+        # In Python2, this branch is taken for both 'str' and 'bytes'.
+        # In Python3, this branch is taken only for 'bytes'.
+        return s
+    # In Python2, 's' is a 'unicode' object.
+    # In Python3, 's' is a 'str' object.
+    # Encode to UTF-8 to get 'bytes' data.
+    return s.encode('utf-8')
+
+def to_string(b):
+    """Return the parameter as type 'str', possibly encoding it.
+
+    In Python2, the 'str' type is the same as 'bytes'. In Python3, the
+    'str' type is (essentially) Python2's 'unicode' type, and 'bytes' is
+    distinct.
+    """
+    if isinstance(b, str):
+        # In Python2, this branch is taken for types 'str' and 'bytes'.
+        # In Python3, this branch is taken only for 'str'.
+        return b
+    if isinstance(b, bytes):
+        # In Python2, this branch is never taken ('bytes' is handled as 'str').
+        # In Python3, this is true only for 'bytes'.
+        try:
+            return b.decode('utf-8')
+        except UnicodeDecodeError:
+            # If the value is not valid Unicode, return the default
+            # repr-line encoding.
+            return str(b)
+
+    # By this point, here's what we *don't* have:
+    #
+    #  - In Python2:
+    #    - 'str' or 'bytes' (1st branch above)
+    #  - In Python3:
+    #    - 'str' (1st branch above)
+    #    - 'bytes' (2nd branch above)
+    #
+    # The last type we might expect is the Python2 'unicode' type. There is no
+    # 'unicode' type in Python3 (all the Python3 cases were already handled). In
+    # order to get a 'str' object, we need to encode the 'unicode' object.
     try:
-        return to_string(bytes.decode('utf-8'))
-    except AttributeError: # 'str' object has no attribute 'decode'.
-        return str(bytes)
-    except UnicodeError:
-        return str(bytes)
+        return b.encode('utf-8')
+    except AttributeError:
+        raise TypeError('not sure how to convert %s to %s' % (type(b), str))
 
 def detectCPUs():
     """
@@ -39,7 +72,8 @@ def detectCPUs():
             if isinstance(ncpus, int) and ncpus > 0:
                 return ncpus
         else: # OSX:
-            return int(capture(['sysctl', '-n', 'hw.ncpu']))
+            return int(subprocess.check_output(['sysctl', '-n', 'hw.ncpu'],
+                                               stderr=subprocess.STDOUT))
     # Windows:
     if "NUMBER_OF_PROCESSORS" in os.environ:
         ncpus = int(os.environ["NUMBER_OF_PROCESSORS"])
@@ -67,21 +101,6 @@ def mkdir_p(path):
         if e.errno != errno.EEXIST:
             raise
 
-def capture(args, env=None):
-    """capture(command) - Run the given command (or argv list) in a shell and
-    return the standard output. Raises a CalledProcessError if the command
-    exits with a non-zero status."""
-    p = subprocess.Popen(args, stdout=subprocess.PIPE, stderr=subprocess.PIPE,
-                         env=env)
-    out, err = p.communicate()
-    out = convert_string(out)
-    err = convert_string(err)
-    if p.returncode != 0:
-        raise subprocess.CalledProcessError(cmd=args,
-                                            returncode=p.returncode,
-                                            output="{}\n{}".format(out, err))
-    return out
-
 def which(command, paths = None):
     """which(command, [paths]) - Look up the given command in the paths string
     (or the PATH environment variable, if unspecified)."""
@@ -233,8 +252,8 @@ def executeCommand(command, cwd=None, en
             timerObject.cancel()
 
     # Ensure the resulting output is always of string type.
-    out = convert_string(out)
-    err = convert_string(err)
+    out = to_string(out)
+    err = to_string(err)
 
     if hitTimeOut[0]:
         raise ExecuteCommandTimeoutException(