[test-suite] r285783 - Add utils/tdiff.py utility

Tue Nov 1 18:44:37 PDT 2016

Author: matze
Date: Tue Nov  1 20:44:36 2016
New Revision: 285783

URL: http://llvm.org/viewvc/llvm-project?rev=285783&view=rev
Log:
Add utils/tdiff.py utility

This helper script scans ninja build files to create lists of
source/assembly/object/statistics files involved in building a target.
It's main use however is a builtin diff mode which invokes the diff tool
comparing the contents of such a file list between two build
directories.

Also rename "util" to "utils" to be consistent with llvm and well we
have more than one util now.

Added:
    test-suite/trunk/utils/
    test-suite/trunk/utils/compare.py
      - copied, changed from r285245, test-suite/trunk/util/compare.py
    test-suite/trunk/utils/tdiff.py   (with props)
Removed:
    test-suite/trunk/util/compare.py

Removed: test-suite/trunk/util/compare.py
URL: http://llvm.org/viewvc/llvm-project/test-suite/trunk/util/compare.py?rev=285782&view=auto
==============================================================================

--- test-suite/trunk/util/compare.py (original)
+++ test-suite/trunk/util/compare.py (removed)
@@ -1,328 +0,0 @@
-#!/usr/bin/env python2.7
-"""Tool to filter, organize, compare and display benchmarking results. Usefull
-for smaller datasets. It works great with a few dozen runs it is not designed to
-deal with hundreds.
-Requires the pandas library to be installed."""
-import pandas as pd
-import sys
-import os.path
-import re
-import numbers
-import argparse
-
-def read_lit_json(filename):
-    import json
-    jsondata = json.load(open(filename))
-    testnames = []
-    columns = []
-    columnindexes = {}
-    info_columns = ['hash']
-    for test in jsondata['tests']:
-        if "name" not in test:
-            print "Skipping unnamed test!"
-            continue
-        if "metrics" not in test:
-            print "Warning: '%s' has No metrics!" % test['name']
-            continue
-        for name in test["metrics"].keys():
-            if name not in columnindexes:
-                columnindexes[name] = len(columns)
-                columns.append(name)
-        for name in test.keys():
-            if name not in columnindexes and name in info_columns:
-                columnindexes[name] = len(columns)
-                columns.append(name)
-
-    nan = float('NaN')
-    data = []
-    for test in jsondata['tests']:
-        if "name" not in test:
-            print "Skipping unnamed test!"
-            continue
-        name = test['name']
-        if 'shortname' in test:
-            name = test['shortname']
-        testnames.append(name)
-
-        datarow = [nan] * len(columns)
-        if "metrics" in test:
-            for (metricname, value) in test['metrics'].iteritems():
-                datarow[columnindexes[metricname]] = value
-        for (name, value) in test.iteritems():
-            index = columnindexes.get(name)
-            if index is not None:
-                datarow[index] = test[name]
-        data.append(datarow)
-    index = pd.Index(testnames, name='Program')
-    return pd.DataFrame(data=data, index=index, columns=columns)
-
-def read_report_simple_csv(filename):
-    return pd.read_csv(filename, na_values=['*'], index_col=0, header=0)
-
-def read(name):
-    if name.endswith(".json"):
-        return read_lit_json(name)
-    if name.endswith(".csv"):
-        return read_report_simple_csv(name)
-    raise Exception("Cannot determine file format");
-
-def readmulti(filenames):
-    # Read datasets
-    datasetnames = []
-    datasets = []
-    prev_index = None
-    for filename in filenames:
-        data = read(filename)
-        name = os.path.basename(filename)
-        # drop .json/.csv suffix; TODO: Should we rather do this in the printing
-        # logic?
-        for ext in ['.csv', '.json']:
-            if name.endswith(ext):
-                name = name[:-len(ext)]
-        datasets.append(data)
-        suffix = ""
-        count = 0
-        while True:
-            if name+suffix not in datasetnames:
-                break
-            suffix = str(count)
-            count +=1
-
-        datasetnames.append(name+suffix)
-        # Warn if index names are different
-        if prev_index is not None and prev_index.name != data.index.name:
-            sys.stderr.write("Warning: Mismatched index names: '%s' vs '%s'\n"
-                             % (prev_index.name, data.index.name))
-        prev_index = data.index
-    # Merge datasets
-    d = pd.concat(datasets, axis=0, names=['run'], keys=datasetnames)
-    return d
-
-def add_diff_column(d, absolute_diff=False):
-    values = d.unstack(level=0)
-
-    has_two_runs = d.index.get_level_values(0).nunique() == 2
-    if has_two_runs:
-        values0 = values.iloc[:,0]
-        values1 = values.iloc[:,1]
-    else:
-        values0 = values.min(axis=1)
-        values1 = values.max(axis=1)
-
-    # Quotient or absolute difference?
-    if absolute_diff:
-        values['diff'] = values1 - values0
-    else:
-        values['diff'] = values1 / values0
-        values['diff'] -= 1.0
-    # unstack() gave us a complicated multiindex for the columns, simplify
-    # things by renaming to a simple index.
-    values.columns = [(c[1] if c[1] else c[0]) for c in values.columns.values]
-    return values
-
-def filter_failed(data, key='Exec'):
-    return data.loc[data[key] == "pass"]
-
-def filter_short(data, key='Exec_Time', threshold=0.6):
-    return data.loc[data[key] >= threshold]
-
-def filter_same_hash(data, key='hash'):
-    assert key in data.columns
-    assert data.index.get_level_values(0).nunique() > 1
-
-    return data.groupby(level=1).filter(lambda x: x[key].nunique() != 1)
-
-def filter_blacklist(data, blacklist):
-    return data.loc[~(data.index.get_level_values(1).isin(blacklist))]
-
-def print_filter_stats(reason, before, after):
-    n_before = len(before.groupby(level=1))
-    n_after = len(after.groupby(level=1))
-    n_filtered = n_before - n_after
-    if n_filtered != 0:
-        print "%s: %s (filtered out)" % (reason, n_filtered)
-
-# Truncate a string to a maximum length by keeping a prefix, a suffix and ...
-# in the middle
-def truncate(string, prefix_len, suffix_len):
-    return re.sub("^(.{%d}).*(.{%d})$" % (prefix_len, suffix_len),
-                  "\g<1>...\g<2>", string)
-
-# Search for common prefixes and suffixes in a list of names and return
-# a (prefix,suffix) tuple that specifies how many characters can be dropped
-# for the prefix/suffix. The numbers will be small enough that no name will
-# become shorter than min_len characters.
-def determine_common_prefix_suffix(names, min_len=8):
-    if len(names) <= 1:
-        return (0,0)
-    name0 = names[0]
-    prefix = name0
-    prefix_len = len(name0)
-    suffix = name0
-    suffix_len = len(name0)
-    shortest_name = len(name0)
-    for name in names:
-        if len(name) < shortest_name:
-            shortest_name = len(name)
-        while prefix_len > 0 and name[:prefix_len] != prefix:
-            prefix_len -= 1
-            prefix = name0[:prefix_len]
-        while suffix_len > 0 and name[-suffix_len:] != suffix:
-            suffix_len -= 1
-            suffix = name0[-suffix_len:]
-
-    if suffix[0] != '.' and suffix[0] != '_':
-        suffix_len = 0
-    suffix_len = max(0, min(shortest_name - prefix_len - min_len, suffix_len))
-    prefix_len = max(0, min(shortest_name - suffix_len, prefix_len))
-    return (prefix_len, suffix_len)
-
-def format_diff(value):
-    if not isinstance(value, numbers.Integral):
-        return "%4.1f%%" % (value * 100.)
-    else:
-        return "%-5d" % value
-
-def print_result(d, limit_output=True, shorten_names=True,
-                 show_diff_column=True, sortkey='diff'):
-    # sort (TODO: is there a more elegant way than create+drop a column?)
-    d['$sortkey'] = d[sortkey].abs()
-    d = d.sort_values("$sortkey", ascending=False)
-    del d['$sortkey']
-    if not show_diff_column:
-        del d['diff']
-    dataout = d
-    if limit_output:
-        # Take 15 topmost elements
-        dataout = dataout.head(15)
-
-    # Turn index into a column so we can format it...
-    dataout.insert(0, 'Program', dataout.index)
-
-    formatters = dict()
-    formatters['diff'] = format_diff
-    if shorten_names:
-        drop_prefix, drop_suffix = determine_common_prefix_suffix(dataout.Program)
-        formatters['Program'] = lambda x: "%-45s" % truncate(x[drop_prefix:-drop_suffix], 10, 30)
-        # TODO: it would be cool to drop prefixes/suffix common to all
-        # names
-    float_format = lambda x: "%6.2f" % (x,)
-    pd.set_option("display.max_colwidth", 0)
-    out = dataout.to_string(index=False, justify='left',
-                            float_format=float_format, formatters=formatters)
-    print out
-    print d.describe()
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser(prog='compare.py')
-    parser.add_argument('-a', '--all', action='store_true')
-    parser.add_argument('-f', '--full', action='store_true')
-    parser.add_argument('-m', '--metric', action='append', dest='metrics',
-                        default=[])
-    parser.add_argument('--nodiff', action='store_false', dest='show_diff',
-                        default=None)
-    parser.add_argument('--diff', action='store_true', dest='show_diff')
-    parser.add_argument('--filter-short', action='store_true',
-                        dest='filter_short')
-    parser.add_argument('--no-filter-failed', action='store_false',
-                        dest='filter_failed', default=True)
-    parser.add_argument('--filter-hash', action='store_true',
-                        dest='filter_hash', default=False)
-    parser.add_argument('--filter-blacklist',
-                        dest='filter_blacklist', default=None)
-    parser.add_argument('--merge-average', action='store_const',
-                        dest='merge_function', const=pd.DataFrame.mean,
-                        default=pd.DataFrame.min)
-    parser.add_argument('--merge-min', action='store_const',
-                        dest='merge_function', const=pd.DataFrame.min)
-    parser.add_argument('--merge-max', action='store_const',
-                        dest='merge_function', const=pd.DataFrame.max)
-    parser.add_argument('files', metavar='FILE', nargs='+')
-    config = parser.parse_args()
-
-    if config.show_diff is None:
-        config.show_diff = len(config.files) > 1
-
-    # Read inputs
-    files = config.files
-    if "vs" in files:
-        split = files.index("vs")
-        lhs = files[0:split]
-        rhs = files[split+1:]
-
-        # Filter minimum of lhs and rhs
-        lhs_d = readmulti(lhs)
-        lhs_merged = config.merge_function(lhs_d, level=1)
-        rhs_d = readmulti(rhs)
-        rhs_merged = config.merge_function(rhs_d, level=1)
-
-        # Combine to new dataframe
-        data = pd.concat([lhs_merged, rhs_merged], names=['l/r'], keys=['lhs', 'rhs'])
-    else:
-        data = readmulti(files)
-
-    # Decide which metric to display / what is our "main" metric
-    metrics = config.metrics
-    if len(metrics) == 0:
-        defaults = [ 'Exec_Time', 'exec_time', 'Value', 'Runtime' ]
-        for defkey in defaults:
-            if defkey in data.columns:
-                metrics = [defkey]
-                break
-    if len(metrics) == 0:
-        sys.stderr.write("No default metric found and none specified\n")
-        sys.stderr.write("Available metrics:\n")
-        for column in data.columns:
-            sys.stderr.write("\t%s\n" % column)
-        sys.exit(1)
-    for metric in metrics:
-        problem = False
-        if metric not in data.columns:
-            sys.stderr.write("Unknown metric '%s'\n" % metric)
-            problem = True
-        if problem:
-            sys.exit(1)
-
-    # Filter data
-    proggroup = data.groupby(level=1)
-    initial_size = len(proggroup.indices)
-    print "Tests: %s" % (initial_size,)
-    if config.filter_failed and hasattr(data, 'Exec'):
-        newdata = filter_failed(data)
-        print_filter_stats("Failed", data, newdata)
-        newdata = newdata.drop('Exec', 1)
-        data = newdata
-    if config.filter_short:
-        newdata = filter_short(data, metric)
-        print_filter_stats("Short Running", data, newdata)
-        data = newdata
-    if config.filter_hash and 'hash' in data.columns and \
-       data.index.get_level_values(0).nunique() > 1:
-        newdata = filter_same_hash(data)
-        print_filter_stats("Same hash", data, newdata)
-        data = newdata
-    if config.filter_blacklist:
-        blacklist = open(config.filter_blacklist).readlines()
-        blacklist = [line.strip() for line in blacklist]
-        newdata = filter_blacklist(data, blacklist)
-        print_filter_stats("In Blacklist", data, newdata)
-        data = newdata
-    final_size = len(data.groupby(level=1))
-    if final_size != initial_size:
-        print "Remaining: %d" % (final_size,)
-
-    # Reduce / add columns
-    print "Metric: %s" % metric
-    if len(metric) > 0:
-        data = data[metrics]
-    data = add_diff_column(data)
-
-    sortkey = 'diff'
-    if len(config.files) == 1:
-        sortkey = data.columns[0]
-
-    # Print data
-    print ""
-    shorten_names = not config.full
-    limit_output = (not config.all) and (not config.full)
-    print_result(data, limit_output, shorten_names, config.show_diff, sortkey)

Copied: test-suite/trunk/utils/compare.py (from r285245, test-suite/trunk/util/compare.py)
URL: http://llvm.org/viewvc/llvm-project/test-suite/trunk/utils/compare.py?p2=test-suite/trunk/utils/compare.py&p1=test-suite/trunk/util/compare.py&r1=285245&r2=285783&rev=285783&view=diff
==============================================================================
    (empty)

Added: test-suite/trunk/utils/tdiff.py
URL: http://llvm.org/viewvc/llvm-project/test-suite/trunk/utils/tdiff.py?rev=285783&view=auto
==============================================================================
--- test-suite/trunk/utils/tdiff.py (added)
+++ test-suite/trunk/utils/tdiff.py Tue Nov  1 20:44:36 2016
@@ -0,0 +1,193 @@
+#!/usr/bin/env python
+#
+# This tool queries a ninja build file for the test-suite to figure out details
+# about the build like the sourcefiles involved in a target or the assembly
+# files output when clang is invoked with -save-temps=obj.
+# It comes with an additional mode that given two build directories invokes the
+# diff tool for each pair of files.
+#
+# Examples:
+#
+# List .stats files for the build in the current directory (assuming
+# -save-stats=obj in CFLAGS):
+#   $ tdiff.py --stats all
+#
+# Compare assembly files of the 176.gcc benchmark between two test-suite build
+# directories (assuming -save-temps=obj in CFLAGS):
+#   $ tdiff.py -a path/dir_before -b path/dir_after --s_files 176.gcc | less
+#
+# Ninja query code based on ninja/src/browse.py (apache license version 2.0).
+import sys
+import subprocess
+import argparse
+import os
+from collections import namedtuple
+
+
+Node = namedtuple('Node', ['inputs', 'rule', 'target', 'outputs'])
+
+
+def match_strip(line, prefix):
+    if not line.startswith(prefix):
+        return (False, line)
+    return (True, line[len(prefix):])
+
+
+def parse(text):
+    text = text.strip()
+    lines = iter(text.split('\n'))
+
+    rule = None
+    inputs = []
+    outputs = []
+
+    try:
+        line = None
+        while True:
+            target = None
+            if line is None:
+                line = next(lines)
+            target = line[:-1]  # strip trailing colon
+
+            line = next(lines)
+            (match, rule) = match_strip(line, '  input: ')
+            if match:
+                (match, line) = match_strip(next(lines), '    ')
+                while match:
+                    type = None
+                    (match, line) = match_strip(line, '| ')
+                    if match:
+                        type = 'implicit'
+                    (match, line) = match_strip(line, '|| ')
+                    if match:
+                        type = 'order-only'
+                    inputs.append((line, type))
+                    (match, line) = match_strip(next(lines), '    ')
+
+            match, _ = match_strip(line, '  outputs:')
+            if match:
+                (match, line) = match_strip(next(lines), '    ')
+                while match:
+                    outputs.append(line)
+                    (match, line) = match_strip(next(lines), '    ')
+            yield Node(inputs, rule, target, outputs)
+    except StopIteration:
+        pass
+
+    if target is not None:
+        yield Node(inputs, rule, target, outputs)
+
+
+def query_ninja(targets, cwd):
+    # Query ninja for a node in its build dependency tree.
+    proc = subprocess.Popen(['ninja', '-t', 'query'] + targets, cwd=cwd,
+                            stdout=subprocess.PIPE, universal_newlines=True)
+    out, _ = proc.communicate()
+    if proc.returncode != 0:
+        raise Exception("Failed to query ninja for targets: %s" % (targets,))
+    return parse(out)
+
+
+def get_inputs_rec(target, cwd):
+    worklist = [target]
+
+    result = dict()
+    while len(worklist) > 0:
+        limit = 30   # Limit number of targets to avoid argument list limits
+        querylist = []
+        for w in worklist[:limit]:
+            if w in result:
+                continue
+            querylist.append(w)
+        worklist = worklist[limit:]
+        if querylist == []:
+            break
+
+        queryres = query_ninja(querylist, cwd)
+        for res in queryres:
+            result[res.target] = res
+            for inp in res.inputs:
+                if inp[1] == 'order-only':
+                    continue
+                worklist.append(inp[0])
+    return result
+
+
+def replace_ext(filename, newext):
+    # Note that os.path.splitext() does not work here: We want '.c.o' -> '.xxx'
+    dirname, basename = os.path.split(filename)
+    return dirname + "/" + basename.split(".", 1)[0] + newext
+
+
+def filelist(mode, target, cwd, config):
+    tree = get_inputs_rec(config.target[0], cwd)
+
+    if config.mode == 'sources':
+        # Take leafs in the dependency tree
+        for target, depnode in tree.iteritems():
+            if len(depnode.inputs) == 0:
+                yield target
+    else:
+        # Take files ending in '.o'
+        for target, depnode in tree.iteritems():
+            if target.endswith(".o"):
+                # Determine .s/.stats ending used by -save-temps=obj or
+                # -save-stats=obj
+                if config.mode == 's_files':
+                    target = replace_ext(target, '.s')
+                elif config.mode == 'stats':
+                    target = replace_ext(target, '.stats')
+                else:
+                    assert config.mode == 'objects'
+                yield target
+
+
+def diff_file(dir0, dir1, target, config):
+    u_args = ['-u']
+    if config.diff_U is not None:
+        u_args = ['-U' + config.diff_U]
+    files = ["%s/%s" % (dir0, target), "%s/%s" % (dir1, target)]
+    rescode = subprocess.call(['diff'] + u_args + files)
+    return rescode
+
+
+def main(argv):
+    parser = argparse.ArgumentParser(prog=argv[0])
+    parser.add_argument('-s', '--s_files', dest='mode', action='store_const',
+                        const='s_files', help="Select assembly files")
+    parser.add_argument('-i', '--sources', dest='mode', action='store_const',
+                        const='sources', help="Select source files")
+    parser.add_argument('-o', '--objects', dest='mode', action='store_const',
+                        const='objects', help="Select object files")
+    parser.add_argument('-S', '--stats', dest='mode', action='store_const',
+                        const='stats', help="Select statistics files")
+    parser.add_argument('-a', '--dir0', dest='dir0')
+    parser.add_argument('-b', '--dir1', dest='dir1')
+    parser.add_argument('-U', dest='diff_U')
+    parser.add_argument('target', metavar='TARGET', nargs=1)
+    config = parser.parse_args()
+    if config.mode is None:
+        parser.print_usage(sys.stderr)
+        sys.stderr.write("%s: error: Must specify a mode\n" % (argv[0], ))
+        sys.exit(1)
+    if (config.dir0 is None) != (config.dir1 is None):
+        sys.stderr.write("%s: error: Must specify dir0+dir1 (or none)")
+        sys.exit(1)
+
+    files = filelist(config.mode, config.target[0], config.dir0, config)
+
+    if config.dir0:
+        global_rc = 0
+        for target in files:
+            rc = diff_file(config.dir0, config.dir1, target, config)
+            if rc != 0:
+                global_rc = rc
+        sys.exit(global_rc)
+    else:
+        # Simply print file list
+        for f in files:
+            print(f)
+
+
+if __name__ == '__main__':
+    main(sys.argv)

Propchange: test-suite/trunk/utils/tdiff.py
------------------------------------------------------------------------------
    svn:executable = *