[LNT][PATCH] rerun tests which the server said changed

Wed Jul 23 13:33:20 PDT 2014

Comments on the main patch inline, haven't looked at the tests yet:

--

> From 08612457b3c39c29eba5aa8ada79aeeb1e1d21cd Mon Sep 17 00:00:00 2001
> From: Chris Matthews <cmatthews5 at apple.com>
> Date: Mon, 21 Jul 2014 14:53:23 -0700
> Subject: [PATCH] Add rerun flag: when passed, rerun benchmarks which the
>  server says changed
>
> ---
>  lnt/tests/nt.py | 319
+++++++++++++++++++++++++++++++++++++++++++++++++++++++-
>  1 file changed, 315 insertions(+), 4 deletions(-)
>
> diff --git a/lnt/tests/nt.py b/lnt/tests/nt.py
> index 7e292bc..4e34fb6 100644
> --- a/lnt/tests/nt.py
> +++ b/lnt/tests/nt.py
> @@ -7,18 +7,27 @@ import subprocess
>  import sys
>  import time
>  import traceback
> -
>  from datetime import datetime
> +from optparse import OptionParser, OptionGroup
> +
>
>  import lnt.testing
>  import lnt.testing.util.compilers
>  import lnt.util.ImportData as ImportData
>
> -from lnt.testing.util.commands import note, warning, error, fatal,
resolve_command_path
> +from lnt.testing.util.commands import note, warning, fatal
>  from lnt.testing.util.commands import capture, mkdir_p, which
> +from lnt.testing.util.commands import resolve_command_path
> +
>  from lnt.testing.util.rcs import get_source_version
> +
>  from lnt.testing.util.misc import timestamp
>
> +from lnt.server.reporting.analysis import UNCHANGED_PASS, UNCHANGED_FAIL
> +from lnt.server.reporting.analysis import REGRESSED, IMPROVED
> +from lnt.util import ImportData
> +import builtintest
> +
>  ###
>
>  class TestModule(object):
> @@ -1033,5 +1042,61 @@ def run_test(nick_prefix, iteration, config):
>
>  ###
>
> -import builtintest
> -from optparse import OptionParser, OptionGroup
> +def _construct_report_path(basedir, only_test, test_style,
file_type="csv"):
> +    """Get the full path to report files in the sandbox.
> +    """
> +    report_path = os.path.join(basedir)
> +    if only_test is not None:
> +        report_path =  os.path.join(report_path, only_test)
> +    report_path = os.path.join(report_path, ('report.%s.' % test_style)
+ file_type)
> +    return report_path
> +
> +
> +def rerun_test(config, name, num_times):
> +    """Take the test at name, and rerun it num_times with the previous
settings
> +    stored in config.
> +
> +    """
> +    # Extend the old log file.
> +    logfile = open(config.test_log_path(None), 'a')
> +
> +    relative_test_path = os.path.dirname(name)
> +    test_name = os.path.basename(name)
> +
> +    single_test_report = _construct_report_path(config.report_dir,
> +                                               config.only_test,
> +                                               config.test_style)
> +
> +    assert os.path.exists(single_test_report), "Previous report not
there?" + \
> +        single_test_report
> +
> +    test_full_path = os.path.join(
> +        config.report_dir, relative_test_path)
> +
> +    assert os.path.exists(test_full_path), "Previous test directory not
there?" + \
> +        test_full_path
> +
> +    results = []
> +    for i in xrange(0, num_times):
> +        _prepare_testsuite_for_rerun(test_name, test_full_path, config)
> +        result_path = _execute_test_again(config,
> +                                          test_name,
> +                                          test_full_path,
> +                                          logfile)
> +        results.extend(load_nt_report_file(result_path, config))
> +
> +    # Check we got an exec and status from each run.
> +    assert len(results) >= num_times, "Did not get all the runs?"
> +
> +    logfile.close()
> +    return results
> +
> +
> +def _prepare_testsuite_for_rerun(test_name, test_full_path, config):
> +    """Rerun  step 1: wipe out old files. To prepare the test suite,
> +    we must delete the correct files to convince make to rerun
> +    just this test. For executoin time runs, these are the
> +    previous execution time runs, then any reports that would hav
> +    been created by them.
> +
> +    """

Sic (execution, have)

> +    output = os.path.join(test_full_path, "Output/")
> +    # The execution timing file.
> +
> +    try:
> +        os.remove(output + test_name + ".out-" + config.test_style)
> +    except OSError:
> +        # If that does not work, try the mangled name.
> +        os.remove(output + test_name.replace("_", ".") + ".out-" +
config.test_style)

I don't like the ad hoc handling of mangling here, and this won't properly
work
with tests that had a mix of actual '_' and '.' characters. I think we
should
find a principled way to solve this, one way would be to have the initial
run
leave behind a mapping of original names to mangled names that could be
inverted
on demand.

> +
> +    # Remove the report files.
> +    os.remove(_construct_report_path(config.report_dir,
> +                                    config.only_test,
> +                                    config.test_style))
> +
> +    os.remove(_construct_report_path(config.report_dir,
> +                                    config.only_test,
> +                                    config.test_style,
> +                                    "txt"))
> +
> +    os.remove(_construct_report_path(config.report_dir,
> +                                    config.only_test,
> +                                    config.test_style,
> +                                    "raw.out"))

I'm tempted to make this code more resilient and simpler by just removing
the
entire contents of the Output directory. Do you see any good reason not to
do
that? If we don't do that, I'd like to remove *.$(TEST).* from in the test
dir.

> +
> +
> +def _execute_test_again(config, test_name, test_path, logfile):
> +    """Rerun step 2: rerun the execution benchmark of interest.  Now
that the
> +    test suite is ready, rerun the make command to gather new
> +    data.  Modify the old make arguments to just be the execution
> +    target and all the reports that are dependent on that target.
> +
> +    """
> +    # Grab old make invocation.
> +    mk_vars, _ = config.compute_run_make_variables()
> +    to_exec = list(mk_vars)
> +    to_exec = ['make','-k'] + to_exec
> +    to_exec.extend('%s=%s' % (k,v) for k,v in mk_vars.items())
> +    if config.only_test is not None:
> +        to_exec.extend(['-C', config.only_test])
> +
> +    to_exec.append("Output/" + test_name + ".out-" + config.test_style)
> +    to_exec.append("Output/" + test_name + ".diff-" + config.test_style)
> +    to_exec.append("Output/" + test_name + ".exec-" + config.test_style)
> +    to_exec.append("Output/" + test_name + ".exec.report." +
config.test_style)
> +    to_exec.append("Output/" + test_name + "." + config.test_style +
".report.txt")
> +    to_exec.append("report." + config.test_style + ".csv")
> +    to_exec.append("report")

You don't need to provide all of these targets. The "master" target for
driving
each test is $(TEST-NAME).$(TEST).report.txt, and we should only build that
target. If you build "report" or "report.$(TEST).csv", you may end up
rerunning
many other tests (in directories that have multiple tests, or that have
subdirectories).

> +
> +    result_loc = _construct_report_path(config.report_dir,
> +                                       config.only_test,
> +                                       config.test_style)
> +

Instead of building the report.$(TEST).csv, I'd like to just build the one
individual report file, then manually call GenerateReport.pl on the one
input
file we need so we can get an output we can pass to load_nt_report_file()

> +    assert not os.path.exists(result_loc)
> +    execute_command(logfile, config.build_dir(None), to_exec, test_path)
> +    assert os.path.exists(result_loc), "Missing {}".format(result_loc)
> +    return result_loc

I think a more sensible API is for this method to return the report data,
not
the path to the report. Also, I would sink the call to
prepare_testsuite_for_rerun into this method.

> +
> +
> +# When set to true, all benchmarks will be rerun.
> +DEBUG_RERUN = False

What is this here for?

> +NUMBER_OF_RERUNS = 4
> +
> +SERVER_FAIL = u'FAIL'
> +SERVER_PASS = u'PASS'
> +
> +# Local test reults names have these suffexes
> +# Test will have the perf suffex if it passed
> +# if it failed it will have a status suffex.

Sic (suffix)

> +LOCAL_COMPILE_PERF = "compile"
> +LOCAL_COMPILE_STATUS = "compile.status"
> +LOCAL_EXEC_PERF = "exec"
> +LOCAL_EXEC_STATUS = "exec.status"
> +
> +# Server results have both status and performance in each entry
> +SERVER_COMPILE_RESULT = "compile_time"
> +SERVER_EXEC_RESULT = "execution_time"
> +
> +
> +class PastRunData:

This should inherit from object to be modern.

> +    """To decide if we need to rerun, we must know
> +    what happened on each test in the first runs.
> +    Because the server retruns data in a different format than
> +    the local results, this class contains a per-test per-run
> +    aggregate of the two reports."""

Sic (reruns)

> +    def __init__(self, name):
> +        self.name = name
> +        self.compile_status = None
> +        self.compile_time = None
> +        self.execution_status = None
> +        self.execution_time = None
> +        self.valid = False
> +
> +    def _check(self):
> +        """Make sure this run data is complete."""
> +        assert self.name is not None
> +        msg = "Malformed test: %s" % (repr(self))
> +        assert self.compile_status is not None, msg
> +        assert self.execution_status is not None, msg
> +        assert self.compile_time is not None, msg
> +        assert self.execution_time is not None, msg
> +        self.valid = True
> +
> +    def _is_rerunable(self):
> +        """Decide if we should rerun this test. Rerun all execute tests
when in
> +        debug, re-run if test compiled correctly, execuited correctly,
> +        but performance changed. Don't rerun if compile failed, or
execute
> +        failed, sucessful execute did not change in performance (as
decided by
> +        the server).
> +        """

Sic (executed). This doc comment just duplicates the comments in the code, I
would just make this describe the API (should we rerun this test) not the
logic.

> +        assert self.valid
> +        # Don't rerun if compile failed.
> +        if self.compile_status == SERVER_FAIL:
> +            return False
> +
> +        # Rerun everything in debug.
> +        if DEBUG_RERUN:
> +            return True
> +
> +        # Don't rerun on correctness failure or test pass.
> +        if self.execution_status == UNCHANGED_FAIL or \
> +           self.execution_status == UNCHANGED_PASS or \
> +           self.execution_status == SERVER_FAIL:
> +            return False
> +
> +        # Do rerun on regression or improvement.
> +        if self.execution_status == REGRESSED or \
> +           self.execution_status == IMPROVED:
> +            return True
> +
> +        assert False, "Malformed run data: " \
> +            "you should not get here. " + str(self)
> +
> +    def __repr__(self):
> +        template = "<{}: CS {}, CT {}, ES {}, ET {}>"
> +        return template.format(self.name,
> +                               repr(self.compile_status),
> +                               repr(self.compile_time),
> +                               repr(self.execution_status),
> +                               repr(self.execution_time))
> +
> +
> +def _process_reruns(config, server_reply, local_results):
> +    """Rerun each benchmark which the server reported "changed", N more
> +    times. By default N=4 to make 5 samples total. 5 samples is the
> +    smallest thing that has an okay median.  For now, only rerun the
> +    exec time tests, and don't rerun test which failed or whoes
> +    compile failed.
> +
> +    """

This comment again duplicates ones elsewhere, better to delegate.

> +    # To know which benchmarks to run, we need to check the local results
> +    # to see which did not fail, then check the server results to see
> +    # which regressed or improved. The server will return extra results
> +    # for things that we did not run this time, it may also omit
performance
> +    # results.

More duplication of logic explanation.

> +    server_results = server_reply['test_results'][0]['results']
> +
> +    # Holds the combined local and server results.
> +    cololated_results = dict()

Sic (collated)

> +
> +    for b in local_results.tests:
> +        # format: suite.test/path/and/name.type<.type>
> +        fields = b.name.split('.')
> +        test_name = fields[1]
> +        test_type = '.'.join(fields[2:])\
> +
> +        updating_entry = cololated_results.get(test_name,
> +                                               PastRunData(test_name))
> +        if test_type == LOCAL_COMPILE_PERF:
> +            updating_entry.compile_time = b.data
> +        elif test_type == LOCAL_COMPILE_STATUS:
> +            updating_entry.compile_status = SERVER_FAIL
> +        elif test_type == LOCAL_EXEC_PERF:
> +            updating_entry.execution_time = b.data
> +        elif test_type == LOCAL_EXEC_STATUS:
> +                updating_entry.execution_status = SERVER_FAIL
> +        else:
> +            assert False, "Unexpected local test type."
> +
> +        cololated_results[test_name] = updating_entry
> +
> +    # Now add on top the server results to any entry we already have.
> +    for full_name, results_status, perf_status in server_results:
> +        test_name, test_type = full_name.split(".")
> +
> +        new_entry = cololated_results.get(test_name,  None)
> +        # Some tests will come from the server, which we did not run
locally.
> +        # Drop them.
> +        if new_entry is None:
> +            continue
> +        # Set these, if they were not set with fails above.
> +        if SERVER_COMPILE_RESULT in test_type:
> +            if new_entry.compile_status is None:
> +                new_entry.compile_status = results_status
> +        elif SERVER_EXEC_RESULT in test_type:
> +            if new_entry.execution_status is None:
> +                # If the server has not seen the test before, it will
return
> +                # None for the performance results analysis. In this
case we
> +                # will assume no rerun is needed, so assign unchanged.
> +                if perf_status is None:
> +                    derived_perf_status = UNCHANGED_PASS
> +                else:
> +                    derived_perf_status = perf_status
> +                new_entry.execution_status = derived_perf_status
> +        else:
> +            assert False, "Unexpected server result type."
> +        cololated_results[test_name] = new_entry
> +
> +    # Double check that all values are there for all tests.
> +    for test in cololated_results.values():
> +        test._check()
> +
> +    rerunable_benches = filter(lambda x: x._is_rerunable(),
> +                               cololated_results.values())
> +    rerunable_benches.sort(key=lambda x: x.name)
> +    # Now lets actually do the reruns.
> +    rerun_results = []
> +    SUMMARY = "Rerunning {} of {} benchmarks."
> +    note(SUMMARY.format(len(rerunable_benches),
> +                        len(cololated_results.values())))
> +
> +    for i, bench in enumerate(rerunable_benches):
> +        note("Rerunning: {} [{}/{}]".format(bench.name,
> +                                            i + 1,
> +                                            len(rerunable_benches)))
> +        fresh_samples = rerun_test(config,
> +                                   bench.name,
> +                                   NUMBER_OF_RERUNS)
> +        rerun_results.extend(fresh_samples)
> +
> +    return rerun_results
> +

This method seems more complicated than it needs to be. Can't we just scan
the
server results for the list of tests we need to rerun, then filter out any
that
we didn't run locally, then run those? It isn't clear to me we need to do
the
collation and build up all the PastRunData objects.

>  usage_info = """
>  Script for running the tests in LLVM's test-suite repository.
> @@ -1233,6 +1529,9 @@ class NTTest(builtintest.BuiltinTest):
>          group.add_option("", "--build-threads", dest="build_threads",
>                           help="Number of compilation threads",
>                           type=int, default=0, metavar="N")
> +        group.add_option("", "--rerun", dest="rerun",
> +                 help="Rerun tests that have regressed.",
> +                 action="store_true", default=False)
>
>          group.add_option("", "--remote", dest="remote",
>                           help=("Execute remotely, see "
> @@ -1495,3 +1794,13 @@ class NTTest(builtintest.BuiltinTest):
>
>          else:
>              test_results = run_test(nick, None, config)
> +            if opts.rerun:
> +                self.log("Performing any needed reruns.")
> +                server_report = self.submit_helper(config, commit=False)
> +                new_samples = _process_reruns(config, server_report,
test_results)
> +                test_results.update_report(new_samples)
> +
> +                # persist report with new samples.
> +                lnt_report_path = config.report_path(None)
> +                os.unlink(lnt_report_path)
> +                lnt_report_file = open(lnt_report_path, 'w')

Why bother to unlink here?

> +                print >>lnt_report_file, test_results.render()
> +                lnt_report_file.close()
>
>              if config.output is not None:
>                  self.print_report(test_results, config.output)
> --
> 1.9.3 (Apple Git-50)+GitX

--

On Mon, Jul 21, 2014 at 3:32 PM, Chris Matthews <chris.matthews at apple.com>
wrote:

> Hi Daniel,
>
> Here is an update to the rerun patch discussed last year. Attached is a
> patch for the NT test suite as well as a new rerun test in the LNT test
> suite.
>
> The patch does not perform well (reruns take quite a while). I’d like to
> address that in a separate patch.
>
>
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://lists.llvm.org/pipermail/llvm-commits/attachments/20140723/ebdc4f0d/attachment.html>