[LNT][PATCH] rerun tests which the server said changed

Thu Sep 18 16:13:18 PDT 2014

Awesome, I am glad to see this land and can't wait to see what the impact
looks like.

 - Daniel

On Thu, Sep 18, 2014 at 3:02 PM, Chris Matthews <chris.matthews at apple.com>
wrote:

> Thanks Daniel.
>
> I have made all your suggested changes, and committed in r218080.  For now
> reruns are off by default.  I will start introducing the flag to some of
> the bots.
>
> On Jul 23, 2014, at 1:33 PM, Daniel Dunbar <daniel at zuster.org> wrote:
>
> Comments on the main patch inline, haven't looked at the tests yet:
>
> --
>
> > From 08612457b3c39c29eba5aa8ada79aeeb1e1d21cd Mon Sep 17 00:00:00 2001
> > From: Chris Matthews <cmatthews5 at apple.com>
> > Date: Mon, 21 Jul 2014 14:53:23 -0700
> > Subject: [PATCH] Add rerun flag: when passed, rerun benchmarks which the
> >  server says changed
> >
> > ---
> >  lnt/tests/nt.py | 319
> +++++++++++++++++++++++++++++++++++++++++++++++++++++++-
> >  1 file changed, 315 insertions(+), 4 deletions(-)
> >
> > diff --git a/lnt/tests/nt.py b/lnt/tests/nt.py
> > index 7e292bc..4e34fb6 100644
> > --- a/lnt/tests/nt.py
> > +++ b/lnt/tests/nt.py
> > @@ -7,18 +7,27 @@ import subprocess
> >  import sys
> >  import time
> >  import traceback
> > -
> >  from datetime import datetime
> > +from optparse import OptionParser, OptionGroup
> > +
> >
> >  import lnt.testing
> >  import lnt.testing.util.compilers
> >  import lnt.util.ImportData as ImportData
> >
> > -from lnt.testing.util.commands import note, warning, error, fatal,
> resolve_command_path
> > +from lnt.testing.util.commands import note, warning, fatal
> >  from lnt.testing.util.commands import capture, mkdir_p, which
> > +from lnt.testing.util.commands import resolve_command_path
> > +
> >  from lnt.testing.util.rcs import get_source_version
> > +
> >  from lnt.testing.util.misc import timestamp
> >
> > +from lnt.server.reporting.analysis import UNCHANGED_PASS, UNCHANGED_FAIL
> > +from lnt.server.reporting.analysis import REGRESSED, IMPROVED
> > +from lnt.util import ImportData
> > +import builtintest
> > +
> >  ###
> >
> >  class TestModule(object):
> > @@ -1033,5 +1042,61 @@ def run_test(nick_prefix, iteration, config):
> >
> >  ###
> >
> > -import builtintest
> > -from optparse import OptionParser, OptionGroup
> > +def _construct_report_path(basedir, only_test, test_style,
> file_type="csv"):
> > +    """Get the full path to report files in the sandbox.
> > +    """
> > +    report_path = os.path.join(basedir)
> > +    if only_test is not None:
> > +        report_path =  os.path.join(report_path, only_test)
> > +    report_path = os.path.join(report_path, ('report.%s.' % test_style)
> + file_type)
> > +    return report_path
> > +
> > +
> > +def rerun_test(config, name, num_times):
> > +    """Take the test at name, and rerun it num_times with the previous
> settings
> > +    stored in config.
> > +
> > +    """
> > +    # Extend the old log file.
> > +    logfile = open(config.test_log_path(None), 'a')
> > +
> > +    relative_test_path = os.path.dirname(name)
> > +    test_name = os.path.basename(name)
> > +
> > +    single_test_report = _construct_report_path(config.report_dir,
> > +                                               config.only_test,
> > +                                               config.test_style)
> > +
> > +    assert os.path.exists(single_test_report), "Previous report not
> there?" + \
> > +        single_test_report
> > +
> > +    test_full_path = os.path.join(
> > +        config.report_dir, relative_test_path)
> > +
> > +    assert os.path.exists(test_full_path), "Previous test directory not
> there?" + \
> > +        test_full_path
> > +
> > +    results = []
> > +    for i in xrange(0, num_times):
> > +        _prepare_testsuite_for_rerun(test_name, test_full_path, config)
> > +        result_path = _execute_test_again(config,
> > +                                          test_name,
> > +                                          test_full_path,
> > +                                          logfile)
> > +        results.extend(load_nt_report_file(result_path, config))
> > +
> > +    # Check we got an exec and status from each run.
> > +    assert len(results) >= num_times, "Did not get all the runs?"
> > +
> > +    logfile.close()
> > +    return results
> > +
> > +
> > +def _prepare_testsuite_for_rerun(test_name, test_full_path, config):
> > +    """Rerun  step 1: wipe out old files. To prepare the test suite,
> > +    we must delete the correct files to convince make to rerun
> > +    just this test. For executoin time runs, these are the
> > +    previous execution time runs, then any reports that would hav
> > +    been created by them.
> > +
> > +    """
>
> Sic (execution, have)
>
> > +    output = os.path.join(test_full_path, "Output/")
> > +    # The execution timing file.
> > +
> > +    try:
> > +        os.remove(output + test_name + ".out-" + config.test_style)
> > +    except OSError:
> > +        # If that does not work, try the mangled name.
> > +        os.remove(output + test_name.replace("_", ".") + ".out-" +
> config.test_style)
>
> I don't like the ad hoc handling of mangling here, and this won't properly
> work
> with tests that had a mix of actual '_' and '.' characters. I think we
> should
> find a principled way to solve this, one way would be to have the initial
> run
> leave behind a mapping of original names to mangled names that could be
> inverted
> on demand.
>
> > +
> > +    # Remove the report files.
> > +    os.remove(_construct_report_path(config.report_dir,
> > +                                    config.only_test,
> > +                                    config.test_style))
> > +
> > +    os.remove(_construct_report_path(config.report_dir,
> > +                                    config.only_test,
> > +                                    config.test_style,
> > +                                    "txt"))
> > +
> > +    os.remove(_construct_report_path(config.report_dir,
> > +                                    config.only_test,
> > +                                    config.test_style,
> > +                                    "raw.out"))
>
> I'm tempted to make this code more resilient and simpler by just removing
> the
> entire contents of the Output directory. Do you see any good reason not to
> do
> that? If we don't do that, I'd like to remove *.$(TEST).* from in the test
> dir.
>
> > +
> > +
> > +def _execute_test_again(config, test_name, test_path, logfile):
> > +    """Rerun step 2: rerun the execution benchmark of interest.  Now
> that the
> > +    test suite is ready, rerun the make command to gather new
> > +    data.  Modify the old make arguments to just be the execution
> > +    target and all the reports that are dependent on that target.
> > +
> > +    """
> > +    # Grab old make invocation.
> > +    mk_vars, _ = config.compute_run_make_variables()
> > +    to_exec = list(mk_vars)
> > +    to_exec = ['make','-k'] + to_exec
> > +    to_exec.extend('%s=%s' % (k,v) for k,v in mk_vars.items())
> > +    if config.only_test is not None:
> > +        to_exec.extend(['-C', config.only_test])
> > +
> > +    to_exec.append("Output/" + test_name + ".out-" + config.test_style)
> > +    to_exec.append("Output/" + test_name + ".diff-" + config.test_style)
> > +    to_exec.append("Output/" + test_name + ".exec-" + config.test_style)
> > +    to_exec.append("Output/" + test_name + ".exec.report." +
> config.test_style)
> > +    to_exec.append("Output/" + test_name + "." + config.test_style +
> ".report.txt")
> > +    to_exec.append("report." + config.test_style + ".csv")
> > +    to_exec.append("report")
>
> You don't need to provide all of these targets. The "master" target for
> driving
> each test is $(TEST-NAME).$(TEST).report.txt, and we should only build that
> target. If you build "report" or "report.$(TEST).csv", you may end up
> rerunning
> many other tests (in directories that have multiple tests, or that have
> subdirectories).
>
> > +
> > +    result_loc = _construct_report_path(config.report_dir,
> > +                                       config.only_test,
> > +                                       config.test_style)
> > +
>
> Instead of building the report.$(TEST).csv, I'd like to just build the one
> individual report file, then manually call GenerateReport.pl on the one
> input
> file we need so we can get an output we can pass to load_nt_report_file()
>
> > +    assert not os.path.exists(result_loc)
> > +    execute_command(logfile, config.build_dir(None), to_exec, test_path)
> > +    assert os.path.exists(result_loc), "Missing {}".format(result_loc)
> > +    return result_loc
>
> I think a more sensible API is for this method to return the report data,
> not
> the path to the report. Also, I would sink the call to
> prepare_testsuite_for_rerun into this method.
>
> > +
> > +
> > +# When set to true, all benchmarks will be rerun.
> > +DEBUG_RERUN = False
>
> What is this here for?
>
> > +NUMBER_OF_RERUNS = 4
> > +
> > +SERVER_FAIL = u'FAIL'
> > +SERVER_PASS = u'PASS'
> > +
> > +# Local test reults names have these suffexes
> > +# Test will have the perf suffex if it passed
> > +# if it failed it will have a status suffex.
>
> Sic (suffix)
>
> > +LOCAL_COMPILE_PERF = "compile"
> > +LOCAL_COMPILE_STATUS = "compile.status"
> > +LOCAL_EXEC_PERF = "exec"
> > +LOCAL_EXEC_STATUS = "exec.status"
> > +
> > +# Server results have both status and performance in each entry
> > +SERVER_COMPILE_RESULT = "compile_time"
> > +SERVER_EXEC_RESULT = "execution_time"
> > +
> > +
> > +class PastRunData:
>
> This should inherit from object to be modern.
>
> > +    """To decide if we need to rerun, we must know
> > +    what happened on each test in the first runs.
> > +    Because the server retruns data in a different format than
> > +    the local results, this class contains a per-test per-run
> > +    aggregate of the two reports."""
>
> Sic (reruns)
>
> > +    def __init__(self, name):
> > +        self.name = name
> > +        self.compile_status = None
> > +        self.compile_time = None
> > +        self.execution_status = None
> > +        self.execution_time = None
> > +        self.valid = False
> > +
> > +    def _check(self):
> > +        """Make sure this run data is complete."""
> > +        assert self.name is not None
> > +        msg = "Malformed test: %s" % (repr(self))
> > +        assert self.compile_status is not None, msg
> > +        assert self.execution_status is not None, msg
> > +        assert self.compile_time is not None, msg
> > +        assert self.execution_time is not None, msg
> > +        self.valid = True
> > +
> > +    def _is_rerunable(self):
> > +        """Decide if we should rerun this test. Rerun all execute tests
> when in
> > +        debug, re-run if test compiled correctly, execuited correctly,
> > +        but performance changed. Don't rerun if compile failed, or
> execute
> > +        failed, sucessful execute did not change in performance (as
> decided by
> > +        the server).
> > +        """
>
> Sic (executed). This doc comment just duplicates the comments in the code,
> I
> would just make this describe the API (should we rerun this test) not the
> logic.
>
> > +        assert self.valid
> > +        # Don't rerun if compile failed.
> > +        if self.compile_status == SERVER_FAIL:
> > +            return False
> > +
> > +        # Rerun everything in debug.
> > +        if DEBUG_RERUN:
> > +            return True
> > +
> > +        # Don't rerun on correctness failure or test pass.
> > +        if self.execution_status == UNCHANGED_FAIL or \
> > +           self.execution_status == UNCHANGED_PASS or \
> > +           self.execution_status == SERVER_FAIL:
> > +            return False
> > +
> > +        # Do rerun on regression or improvement.
> > +        if self.execution_status == REGRESSED or \
> > +           self.execution_status == IMPROVED:
> > +            return True
> > +
> > +        assert False, "Malformed run data: " \
> > +            "you should not get here. " + str(self)
> > +
> > +    def __repr__(self):
> > +        template = "<{}: CS {}, CT {}, ES {}, ET {}>"
> > +        return template.format(self.name,
> > +                               repr(self.compile_status),
> > +                               repr(self.compile_time),
> > +                               repr(self.execution_status),
> > +                               repr(self.execution_time))
> > +
> > +
> > +def _process_reruns(config, server_reply, local_results):
> > +    """Rerun each benchmark which the server reported "changed", N more
> > +    times. By default N=4 to make 5 samples total. 5 samples is the
> > +    smallest thing that has an okay median.  For now, only rerun the
> > +    exec time tests, and don't rerun test which failed or whoes
> > +    compile failed.
> > +
> > +    """
>
> This comment again duplicates ones elsewhere, better to delegate.
>
> > +    # To know which benchmarks to run, we need to check the local
> results
> > +    # to see which did not fail, then check the server results to see
> > +    # which regressed or improved. The server will return extra results
> > +    # for things that we did not run this time, it may also omit
> performance
> > +    # results.
>
> More duplication of logic explanation.
>
> > +    server_results = server_reply['test_results'][0]['results']
> > +
> > +    # Holds the combined local and server results.
> > +    cololated_results = dict()
>
> Sic (collated)
>
> > +
> > +    for b in local_results.tests:
> > +        # format: suite.test/path/and/name.type<.type>
> > +        fields = b.name.split('.')
> > +        test_name = fields[1]
> > +        test_type = '.'.join(fields[2:])\
> > +
> > +        updating_entry = cololated_results.get(test_name,
> > +                                               PastRunData(test_name))
> > +        if test_type == LOCAL_COMPILE_PERF:
> > +            updating_entry.compile_time = b.data
> > +        elif test_type == LOCAL_COMPILE_STATUS:
> > +            updating_entry.compile_status = SERVER_FAIL
> > +        elif test_type == LOCAL_EXEC_PERF:
> > +            updating_entry.execution_time = b.data
> > +        elif test_type == LOCAL_EXEC_STATUS:
> > +                updating_entry.execution_status = SERVER_FAIL
> > +        else:
> > +            assert False, "Unexpected local test type."
> > +
> > +        cololated_results[test_name] = updating_entry
> > +
> > +    # Now add on top the server results to any entry we already have.
> > +    for full_name, results_status, perf_status in server_results:
> > +        test_name, test_type = full_name.split(".")
> > +
> > +        new_entry = cololated_results.get(test_name,  None)
> > +        # Some tests will come from the server, which we did not run
> locally.
> > +        # Drop them.
> > +        if new_entry is None:
> > +            continue
> > +        # Set these, if they were not set with fails above.
> > +        if SERVER_COMPILE_RESULT in test_type:
> > +            if new_entry.compile_status is None:
> > +                new_entry.compile_status = results_status
> > +        elif SERVER_EXEC_RESULT in test_type:
> > +            if new_entry.execution_status is None:
> > +                # If the server has not seen the test before, it will
> return
> > +                # None for the performance results analysis. In this
> case we
> > +                # will assume no rerun is needed, so assign unchanged.
> > +                if perf_status is None:
> > +                    derived_perf_status = UNCHANGED_PASS
> > +                else:
> > +                    derived_perf_status = perf_status
> > +                new_entry.execution_status = derived_perf_status
> > +        else:
> > +            assert False, "Unexpected server result type."
> > +        cololated_results[test_name] = new_entry
> > +
> > +    # Double check that all values are there for all tests.
> > +    for test in cololated_results.values():
> > +        test._check()
> > +
> > +    rerunable_benches = filter(lambda x: x._is_rerunable(),
> > +                               cololated_results.values())
> > +    rerunable_benches.sort(key=lambda x: x.name)
> > +    # Now lets actually do the reruns.
> > +    rerun_results = []
> > +    SUMMARY = "Rerunning {} of {} benchmarks."
> > +    note(SUMMARY.format(len(rerunable_benches),
> > +                        len(cololated_results.values())))
> > +
> > +    for i, bench in enumerate(rerunable_benches):
> > +        note("Rerunning: {} [{}/{}]".format(bench.name,
> > +                                            i + 1,
> > +                                            len(rerunable_benches)))
> > +        fresh_samples = rerun_test(config,
> > +                                   bench.name,
> > +                                   NUMBER_OF_RERUNS)
> > +        rerun_results.extend(fresh_samples)
> > +
> > +    return rerun_results
> > +
>
> This method seems more complicated than it needs to be. Can't we just scan
> the
> server results for the list of tests we need to rerun, then filter out any
> that
> we didn't run locally, then run those? It isn't clear to me we need to do
> the
> collation and build up all the PastRunData objects.
>
> >  usage_info = """
> >  Script for running the tests in LLVM's test-suite repository.
> > @@ -1233,6 +1529,9 @@ class NTTest(builtintest.BuiltinTest):
> >          group.add_option("", "--build-threads", dest="build_threads",
> >                           help="Number of compilation threads",
> >                           type=int, default=0, metavar="N")
> > +        group.add_option("", "--rerun", dest="rerun",
> > +                 help="Rerun tests that have regressed.",
> > +                 action="store_true", default=False)
> >
> >          group.add_option("", "--remote", dest="remote",
> >                           help=("Execute remotely, see "
> > @@ -1495,3 +1794,13 @@ class NTTest(builtintest.BuiltinTest):
> >
> >          else:
> >              test_results = run_test(nick, None, config)
> > +            if opts.rerun:
> > +                self.log("Performing any needed reruns.")
> > +                server_report = self.submit_helper(config, commit=False)
> > +                new_samples = _process_reruns(config, server_report,
> test_results)
> > +                test_results.update_report(new_samples)
> > +
> > +                # persist report with new samples.
> > +                lnt_report_path = config.report_path(None)
> > +                os.unlink(lnt_report_path)
> > +                lnt_report_file = open(lnt_report_path, 'w')
>
> Why bother to unlink here?
>
> > +                print >>lnt_report_file, test_results.render()
> > +                lnt_report_file.close()
> >
> >              if config.output is not None:
> >                  self.print_report(test_results, config.output)
> > --
> > 1.9.3 (Apple Git-50)+GitX
>
> --
>
>
> On Mon, Jul 21, 2014 at 3:32 PM, Chris Matthews <chris.matthews at apple.com>
> wrote:
>
>> Hi Daniel,
>>
>> Here is an update to the rerun patch discussed last year. Attached is a
>> patch for the NT test suite as well as a new rerun test in the LNT test
>> suite.
>>
>> The patch does not perform well (reruns take quite a while). I’d like to
>> address that in a separate patch.
>>
>>
>
>
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://lists.llvm.org/pipermail/llvm-commits/attachments/20140918/9facd3f1/attachment.html>