[LNT][PATCH] rerun tests which the server said changed
Daniel Dunbar
daniel at zuster.org
Thu Sep 18 16:13:18 PDT 2014
Awesome, I am glad to see this land and can't wait to see what the impact
looks like.
- Daniel
On Thu, Sep 18, 2014 at 3:02 PM, Chris Matthews <chris.matthews at apple.com>
wrote:
> Thanks Daniel.
>
> I have made all your suggested changes, and committed in r218080. For now
> reruns are off by default. I will start introducing the flag to some of
> the bots.
>
> On Jul 23, 2014, at 1:33 PM, Daniel Dunbar <daniel at zuster.org> wrote:
>
> Comments on the main patch inline, haven't looked at the tests yet:
>
> --
>
> > From 08612457b3c39c29eba5aa8ada79aeeb1e1d21cd Mon Sep 17 00:00:00 2001
> > From: Chris Matthews <cmatthews5 at apple.com>
> > Date: Mon, 21 Jul 2014 14:53:23 -0700
> > Subject: [PATCH] Add rerun flag: when passed, rerun benchmarks which the
> > server says changed
> >
> > ---
> > lnt/tests/nt.py | 319
> +++++++++++++++++++++++++++++++++++++++++++++++++++++++-
> > 1 file changed, 315 insertions(+), 4 deletions(-)
> >
> > diff --git a/lnt/tests/nt.py b/lnt/tests/nt.py
> > index 7e292bc..4e34fb6 100644
> > --- a/lnt/tests/nt.py
> > +++ b/lnt/tests/nt.py
> > @@ -7,18 +7,27 @@ import subprocess
> > import sys
> > import time
> > import traceback
> > -
> > from datetime import datetime
> > +from optparse import OptionParser, OptionGroup
> > +
> >
> > import lnt.testing
> > import lnt.testing.util.compilers
> > import lnt.util.ImportData as ImportData
> >
> > -from lnt.testing.util.commands import note, warning, error, fatal,
> resolve_command_path
> > +from lnt.testing.util.commands import note, warning, fatal
> > from lnt.testing.util.commands import capture, mkdir_p, which
> > +from lnt.testing.util.commands import resolve_command_path
> > +
> > from lnt.testing.util.rcs import get_source_version
> > +
> > from lnt.testing.util.misc import timestamp
> >
> > +from lnt.server.reporting.analysis import UNCHANGED_PASS, UNCHANGED_FAIL
> > +from lnt.server.reporting.analysis import REGRESSED, IMPROVED
> > +from lnt.util import ImportData
> > +import builtintest
> > +
> > ###
> >
> > class TestModule(object):
> > @@ -1033,5 +1042,61 @@ def run_test(nick_prefix, iteration, config):
> >
> > ###
> >
> > -import builtintest
> > -from optparse import OptionParser, OptionGroup
> > +def _construct_report_path(basedir, only_test, test_style,
> file_type="csv"):
> > + """Get the full path to report files in the sandbox.
> > + """
> > + report_path = os.path.join(basedir)
> > + if only_test is not None:
> > + report_path = os.path.join(report_path, only_test)
> > + report_path = os.path.join(report_path, ('report.%s.' % test_style)
> + file_type)
> > + return report_path
> > +
> > +
> > +def rerun_test(config, name, num_times):
> > + """Take the test at name, and rerun it num_times with the previous
> settings
> > + stored in config.
> > +
> > + """
> > + # Extend the old log file.
> > + logfile = open(config.test_log_path(None), 'a')
> > +
> > + relative_test_path = os.path.dirname(name)
> > + test_name = os.path.basename(name)
> > +
> > + single_test_report = _construct_report_path(config.report_dir,
> > + config.only_test,
> > + config.test_style)
> > +
> > + assert os.path.exists(single_test_report), "Previous report not
> there?" + \
> > + single_test_report
> > +
> > + test_full_path = os.path.join(
> > + config.report_dir, relative_test_path)
> > +
> > + assert os.path.exists(test_full_path), "Previous test directory not
> there?" + \
> > + test_full_path
> > +
> > + results = []
> > + for i in xrange(0, num_times):
> > + _prepare_testsuite_for_rerun(test_name, test_full_path, config)
> > + result_path = _execute_test_again(config,
> > + test_name,
> > + test_full_path,
> > + logfile)
> > + results.extend(load_nt_report_file(result_path, config))
> > +
> > + # Check we got an exec and status from each run.
> > + assert len(results) >= num_times, "Did not get all the runs?"
> > +
> > + logfile.close()
> > + return results
> > +
> > +
> > +def _prepare_testsuite_for_rerun(test_name, test_full_path, config):
> > + """Rerun step 1: wipe out old files. To prepare the test suite,
> > + we must delete the correct files to convince make to rerun
> > + just this test. For executoin time runs, these are the
> > + previous execution time runs, then any reports that would hav
> > + been created by them.
> > +
> > + """
>
> Sic (execution, have)
>
> > + output = os.path.join(test_full_path, "Output/")
> > + # The execution timing file.
> > +
> > + try:
> > + os.remove(output + test_name + ".out-" + config.test_style)
> > + except OSError:
> > + # If that does not work, try the mangled name.
> > + os.remove(output + test_name.replace("_", ".") + ".out-" +
> config.test_style)
>
> I don't like the ad hoc handling of mangling here, and this won't properly
> work
> with tests that had a mix of actual '_' and '.' characters. I think we
> should
> find a principled way to solve this, one way would be to have the initial
> run
> leave behind a mapping of original names to mangled names that could be
> inverted
> on demand.
>
> > +
> > + # Remove the report files.
> > + os.remove(_construct_report_path(config.report_dir,
> > + config.only_test,
> > + config.test_style))
> > +
> > + os.remove(_construct_report_path(config.report_dir,
> > + config.only_test,
> > + config.test_style,
> > + "txt"))
> > +
> > + os.remove(_construct_report_path(config.report_dir,
> > + config.only_test,
> > + config.test_style,
> > + "raw.out"))
>
> I'm tempted to make this code more resilient and simpler by just removing
> the
> entire contents of the Output directory. Do you see any good reason not to
> do
> that? If we don't do that, I'd like to remove *.$(TEST).* from in the test
> dir.
>
> > +
> > +
> > +def _execute_test_again(config, test_name, test_path, logfile):
> > + """Rerun step 2: rerun the execution benchmark of interest. Now
> that the
> > + test suite is ready, rerun the make command to gather new
> > + data. Modify the old make arguments to just be the execution
> > + target and all the reports that are dependent on that target.
> > +
> > + """
> > + # Grab old make invocation.
> > + mk_vars, _ = config.compute_run_make_variables()
> > + to_exec = list(mk_vars)
> > + to_exec = ['make','-k'] + to_exec
> > + to_exec.extend('%s=%s' % (k,v) for k,v in mk_vars.items())
> > + if config.only_test is not None:
> > + to_exec.extend(['-C', config.only_test])
> > +
> > + to_exec.append("Output/" + test_name + ".out-" + config.test_style)
> > + to_exec.append("Output/" + test_name + ".diff-" + config.test_style)
> > + to_exec.append("Output/" + test_name + ".exec-" + config.test_style)
> > + to_exec.append("Output/" + test_name + ".exec.report." +
> config.test_style)
> > + to_exec.append("Output/" + test_name + "." + config.test_style +
> ".report.txt")
> > + to_exec.append("report." + config.test_style + ".csv")
> > + to_exec.append("report")
>
> You don't need to provide all of these targets. The "master" target for
> driving
> each test is $(TEST-NAME).$(TEST).report.txt, and we should only build that
> target. If you build "report" or "report.$(TEST).csv", you may end up
> rerunning
> many other tests (in directories that have multiple tests, or that have
> subdirectories).
>
> > +
> > + result_loc = _construct_report_path(config.report_dir,
> > + config.only_test,
> > + config.test_style)
> > +
>
> Instead of building the report.$(TEST).csv, I'd like to just build the one
> individual report file, then manually call GenerateReport.pl on the one
> input
> file we need so we can get an output we can pass to load_nt_report_file()
>
> > + assert not os.path.exists(result_loc)
> > + execute_command(logfile, config.build_dir(None), to_exec, test_path)
> > + assert os.path.exists(result_loc), "Missing {}".format(result_loc)
> > + return result_loc
>
> I think a more sensible API is for this method to return the report data,
> not
> the path to the report. Also, I would sink the call to
> prepare_testsuite_for_rerun into this method.
>
> > +
> > +
> > +# When set to true, all benchmarks will be rerun.
> > +DEBUG_RERUN = False
>
> What is this here for?
>
> > +NUMBER_OF_RERUNS = 4
> > +
> > +SERVER_FAIL = u'FAIL'
> > +SERVER_PASS = u'PASS'
> > +
> > +# Local test reults names have these suffexes
> > +# Test will have the perf suffex if it passed
> > +# if it failed it will have a status suffex.
>
> Sic (suffix)
>
> > +LOCAL_COMPILE_PERF = "compile"
> > +LOCAL_COMPILE_STATUS = "compile.status"
> > +LOCAL_EXEC_PERF = "exec"
> > +LOCAL_EXEC_STATUS = "exec.status"
> > +
> > +# Server results have both status and performance in each entry
> > +SERVER_COMPILE_RESULT = "compile_time"
> > +SERVER_EXEC_RESULT = "execution_time"
> > +
> > +
> > +class PastRunData:
>
> This should inherit from object to be modern.
>
> > + """To decide if we need to rerun, we must know
> > + what happened on each test in the first runs.
> > + Because the server retruns data in a different format than
> > + the local results, this class contains a per-test per-run
> > + aggregate of the two reports."""
>
> Sic (reruns)
>
> > + def __init__(self, name):
> > + self.name = name
> > + self.compile_status = None
> > + self.compile_time = None
> > + self.execution_status = None
> > + self.execution_time = None
> > + self.valid = False
> > +
> > + def _check(self):
> > + """Make sure this run data is complete."""
> > + assert self.name is not None
> > + msg = "Malformed test: %s" % (repr(self))
> > + assert self.compile_status is not None, msg
> > + assert self.execution_status is not None, msg
> > + assert self.compile_time is not None, msg
> > + assert self.execution_time is not None, msg
> > + self.valid = True
> > +
> > + def _is_rerunable(self):
> > + """Decide if we should rerun this test. Rerun all execute tests
> when in
> > + debug, re-run if test compiled correctly, execuited correctly,
> > + but performance changed. Don't rerun if compile failed, or
> execute
> > + failed, sucessful execute did not change in performance (as
> decided by
> > + the server).
> > + """
>
> Sic (executed). This doc comment just duplicates the comments in the code,
> I
> would just make this describe the API (should we rerun this test) not the
> logic.
>
> > + assert self.valid
> > + # Don't rerun if compile failed.
> > + if self.compile_status == SERVER_FAIL:
> > + return False
> > +
> > + # Rerun everything in debug.
> > + if DEBUG_RERUN:
> > + return True
> > +
> > + # Don't rerun on correctness failure or test pass.
> > + if self.execution_status == UNCHANGED_FAIL or \
> > + self.execution_status == UNCHANGED_PASS or \
> > + self.execution_status == SERVER_FAIL:
> > + return False
> > +
> > + # Do rerun on regression or improvement.
> > + if self.execution_status == REGRESSED or \
> > + self.execution_status == IMPROVED:
> > + return True
> > +
> > + assert False, "Malformed run data: " \
> > + "you should not get here. " + str(self)
> > +
> > + def __repr__(self):
> > + template = "<{}: CS {}, CT {}, ES {}, ET {}>"
> > + return template.format(self.name,
> > + repr(self.compile_status),
> > + repr(self.compile_time),
> > + repr(self.execution_status),
> > + repr(self.execution_time))
> > +
> > +
> > +def _process_reruns(config, server_reply, local_results):
> > + """Rerun each benchmark which the server reported "changed", N more
> > + times. By default N=4 to make 5 samples total. 5 samples is the
> > + smallest thing that has an okay median. For now, only rerun the
> > + exec time tests, and don't rerun test which failed or whoes
> > + compile failed.
> > +
> > + """
>
> This comment again duplicates ones elsewhere, better to delegate.
>
> > + # To know which benchmarks to run, we need to check the local
> results
> > + # to see which did not fail, then check the server results to see
> > + # which regressed or improved. The server will return extra results
> > + # for things that we did not run this time, it may also omit
> performance
> > + # results.
>
> More duplication of logic explanation.
>
> > + server_results = server_reply['test_results'][0]['results']
> > +
> > + # Holds the combined local and server results.
> > + cololated_results = dict()
>
> Sic (collated)
>
> > +
> > + for b in local_results.tests:
> > + # format: suite.test/path/and/name.type<.type>
> > + fields = b.name.split('.')
> > + test_name = fields[1]
> > + test_type = '.'.join(fields[2:])\
> > +
> > + updating_entry = cololated_results.get(test_name,
> > + PastRunData(test_name))
> > + if test_type == LOCAL_COMPILE_PERF:
> > + updating_entry.compile_time = b.data
> > + elif test_type == LOCAL_COMPILE_STATUS:
> > + updating_entry.compile_status = SERVER_FAIL
> > + elif test_type == LOCAL_EXEC_PERF:
> > + updating_entry.execution_time = b.data
> > + elif test_type == LOCAL_EXEC_STATUS:
> > + updating_entry.execution_status = SERVER_FAIL
> > + else:
> > + assert False, "Unexpected local test type."
> > +
> > + cololated_results[test_name] = updating_entry
> > +
> > + # Now add on top the server results to any entry we already have.
> > + for full_name, results_status, perf_status in server_results:
> > + test_name, test_type = full_name.split(".")
> > +
> > + new_entry = cololated_results.get(test_name, None)
> > + # Some tests will come from the server, which we did not run
> locally.
> > + # Drop them.
> > + if new_entry is None:
> > + continue
> > + # Set these, if they were not set with fails above.
> > + if SERVER_COMPILE_RESULT in test_type:
> > + if new_entry.compile_status is None:
> > + new_entry.compile_status = results_status
> > + elif SERVER_EXEC_RESULT in test_type:
> > + if new_entry.execution_status is None:
> > + # If the server has not seen the test before, it will
> return
> > + # None for the performance results analysis. In this
> case we
> > + # will assume no rerun is needed, so assign unchanged.
> > + if perf_status is None:
> > + derived_perf_status = UNCHANGED_PASS
> > + else:
> > + derived_perf_status = perf_status
> > + new_entry.execution_status = derived_perf_status
> > + else:
> > + assert False, "Unexpected server result type."
> > + cololated_results[test_name] = new_entry
> > +
> > + # Double check that all values are there for all tests.
> > + for test in cololated_results.values():
> > + test._check()
> > +
> > + rerunable_benches = filter(lambda x: x._is_rerunable(),
> > + cololated_results.values())
> > + rerunable_benches.sort(key=lambda x: x.name)
> > + # Now lets actually do the reruns.
> > + rerun_results = []
> > + SUMMARY = "Rerunning {} of {} benchmarks."
> > + note(SUMMARY.format(len(rerunable_benches),
> > + len(cololated_results.values())))
> > +
> > + for i, bench in enumerate(rerunable_benches):
> > + note("Rerunning: {} [{}/{}]".format(bench.name,
> > + i + 1,
> > + len(rerunable_benches)))
> > + fresh_samples = rerun_test(config,
> > + bench.name,
> > + NUMBER_OF_RERUNS)
> > + rerun_results.extend(fresh_samples)
> > +
> > + return rerun_results
> > +
>
> This method seems more complicated than it needs to be. Can't we just scan
> the
> server results for the list of tests we need to rerun, then filter out any
> that
> we didn't run locally, then run those? It isn't clear to me we need to do
> the
> collation and build up all the PastRunData objects.
>
> > usage_info = """
> > Script for running the tests in LLVM's test-suite repository.
> > @@ -1233,6 +1529,9 @@ class NTTest(builtintest.BuiltinTest):
> > group.add_option("", "--build-threads", dest="build_threads",
> > help="Number of compilation threads",
> > type=int, default=0, metavar="N")
> > + group.add_option("", "--rerun", dest="rerun",
> > + help="Rerun tests that have regressed.",
> > + action="store_true", default=False)
> >
> > group.add_option("", "--remote", dest="remote",
> > help=("Execute remotely, see "
> > @@ -1495,3 +1794,13 @@ class NTTest(builtintest.BuiltinTest):
> >
> > else:
> > test_results = run_test(nick, None, config)
> > + if opts.rerun:
> > + self.log("Performing any needed reruns.")
> > + server_report = self.submit_helper(config, commit=False)
> > + new_samples = _process_reruns(config, server_report,
> test_results)
> > + test_results.update_report(new_samples)
> > +
> > + # persist report with new samples.
> > + lnt_report_path = config.report_path(None)
> > + os.unlink(lnt_report_path)
> > + lnt_report_file = open(lnt_report_path, 'w')
>
> Why bother to unlink here?
>
> > + print >>lnt_report_file, test_results.render()
> > + lnt_report_file.close()
> >
> > if config.output is not None:
> > self.print_report(test_results, config.output)
> > --
> > 1.9.3 (Apple Git-50)+GitX
>
> --
>
>
> On Mon, Jul 21, 2014 at 3:32 PM, Chris Matthews <chris.matthews at apple.com>
> wrote:
>
>> Hi Daniel,
>>
>> Here is an update to the rerun patch discussed last year. Attached is a
>> patch for the NT test suite as well as a new rerun test in the LNT test
>> suite.
>>
>> The patch does not perform well (reruns take quite a while). I’d like to
>> address that in a separate patch.
>>
>>
>
>
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://lists.llvm.org/pipermail/llvm-commits/attachments/20140918/9facd3f1/attachment.html>
More information about the llvm-commits
mailing list