[LNT] r208450 - Use Mann-Whitney U test to identify changes(2)

Fri May 9 15:09:29 PDT 2014

Author: kongyi
Date: Fri May  9 17:09:29 2014
New Revision: 208450

URL: http://llvm.org/viewvc/llvm-project?rev=208450&view=rev
Log:
Use Mann-Whitney U test to identify changes(2)

Correctly calculates Mann-Whitney U
Optionally depends on SciPy


Modified:
    lnt/trunk/lnt/server/reporting/analysis.py
    lnt/trunk/lnt/server/reporting/runs.py
    lnt/trunk/lnt/server/ui/templates/v4_run.html
    lnt/trunk/lnt/server/ui/views.py
    lnt/trunk/lnt/util/stats.py

Modified: lnt/trunk/lnt/server/reporting/analysis.py
URL: http://llvm.org/viewvc/llvm-project/lnt/trunk/lnt/server/reporting/analysis.py?rev=208450&r1=208449&r2=208450&view=diff
==============================================================================

--- lnt/trunk/lnt/server/reporting/analysis.py (original)
+++ lnt/trunk/lnt/server/reporting/analysis.py Fri May  9 17:09:29 2014
@@ -13,8 +13,8 @@ UNCHANGED_FAIL = 'UNCHANGED_FAIL'
 
 class ComparisonResult:
     def __init__(self, cur_value, prev_value, delta, pct_delta, stddev, MAD,
-                 cur_failed, prev_failed, samples, stddev_mean = None,
-                 stddev_is_estimated = False):
+                 cur_failed, prev_failed, samples, prev_samples, stddev_mean = None,
+                 stddev_is_estimated = False, confidence_lv = .05):
         self.current = cur_value
         self.previous = prev_value
         self.delta = delta
@@ -24,8 +24,10 @@ class ComparisonResult:
         self.failed = cur_failed
         self.prev_failed = prev_failed
         self.samples = samples
+        self.prev_samples = prev_samples
         self.stddev_mean = stddev_mean
         self.stddev_is_estimated = stddev_is_estimated
+        self.confidence_lv = confidence_lv
 
     def get_samples(self):
         return self.samples
@@ -56,6 +58,9 @@ class ComparisonResult:
 
     def get_value_status(self, confidence_interval=2.576,
                          value_precision=0.0001, ignore_small=True):
+        """
+        Raises ImportError if SciPy is not installed and sample size is too large.
+        """
         if self.current is None or self.previous is None:
             return None
 
@@ -89,6 +94,13 @@ class ComparisonResult:
         if ignore_small and abs(self.delta) < .01:
             return UNCHANGED_PASS
 
+        # Use Mann-Whitney U test to test null hypothesis that result is
+        # unchanged.
+        if len(self.samples) >= 4 and len(self.prev_samples) >= 4:
+            same = stats.mannwhitneyu(self.samples, self.prev_samples, self.confidence_lv)
+            if same:
+                return UNCHANGED_PASS
+
         # If we have a comparison window, then measure using a symmetic
         # confidence interval.
         if self.stddev is not None:
@@ -123,9 +135,10 @@ class ComparisonResult:
 
 class RunInfo(object):
     def __init__(self, testsuite, runs_to_load,
-                 aggregation_fn = min):
+                 aggregation_fn = min, confidence_lv = .05):
         self.testsuite = testsuite
         self.aggregation_fn = aggregation_fn
+        self.confidence_lv = confidence_lv
 
         self.sample_map = util.multidict()
         self.loaded_run_ids = set()
@@ -207,7 +220,8 @@ class RunInfo(object):
                 run_value, prev_value, delta=None,
                 pct_delta = None, stddev = stddev, MAD = MAD,
                 cur_failed = run_failed, prev_failed = prev_failed,
-                samples = run_values)
+                samples = run_values, prev_samples = prev_values,
+                confidence_lv = self.confidence_lv)
 
         # Compute the comparison status for the test value.
         delta = run_value - prev_value
@@ -241,8 +255,9 @@ class RunInfo(object):
 
         return ComparisonResult(run_value, prev_value, delta,
                                 pct_delta, stddev, MAD,
-                                run_failed, prev_failed, run_values,
-                                stddev_mean, stddev_is_estimated)
+                                run_failed, prev_failed, run_values, prev_values,
+                                stddev_mean, stddev_is_estimated,
+                                self.confidence_lv)
 
     def _load_samples_for_runs(self, run_ids):
         # Find the set of new runs to load.

Modified: lnt/trunk/lnt/server/reporting/runs.py
URL: http://llvm.org/viewvc/llvm-project/lnt/trunk/lnt/server/reporting/runs.py?rev=208450&r1=208449&r2=208450&view=diff
==============================================================================
--- lnt/trunk/lnt/server/reporting/runs.py (original)
+++ lnt/trunk/lnt/server/reporting/runs.py Fri May  9 17:09:29 2014
@@ -14,7 +14,8 @@ import lnt.util.stats
 def generate_run_report(run, baseurl, only_html_body = False,
                         num_comparison_runs = 10, result = None,
                         compare_to = None, baseline = None,
-                        comparison_window = None, aggregation_fn = min):
+                        comparison_window = None, aggregation_fn = min,
+                        confidence_lv = .05):
     """
     generate_run_report(...) -> (str: subject, str: text_report,
                                  str: html_report)
@@ -67,7 +68,7 @@ def generate_run_report(run, baseurl, on
     if baseline:
         runs_to_load.add(baseline.id)
     sri = lnt.server.reporting.analysis.RunInfo(
-        ts, runs_to_load, aggregation_fn)
+        ts, runs_to_load, aggregation_fn, confidence_lv)
 
     # Get the test names, primary fields and total test counts.
     test_names = ts.query(ts.Test.name, ts.Test.id).\

Modified: lnt/trunk/lnt/server/ui/templates/v4_run.html
URL: http://llvm.org/viewvc/llvm-project/lnt/trunk/lnt/server/ui/templates/v4_run.html?rev=208450&r1=208449&r2=208450&view=diff
==============================================================================
--- lnt/trunk/lnt/server/ui/templates/v4_run.html (original)
+++ lnt/trunk/lnt/server/ui/templates/v4_run.html Fri May  9 17:09:29 2014
@@ -76,6 +76,7 @@
   {% if options.show_sample_counts %}
     <td>{{cr.get_samples()|length}}</td>
   {% endif %}
+
 {% endmacro %}
 
 {% block sidebar %}
@@ -243,6 +244,17 @@
         </select>
       </td>
     </tr>
+    <tr>
+      <td>Mann-Whitney test confidence level:</td>
+      <td>
+        <select name="MW_confidence_lv">
+          <option value="0.05" {{ "selected" if options.MW_confidence_lv == 0.05 else ""}}>
+            5%</value>
+          <option value="0.01" {{ "selected" if options.MW_confidence_lv == 0.01 else ""}}>
+            1%</value>
+        </select>
+      </td>
+    </tr>
     <tr>
       <td colspan="2">
         {% if compare_to %}

Modified: lnt/trunk/lnt/server/ui/views.py
URL: http://llvm.org/viewvc/llvm-project/lnt/trunk/lnt/server/ui/views.py?rev=208450&r1=208449&r2=208450&view=diff
==============================================================================
--- lnt/trunk/lnt/server/ui/views.py (original)
+++ lnt/trunk/lnt/server/ui/views.py Fri May  9 17:09:29 2014
@@ -191,6 +191,13 @@ class V4RequestInfo(object):
                                 'median' : lnt.util.stats.median }.get(
             aggregation_fn_name, min)
 
+        # Get the MW confidence level.
+        try:
+            confidence_lv = float(request.args.get('MW_confidence_lv'))
+        except (TypeError, ValueError):
+            confidence_lv = .05
+        self.confidence_lv = confidence_lv
+
         # Find the neighboring runs, by order.
         prev_runs = list(ts.get_previous_runs_on_machine(run, N = 3))
         next_runs = list(ts.get_next_runs_on_machine(run, N = 3))
@@ -246,7 +253,7 @@ class V4RequestInfo(object):
             only_html_body=only_html_body, result=None,
             compare_to=self.compare_to, baseline=self.baseline,
             comparison_window=self.comparison_window,
-            aggregation_fn=self.aggregation_fn)
+            aggregation_fn=self.aggregation_fn, confidence_lv=confidence_lv)
         _, self.text_report, self.html_report, self.sri = reports
 
 @v4_route("/<int:id>/report")
@@ -294,7 +301,12 @@ Unable to find a v0.4 run for this ID. P
 
 @v4_route("/<int:id>")
 def v4_run(id):
-    info = V4RequestInfo(id)
+    try:
+        info = V4RequestInfo(id)
+    except ImportError:
+        return render_template("error.html",
+            message="SciPy is not installed on server and sample size is too large.")
+
     ts = info.ts
     run = info.run
 
@@ -314,6 +326,7 @@ def v4_run(id):
     options['num_comparison_runs'] = info.num_comparison_runs
     options['test_filter'] = test_filter_str = request.args.get(
         'test_filter', '')
+    options['MW_confidence_lv'] = info.confidence_lv
     if test_filter_str:
         test_filter_re = re.compile(test_filter_str)
     else:

Modified: lnt/trunk/lnt/util/stats.py
URL: http://llvm.org/viewvc/llvm-project/lnt/trunk/lnt/util/stats.py?rev=208450&r1=208449&r2=208450&view=diff
==============================================================================
--- lnt/trunk/lnt/util/stats.py (original)
+++ lnt/trunk/lnt/util/stats.py Fri May  9 17:09:29 2014
@@ -19,3 +19,100 @@ def standard_deviation(l):
     means_sqrd = sum([(v - m)**2 for v in l]) / len(l)
     rms = math.sqrt(means_sqrd)
     return rms
+
+def mannwhitneyu(a, b, sigLevel = .05):
+    """
+    Determine if sample a and b are the same at given significance level,
+    raises ImportError if SciPy is not installed on server and sample size is
+    too large.
+    """
+    if len(a) <= 20 and len(b) <= 20:
+        return mannwhitneyu_small(a, b, sigLevel)
+    else:
+        try:
+            from scipy.stats import mannwhitneyu as mannwhitneyu_large
+            return mannwhitneyu_large(a, b, False) >= sigLevel
+        except ValueError:
+            return True
+
+def mannwhitneyu_small(a, b, sigLevel):
+    """
+    Determine if sample a and b are the same.
+    Sample size must be less than 20.
+    """
+    assert len(a) <= 20, "Sample size must be less than 20."
+    assert len(b) <= 20, "Sample size must be less than 20."
+
+    if not sigLevel in tables:
+        raise ValueError("Do not have according significance table.")
+
+    # Calculate U value for sample groups using method described on Wikipedia.
+    flip = len(a) > len(b)
+    x = a if not flip else b
+    y = b if not flip else a
+
+    Ux = 0.
+    for xe in x:
+        for ye in y:
+            if xe < ye:
+                Ux += 1
+            elif xe == ye:
+                Ux += .5
+    Uy = len(a) * len(b) - Ux
+    Ua = Ux if not flip else Uy
+    Ub = Uy if not flip else Ux
+
+    U = abs(Ua - Ub)
+
+    same = U <= SIGN_TABLES[sigLevel][len(a) - 1][len(b) - 1]
+    return same
+
+# Table for .05 significance level.
+TABLE_0_05 = [
+        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
+        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 2, 2, 2, 2],
+        [0, 0, 0, 0, 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8],
+        [0, 0, 0, 0, 1, 2, 3, 4, 4, 5, 6, 7, 8, 9, 10, 11, 11, 12, 13, 13],
+        [0, 0, 0, 1, 2, 3, 5, 6, 7, 8, 9, 11, 12, 13, 14, 15, 17, 18, 19, 20],
+        [0, 0, 1, 2, 3, 5, 6, 8, 10, 11, 13, 14, 16, 17, 19, 21, 22, 24, 25, 27],
+        [0, 0, 1, 3, 5, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30, 32, 34],
+        [0, 0, 2, 4, 6, 8, 10, 13, 15, 17, 19, 22, 24, 26, 29, 31, 34, 36, 38, 41],
+        [0, 0, 2, 4, 7, 10, 12, 15, 17, 20, 23, 26, 28, 31, 34, 37, 39, 42, 45, 48],
+        [0, 0, 3, 5, 8, 11, 14, 17, 20, 23, 26, 29, 33, 36, 39, 42, 45, 48, 52, 55],
+        [0, 0, 3, 6, 9, 13, 16, 19, 23, 26, 30, 33, 37, 40, 44, 47, 51, 55, 58, 62],
+        [0, 1, 4, 7, 11, 14, 18, 22, 26, 29, 33, 37, 41, 45, 49, 53, 57, 61, 65, 69],
+        [0, 1, 4, 8, 12, 16, 20, 24, 28, 33, 37, 41, 45, 50, 54, 59, 63, 67, 72, 76],
+        [0, 1, 5, 9, 13, 17, 22, 26, 31, 36, 40, 45, 50, 55, 59, 64, 67, 74, 78, 83],
+        [0, 1, 5, 10, 14, 19, 24, 29, 34, 39, 44, 49, 54, 59, 64, 70, 75, 80, 85, 90],
+        [0, 1, 6, 11, 15, 21, 26, 31, 37, 42, 47, 53, 59, 64, 70, 75, 81, 86, 92, 98],
+        [0, 2, 6, 11, 17, 22, 28, 34, 39, 45, 51, 57, 63, 67, 75, 81, 87, 93, 99, 105],
+        [0, 2, 7, 12, 18, 24, 30, 36, 42, 48, 55, 61, 67, 74, 80, 86, 93, 99, 106, 112],
+        [0, 2, 7, 13, 19, 25, 32, 38, 45, 52, 58, 65, 72, 78, 85, 92, 99, 106, 113, 119],
+        [0, 2, 8, 13, 20, 27, 34, 41, 48, 55, 62, 69, 76, 83, 90, 98, 105, 112, 119, 127]
+        ]
+
+# Table for .01 significance level.
+TABLE_0_01 = [
+        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
+        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
+        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 2, 2, 2, 2, 3, 3],
+        [0, 0, 0, 0, 0, 0, 0, 1, 1, 2, 2, 3, 3, 4, 5, 5, 6, 6, 7, 8],
+        [0, 0, 0, 0, 0, 1, 1, 2, 3, 4, 5, 6, 7, 7, 8, 9, 10, 11, 12, 13],
+        [0, 0, 0, 0, 1, 2, 3, 4, 5, 6, 7, 9, 10, 11, 12, 13, 15, 16, 17, 18],
+        [0, 0, 0, 0, 1, 3, 4, 6, 7, 9, 10, 12, 13, 15, 16, 18, 19, 21, 22, 24],
+        [0, 0, 0, 1, 2, 4, 6, 7, 9, 11, 13, 15, 17, 18, 20, 22, 24, 26, 28, 30],
+        [0, 0, 0, 1, 3, 5, 7, 9, 11, 13, 16, 18, 20, 22, 24, 27, 29, 31, 33, 36],
+        [0, 0, 0, 2, 4, 6, 9, 11, 13, 16, 18, 21, 24, 26, 29, 31, 34, 37, 39, 42],
+        [0, 0, 0, 2, 5, 7, 10, 13, 16, 18, 21, 24, 27, 30, 33, 36, 39, 42, 45, 46],
+        [0, 0, 1, 3, 6, 9, 12, 15, 18, 21, 24, 27, 31, 34, 37, 41, 44, 47, 51, 54],
+        [0, 0, 1, 3, 7, 10, 13, 17, 20, 24, 27, 31, 34, 38, 42, 45, 49, 53, 56, 60],
+        [0, 0, 1, 4, 7, 11, 15, 18, 22, 26, 30, 34, 38, 42, 46, 50, 54, 58, 63, 67],
+        [0, 0, 2, 5, 8, 12, 16, 20, 24, 29, 33, 37, 42, 46, 51, 55, 60, 64, 69, 73],
+        [0, 0, 2, 5, 9, 13, 18, 22, 27, 31, 36, 41, 45, 50, 55, 60, 65, 70, 74, 79],
+        [0, 0, 2, 6, 10, 15, 19, 24, 29, 34, 39, 44, 49, 54, 60, 65, 70, 75, 81, 86],
+        [0, 0, 2, 6, 11, 16, 21, 26, 31, 37, 42, 47, 53, 58, 64, 70, 75, 81, 87, 92],
+        [0, 0, 3, 7, 12, 17, 22, 28, 33, 39, 45, 51, 56, 63, 69, 74, 81, 87, 93, 99],
+        [0, 0, 3, 8, 13, 18, 24, 30, 36, 42, 46, 54, 60, 67, 73, 79, 86, 92, 99, 105]
+        ]
+
+SIGN_TABLES = {.05: TABLE_0_05, .01: TABLE_0_01}