[llvm] update_test_checks: keep meta variables stable by default (PR #76748)

Wed Mar 6 03:25:54 PST 2024

Nicolai =?utf-8?q?Hähnle?= <nicolai.haehnle at amd.com>,
Nicolai =?utf-8?q?Hähnle?= <nicolai.haehnle at amd.com>,
Nicolai =?utf-8?q?Hähnle?= <nicolai.haehnle at amd.com>,
Nicolai =?utf-8?q?Hähnle?= <nicolai.haehnle at amd.com>,
Nicolai =?utf-8?q?Hähnle?= <nicolai.haehnle at amd.com>
Message-ID:
In-Reply-To: <llvm.org/llvm/llvm-project/pull/76748 at github.com>


================
@@ -1187,20 +1233,317 @@ def may_clash_with_default_check_prefix_name(check_prefix, var):
     )
 
 
+def find_diff_matching(lhs: List[str], rhs: List[str]) -> List[int]:
+    """
+    Find a large ordered matching between strings in lhs and rhs.
+
+    Think of this as finding the *unchanged* lines in a diff, where the entries
+    of lhs and rhs are lines of the files being diffed.
+
+    Returns a list of matched (lhs_idx, rhs_idx) pairs.
+    """
+
+    # Collect matches in reverse order.
+    matches = []
+
+    def recurse(lhs_start, lhs_end, rhs_start, rhs_end):
+        if lhs_start == lhs_end or rhs_start == rhs_end:
+            return
+
+        # First, collect a set of candidate matching edges. We limit this to a
+        # constant multiple of the input size to avoid quadratic runtime.
+        patterns = collections.defaultdict(lambda: ([], []))
+
+        for idx in range(lhs_start, lhs_end):
+            patterns[lhs[idx]][0].append(idx)
+        for idx in range(rhs_start, rhs_end):
+            patterns[rhs[idx]][1].append(idx)
+
+        multiple_patterns = []
+
+        candidates = []
+        for pattern in patterns.values():
+            if not pattern[0] or not pattern[1]:
+                continue
+
+            if len(pattern[0]) == len(pattern[1]) == 1:
+                candidates.append((pattern[0][0], pattern[1][0]))
+            else:
+                multiple_patterns.append(pattern)
+
+        multiple_patterns.sort(key=lambda pattern: len(pattern[0]) * len(pattern[1]))
+
+        for pattern in multiple_patterns:
+            if len(candidates) + len(pattern[0]) * len(pattern[1]) > 2 * (len(lhs) + len(rhs)):
+                break
+            for lhs_idx in pattern[0]:
+                for rhs_idx in pattern[1]:
+                    candidates.append((lhs_idx, rhs_idx))
+
+        if not candidates:
+            # The LHS and RHS either share nothing in common, or lines are just too
+            # identical. In that case, let's give up and not match anything.
+            return
+
+        # Compute a maximal crossing-free matching via an algorithm that is
+        # inspired by a mixture of dynamic programming and line-sweeping in
+        # discrete geometry.
+        #
+        # I would be surprised if this algorithm didn't exist somewhere in the
+        # literature, but I found it without consciously recalling any
+        # references, so you'll have to make do with the explanation below.
+        # Sorry.
+        #
+        # The underlying graph is bipartite:
+        #  - nodes on the LHS represent lines in the original check
+        #  - nodes on the RHS represent lines in the new (updated) check
+        #
+        # Nodes are implicitly sorted by the corresponding line number.
+        # Edges (unique_matches) are sorted by the line number on the LHS.
+        #
+        # Here's the geometric intuition for the algorithm.
+        #
+        #  * Plot the edges as points in the plane, with the original line
+        #    number on the X axis and the updated line number on the Y axis.
+        #  * The goal is to find a longest "chain" of points where each point
+        #    is strictly above and to the right of the previous point.
+        #  * The algorithm proceeds by sweeping a vertical line from left to
+        #    right.
+        #  * The algorithm maintains a table where `table[N]` answers the
+        #    question "What is currently the 'best' way to build a chain of N+1
+        #    points to the left of the vertical line". Here, 'best' means
+        #    that the last point of the chain is a as low as possible (minimal
+        #    Y coordinate).
+        #   * `table[N]` is `(y, point_idx)` where `point_idx` is the index of
+        #     the last point in the chain and `y` is its Y coordinate
+        #   * A key invariant is that the Y values in the table are
+        #     monotonically increasing
+        #  * Thanks to these properties, the table can be used to answer the
+        #    question "What is the longest chain that can be built to the left
+        #    of the vertical line using only points below a certain Y value",
+        #    using a binary search over the table.
+        #  * The algorithm also builds a backlink structure in which every point
+        #    links back to the previous point on a best (longest) chain ending
+        #    at that point
+        #
+        # The core loop of the algorithm sweeps the line and updates the table
+        # and backlink structure for every point that we cross during the sweep.
+        # Therefore, the algorithm is trivially O(M log M) in the number of
+        # points. Since we only consider lines that are unique, it is log-linear
+        # in the problem size.
+        candidates.sort(key=lambda candidate: (candidate[0], -candidate[1]))
+
+        backlinks = []
+        table = []
+        for _, rhs_idx in candidates:
+            candidate_idx = len(backlinks)
+            ti = bisect.bisect_left(table, rhs_idx, key=lambda entry: entry[0])
+            if ti < len(table):
+                table[ti] = (rhs_idx, candidate_idx)
+            else:
+                table.append((rhs_idx, candidate_idx))
+            if ti > 0:
+                backlinks.append(table[ti - 1][1])
+            else:
+                backlinks.append(None)
+
+        # Commit to names in the matching by walking the backlinks. Recursively
+        # attempt to fill in more matches in-betweem.
+        previous = (lhs_end, rhs_end)
+        match_idx = table[-1][1]
+        while match_idx is not None:
+            current = candidates[match_idx]
+            recurse(current[0] + 1, previous[0], current[1] + 1, previous[1])
+            matches.append(current)
+            previous = current
+            match_idx = backlinks[match_idx]
+        recurse(lhs_start, previous[0], rhs_start, previous[1])
+
+    recurse(0, len(lhs), 0, len(rhs))
+
+    matches.reverse()
+    return matches
+
+
+VARIABLE_TAG = "[[@@]]"
+METAVAR_RE = re.compile(r"\[\[([A-Z0-9_]+)(?::[^]]+)?\]\]")
+NUMERIC_SUFFIX_RE = re.compile(r"[0-9]*$")
+
+
+class CheckValueInfo:
+    def __init__(
+        self,
+        nameless_value: NamelessValue,
+        var: str,
+        prefix: str,
+    ):
+        self.nameless_value = nameless_value
+        self.var = var
+        self.prefix = prefix
+
+
+# Represent a check line in a way that allows us to compare check lines while
+# ignoring some or all of the FileCheck variable names.
+class CheckLineInfo:
+    def __init__(self, line, values):
+        # Line with all FileCheck variable name occurrences replaced by VARIABLE_TAG
+        self.line: str = line
+
+        # Information on each FileCheck variable name occurrences in the line
+        self.values: List[CheckValueInfo] = values
+
+    def __repr__(self):
+        return f"CheckLineInfo(line={self.line}, self.values={self.values})"
+
+
+def remap_metavar_names(
+    old_line_infos: List[CheckLineInfo],
+    new_line_infos: List[CheckLineInfo],
+    committed_names: Set[str],
+) -> Mapping[str, str]:
+    """
+    Map all FileCheck variable names that appear in new_line_infos to new
+    FileCheck variable names in an attempt to reduce the diff from old_line_infos
+    to new_line_infos.
+
+    This is done by:
+    * Matching old check lines and new check lines using a diffing algorithm
+      applied after replacing names with wildcards.
+    * Committing to variable names such that the matched lines become equal
+      (without wildcards) if possible
+    * This is done recursively to handle cases where many lines are equal
+      after wildcard replacement
+    """
+    # Initialize uncommitted identity mappings
+    new_mapping = {}
+    for line in new_line_infos:
+        for value in line.values:
+            new_mapping[value.var] = value.var
+
+    # Recursively commit to the identity mapping or find a better one
+    def recurse(old_begin, old_end, new_begin, new_end):
+        if old_begin == old_end or new_begin == new_end:
+            return
+
+        # Find a matching of lines where uncommitted names are replaced
+        # with a placeholder.
+        def diffify_line(line, mapper):
+            values = []
+            for value in line.values:
+                mapped = mapper(value.var)
+                values.append(mapped if mapped in committed_names else '?')
+            return line.line.strip() + ' @@@ ' + ' @ '.join(values)
+
+        lhs_lines = [
+            diffify_line(line, lambda x: x)
+            for line in old_line_infos[old_begin:old_end]
+        ]
+        rhs_lines = [
+            diffify_line(line, lambda x: new_mapping[x])
+            for line in new_line_infos[new_begin:new_end]
+        ]
+
+        candidate_matches = find_diff_matching(lhs_lines, rhs_lines)
+
+        # Apply commits greedily on a match-by-match basis
+        matches = [(-1,-1)]
+        committed_anything = False
+        for lhs_idx, rhs_idx in candidate_matches:
+            lhs_line = old_line_infos[lhs_idx]
+            rhs_line = new_line_infos[rhs_idx]
+
+            local_commits = {}
+
+            for lhs_value, rhs_value in zip(lhs_line.values, rhs_line.values):
+                if new_mapping[rhs_value.var] in committed_names:
+                    # The new value has already been committed. If it was mapped
+                    # to the same name as the original value, we can consider
+                    # # committing other values from this line. Otherwise, we
----------------
jasilvanus wrote:

Nit: double `#`

https://github.com/llvm/llvm-project/pull/76748