[llvm] update_test_checks: keep meta variables stable by default (PR #76748)

Wed Mar 6 11:43:25 PST 2024

================
@@ -1187,20 +1233,317 @@ def may_clash_with_default_check_prefix_name(check_prefix, var):
     )
 
 
+def find_diff_matching(lhs: List[str], rhs: List[str]) -> List[int]:
+    """
+    Find a large ordered matching between strings in lhs and rhs.
+
+    Think of this as finding the *unchanged* lines in a diff, where the entries
+    of lhs and rhs are lines of the files being diffed.
+
+    Returns a list of matched (lhs_idx, rhs_idx) pairs.
+    """
+
+    # Collect matches in reverse order.
+    matches = []
+
+    def recurse(lhs_start, lhs_end, rhs_start, rhs_end):
+        if lhs_start == lhs_end or rhs_start == rhs_end:
+            return
+
+        # First, collect a set of candidate matching edges. We limit this to a
+        # constant multiple of the input size to avoid quadratic runtime.
+        patterns = collections.defaultdict(lambda: ([], []))
+
+        for idx in range(lhs_start, lhs_end):
+            patterns[lhs[idx]][0].append(idx)
+        for idx in range(rhs_start, rhs_end):
+            patterns[rhs[idx]][1].append(idx)
+
+        multiple_patterns = []
+
+        candidates = []
+        for pattern in patterns.values():
+            if not pattern[0] or not pattern[1]:
+                continue
+
+            if len(pattern[0]) == len(pattern[1]) == 1:
+                candidates.append((pattern[0][0], pattern[1][0]))
+            else:
+                multiple_patterns.append(pattern)
+
+        multiple_patterns.sort(key=lambda pattern: len(pattern[0]) * len(pattern[1]))
+
+        for pattern in multiple_patterns:
+            if len(candidates) + len(pattern[0]) * len(pattern[1]) > 2 * (len(lhs) + len(rhs)):
+                break
+            for lhs_idx in pattern[0]:
+                for rhs_idx in pattern[1]:
+                    candidates.append((lhs_idx, rhs_idx))
+
+        if not candidates:
+            # The LHS and RHS either share nothing in common, or lines are just too
+            # identical. In that case, let's give up and not match anything.
+            return
+
+        # Compute a maximal crossing-free matching via an algorithm that is
+        # inspired by a mixture of dynamic programming and line-sweeping in
+        # discrete geometry.
+        #
+        # I would be surprised if this algorithm didn't exist somewhere in the
+        # literature, but I found it without consciously recalling any
+        # references, so you'll have to make do with the explanation below.
+        # Sorry.
+        #
+        # The underlying graph is bipartite:
+        #  - nodes on the LHS represent lines in the original check
+        #  - nodes on the RHS represent lines in the new (updated) check
+        #
+        # Nodes are implicitly sorted by the corresponding line number.
+        # Edges (unique_matches) are sorted by the line number on the LHS.
+        #
+        # Here's the geometric intuition for the algorithm.
+        #
+        #  * Plot the edges as points in the plane, with the original line
+        #    number on the X axis and the updated line number on the Y axis.
+        #  * The goal is to find a longest "chain" of points where each point
+        #    is strictly above and to the right of the previous point.
+        #  * The algorithm proceeds by sweeping a vertical line from left to
+        #    right.
+        #  * The algorithm maintains a table where `table[N]` answers the
+        #    question "What is currently the 'best' way to build a chain of N+1
+        #    points to the left of the vertical line". Here, 'best' means
+        #    that the last point of the chain is a as low as possible (minimal
+        #    Y coordinate).
+        #   * `table[N]` is `(y, point_idx)` where `point_idx` is the index of
+        #     the last point in the chain and `y` is its Y coordinate
+        #   * A key invariant is that the Y values in the table are
+        #     monotonically increasing
+        #  * Thanks to these properties, the table can be used to answer the
+        #    question "What is the longest chain that can be built to the left
+        #    of the vertical line using only points below a certain Y value",
+        #    using a binary search over the table.
+        #  * The algorithm also builds a backlink structure in which every point
+        #    links back to the previous point on a best (longest) chain ending
+        #    at that point
+        #
+        # The core loop of the algorithm sweeps the line and updates the table
+        # and backlink structure for every point that we cross during the sweep.
+        # Therefore, the algorithm is trivially O(M log M) in the number of
+        # points. Since we only consider lines that are unique, it is log-linear
+        # in the problem size.
+        candidates.sort(key=lambda candidate: (candidate[0], -candidate[1]))
+
+        backlinks = []
+        table = []
+        for _, rhs_idx in candidates:
+            candidate_idx = len(backlinks)
+            ti = bisect.bisect_left(table, rhs_idx, key=lambda entry: entry[0])
+            if ti < len(table):
+                table[ti] = (rhs_idx, candidate_idx)
+            else:
+                table.append((rhs_idx, candidate_idx))
----------------
nhaehnle wrote:

There is actually a subtlety here that I'm recording in a comment, and I ended up not quite restating what you wrote here. I hope that's okay :)

https://github.com/llvm/llvm-project/pull/76748