[llvm] [utils][filecheck-lint]: speedup filecheck_lint (PR #94191)

Wed Jun 5 01:56:04 PDT 2024

https://github.com/klensy updated https://github.com/llvm/llvm-project/pull/94191

>From faeba1e67bfb7822fd30f0cab6e129d28c985485 Mon Sep 17 00:00:00 2001
From: klensy <nightouser at gmail.com>
Date: Mon, 3 Jun 2024 11:08:52 +0300
Subject: [PATCH 1/2] [utils][filecheck-lint]: speedup filecheck_lint by
 caching edit_distance calculations

---
 llvm/utils/filecheck_lint/filecheck_lint.py | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/llvm/utils/filecheck_lint/filecheck_lint.py b/llvm/utils/filecheck_lint/filecheck_lint.py
index 837846db83321..d238277c9e01b 100755
--- a/llvm/utils/filecheck_lint/filecheck_lint.py
+++ b/llvm/utils/filecheck_lint/filecheck_lint.py
@@ -228,7 +228,8 @@ def find_best_match(typo):
         )
 
     potential_directives = find_potential_directives(content)
-
+    # cache score and best_match to skip recalculating
+    checked_potential_directives = dict()
     for filerange, potential_directive in potential_directives:
         # TODO(bchetioui): match count directives more finely. We skip directives
         # starting with 'CHECK-COUNT-' for the moment as they require more complex
@@ -244,7 +245,11 @@ def find_best_match(typo):
         if len(potential_directive) > max(map(len, all_directives)) + threshold:
             continue
 
-        score, best_match = find_best_match(potential_directive)
+        if potential_directive not in checked_potential_directives:
+            score, best_match = find_best_match(potential_directive)
+            checked_potential_directives[potential_directive] = (score, best_match)
+        else:
+            score, best_match = checked_potential_directives[potential_directive]
         if score == 0:  # This is an actual directive, ignore.
             continue
         elif score <= threshold and best_match not in _ignore:

>From fef05e955f7af8f443ca56aaa2dfa5602fe7bfd0 Mon Sep 17 00:00:00 2001
From: klensy <nightouser at gmail.com>
Date: Mon, 3 Jun 2024 17:44:50 +0300
Subject: [PATCH 2/2] build FileRange string lazily

---
 llvm/utils/filecheck_lint/filecheck_lint.py   | 54 ++++++++++++-------
 .../filecheck_lint/filecheck_lint_test.py     | 18 ++-----
 2 files changed, 38 insertions(+), 34 deletions(-)

diff --git a/llvm/utils/filecheck_lint/filecheck_lint.py b/llvm/utils/filecheck_lint/filecheck_lint.py
index d238277c9e01b..12f8299b83610 100755
--- a/llvm/utils/filecheck_lint/filecheck_lint.py
+++ b/llvm/utils/filecheck_lint/filecheck_lint.py
@@ -81,29 +81,40 @@ class FileRange:
     """Stores the coordinates of a span on a single line within a file.
 
     Attributes:
-      line:         the line number
-      start_column: the (inclusive) column where the span starts
-      end_column:   the (inclusive) column where the span ends
+      content:    line str
+      start_byte: the (inclusive) byte offset the span starts
+      end_byte:   the (inclusive) byte offset the span ends
     """
 
-    line: int
-    start_column: int
-    end_column: int
+    content: str
+    start_byte: int
+    end_byte: int
 
     def __init__(
         self, content: str, start_byte: int, end_byte: int
     ):  # pylint: disable=g-doc-args
-        """Derives a span's coordinates based on a string and start/end bytes.
+        """
+        Stores the coordinates of a span based on a string and start/end bytes.
 
         `start_byte` and `end_byte` are assumed to be on the same line.
         """
-        content_before_span = content[:start_byte]
-        self.line = content_before_span.count("\n") + 1
-        self.start_column = start_byte - content_before_span.rfind("\n")
-        self.end_column = self.start_column + (end_byte - start_byte - 1)
+        self.content = content
+        self.start_byte = start_byte
+        self.end_byte = end_byte
 
-    def __str__(self) -> str:
-        return f"{self.line}:{self.start_column}-{self.end_column}"
+    def as_str(self):
+        """
+        Derives span from line and coordinates.
+
+        start_column: the (inclusive) column where the span starts
+        end_column:   the (inclusive) column where the span ends
+        """
+        content_before_span = self.content[: self.start_byte]
+        line = content_before_span.count("\n") + 1
+        start_column = self.start_byte - content_before_span.rfind("\n")
+        end_column = start_column + (self.end_byte - self.start_byte - 1)
+
+        return f"{line}:{start_column}-{end_column}"
 
 
 class Diagnostic:
@@ -134,7 +145,7 @@ def __init__(
         self.fix = fix
 
     def __str__(self) -> str:
-        return f"{self.filepath}:" + str(self.filerange) + f": {self.summary()}"
+        return f"{self.filepath}:" + self.filerange.as_str() + f": {self.summary()}"
 
     def summary(self) -> str:
         return (
@@ -228,8 +239,8 @@ def find_best_match(typo):
         )
 
     potential_directives = find_potential_directives(content)
-    # cache score and best_match to skip recalculating
-    checked_potential_directives = dict()
+    # Cache score and best_match to skip recalculating.
+    score_and_best_match_for_potential_directive = dict()
     for filerange, potential_directive in potential_directives:
         # TODO(bchetioui): match count directives more finely. We skip directives
         # starting with 'CHECK-COUNT-' for the moment as they require more complex
@@ -245,11 +256,16 @@ def find_best_match(typo):
         if len(potential_directive) > max(map(len, all_directives)) + threshold:
             continue
 
-        if potential_directive not in checked_potential_directives:
+        if potential_directive not in score_and_best_match_for_potential_directive:
             score, best_match = find_best_match(potential_directive)
-            checked_potential_directives[potential_directive] = (score, best_match)
+            score_and_best_match_for_potential_directive[potential_directive] = (
+                score,
+                best_match,
+            )
         else:
-            score, best_match = checked_potential_directives[potential_directive]
+            score, best_match = score_and_best_match_for_potential_directive[
+                potential_directive
+            ]
         if score == 0:  # This is an actual directive, ignore.
             continue
         elif score <= threshold and best_match not in _ignore:
diff --git a/llvm/utils/filecheck_lint/filecheck_lint_test.py b/llvm/utils/filecheck_lint/filecheck_lint_test.py
index 16f381d5b0455..6edcf0abd25a9 100644
--- a/llvm/utils/filecheck_lint/filecheck_lint_test.py
+++ b/llvm/utils/filecheck_lint/filecheck_lint_test.py
@@ -49,27 +49,15 @@ def test_find_potential_directives_comment_prefix(self):
         results = list(fcl.find_potential_directives(content))
         assert len(results) == 3
         pos, match = results[0]
-        assert (
-            pos.line == 1
-            and pos.start_column == len("junk; ") + 1
-            and pos.end_column == len(lines[0]) - 1
-        )
+        assert pos.as_str() == "1:7-11"
         assert match == "CHCK1"
 
         pos, match = results[1]
-        assert (
-            pos.line == 2
-            and pos.start_column == len("junk// ") + 1
-            and pos.end_column == len(lines[1]) - 1
-        )
+        assert pos.as_str() == "2:8-12"
         assert match == "CHCK2"
 
         pos, match = results[2]
-        assert (
-            pos.line == 3
-            and pos.start_column == 1
-            and pos.end_column == len(lines[2]) - 1
-        )
+        assert pos.as_str() == "3:1-10"
         assert match == "SOME CHCK3"
 
     def test_levenshtein(self):