[llvm] f702c75 - [utils][filecheck-lint] Add filecheck-lint, a linter for FileCheck directives.

Thu Jan 12 05:20:19 PST 2023

Author: Benjamin Chetioui
Date: 2023-01-12T13:20:02Z
New Revision: f702c75995db9ea3978d39e3f3885b2113356f68

URL: https://github.com/llvm/llvm-project/commit/f702c75995db9ea3978d39e3f3885b2113356f68
DIFF: https://github.com/llvm/llvm-project/commit/f702c75995db9ea3978d39e3f3885b2113356f68.diff

LOG: [utils][filecheck-lint] Add filecheck-lint, a linter for FileCheck directives.

FileCheck is a versatile tool for testing compiler output regardless of syntax.
FileCheck achieves this versatility by processing only input lines in which it
detects a valid CHECK directive, and ignoring everything else.

This comes at a price: if a directive is not typed properly, it is not
processed by FileCheck, and the test code it precedes is effectively disabled.
This results in the illusion that the code is tested, while the test may have
actually never run.

This scenario is not hypothetical, see the fixes introduced in, e.g.
https://github.com/tensorflow/tensorflow/commit/48cacf049f3d6ed3f289ccc48ec50491b6d8d9a8,
https://reviews.llvm.org/D139698, https://reviews.llvm.org/D139636,
https://github.com/iree-org/iree/pull/11693.

The findings corrected in the above changes originate in filecheck-lint.
In a given test file, filecheck-lint uses the edit distance between anything
that looks like a FileCheck directive and the set of locally valid directives
to detect likely misspelled directives.

The tool is not yet feature complete---e.g. it does not parse custom comment
prefixes to exclude them from reporting---but it already yields useful results,
as demonstrated above.

Differential Revision: https://reviews.llvm.org/D141508

Added: 
    llvm/utils/filecheck_lint/filecheck_lint.py
    llvm/utils/filecheck_lint/filecheck_lint_test.py

Modified: 
    

Removed: 
    


################################################################################
diff  --git a/llvm/utils/filecheck_lint/filecheck_lint.py b/llvm/utils/filecheck_lint/filecheck_lint.py
new file mode 100644
index 0000000000000..cae4b3396b3d2

--- /dev/null
+++ b/llvm/utils/filecheck_lint/filecheck_lint.py
@@ -0,0 +1,251 @@
+# ===----------------------------------------------------------------------===##
+#
+# Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+#
+# ===----------------------------------------------------------------------===##
+"""A linter that detects potential typos in FileCheck directive names.
+
+Consider a broken test foo.cpp:
+
+// RUN: clang -cc1 -ast-dump %s | FileCheck %s --check-prefix=NEW
+// RUN: clang -cc1 -ast-dump %s -std=c++98 | FileCheck %s --check-prefix=OLD
+auto x = 42;
+// NEWW: auto is a c++11 extension
+// ODL-NOT: auto is a c++11 extension
+
+We first detect the locally valid FileCheck directive prefixes by parsing the
+--check-prefix flags. Here we get {CHECK, NEW, OLD}, so our directive names are
+{CHECK, NEW, OLD, CHECK-NOT, NEW-NOT, ...}.
+
+Then we look for lines that look like directives. These are of the form 'FOO:',
+usually at the beginning of a line or a comment. If any of these are a
+"near-miss" for a directive name, then we suspect this is a typo and report it.
+
+Usage: filecheck_lint path/to/test/file/1 ... path/to/test/file/n
+"""
+
+import itertools
+import logging
+import pathlib
+import re
+import sys
+from typing import Generator, Sequence, Tuple
+
+_distance_threshold = 3
+_prefixes = {'CHECK'}
+_suffixes = {'-DAG', '-COUNT', '-EMPTY', '-LABEL', '-NEXT', '-NOT', '-SAME'}
+# 'NOTE' and 'TODO' are not directives, but are likely to be false positives
+# if encountered and to generate noise as a result. We filter them out also to
+# avoid this.
+_lit_directives = {
+    'RUN',
+    'REQUIRES',
+    'UNSUPPORTED',
+    'XFAIL',
+    'DEFINE',
+    'REDEFINE',
+}
+# 'COM' and 'RUN' are default comment prefixes for FileCheck.
+_comment_prefixes = {'COM', 'RUN'}
+_ignore = _lit_directives.union(_comment_prefixes).union({'NOTE', 'TODO'})
+
+
+def levenshtein(s1: str, s2: str) -> int:  # pylint: disable=g-doc-args
+  """Computes the edit distance between two strings.
+
+  Additions, deletions, and substitutions all count as a single operation.
+  """
+  if not s1:
+    return len(s2)
+  if not s2:
+    return len(s1)
+
+  distances = range(len(s2) + 1)
+  for i in range(len(s1)):
+    new_distances = [i + 1]
+    for j in range(len(s2)):
+      cost = min(distances[j] + int(s1[i] != s2[j]), distances[j + 1] + 1,
+                 new_distances[-1] + 1)
+      new_distances.append(cost)
+    distances = new_distances
+  return distances[-1]
+
+
+class FileRange:
+  """Stores the coordinates of a span on a single line within a file.
+
+  Attributes:
+    line:         the line number
+    start_column: the (inclusive) column where the span starts
+    end_column:   the (inclusive) column where the span ends
+  """
+  line: int
+  start_column: int
+  end_column: int
+
+  def __init__(self, content: str, start_byte: int, end_byte: int):  # pylint: disable=g-doc-args
+    """Derives a span's coordinates based on a string and start/end bytes.
+
+    `start_byte` and `end_byte` are assumed to be on the same line.
+    """
+    content_before_span = content[:start_byte]
+    self.line = content_before_span.count('\n') + 1
+    self.start_column = start_byte - content_before_span.rfind('\n')
+    self.end_column = self.start_column + (end_byte - start_byte - 1)
+
+  def __str__(self) -> str:
+    return f'{self.line}:{self.start_column}-{self.end_column}'
+
+
+class Diagnostic:
+  """Stores information about one typo and a suggested fix.
+
+  Attributes:
+    filepath:   the path to the file in which the typo was found
+    filerange:  the position at which the typo was found in the file
+    typo:       the typo
+    fix:        a suggested fix
+  """
+
+  filepath: pathlib.Path
+  filerange: FileRange
+  typo: str
+  fix: str
+
+  def __init__(
+      self,
+      filepath: pathlib.Path,
+      filerange: FileRange,
+      typo: str,
+      fix: str  # pylint: disable=redefined-outer-name
+  ):
+    self.filepath = filepath
+    self.filerange = filerange
+    self.typo = typo
+    self.fix = fix
+
+  def __str__(self) -> str:
+    return f'{self.filepath}:' + str(self.filerange) + f': {self.summary()}'
+
+  def summary(self) -> str:
+    return (
+        f'Found potentially misspelled directive "{self.typo}". Did you mean '
+        f'"{self.fix}"?')
+
+
+def find_potential_directives(
+    content: str,) -> Generator[Tuple[FileRange, str], None, None]:
+  """Extracts all the potential FileCheck directives from a string.
+
+  What constitutes a potential directive is loosely defined---we err on the side
+  of capturing more strings than is necessary, rather than missing any.
+
+  Args:
+    content: the string in which to look for directives
+
+  Yields:
+    Tuples (p, d) where p is the span where the potential directive occurs
+    within the string and d is the potential directive.
+  """
+  directive_pattern = re.compile(
+      r'(?:^|//|;|#)[^\d\w\-_]*([\d\w\-_][\s\d\w\-_]*):', re.MULTILINE)
+  for match in re.finditer(directive_pattern, content):
+    potential_directive, span = match.group(1), match.span(1)
+    yield (FileRange(content, span[0], span[1]), potential_directive)
+
+
+# TODO(bchetioui): also parse comment prefixes to ignore.
+def parse_custom_prefixes(content: str) -> Generator[str, None, None]:  # pylint: disable=g-doc-args
+  """Parses custom prefixes defined in the string provided.
+
+  For example, given the following file content:
+    RUN: something | FileCheck %s -check-prefixes CHECK1,CHECK2
+    RUN: something_else | FileCheck %s -check-prefix 'CHECK3'
+
+  the custom prefixes are CHECK1, CHECK2, and CHECK3.
+  """
+  param_re = r'|'.join([r"'[^']*'", r'"[^"]*"', r'[^\'"\s]+'])
+  for m in re.finditer(r'-check-prefix(?:es)?(?:\s+|=)({})'.format(param_re),
+                       content):
+    prefixes = m.group(1)
+    if prefixes.startswith('\'') or prefixes.startswith('"'):
+      prefixes = prefixes[1:-1]
+    for prefix in prefixes.split(','):
+      yield prefix
+
+
+def find_directive_typos(
+    content: str,
+    filepath: pathlib.Path,
+    threshold: int = 3,
+) -> Generator[Diagnostic, None, None]:
+  """Detects potential typos in FileCheck directives.
+
+  Args:
+    content: the content of the file
+    filepath: the path to the file to check for typos in directives
+    threshold: the (inclusive) maximum edit distance between a potential
+      directive and an actual directive, such that the potential directive is
+      classified as a typo
+
+  Yields:
+    Diagnostics, in order from the top of the file.
+  """
+  all_prefixes = _prefixes.union(set(parse_custom_prefixes(content)))
+  all_directives = ([
+      f'{prefix}{suffix}'
+      for prefix, suffix in itertools.product(all_prefixes, _suffixes)
+  ] + list(_ignore) + list(all_prefixes))
+
+  def find_best_match(typo):
+    return min(
+        [(threshold + 1, typo)] + [(levenshtein(typo, d), d)
+                                   for d in all_directives
+                                   if abs(len(d) - len(typo)) <= threshold],
+        key=lambda tup: tup[0],
+    )
+
+  potential_directives = find_potential_directives(content)
+
+  for filerange, potential_directive in potential_directives:
+    # TODO(bchetioui): match count directives more finely. We skip directives
+    # starting with 'CHECK-COUNT-' for the moment as they require more complex
+    # logic to be handled correctly.
+    if any(
+        potential_directive.startswith(f'{prefix}-COUNT-')
+        for prefix in all_prefixes):
+      continue
+
+    # Ignoring potential typos that will not be matched later due to a too low
+    # threshold, in order to avoid potentially long computation times.
+    if len(potential_directive) > max(map(len, all_directives)) + threshold:
+      continue
+
+    score, best_match = find_best_match(potential_directive)
+    if score == 0:  # This is an actual directive, ignore.
+      continue
+    elif score <= threshold and best_match not in _ignore:
+      yield Diagnostic(filepath, filerange, potential_directive, best_match)
+
+
+def main(argv: Sequence[str]):
+  if len(argv) < 2:
+    print(f'Usage: {argv[0]} path/to/file/1 ... path/to/file/n')
+    exit(1)
+
+  for filepath in argv[1:]:
+    logging.info('Checking %s', filepath)
+    with open(filepath, 'rt') as f:
+      content = f.read()
+    for diagnostic in find_directive_typos(
+        content,
+        pathlib.Path(filepath),
+        threshold=_distance_threshold,
+    ):
+      print(diagnostic)
+
+
+if __name__ == '__main__':
+  main(sys.argv)

diff  --git a/llvm/utils/filecheck_lint/filecheck_lint_test.py b/llvm/utils/filecheck_lint/filecheck_lint_test.py
new file mode 100644
index 0000000000000..ddb2c0768c4f8
--- /dev/null
+++ b/llvm/utils/filecheck_lint/filecheck_lint_test.py
@@ -0,0 +1,78 @@
+# ===----------------------------------------------------------------------===##
+#
+# Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+#
+# ===----------------------------------------------------------------------===##
+import unittest
+
+import filecheck_lint as fcl
+
+
+class TestParser(unittest.TestCase):
+
+  def test_parse_all_additional_prefixes(self):
+
+    def run(content, expected_prefixes):
+      prefixes = set(fcl.parse_custom_prefixes(content))
+      for prefix in expected_prefixes:
+        self.assertIn(prefix, prefixes)
+
+    for content, expected_prefixes in [
+        ('-check-prefix=PREFIX', {'PREFIX'}),
+        ('-check-prefix=\'PREFIX\'', {'PREFIX'}),
+        ('-check-prefix="PREFIX"', {'PREFIX'}),
+        ('-check-prefix PREFIX', {'PREFIX'}),
+        ('-check-prefix      PREFIX', {'PREFIX'}),
+        ('-check-prefixes=PREFIX1,PREFIX2', {'PREFIX1', 'PREFIX2'}),
+        ('-check-prefixes PREFIX1,PREFIX2', {'PREFIX1', 'PREFIX2'}),
+        (
+            """-check-prefix=PREFIX1 -check-prefix PREFIX2
+            -check-prefixes=PREFIX3,PREFIX4 -check-prefix=PREFIX5
+            -check-prefixes PREFIX6,PREFIX7 -check-prefixes=PREFIX8',
+         """,  # pylint: disable=bad-continuation
+            {f'PREFIX{i}' for i in range(1, 9)}),
+    ]:
+      run(content, expected_prefixes)
+
+  def test_additional_prefixes_uniquely(self):
+    lines = ['--check-prefix=SOME-PREFIX', '--check-prefix=SOME-PREFIX']
+    prefixes = set(fcl.parse_custom_prefixes('\n'.join(lines)))
+    assert len(prefixes) == 1
+
+
+class TestTypoDetection(unittest.TestCase):
+
+  def test_find_potential_directives_comment_prefix(self):
+    lines = ['junk; CHCK1:', 'junk// CHCK2:', 'SOME CHCK3:']
+    content = '\n'.join(lines)
+
+    results = list(fcl.find_potential_directives(content))
+    assert len(results) == 3
+    pos, match = results[0]
+    assert (pos.line == 1 and
+            pos.start_column == len('junk; ') + 1 and
+            pos.end_column == len(lines[0]) - 1)
+    assert match == 'CHCK1'
+
+    pos, match = results[1]
+    assert (pos.line == 2 and
+            pos.start_column == len('junk// ') + 1 and
+            pos.end_column == len(lines[1]) - 1)
+    assert match == 'CHCK2'
+
+    pos, match = results[2]
+    assert (pos.line == 3 and
+            pos.start_column == 1 and
+            pos.end_column == len(lines[2]) - 1)
+    assert match == 'SOME CHCK3'
+
+  def test_levenshtein(self):
+    for s1, s2, distance in [
+        ('Levenshtein', 'Levenstin', 2),  # 2 insertions
+        ('Levenshtein', 'Levenstherin', 3),  # 1 insertion, 2 deletions
+        ('Levenshtein', 'Lenvinshtein', 2),  # 1 deletion, 1 substitution
+        ('Levenshtein', 'Levenshtein', 0),  # identical strings
+    ]:
+      assert fcl.levenshtein(s1, s2) == distance