[llvm] Add a script to check for and fix trailing whitespace (PR #123496)

Jeremy Rifkin via llvm-commits llvm-commits at lists.llvm.org
Sat Jan 18 22:43:36 PST 2025


https://github.com/jeremy-rifkin updated https://github.com/llvm/llvm-project/pull/123496

>From 0eb98cf85a4318915e4bd14970ee9cdc5e99dc45 Mon Sep 17 00:00:00 2001
From: Jeremy Rifkin <51220084+jeremy-rifkin at users.noreply.github.com>
Date: Sat, 18 Jan 2025 21:44:47 -0600
Subject: [PATCH 1/3] Add a script to check for and fix trailing whitespace

---
 llvm/utils/git/code-format-helper.py |  49 ++++++-
 llvm/utils/trailing-whitespace.py    | 194 +++++++++++++++++++++++++++
 2 files changed, 242 insertions(+), 1 deletion(-)
 create mode 100644 llvm/utils/trailing-whitespace.py

diff --git a/llvm/utils/git/code-format-helper.py b/llvm/utils/git/code-format-helper.py
index 48a338aca9c8e6..fc384450d739bc 100755
--- a/llvm/utils/git/code-format-helper.py
+++ b/llvm/utils/git/code-format-helper.py
@@ -423,7 +423,54 @@ def format_run(self, changed_files: List[str], args: FormatArgs) -> Optional[str
         return report
 
 
-ALL_FORMATTERS = (DarkerFormatHelper(), ClangFormatHelper(), UndefGetFormatHelper())
+class TrailingWhitespaceFormatter(FormatHelper):
+    name = "trailing-whitespace"
+    friendly_name = "Trailing whitespace formatter"
+
+    @property
+    def instructions(self) -> str:
+        return f"python3 {self.trailing_whitespace_path} --exclude build " + " ".join(self.files)
+
+    @property
+    def trailing_whitespace_path(self) -> str:
+        if "TRAILING_WHITESPACE" in os.environ:
+            return os.environ["TRAILING_WHITESPACE"]
+        return "llvm/utils/trailing-whitespace.py"
+
+    def has_tool(self) -> bool:
+        return True
+
+    def format_run(self, changed_files: List[str], args: FormatArgs) -> Optional[str]:
+        tw_cmd = [
+            self.trailing_whitespace_path,
+            "--exclude",
+            "build",
+        ]
+        if args.start_rev and args.end_rev:
+            tw_cmd += ["--diff", args.start_rev, args.end_rev]
+        tw_cmd += changed_files
+        if args.verbose:
+            print(f"Running: {sys.executable} {' '.join(tw_cmd)}")
+        self.files = changed_files
+        proc = subprocess.run(
+            [sys.executable] + tw_cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE
+        )
+        if args.verbose:
+            sys.stdout.write(proc.stderr.decode("utf-8"))
+
+        if proc.returncode != 0:
+            # formatting needed, or the command otherwise failed
+            if args.verbose:
+                print(f"error: {self.name} exited with code {proc.returncode}")
+                # Print the diff in the log so that it is viewable there
+                print(proc.stdout.decode("utf-8"))
+            return proc.stdout.decode("utf-8")
+        else:
+            sys.stdout.write(proc.stdout.decode("utf-8"))
+            return None
+
+
+ALL_FORMATTERS = (DarkerFormatHelper(), ClangFormatHelper(), UndefGetFormatHelper(), TrailingWhitespaceFormatter())
 
 
 def hook_main():
diff --git a/llvm/utils/trailing-whitespace.py b/llvm/utils/trailing-whitespace.py
new file mode 100644
index 00000000000000..a1e604fabfeb89
--- /dev/null
+++ b/llvm/utils/trailing-whitespace.py
@@ -0,0 +1,194 @@
+import argparse
+import os
+import re
+import subprocess
+import sys
+
+def is_text(file):
+    text_file_extensions = {
+        ".apinotes",
+        ".asm",
+        ".bazel",
+        ".c",
+        ".cc",
+        ".cfg",
+        ".cl",
+        ".clcpp",
+        ".cmake",
+        ".cmd",
+        ".cpp",
+        ".cppm",
+        ".css",
+        ".csv",
+        ".cu",
+        ".d",
+        ".def",
+        ".dot",
+        ".expected",
+        ".f",
+        ".f90",
+        ".fir",
+        ".gn",
+        ".gni",
+        ".h",
+        ".hip",
+        ".hlsl",
+        ".hpp",
+        ".html",
+        ".i",
+        ".in",
+        ".inc",
+        ".jscop",
+        ".json",
+        ".ll",
+        ".m",
+        ".map",
+        ".md",
+        ".mir",
+        ".mlir",
+        ".mm",
+        ".modulemap",
+        ".plist",
+        ".py",
+        ".rc",
+        ".result",
+        ".rsp",
+        ".rst",
+        ".s",
+        ".script",
+        ".sh",
+        ".st",
+        ".tbd",
+        ".td",
+        ".template",
+        ".test",
+        ".transformed",
+        ".txt",
+        ".xml",
+        ".yml",
+        ".yaml",
+    }
+    _, ext = os.path.splitext(file)
+    return ext.lower() in text_file_extensions
+
+def check_file(path, fix):
+    try:
+        trailing = False
+        with open(path, "r", encoding="utf-8", errors="ignore") as f:
+            for line_number, line in enumerate(f, 1):
+                if line.rstrip("\n").endswith(" "):
+                    print(f"{path}:{line_number}: Trailing whitespace found")
+                    trailing = True
+        if trailing and fix:
+            with open(path, "r", encoding="utf-8", errors="ignore") as f:
+                lines = [line.rstrip(" \t") for line in f]
+            with open(path, "w", encoding="utf-8", errors="ignore") as f:
+                f.writelines(lines)
+        return trailing
+    except UnicodeDecodeError:
+        print(f"Warning: Encoding error encountered for {path}")
+    except FileNotFoundError:
+        print(f"Warning: Could not open {path}")
+    return False
+
+def check_paths(paths, exclude, fix):
+    exclude = [os.path.abspath(d) for d in exclude]
+    seen = set()
+    found_trailing = False
+    for path in paths:
+        for root, dirs, files in os.walk(path):
+            dirs[:] = [d for d in dirs if os.path.abspath(os.path.join(root, d)) not in exclude]
+            for file in files:
+                file_path = os.path.join(root, file)
+                if not is_text(file):
+                    continue
+                if file_path in seen:
+                    continue
+                seen.add(file_path)
+                if check_file(file_path, fix):
+                    found_trailing = True
+    return found_trailing
+
+HUNK_HEADER_REGEX = re.compile(r"^@@ -(\d+)(?:,(\d+))? \+(\d+)(?:,(\d+))? @@")
+DIFF_HEADER_REGEX = re.compile(r"^diff --git a/(.+) b/(.+)")
+
+def parse_diffs(diff_text):
+    diffs: dict[str, list[tuple[int, str]]] = dict()
+    file = None
+    current_line = None
+    for line in diff_text.splitlines():
+        match = DIFF_HEADER_REGEX.match(line)
+        if match:
+            file = match.groups()[1]
+            if file not in diffs:
+                diffs[file] = []
+            current_line = None
+            continue
+        match = HUNK_HEADER_REGEX.match(line)
+        if match:
+            current_line = int(match.groups()[2])
+            continue
+        if not current_line: # haven't seen the hunk header yet, continue
+            continue
+        if line.startswith("+"):
+            line = line[1:]
+            diffs[file].append((current_line, line))
+            current_line += 1
+    return diffs
+
+def check_paths_diff(paths, exclude, rev_start, rev_end):
+    exclude = [os.path.abspath(d) for d in exclude]
+    cmd = ["git", "diff", "-U0", rev_start, rev_end, *paths]
+    proc = subprocess.run(cmd, stdout=subprocess.PIPE, encoding="utf-8")
+    all_diffs = parse_diffs(proc.stdout)
+    found_trailing = False
+    for file, diffs in all_diffs.items():
+        if any([os.path.abspath(file).startswith(path) for path in exclude]):
+            continue
+        # kind of redundant, diffs are text, but just to be consistent
+        if not is_text(file):
+            continue
+        for num, line in diffs:
+            if line.endswith(" "):
+                print(f"{file}:{num}: Trailing whitespace found")
+                found_trailing = True
+    return found_trailing
+
+if __name__ == "__main__":
+    script_path = os.path.abspath(__file__)
+
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--fix", action=argparse.BooleanOptionalAction, default=False, help="Automatically apply fixes"
+    )
+    parser.add_argument(
+        "--exclude",
+        action="append",
+        default=[],
+        help="Paths to exclude. Can be used multiple times."
+    )
+    parser.add_argument(
+        "--diff",
+        help="Compute only based on changed lines (format: rev_start..rev_end)",
+    )
+    parser.add_argument("paths", nargs="*", type=str, help="Paths to check")
+
+    args = parser.parse_args()
+    paths = set(args.paths)
+
+    if len(paths) == 0:
+        print("Error: Must specify paths to check", file=sys.stderr)
+        sys.exit(1)
+
+    if args.diff and args.fix:
+        print("Error: Diff mode doesn't support --fix currently", file=sys.stderr)
+        sys.exit(1)
+
+    if args.diff:
+        rev_start, rev_end = args.diff.split("..")
+        found_trailing = check_paths_diff(paths, args.exclude, rev_start, rev_end)
+    else:
+        found_trailing = check_paths(paths, args.exclude, args.fix)
+
+    if found_trailing:
+        sys.exit(1)

>From 420c01355d65ba2ad8b82928c583cff9bc38b16d Mon Sep 17 00:00:00 2001
From: Jeremy Rifkin <51220084+jeremy-rifkin at users.noreply.github.com>
Date: Sat, 18 Jan 2025 22:16:45 -0600
Subject: [PATCH 2/3] Fix issue with --exclude

---
 llvm/utils/trailing-whitespace.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/llvm/utils/trailing-whitespace.py b/llvm/utils/trailing-whitespace.py
index a1e604fabfeb89..8e02d74440e426 100644
--- a/llvm/utils/trailing-whitespace.py
+++ b/llvm/utils/trailing-whitespace.py
@@ -96,6 +96,8 @@ def check_paths(paths, exclude, fix):
     seen = set()
     found_trailing = False
     for path in paths:
+        if os.path.abspath(path) in exclude:
+            continue
         for root, dirs, files in os.walk(path):
             dirs[:] = [d for d in dirs if os.path.abspath(os.path.join(root, d)) not in exclude]
             for file in files:

>From 41956f3ec4b60f4005c3197706fd2690cb284414 Mon Sep 17 00:00:00 2001
From: Jeremy Rifkin <51220084+jeremy-rifkin at users.noreply.github.com>
Date: Sun, 19 Jan 2025 00:43:23 -0600
Subject: [PATCH 3/3] Add some tests and rename script

---
 llvm/utils/git/code-format-helper.py          |  4 +-
 ...g-whitespace.py => trailing_whitespace.py} |  0
 llvm/utils/trailing_whitespace_test.py        | 59 +++++++++++++++++++
 3 files changed, 61 insertions(+), 2 deletions(-)
 rename llvm/utils/{trailing-whitespace.py => trailing_whitespace.py} (100%)
 create mode 100644 llvm/utils/trailing_whitespace_test.py

diff --git a/llvm/utils/git/code-format-helper.py b/llvm/utils/git/code-format-helper.py
index fc384450d739bc..fc42422b38faf0 100755
--- a/llvm/utils/git/code-format-helper.py
+++ b/llvm/utils/git/code-format-helper.py
@@ -424,7 +424,7 @@ def format_run(self, changed_files: List[str], args: FormatArgs) -> Optional[str
 
 
 class TrailingWhitespaceFormatter(FormatHelper):
-    name = "trailing-whitespace"
+    name = "trailing_whitespace"
     friendly_name = "Trailing whitespace formatter"
 
     @property
@@ -435,7 +435,7 @@ def instructions(self) -> str:
     def trailing_whitespace_path(self) -> str:
         if "TRAILING_WHITESPACE" in os.environ:
             return os.environ["TRAILING_WHITESPACE"]
-        return "llvm/utils/trailing-whitespace.py"
+        return "llvm/utils/trailing_whitespace.py"
 
     def has_tool(self) -> bool:
         return True
diff --git a/llvm/utils/trailing-whitespace.py b/llvm/utils/trailing_whitespace.py
similarity index 100%
rename from llvm/utils/trailing-whitespace.py
rename to llvm/utils/trailing_whitespace.py
diff --git a/llvm/utils/trailing_whitespace_test.py b/llvm/utils/trailing_whitespace_test.py
new file mode 100644
index 00000000000000..643e7b274b980b
--- /dev/null
+++ b/llvm/utils/trailing_whitespace_test.py
@@ -0,0 +1,59 @@
+import unittest
+
+import trailing_whitespace
+
+class Test(unittest.TestCase):
+    """Tests for trailing_whitespace."""
+
+    def test_is_text(self) -> None:
+        self.assertTrue(trailing_whitespace.is_text("foo.cpp"))
+        self.assertTrue(trailing_whitespace.is_text("foo/bar.cpp"))
+        self.assertFalse(trailing_whitespace.is_text("foo/bar"))
+        self.assertFalse(trailing_whitespace.is_text("foo/bar.o"))
+
+    def test_diff_parsing(self) -> None:
+        basic_diff = """diff --git a/clang/test/CodeGen/memalign-libcall.c b/clang/test/CodeGen/memalign-libcall.c
+index 2070eebdbf84..4fe1a838d15f 100644
+--- a/clang/test/CodeGen/memalign-libcall.c
++++ b/clang/test/CodeGen/memalign-libcall.c
+@@ -12 +12,2 @@ void *test(size_t alignment, size_t size) {
+-// CHECK: attributes #2 = { nobuiltin "no-builtin-memalign" } 
+\\ No newline at end of file
++// CHECK: attributes #2 = { nobuiltin "no-builtin-memalign" }"""
+        self.assertEqual(trailing_whitespace.parse_diffs(basic_diff), {
+            'clang/test/CodeGen/memalign-libcall.c': [
+                (12, "// CHECK: attributes #2 = { nobuiltin \"no-builtin-memalign\" }")
+            ]
+        })
+        multiple_added_lines = """diff --git a/clang/test/CodeGen/memalign-libcall.c b/clang/test/CodeGen/memalign-libcall.c
+index 2070eebdbf84..4fe1a838d15f 100644
+--- a/clang/test/CodeGen/memalign-libcall.c
++++ b/clang/test/CodeGen/memalign-libcall.c
+@@ -12 +12,2 @@ void *test(size_t alignment, size_t size) {
+-// CHECK: attributes #2 = { nobuiltin "no-builtin-memalign" } 
+\\ No newline at end of file
++// CHECK: attributes #2 = { nobuiltin "no-builtin-memalign" }
++foobar"""
+        self.assertEqual(trailing_whitespace.parse_diffs(multiple_added_lines), {
+            'clang/test/CodeGen/memalign-libcall.c': [
+                (12, "// CHECK: attributes #2 = { nobuiltin \"no-builtin-memalign\" }"),
+                (13, "foobar"),
+            ]
+        })
+        multiple_deleted = """diff --git a/clang/test/CodeGen/memalign-libcall.c b/clang/test/CodeGen/memalign-libcall.c
+index 2070eebdbf84..4fe1a838d15f 100644
+--- a/clang/test/CodeGen/memalign-libcall.c
++++ b/clang/test/CodeGen/memalign-libcall.c
+@@ -12 +12,2 @@ void *test(size_t alignment, size_t size) {
+-foobar
+-// CHECK: attributes #2 = { nobuiltin "no-builtin-memalign" } 
+\\ No newline at end of file
++// CHECK: attributes #2 = { nobuiltin "no-builtin-memalign" }"""
+        self.assertEqual(trailing_whitespace.parse_diffs(multiple_deleted), {
+            'clang/test/CodeGen/memalign-libcall.c': [
+                (12, "// CHECK: attributes #2 = { nobuiltin \"no-builtin-memalign\" }"),
+            ]
+        })
+
+if __name__ == "__main__":
+    unittest.main()



More information about the llvm-commits mailing list