[llvm] Add a script to check for and fix trailing whitespace (PR #123496)
Jeremy Rifkin via llvm-commits
llvm-commits at lists.llvm.org
Sat Jan 18 22:43:36 PST 2025
https://github.com/jeremy-rifkin updated https://github.com/llvm/llvm-project/pull/123496
>From 0eb98cf85a4318915e4bd14970ee9cdc5e99dc45 Mon Sep 17 00:00:00 2001
From: Jeremy Rifkin <51220084+jeremy-rifkin at users.noreply.github.com>
Date: Sat, 18 Jan 2025 21:44:47 -0600
Subject: [PATCH 1/3] Add a script to check for and fix trailing whitespace
---
llvm/utils/git/code-format-helper.py | 49 ++++++-
llvm/utils/trailing-whitespace.py | 194 +++++++++++++++++++++++++++
2 files changed, 242 insertions(+), 1 deletion(-)
create mode 100644 llvm/utils/trailing-whitespace.py
diff --git a/llvm/utils/git/code-format-helper.py b/llvm/utils/git/code-format-helper.py
index 48a338aca9c8e6..fc384450d739bc 100755
--- a/llvm/utils/git/code-format-helper.py
+++ b/llvm/utils/git/code-format-helper.py
@@ -423,7 +423,54 @@ def format_run(self, changed_files: List[str], args: FormatArgs) -> Optional[str
return report
-ALL_FORMATTERS = (DarkerFormatHelper(), ClangFormatHelper(), UndefGetFormatHelper())
+class TrailingWhitespaceFormatter(FormatHelper):
+ name = "trailing-whitespace"
+ friendly_name = "Trailing whitespace formatter"
+
+ @property
+ def instructions(self) -> str:
+ return f"python3 {self.trailing_whitespace_path} --exclude build " + " ".join(self.files)
+
+ @property
+ def trailing_whitespace_path(self) -> str:
+ if "TRAILING_WHITESPACE" in os.environ:
+ return os.environ["TRAILING_WHITESPACE"]
+ return "llvm/utils/trailing-whitespace.py"
+
+ def has_tool(self) -> bool:
+ return True
+
+ def format_run(self, changed_files: List[str], args: FormatArgs) -> Optional[str]:
+ tw_cmd = [
+ self.trailing_whitespace_path,
+ "--exclude",
+ "build",
+ ]
+ if args.start_rev and args.end_rev:
+ tw_cmd += ["--diff", args.start_rev, args.end_rev]
+ tw_cmd += changed_files
+ if args.verbose:
+ print(f"Running: {sys.executable} {' '.join(tw_cmd)}")
+ self.files = changed_files
+ proc = subprocess.run(
+ [sys.executable] + tw_cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE
+ )
+ if args.verbose:
+ sys.stdout.write(proc.stderr.decode("utf-8"))
+
+ if proc.returncode != 0:
+ # formatting needed, or the command otherwise failed
+ if args.verbose:
+ print(f"error: {self.name} exited with code {proc.returncode}")
+ # Print the diff in the log so that it is viewable there
+ print(proc.stdout.decode("utf-8"))
+ return proc.stdout.decode("utf-8")
+ else:
+ sys.stdout.write(proc.stdout.decode("utf-8"))
+ return None
+
+
+ALL_FORMATTERS = (DarkerFormatHelper(), ClangFormatHelper(), UndefGetFormatHelper(), TrailingWhitespaceFormatter())
def hook_main():
diff --git a/llvm/utils/trailing-whitespace.py b/llvm/utils/trailing-whitespace.py
new file mode 100644
index 00000000000000..a1e604fabfeb89
--- /dev/null
+++ b/llvm/utils/trailing-whitespace.py
@@ -0,0 +1,194 @@
+import argparse
+import os
+import re
+import subprocess
+import sys
+
+def is_text(file):
+ text_file_extensions = {
+ ".apinotes",
+ ".asm",
+ ".bazel",
+ ".c",
+ ".cc",
+ ".cfg",
+ ".cl",
+ ".clcpp",
+ ".cmake",
+ ".cmd",
+ ".cpp",
+ ".cppm",
+ ".css",
+ ".csv",
+ ".cu",
+ ".d",
+ ".def",
+ ".dot",
+ ".expected",
+ ".f",
+ ".f90",
+ ".fir",
+ ".gn",
+ ".gni",
+ ".h",
+ ".hip",
+ ".hlsl",
+ ".hpp",
+ ".html",
+ ".i",
+ ".in",
+ ".inc",
+ ".jscop",
+ ".json",
+ ".ll",
+ ".m",
+ ".map",
+ ".md",
+ ".mir",
+ ".mlir",
+ ".mm",
+ ".modulemap",
+ ".plist",
+ ".py",
+ ".rc",
+ ".result",
+ ".rsp",
+ ".rst",
+ ".s",
+ ".script",
+ ".sh",
+ ".st",
+ ".tbd",
+ ".td",
+ ".template",
+ ".test",
+ ".transformed",
+ ".txt",
+ ".xml",
+ ".yml",
+ ".yaml",
+ }
+ _, ext = os.path.splitext(file)
+ return ext.lower() in text_file_extensions
+
+def check_file(path, fix):
+ try:
+ trailing = False
+ with open(path, "r", encoding="utf-8", errors="ignore") as f:
+ for line_number, line in enumerate(f, 1):
+ if line.rstrip("\n").endswith(" "):
+ print(f"{path}:{line_number}: Trailing whitespace found")
+ trailing = True
+ if trailing and fix:
+ with open(path, "r", encoding="utf-8", errors="ignore") as f:
+ lines = [line.rstrip(" \t") for line in f]
+ with open(path, "w", encoding="utf-8", errors="ignore") as f:
+ f.writelines(lines)
+ return trailing
+ except UnicodeDecodeError:
+ print(f"Warning: Encoding error encountered for {path}")
+ except FileNotFoundError:
+ print(f"Warning: Could not open {path}")
+ return False
+
+def check_paths(paths, exclude, fix):
+ exclude = [os.path.abspath(d) for d in exclude]
+ seen = set()
+ found_trailing = False
+ for path in paths:
+ for root, dirs, files in os.walk(path):
+ dirs[:] = [d for d in dirs if os.path.abspath(os.path.join(root, d)) not in exclude]
+ for file in files:
+ file_path = os.path.join(root, file)
+ if not is_text(file):
+ continue
+ if file_path in seen:
+ continue
+ seen.add(file_path)
+ if check_file(file_path, fix):
+ found_trailing = True
+ return found_trailing
+
+HUNK_HEADER_REGEX = re.compile(r"^@@ -(\d+)(?:,(\d+))? \+(\d+)(?:,(\d+))? @@")
+DIFF_HEADER_REGEX = re.compile(r"^diff --git a/(.+) b/(.+)")
+
+def parse_diffs(diff_text):
+ diffs: dict[str, list[tuple[int, str]]] = dict()
+ file = None
+ current_line = None
+ for line in diff_text.splitlines():
+ match = DIFF_HEADER_REGEX.match(line)
+ if match:
+ file = match.groups()[1]
+ if file not in diffs:
+ diffs[file] = []
+ current_line = None
+ continue
+ match = HUNK_HEADER_REGEX.match(line)
+ if match:
+ current_line = int(match.groups()[2])
+ continue
+ if not current_line: # haven't seen the hunk header yet, continue
+ continue
+ if line.startswith("+"):
+ line = line[1:]
+ diffs[file].append((current_line, line))
+ current_line += 1
+ return diffs
+
+def check_paths_diff(paths, exclude, rev_start, rev_end):
+ exclude = [os.path.abspath(d) for d in exclude]
+ cmd = ["git", "diff", "-U0", rev_start, rev_end, *paths]
+ proc = subprocess.run(cmd, stdout=subprocess.PIPE, encoding="utf-8")
+ all_diffs = parse_diffs(proc.stdout)
+ found_trailing = False
+ for file, diffs in all_diffs.items():
+ if any([os.path.abspath(file).startswith(path) for path in exclude]):
+ continue
+ # kind of redundant, diffs are text, but just to be consistent
+ if not is_text(file):
+ continue
+ for num, line in diffs:
+ if line.endswith(" "):
+ print(f"{file}:{num}: Trailing whitespace found")
+ found_trailing = True
+ return found_trailing
+
+if __name__ == "__main__":
+ script_path = os.path.abspath(__file__)
+
+ parser = argparse.ArgumentParser()
+ parser.add_argument(
+ "--fix", action=argparse.BooleanOptionalAction, default=False, help="Automatically apply fixes"
+ )
+ parser.add_argument(
+ "--exclude",
+ action="append",
+ default=[],
+ help="Paths to exclude. Can be used multiple times."
+ )
+ parser.add_argument(
+ "--diff",
+ help="Compute only based on changed lines (format: rev_start..rev_end)",
+ )
+ parser.add_argument("paths", nargs="*", type=str, help="Paths to check")
+
+ args = parser.parse_args()
+ paths = set(args.paths)
+
+ if len(paths) == 0:
+ print("Error: Must specify paths to check", file=sys.stderr)
+ sys.exit(1)
+
+ if args.diff and args.fix:
+ print("Error: Diff mode doesn't support --fix currently", file=sys.stderr)
+ sys.exit(1)
+
+ if args.diff:
+ rev_start, rev_end = args.diff.split("..")
+ found_trailing = check_paths_diff(paths, args.exclude, rev_start, rev_end)
+ else:
+ found_trailing = check_paths(paths, args.exclude, args.fix)
+
+ if found_trailing:
+ sys.exit(1)
>From 420c01355d65ba2ad8b82928c583cff9bc38b16d Mon Sep 17 00:00:00 2001
From: Jeremy Rifkin <51220084+jeremy-rifkin at users.noreply.github.com>
Date: Sat, 18 Jan 2025 22:16:45 -0600
Subject: [PATCH 2/3] Fix issue with --exclude
---
llvm/utils/trailing-whitespace.py | 2 ++
1 file changed, 2 insertions(+)
diff --git a/llvm/utils/trailing-whitespace.py b/llvm/utils/trailing-whitespace.py
index a1e604fabfeb89..8e02d74440e426 100644
--- a/llvm/utils/trailing-whitespace.py
+++ b/llvm/utils/trailing-whitespace.py
@@ -96,6 +96,8 @@ def check_paths(paths, exclude, fix):
seen = set()
found_trailing = False
for path in paths:
+ if os.path.abspath(path) in exclude:
+ continue
for root, dirs, files in os.walk(path):
dirs[:] = [d for d in dirs if os.path.abspath(os.path.join(root, d)) not in exclude]
for file in files:
>From 41956f3ec4b60f4005c3197706fd2690cb284414 Mon Sep 17 00:00:00 2001
From: Jeremy Rifkin <51220084+jeremy-rifkin at users.noreply.github.com>
Date: Sun, 19 Jan 2025 00:43:23 -0600
Subject: [PATCH 3/3] Add some tests and rename script
---
llvm/utils/git/code-format-helper.py | 4 +-
...g-whitespace.py => trailing_whitespace.py} | 0
llvm/utils/trailing_whitespace_test.py | 59 +++++++++++++++++++
3 files changed, 61 insertions(+), 2 deletions(-)
rename llvm/utils/{trailing-whitespace.py => trailing_whitespace.py} (100%)
create mode 100644 llvm/utils/trailing_whitespace_test.py
diff --git a/llvm/utils/git/code-format-helper.py b/llvm/utils/git/code-format-helper.py
index fc384450d739bc..fc42422b38faf0 100755
--- a/llvm/utils/git/code-format-helper.py
+++ b/llvm/utils/git/code-format-helper.py
@@ -424,7 +424,7 @@ def format_run(self, changed_files: List[str], args: FormatArgs) -> Optional[str
class TrailingWhitespaceFormatter(FormatHelper):
- name = "trailing-whitespace"
+ name = "trailing_whitespace"
friendly_name = "Trailing whitespace formatter"
@property
@@ -435,7 +435,7 @@ def instructions(self) -> str:
def trailing_whitespace_path(self) -> str:
if "TRAILING_WHITESPACE" in os.environ:
return os.environ["TRAILING_WHITESPACE"]
- return "llvm/utils/trailing-whitespace.py"
+ return "llvm/utils/trailing_whitespace.py"
def has_tool(self) -> bool:
return True
diff --git a/llvm/utils/trailing-whitespace.py b/llvm/utils/trailing_whitespace.py
similarity index 100%
rename from llvm/utils/trailing-whitespace.py
rename to llvm/utils/trailing_whitespace.py
diff --git a/llvm/utils/trailing_whitespace_test.py b/llvm/utils/trailing_whitespace_test.py
new file mode 100644
index 00000000000000..643e7b274b980b
--- /dev/null
+++ b/llvm/utils/trailing_whitespace_test.py
@@ -0,0 +1,59 @@
+import unittest
+
+import trailing_whitespace
+
+class Test(unittest.TestCase):
+ """Tests for trailing_whitespace."""
+
+ def test_is_text(self) -> None:
+ self.assertTrue(trailing_whitespace.is_text("foo.cpp"))
+ self.assertTrue(trailing_whitespace.is_text("foo/bar.cpp"))
+ self.assertFalse(trailing_whitespace.is_text("foo/bar"))
+ self.assertFalse(trailing_whitespace.is_text("foo/bar.o"))
+
+ def test_diff_parsing(self) -> None:
+ basic_diff = """diff --git a/clang/test/CodeGen/memalign-libcall.c b/clang/test/CodeGen/memalign-libcall.c
+index 2070eebdbf84..4fe1a838d15f 100644
+--- a/clang/test/CodeGen/memalign-libcall.c
++++ b/clang/test/CodeGen/memalign-libcall.c
+@@ -12 +12,2 @@ void *test(size_t alignment, size_t size) {
+-// CHECK: attributes #2 = { nobuiltin "no-builtin-memalign" }
+\\ No newline at end of file
++// CHECK: attributes #2 = { nobuiltin "no-builtin-memalign" }"""
+ self.assertEqual(trailing_whitespace.parse_diffs(basic_diff), {
+ 'clang/test/CodeGen/memalign-libcall.c': [
+ (12, "// CHECK: attributes #2 = { nobuiltin \"no-builtin-memalign\" }")
+ ]
+ })
+ multiple_added_lines = """diff --git a/clang/test/CodeGen/memalign-libcall.c b/clang/test/CodeGen/memalign-libcall.c
+index 2070eebdbf84..4fe1a838d15f 100644
+--- a/clang/test/CodeGen/memalign-libcall.c
++++ b/clang/test/CodeGen/memalign-libcall.c
+@@ -12 +12,2 @@ void *test(size_t alignment, size_t size) {
+-// CHECK: attributes #2 = { nobuiltin "no-builtin-memalign" }
+\\ No newline at end of file
++// CHECK: attributes #2 = { nobuiltin "no-builtin-memalign" }
++foobar"""
+ self.assertEqual(trailing_whitespace.parse_diffs(multiple_added_lines), {
+ 'clang/test/CodeGen/memalign-libcall.c': [
+ (12, "// CHECK: attributes #2 = { nobuiltin \"no-builtin-memalign\" }"),
+ (13, "foobar"),
+ ]
+ })
+ multiple_deleted = """diff --git a/clang/test/CodeGen/memalign-libcall.c b/clang/test/CodeGen/memalign-libcall.c
+index 2070eebdbf84..4fe1a838d15f 100644
+--- a/clang/test/CodeGen/memalign-libcall.c
++++ b/clang/test/CodeGen/memalign-libcall.c
+@@ -12 +12,2 @@ void *test(size_t alignment, size_t size) {
+-foobar
+-// CHECK: attributes #2 = { nobuiltin "no-builtin-memalign" }
+\\ No newline at end of file
++// CHECK: attributes #2 = { nobuiltin "no-builtin-memalign" }"""
+ self.assertEqual(trailing_whitespace.parse_diffs(multiple_deleted), {
+ 'clang/test/CodeGen/memalign-libcall.c': [
+ (12, "// CHECK: attributes #2 = { nobuiltin \"no-builtin-memalign\" }"),
+ ]
+ })
+
+if __name__ == "__main__":
+ unittest.main()
More information about the llvm-commits
mailing list