[clang] [NFC][analyzer] Document configuration options (PR #135169)
DonĂ¡t Nagy via cfe-commits
cfe-commits at lists.llvm.org
Tue Apr 15 04:16:27 PDT 2025
================
@@ -0,0 +1,242 @@
+#!/usr/bin/env python3
+# A tool to automatically generate documentation for the config options of the
+# clang static analyzer by reading `AnalyzerOptions.def`.
+
+import argparse
+from collections import namedtuple
+from enum import Enum, auto
+import re
+import sys
+import textwrap
+
+
+# The following code implements a trivial parser for the narrow subset of C++
+# which is used in AnalyzerOptions.def. This supports the following features:
+# - ignores preprocessor directives, even if they are continued with \ at EOL
+# - ignores comments: both /* ... */ and // ...
+# - parses string literals (even if they contain \" escapes)
+# - concatenates adjacent string literals
+# - parses numbers even if they contain ' as a thousands separator
+# - recognizes MACRO(arg1, arg2, ..., argN) calls
+
+
+class TT(Enum):
+ "Token type enum."
+ number = auto()
+ ident = auto()
+ string = auto()
+ punct = auto()
+
+
+TOKENS = [
+ (re.compile(r"-?[0-9']+"), TT.number),
+ (re.compile(r"\w+"), TT.ident),
+ (re.compile(r'"([^\\"]|\\.)*"'), TT.string),
+ (re.compile(r"[(),]"), TT.punct),
+ (re.compile(r"/\*((?!\*/).)*\*/", re.S), None), # C-style comment
+ (re.compile(r"//.*\n"), None), # C++ style oneline comment
+ (re.compile(r"#.*(\\\n.*)*(?<!\\)\n"), None), # preprocessor directive
+ (re.compile(r"\s+"), None), # whitespace
+]
+
+Token = namedtuple("Token", "kind code")
+
+
+def report_unexpected(s, pos):
+ lines = (s[:pos] + "X").split("\n")
+ lineno, col = (len(lines), len(lines[-1]))
+ print(
+ "unexpected character %r in AnalyzerOptions.def at line %d column %d"
+ % (s[pos], lineno, col),
+ file=sys.stderr,
+ )
+
+
+def tokenize(s):
+ result = []
+ pos = 0
+ while pos < len(s):
+ for regex, kind in TOKENS:
+ if m := regex.match(s, pos):
+ if kind is not None:
+ result.append(Token(kind, m.group(0)))
+ pos = m.end()
+ break
+ else:
+ report_unexpected(s, pos)
+ pos += 1
+ return result
----------------
NagyDonat wrote:
I didn't try to find an external tokenizer library, because `AnalyzerOptions.def` uses some tricky features (apostrophes in numeric literals, preprocessor directives continued with backslash at EOL), so I would need to use a reasonably feature-complete heavyweight C++-specific tokenizer, and interfacing with that wouldn't be much simpler than the current implementation.
Also I like that this little self-contained tokenizer is very transparent (if it misbehaves, the developer only needs to understand a few dozen lines) and I don't expect significant changes in the structure of `AnalyzerOptions.def` so I don't think that we need to prepare with a general-purpose tokenizer for future changes.
https://github.com/llvm/llvm-project/pull/135169
More information about the cfe-commits
mailing list