[clang] [NFC][analyzer] Document configuration options (PR #135169)
DonĂ¡t Nagy via cfe-commits
cfe-commits at lists.llvm.org
Tue May 13 07:19:57 PDT 2025
================
@@ -0,0 +1,286 @@
+#!/usr/bin/env python3
+# A tool to automatically generate documentation for the config options of the
+# clang static analyzer by reading `AnalyzerOptions.def`.
+
+import argparse
+from collections import namedtuple
+from enum import Enum, auto
+import re
+import sys
+import textwrap
+
+
+# The following code implements a trivial parser for the narrow subset of C++
+# which is used in AnalyzerOptions.def. This supports the following features:
+# - ignores preprocessor directives, even if they are continued with \ at EOL
+# - ignores comments: both /* ... */ and // ...
+# - parses string literals (even if they contain \" escapes)
+# - concatenates adjacent string literals
+# - parses numbers even if they contain ' as a thousands separator
+# - recognizes MACRO(arg1, arg2, ..., argN) calls
+
+
+class TT(Enum):
+ "Token type enum."
+ number = auto()
+ ident = auto()
+ string = auto()
+ punct = auto()
+
+
+TOKENS = [
+ (re.compile(r"-?[0-9']+"), TT.number),
+ (re.compile(r"\w+"), TT.ident),
+ (re.compile(r'"([^\\"]|\\.)*"'), TT.string),
+ (re.compile(r"[(),]"), TT.punct),
+ (re.compile(r"/\*((?!\*/).)*\*/", re.S), None), # C-style comment
+ (re.compile(r"//.*\n"), None), # C++ style oneline comment
+ (re.compile(r"#.*(\\\n.*)*(?<!\\)\n"), None), # preprocessor directive
+ (re.compile(r"\s+"), None), # whitespace
+]
+
+Token = namedtuple("Token", "kind code")
+
+
+class ErrorHandler:
+ def __init__(self):
+ self.seen_errors = False
+
+ # This script uses some heuristical tweaks to modify the documentation
+ # of some analyzer options. As this code is fragile, we record the use
+ # of these tweaks and report them if they become obsolete:
+ self.unused_tweaks = [
+ "ctu-max-nodes-*",
+ "accepted values",
+ "example file content",
+ ]
+
+ def record_use_of_tweak(self, tweak_name):
+ try:
+ self.unused_tweaks.remove(tweak_name)
+ except ValueError:
+ pass
+
+ def report_error(self, msg):
+ print("Error:", msg, file=sys.stderr)
+ self.seen_errors = True
+
+ def report_unexpected_char(self, s, pos):
+ lines = (s[:pos] + "X").split("\n")
+ lineno, col = (len(lines), len(lines[-1]))
+ self.report_error(
+ "unexpected character %r in AnalyzerOptions.def at line %d column %d"
+ % (s[pos], lineno, col),
+ )
+
+ def report_unused_tweaks(self):
+ if not self.unused_tweaks:
+ return
+ _is = " is" if len(self.unused_tweaks) == 1 else "s are"
+ names = ", ".join(self.unused_tweaks)
+ self.report_error(f"textual tweak{_is} unused in script: {names}")
+
+
+err_handler = ErrorHandler()
+
+
+def tokenize(s):
+ result = []
+ pos = 0
+ while pos < len(s):
+ for regex, kind in TOKENS:
+ if m := regex.match(s, pos):
+ if kind is not None:
+ result.append(Token(kind, m.group(0)))
+ pos = m.end()
+ break
+ else:
+ err_handler.report_unexpected_char(s, pos)
+ pos += 1
+ return result
+
+
+def join_strings(tokens):
+ result = []
+ for tok in tokens:
+ if tok.kind == TT.string and result and result[-1].kind == TT.string:
+ # If this token is a string, and the previous non-ignored token is
+ # also a string, then merge them into a single token. We need to
+ # discard the closing " of the previous string and the opening " of
+ # this string.
+ prev = result.pop()
+ result.append(Token(TT.string, prev.code[:-1] + tok.code[1:]))
+ else:
+ result.append(tok)
+ return result
+
+
+MacroCall = namedtuple("MacroCall", "name args")
+
+
+class State(Enum):
+ "States of the state machine used for parsing the macro calls."
+ init = auto()
+ after_ident = auto()
+ before_arg = auto()
+ after_arg = auto()
+
+
+def get_calls(tokens, macro_names):
+ state = State.init
+ result = []
+ current = None
+ for tok in tokens:
+ if state == State.init and tok.kind == TT.ident and tok.code in macro_names:
+ current = MacroCall(tok.code, [])
+ state = State.after_ident
+ elif state == State.after_ident and tok == Token(TT.punct, "("):
+ state = State.before_arg
+ elif state == State.before_arg:
+ if current is not None:
+ current.args.append(tok)
+ state = State.after_arg
+ elif state == State.after_arg and tok.kind == TT.punct:
+ if tok.code == ")":
+ result.append(current)
+ current = None
+ state = State.init
+ elif tok.code == ",":
+ state = State.before_arg
+ else:
+ current = None
+ state = State.init
+ return result
+
+
+# The information will be extracted from calls to these two macros:
+# #define ANALYZER_OPTION(TYPE, NAME, CMDFLAG, DESC, DEFAULT_VAL)
+# #define ANALYZER_OPTION_DEPENDS_ON_USER_MODE(TYPE, NAME, CMDFLAG, DESC,
+# SHALLOW_VAL, DEEP_VAL)
+
+MACRO_NAMES_ARGCOUNTS = {
+ "ANALYZER_OPTION": 5,
+ "ANALYZER_OPTION_DEPENDS_ON_USER_MODE": 6,
+}
+
+
+def string_value(tok):
+ if tok.kind != TT.string:
+ raise ValueError(f"expected a string token, got {tok.kind.name}")
+ text = tok.code[1:-1] # Remove quotes
+ text = re.sub(r"\\(.)", r"\1", text) # Resolve backslash escapes
+ return text
+
+
+def cmdflag_to_rst_title(cmdflag_tok):
+ text = string_value(cmdflag_tok)
+ underline = "-" * len(text)
+ ref = f".. _analyzer-option-{text}:"
+
+ return f"{ref}\n\n{text}\n{underline}\n\n"
+
+
+def desc_to_rst_paragraphs(tok):
+ base_desc = string_value(tok)
+
+ # Escape a star that would act as inline emphasis within RST.
+ desc = base_desc.replace("ctu-max-nodes-*", r"ctu-max-nodes-\*")
+ if desc != base_desc:
+ err_handler.record_use_of_tweak("ctu-max-nodes-*")
----------------
NagyDonat wrote:
Yes, this is about escaping, and I'll follow your suggestion.
https://github.com/llvm/llvm-project/pull/135169
More information about the cfe-commits
mailing list