[clang] [NFC][analyzer] Document configuration options (PR #135169)

Mon Apr 14 09:40:37 PDT 2025

================
@@ -0,0 +1,242 @@
+#!/usr/bin/env python3
+# A tool to automatically generate documentation for the config options of the
+# clang static analyzer by reading `AnalyzerOptions.def`.
+
+import argparse
+from collections import namedtuple
+from enum import Enum, auto
+import re
+import sys
+import textwrap
+
+
+# The following code implements a trivial parser for the narrow subset of C++
+# which is used in AnalyzerOptions.def. This supports the following features:
+# - ignores preprocessor directives, even if they are continued with \ at EOL
+# - ignores comments: both /* ... */ and // ...
+# - parses string literals (even if they contain \" escapes)
+# - concatenates adjacent string literals
+# - parses numbers even if they contain ' as a thousands separator
+# - recognizes MACRO(arg1, arg2, ..., argN) calls
+
+
+class TT(Enum):
+    "Token type enum."
+    number = auto()
+    ident = auto()
+    string = auto()
+    punct = auto()
+
+
+TOKENS = [
+    (re.compile(r"-?[0-9']+"), TT.number),
+    (re.compile(r"\w+"), TT.ident),
+    (re.compile(r'"([^\\"]|\\.)*"'), TT.string),
+    (re.compile(r"[(),]"), TT.punct),
+    (re.compile(r"/\*((?!\*/).)*\*/", re.S), None),  # C-style comment
+    (re.compile(r"//.*\n"), None),  # C++ style oneline comment
+    (re.compile(r"#.*(\\\n.*)*(?<!\\)\n"), None),  # preprocessor directive
+    (re.compile(r"\s+"), None),  # whitespace
+]
+
+Token = namedtuple("Token", "kind code")
+
+
+def report_unexpected(s, pos):
+    lines = (s[:pos] + "X").split("\n")
+    lineno, col = (len(lines), len(lines[-1]))
+    print(
+        "unexpected character %r in AnalyzerOptions.def at line %d column %d"
+        % (s[pos], lineno, col),
+        file=sys.stderr,
+    )
+
+
+def tokenize(s):
+    result = []
+    pos = 0
+    while pos < len(s):
+        for regex, kind in TOKENS:
+            if m := regex.match(s, pos):
+                if kind is not None:
+                    result.append(Token(kind, m.group(0)))
+                pos = m.end()
+                break
+        else:
+            report_unexpected(s, pos)
+            pos += 1
+    return result
+
+
+def join_strings(tokens):
+    result = []
+    for tok in tokens:
+        if tok.kind == TT.string and result and result[-1].kind == TT.string:
+            # If this token is a string, and the previous non-ignored token is
+            # also a string, then merge them into a single token. We need to
+            # discard the closing " of the previous string and the opening " of
+            # this string.
+            prev = result.pop()
+            result.append(Token(TT.string, prev.code[:-1] + tok.code[1:]))
+        else:
+            result.append(tok)
+    return result
+
+
+MacroCall = namedtuple("MacroCall", "name args")
+
+
+class State(Enum):
+    "States of the state machine used for parsing the macro calls."
+    init = auto()
+    after_ident = auto()
+    before_arg = auto()
+    after_arg = auto()
+
+
+def get_calls(tokens, macro_names):
+    state = State.init
+    result = []
+    current = None
+    for tok in tokens:
+        if state == State.init and tok.kind == TT.ident and tok.code in macro_names:
+            current = MacroCall(tok.code, [])
+            state = State.after_ident
+        elif state == State.after_ident and tok == Token(TT.punct, "("):
+            state = State.before_arg
+        elif state == State.before_arg:
+            if current is not None:
+                current.args.append(tok)
+                state = State.after_arg
+        elif state == State.after_arg and tok.kind == TT.punct:
+            if tok.code == ")":
+                result.append(current)
+                current = None
+                state = State.init
+            elif tok.code == ",":
+                state = State.before_arg
+        else:
+            current = None
+            state = State.init
+    return result
+
+
+# The information will be extracted from calls to these two macros:
+# #define ANALYZER_OPTION(TYPE, NAME, CMDFLAG, DESC, DEFAULT_VAL)
+# #define ANALYZER_OPTION_DEPENDS_ON_USER_MODE(TYPE, NAME, CMDFLAG, DESC,
+#                                              SHALLOW_VAL, DEEP_VAL)
+
+MACRO_NAMES_ARGCOUNTS = {
+    "ANALYZER_OPTION": 5,
+    "ANALYZER_OPTION_DEPENDS_ON_USER_MODE": 6,
+}
+
+
+def string_value(tok):
+    if tok.kind != TT.string:
+        raise ValueError(f"expected a string token, got {tok.kind.name}")
+    text = tok.code[1:-1]  # Remove quotes
+    text = re.sub(r"\\(.)", r"\1", text)  # Resolve backslash escapes
+    return text
+
+
+def cmdflag_to_rst_title(cmdflag_tok):
+    text = string_value(cmdflag_tok)
+    underline = "-" * len(text)
+    ref = f".. _analyzer-option-{text}:"
+
+    return f"{ref}\n\n{text}\n{underline}\n\n"
+
+
+def desc_to_rst_paragraphs(tok):
+    desc = string_value(tok)
+
+    # Escape a star that would act as inline emphasis within RST.
+    desc = desc.replace("ctu-max-nodes-*", r"ctu-max-nodes-\*")
+
+    # Many descriptions end with "Value: <list of accepted values>", which is
+    # OK for a terse command line printout, but should be prettified for web
+    # documentation.
+    # Moreover, the option ctu-invocation-list shows some example file content
+    # which is formatted as a preformatted block.
+    paragraphs = [desc]
+    extra = ""
+    if m := re.search(r"(^|\s)Value:", desc):
+        paragraphs = [desc[: m.start()], "Accepted values:" + desc[m.end() :]]
+    elif m := re.search(r"\s*Example file.content:", desc):
+        paragraphs = [desc[: m.start()]]
+        extra = "Example file content::\n\n  " + desc[m.end() :] + "\n\n"
+
+    wrapped = [textwrap.fill(p, width=80) for p in paragraphs if p.strip()]
+
+    return "\n\n".join(wrapped + [""]) + extra
+
+
+def default_to_rst(tok):
+    if tok.kind == TT.string:
+        if tok.code == '""':
+            return "(empty string)"
+        return tok.code
+    if tok.kind == TT.ident:
+        return tok.code
+    if tok.kind == TT.number:
+        return tok.code.replace("'", "")
+    raise ValueError(f"unexpected token as default value: {tok.kind.name}")
+
+
+def defaults_to_rst_paragraph(defaults):
+    strs = [default_to_rst(d) for d in defaults]
+
+    if len(strs) == 1:
+        return f"Default value: {strs[0]}\n\n"
+    if len(strs) == 2:
+        return (
+            f"Default value: {strs[0]} (in shallow mode) / {strs[1]} (in deep mode)\n\n"
+        )
+    raise ValueError("unexpected count of default values: %d" % len(defaults))
+
+
+def macro_call_to_rst_paragraphs(macro_call):
+    if len(macro_call.args) != MACRO_NAMES_ARGCOUNTS[macro_call.name]:
+        return ""
+
+    try:
+        _, _, cmdflag, desc, *defaults = macro_call.args
+
+        return (
+            cmdflag_to_rst_title(cmdflag)
+            + desc_to_rst_paragraphs(desc)
+            + defaults_to_rst_paragraph(defaults)
+        )
+    except ValueError as ve:
+        print(ve.args[0], file=sys.stderr)
+        return ""
+
+
+def get_option_list(input_file):
+    with open(input_file, encoding="utf-8") as f:
+        contents = f.read()
+    tokens = join_strings(tokenize(contents))
+    macro_calls = get_calls(tokens, MACRO_NAMES_ARGCOUNTS)
+
+    result = ""
+    for mc in macro_calls:
+        result += macro_call_to_rst_paragraphs(mc)
+    return result
+
+
+p = argparse.ArgumentParser()
+p.add_argument("-i", "--input", help="path to AnalyzerOptions.def")
+p.add_argument("-t", "--template", help="path of template file")
+p.add_argument("-o", "--output", help="path of output file")
----------------
steakhal wrote:

AH!, so the `-i` wasn't the flag of `python` itself. You could use the verbose names of these flags in CMake then to remove this confusion.

https://github.com/llvm/llvm-project/pull/135169