[llvm] [BOLT][NFC] Script for automatic user guide generation (PR #93822)

Thu May 30 08:33:27 PDT 2024

https://github.com/ElvinaYakubova updated https://github.com/llvm/llvm-project/pull/93822

>From b1ac91ce1f078b79ecf1f2fd96903f10c73d6d0c Mon Sep 17 00:00:00 2001
From: Elvina Yakubova <eyakubova at nvidia.com>
Date: Thu, 30 May 2024 20:06:19 +0530
Subject: [PATCH] [BOLT] Script for automatic user guide generation

---
 bolt/docs/CommandLineArgumentReference.md | 227 +++++++++-------------
 bolt/docs/generate_doc.py                 | 149 ++++++++++++++
 bolt/lib/Rewrite/DWARFRewriter.cpp        |   2 +-
 3 files changed, 239 insertions(+), 139 deletions(-)
 create mode 100644 bolt/docs/generate_doc.py

diff --git a/bolt/docs/CommandLineArgumentReference.md b/bolt/docs/CommandLineArgumentReference.md
index 1951ad5a2dc5e..8887d1f5d5bd4 100644
--- a/bolt/docs/CommandLineArgumentReference.md
+++ b/bolt/docs/CommandLineArgumentReference.md
@@ -6,41 +6,37 @@
 
 ## OPTIONS
 
-### Generic options
+### Generic options:
 
 - `-h`
 
-  Alias for `--help`
+  Alias for --help
 
 - `--help`
 
-  Display available options (`--help-hidden` for more).
+  Display available options (--help-hidden for more)
 
 - `--help-hidden`
 
-  Display all available options.
+  Display all available options
 
 - `--help-list`
 
-  Display list of available options (`--help-list-hidden` for more).
+  Display list of available options (--help-list-hidden for more)
 
 - `--help-list-hidden`
 
-  Display list of all available options.
+  Display list of all available options
 
-- `--print-all-options`
-
-  Print all option values after command line parsing.
-
-- `--print-options`
+- `--version`
 
-  Print non-default options after command line parsing.
+  Display the version of this program
 
-- `--version`
+### Output options:
 
-  Display the version of this program.
+- `--bolt-info`
 
-### Output options
+  Write bolt info section in the output binary
 
 - `-o <string>`
 
@@ -50,7 +46,7 @@
 
   Save recorded profile to a file
 
-### BOLT generic options
+### BOLT generic options:
 
 - `--align-text=<uint>`
 
@@ -89,15 +85,20 @@
 
 - `--data=<string>`
 
-  <data file>
+  data file
+
+- `--data2=<string>`
+
+  data file
 
 - `--debug-skeleton-cu`
 
-  Prints out offsets for abbrev and debug_info of Skeleton CUs that get patched.
+  Prints out offsetrs for abbrev and debu_info of Skeleton CUs that get patched.
 
 - `--deterministic-debuginfo`
 
-  Disables parallel execution of tasks that may produce nondeterministic debug info
+  Disables parallel execution of tasks that may produce nondeterministic debug
+  info
 
 - `--dot-tooltip-code`
 
@@ -113,7 +114,7 @@
 
 - `--dump-dot-all`
 
-  Dump function CFGs to graphviz format after each stage; enable '-print-loops'
+  Dump function CFGs to graphviz format after each stage;enable '-print-loops'
   for color-coded blocks
 
 - `--dump-orc`
@@ -179,8 +180,8 @@
 - `--hot-text`
 
   Generate hot text symbols. Apply this option to a precompiled binary that
-  manually calls into hugify, such that at runtime hugify call will put hot
-  code into 2M pages. This requires relocation.
+  manually calls into hugify, such that at runtime hugify call will put hot code
+  into 2M pages. This requires relocation.
 
 - `--hot-text-move-sections=<sec1,sec2,sec3,...>`
 
@@ -227,15 +228,15 @@
 - `--profile-format=<value>`
 
   Format to dump profile output in aggregation mode, default is fdata
-  - `=fdata`: offset-based plaintext format
-  - `=yaml`: dense YAML representation
+  - `fdata`: offset-based plaintext format
+  - `yaml`: dense YAML representation
 
 - `--r11-availability=<value>`
 
   Determine the availability of r11 before indirect branches
-  - `=never`: r11 not available
-  - `=always`: r11 available before calls and jumps
-  - `=abi`r11 available before calls but not before jumps
+  - `never`: r11 not available
+  - `always`: r11 available before calls and jumps
+  - `abi`: r11 available before calls but not before jumps
 
 - `--relocs`
 
@@ -283,7 +284,8 @@
 
 - `--trap-avx512`
 
-  In relocation mode trap upon entry to any function that uses AVX-512 instructions
+  In relocation mode trap upon entry to any function that uses AVX-512
+  instructions
 
 - `--trap-old-code`
 
@@ -311,7 +313,7 @@
   Output a single dwarf package file (dwp) instead of multiple non-relocatable
   dwarf object files (dwo).
 
-### BOLT optimization options
+### BOLT optimization options:
 
 - `--align-blocks`
 
@@ -357,13 +359,14 @@
 
 - `--cg-use-split-hot-size`
 
-  Use hot/cold data on basic blocks to determine hot sizes for call graph functions
+  Use hot/cold data on basic blocks to determine hot sizes for call graph
+  functions
 
 - `--cold-threshold=<uint>`
 
   Tenths of percents of main entry frequency to use as a threshold when
-  evaluating whether a basic block is cold (0 means it is only considered
-  cold if the block has zero samples). Default: 0
+  evaluating whether a basic block is cold (0 means it is only considered cold
+  if the block has zero samples). Default: 0
 
 - `--elim-link-veneers`
 
@@ -375,8 +378,8 @@
 
 - `--equalize-bb-counts`
 
-  Use same count for BBs that should have equivalent count (used in non-LBR
-  and shrink wrapping)
+  Use same count for BBs that should have equivalent count (used in non-LBR and
+  shrink wrapping)
 
 - `--execution-count-threshold=<uint>`
 
@@ -438,8 +441,8 @@
 
 - `--icp-calls-remaining-percent-threshold=<uint>`
 
-  The percentage threshold against remaining unpromoted indirect call count
-  for the promotion for calls
+  The percentage threshold against remaining unpromoted indirect call count for
+  the promotion for calls
 
 - `--icp-calls-topn`
 
@@ -518,22 +521,18 @@
 
 - `--indirect-call-promotion-jump-tables-topn=<uint>`
 
-  Limit number of targets to consider when doing indirect call promotion on
-  jump tables. 0 = no limit
-
-- `--indirect-call-promotion-mispredict-threshold=<uint>`
-
-  Misprediction threshold for skipping ICP on an indirect call
+  Limit number of targets to consider when doing indirect call promotion on jump
+  tables. 0 = no limit
 
 - `--indirect-call-promotion-topn=<uint>`
 
-  Limit number of targets to consider when doing indirect call promotion.
-  0 = no limit
+  Limit number of targets to consider when doing indirect call promotion. 0 = no
+  limit
 
 - `--indirect-call-promotion-use-mispredicts`
 
   Use misprediction frequency for determining whether or not ICP should be
-  applied at a callsite. The `-indirect-call-promotion-mispredict-threshold`
+  applied at a callsite.  The -indirect-call-promotion-mispredict-threshold
   value will be used by this heuristic
 
 - `--infer-fall-throughs`
@@ -566,11 +565,13 @@
 
 - `--inline-small-functions`
 
-  Inline functions if increase in size is less than defined by `-inline-small-functions-bytes`
+  Inline functions if increase in size is less than defined by -inline-small-
+  functions-bytes
 
 - `--inline-small-functions-bytes=<uint>`
 
-  Max number of bytes for the function to be considered small for inlining purposes
+  Max number of bytes for the function to be considered small for inlining
+  purposes
 
 - `--instrument`
 
@@ -590,7 +591,7 @@
   Make jump tables size smaller at the cost of using more instructions at jump
   sites
 
-- `-jump-tables=<value>`
+- `--jump-tables=<value>`
 
   Jump tables support (default=basic)
   - `none`: do not optimize functions with jump tables
@@ -780,23 +781,22 @@
 - `--split-strategy=<value>`
 
   Strategy used to partition blocks into fragments
-
-  - `profile2`: split each function into a hot and cold fragment using
-  profiling information
+  - `profile2`: split each function into a hot and cold fragment using profiling
+  information
   - `cdsplit`: split each function into a hot, warm, and cold fragment using
   profiling information
   - `random2`: split each function into a hot and cold fragment at a randomly
   chosen split point (ignoring any available profiling information)
-  - `randomN`: split each function into N fragments at randomly chosen split
+  - `randomN`: split each function into N fragments at a randomly chosen split
   points (ignoring any available profiling information)
-  - `all`: split all basic blocks of each function into fragments such that
-  each fragment contains exactly a single basic block
+  - `all`: split all basic blocks of each function into fragments such that each
+  fragment contains exactly a single basic block
 
 - `--split-threshold=<uint>`
 
   Split function only if its main size is reduced by more than given amount of
-  bytes. Default value: 0, i.e. split iff the size is reduced. Note that on
-  some architectures the size can increase after splitting.
+  bytes. Default value: 0, i.e. split iff the size is reduced. Note that on some
+  architectures the size can increase after splitting.
 
 - `--stale-matching-max-func-size=<uint>`
 
@@ -817,19 +817,20 @@
 - `--tail-duplication=<value>`
 
   Duplicate unconditional branches that cross a cache line
-
-  - `none` do not apply
-  - `aggressive` aggressive strategy
-  - `moderate` moderate strategy
-  - `cache` cache-aware duplication strategy
+  - `none`: do not apply
+  - `aggressive`: aggressive strategy
+  - `moderate`: moderate strategy
+  - `cache`: cache-aware duplication strategy
 
 - `--tsp-threshold=<uint>`
 
-  Maximum number of hot basic blocks in a function for which to use a precise TSP solution while re-ordering basic blocks
+  Maximum number of hot basic blocks in a function for which to use a precise
+  TSP solution while re-ordering basic blocks
 
 - `--use-aggr-reg-reassign`
 
-  Use register liveness analysis to try to find more opportunities for -reg-reassign optimization
+  Use register liveness analysis to try to find more opportunities for -reg-
+  reassign optimization
 
 - `--use-compact-aligner`
 
@@ -847,21 +848,16 @@
 
   Only apply branch boundary alignment in hot code
 
-- `--x86-strip-redundant-address-size`
+### BOLT options in relocation mode:
 
-  Remove redundant Address-Size override prefix
-
-### BOLT options in relocation mode
-
-- `-align-macro-fusion=<value>`
+- `--align-macro-fusion=<value>`
 
   Fix instruction alignment for macro-fusion (x86 relocation mode)
-
   - `none`: do not insert alignment no-ops for macro-fusion
   - `hot`: only insert alignment no-ops on hot execution paths (default)
   - `all`: always align instructions to allow macro-fusion
 
-### BOLT instrumentation options
+### BOLT instrumentation options:
 
 `llvm-bolt <executable> -instrument [-o outputfile] <instrumented-executable>`
 
@@ -893,72 +889,21 @@
 
 - `--instrumentation-no-counters-clear`
 
-  Don't clear counters across dumps (use with `instrumentation-sleep-time` option)
+  Don't clear counters across dumps (use with instrumentation-sleep-time option)
 
 - `--instrumentation-sleep-time=<uint>`
 
   Interval between profile writes (default: 0 = write only at program end).
   This is useful for service workloads when you want to dump profile every X
-  minutes or if you are killing the program and the profile is not being
-  dumped at the end.
+  minutes or if you are killing the program and the profile is not being dumped
+  at the end.
 
 - `--instrumentation-wait-forks`
 
   Wait until all forks of instrumented process will finish (use with
-  `instrumentation-sleep-time` option)
-
-### Data aggregation options (perf2bolt)
-
-`perf2bolt -p perf.data [-o outputfile] perf.fdata <executable>`
-
-- `--autofdo`
-
-  Generate autofdo textual data instead of bolt data
-
-- `--filter-mem-profile`
-
-  If processing a memory profile, filter out stack or heap accesses that won't
-  be useful for BOLT to reduce profile file size
-
-- `--ignore-build-id`
-
-  Continue even if build-ids in input binary and perf.data mismatch
-
-- `--ignore-interrupt-lbr`
-
-  Ignore kernel interrupt LBR that happens asynchronously
-
-- `--itrace=<string>`
-
-  Generate LBR info with perf itrace argument
+  instrumentation-sleep-time option)
 
-- `--nl`
-
-  Aggregate basic samples (without LBR info)
-
-- `--pa`
-
-  Skip perf and read data from a pre-aggregated file format
-
-- `--perfdata=<string>`
-
-  Data file
-
-- `--pid=<ulong>`
-
-  Only use samples from process with specified PID
-
-- `--time-aggr`
-
-  Time BOLT aggregator
-
-- `--use-event-pc`
-
-  Use event PC in combination with LBR sampling
-
-### BOLT printing options
-
-#### Generic options
+### BOLT printing options:
 
 - `--print-aliases`
 
@@ -1032,10 +977,10 @@
 - `--print-pseudo-probes=<value>`
 
   Print pseudo probe info
-  - `=decode`: decode probes section from binary
-  - `=address_conversion`: update address2ProbesMap with output block address
-  - `=encoded_probes`: display the encoded probes in binary section
-  - `=all`: enable all debugging printout
+  - `decode`: decode probes section from binary
+  - `address_conversion`: update address2ProbesMap with output block address
+  - `encoded_probes`: display the encoded probes in binary section
+  - `all`: enable all debugging printout
 
 - `--print-relocations`
 
@@ -1061,11 +1006,13 @@
 
   Print names of functions with unknown control flow
 
-- `--time-opts`
+- `--time-build`
 
-  Print time spent in each optimization
+  Print time spent constructing binary functions
+
+- `--time-rewrite`
 
-#### Optimization options
+  Print time spent in rewriting passes
 
 - `--print-after-branch-fixup`
 
@@ -1204,10 +1151,14 @@
 
   Print functions after veneer elimination pass
 
-- `--time-build`
+- `--time-opts`
 
-  Print time spent constructing binary functions
+  Print time spent in each optimization
 
-- `--time-rewrite`
+- `--print-all-options`
 
-  Print time spent in rewriting passes
+  Print all option values after command line parsing
+
+- `--print-options`
+
+  Print non-default options after command line parsing
\ No newline at end of file
diff --git a/bolt/docs/generate_doc.py b/bolt/docs/generate_doc.py
new file mode 100644
index 0000000000000..d8829daf677b4
--- /dev/null
+++ b/bolt/docs/generate_doc.py
@@ -0,0 +1,149 @@
+#!/usr/bin/env python3
+# A tool to parse the output of `llvm-bolt --help-hidden` and update the
+# documentation in CommandLineArgumentReference.md automatically.
+# Run from the directory in which this file is located to update the docs.
+
+import subprocess
+from textwrap import wrap
+
+LINE_LIMIT = 80
+
+
+def wrap_text(text, indent, limit=LINE_LIMIT):
+    wrapped_lines = wrap(text, width=limit - len(indent))
+    wrapped_text = ("\n" + indent).join(wrapped_lines)
+    return wrapped_text
+
+
+def add_info(sections, section, option, description):
+    indent = "  "
+    wrapped_description = "\n".join(
+        [
+            wrap_text(line, indent) if len(line) > LINE_LIMIT else line
+            for line in description
+        ]
+    )
+    sections[section].append((option, indent + wrapped_description))
+
+
+def parse_bolt_options(output):
+    section_headers = [
+        "Generic options:",
+        "Output options:",
+        "BOLT generic options:",
+        "BOLT optimization options:",
+        "BOLT options in relocation mode:",
+        "BOLT instrumentation options:",
+        "BOLT printing options:",
+    ]
+
+    sections = {key: [] for key in section_headers}
+    current_section, prev_section = None, None
+    option, description = None, []
+
+    for line in output.split("\n"):
+        cleaned_line = line.strip()
+
+        if cleaned_line.casefold() in map(str.casefold, section_headers):
+            if prev_section != None:  # Save last option from prev section
+                add_info(sections, current_section, option, description)
+                option, description = None, []
+
+            cleaned_line = cleaned_line.split()
+            # Apply lowercase to all words except the first one
+            cleaned_line = [cleaned_line[0]] + [
+                word.lower() for word in cleaned_line[1:]
+            ]
+            # Join the words back together into a string
+            cleaned_line = " ".join(cleaned_line)
+
+            current_section = cleaned_line
+            prev_section = current_section
+            continue
+
+        if cleaned_line.startswith("-"):
+            if option and description:
+                # Join description lines, adding an extra newline for
+                # sub-options that start with '='
+                add_info(sections, current_section, option, description)
+                option, description = None, []
+
+            parts = cleaned_line.split("  ", 1)
+            if len(parts) > 1:
+                option = parts[0].strip()
+                descr = parts[1].strip()
+                descr = descr[2].upper() + descr[3:]
+                description = [descr]
+                if option.startswith("--print") or option.startswith("--time"):
+                    current_section = "BOLT printing options:"
+                elif prev_section != None:
+                    current_section = prev_section
+            continue
+
+        if cleaned_line.startswith("="):
+            parts = cleaned_line.split(maxsplit=1)
+            # Split into two parts: sub-option and description
+            if len(parts) == 2:
+                # Rejoin with a single space
+                cleaned_line = parts[0] + " " + parts[1].rstrip()
+            description.append(cleaned_line)
+        elif cleaned_line:  # Multiline description continuation
+            description.append(cleaned_line)
+
+    add_info(sections, current_section, option, description)
+    return sections
+
+
+def generate_markdown(sections):
+    markdown_lines = [
+        "# BOLT - a post-link optimizer developed to speed up large applications\n",
+        "## SYNOPSIS\n",
+        "`llvm-bolt <executable> [-o outputfile] <executable>.bolt "
+        "[-data=perf.fdata] [options]`\n",
+        "## OPTIONS",
+    ]
+
+    for section, options in sections.items():
+        markdown_lines.append(f"\n### {section}")
+        if section == "BOLT instrumentation options:":
+            markdown_lines.append(
+                f"\n`llvm-bolt <executable> -instrument"
+                " [-o outputfile] <instrumented-executable>`"
+            )
+        for option, desc in options:
+            markdown_lines.append(f"\n- `{option}`\n")
+            # Split description into lines to handle sub-options
+            desc_lines = desc.split("\n")
+            for line in desc_lines:
+                if line.startswith("="):
+                    # Sub-option: correct formatting with bullet
+                    sub_option, sub_desc = line[1:].split(" ", 1)
+                    markdown_lines.append(f"  - `{sub_option}`: {sub_desc[4:]}")
+                else:
+                    # Regular line of description
+                    if line[2:].startswith("<"):
+                        line = line.replace("<", "").replace(">", "")
+                    markdown_lines.append(f"{line}")
+
+    return "\n".join(markdown_lines)
+
+
+def main():
+    try:
+        help_output = subprocess.run(
+            ["llvm-bolt", "--help-hidden"], capture_output=True, text=True, check=True
+        ).stdout
+    except subprocess.CalledProcessError as e:
+        print("Failed to execute llvm-bolt --help:")
+        print(e)
+        return
+
+    sections = parse_bolt_options(help_output)
+    markdown = generate_markdown(sections)
+
+    with open("CommandLineArgumentReference.md", "w") as md_file:
+        md_file.write(markdown)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/bolt/lib/Rewrite/DWARFRewriter.cpp b/bolt/lib/Rewrite/DWARFRewriter.cpp
index ab46503621e9a..8814ebbd10aa5 100644
--- a/bolt/lib/Rewrite/DWARFRewriter.cpp
+++ b/bolt/lib/Rewrite/DWARFRewriter.cpp
@@ -352,7 +352,7 @@ static cl::opt<bool> CreateDebugNames(
 
 static cl::opt<bool>
     DebugSkeletonCu("debug-skeleton-cu",
-                    cl::desc("prints out offsetrs for abbrev and debu_info of "
+                    cl::desc("prints out offsets for abbrev and debug_info of "
                              "Skeleton CUs that get patched."),
                     cl::ZeroOrMore, cl::Hidden, cl::init(false),
                     cl::cat(BoltCategory));