[llvm] [AArch64] A simple tool for generating a scheduling model draft from a SWOG (PR #131525)

Mon Mar 17 23:59:48 PDT 2025

https://github.com/FLZ101 updated https://github.com/llvm/llvm-project/pull/131525

>From 824c6b1586b73ca61406d1de928912e759926553 Mon Sep 17 00:00:00 2001
From: Franklin <zhangfenglei at huawei.com>
Date: Sun, 16 Mar 2025 23:49:22 +0800
Subject: [PATCH 1/2] [AArch64] A simple tool for generating a scheduling model
 draft from a SWOG

---
 llvm/utils/sched_model_gen.py | 425 ++++++++++++++++++++++++++++++++++
 1 file changed, 425 insertions(+)
 create mode 100644 llvm/utils/sched_model_gen.py

diff --git a/llvm/utils/sched_model_gen.py b/llvm/utils/sched_model_gen.py
new file mode 100644
index 0000000000000..de9cdb77221f6
--- /dev/null
+++ b/llvm/utils/sched_model_gen.py
@@ -0,0 +1,425 @@
+import argparse
+import json
+import re
+
+from fractions import Fraction
+
+
+# A simple tool for generating a scheduling model draft from a SWOG.
+#
+# It depends on the following python packages:
+#
+#   pdf2docx
+#
+# Take "Neoverse N3 Core Software Optimization Guide.pdf" as an example. Below are
+# steps to get a scheduling model draft:
+#
+# 1. Write an json file which contains the pipelines N3 has and how symbols in
+#    instruction tables refer to them.
+#
+#    n3.json:
+#
+#    {
+#        "name": "N3",
+#        "pipelines": [
+#            "Branch",
+#            "Integer Single-Cycle",
+#            "Integer Single/Multi-Cycle",
+#            "FP/ASIMD/Vector Store data",
+#            "Load/Store",
+#            "Load 2",
+#            "Integer Store data"
+#        ],
+#        "symbols": {
+#              "B": ["Branch", 0, 1],
+#              "S": ["Integer Single-Cycle", 0, 1],
+#              "M": ["Integer Single/Multi-Cycle", 0, 1],
+#              "I": ["Integer Single-Cycle", 0, 1, "Integer Single/Multi-Cycle", 0, 1],
+#             "M0": ["Integer Single/Multi-Cycle", 0],
+#            "L01": ["Load/Store", 0, 1],
+#              "L": ["Load/Store", 0, 1, "Load 2", 0],
+#             "ID": ["Integer Store data", 0, 1],
+#              "V": ["FP/ASIMD/Vector Store data", 0, 1],
+#             "V0": ["FP/ASIMD/Vector Store data", 0],
+#             "V1": ["FP/ASIMD/Vector Store data", 1]
+#        }
+#    }
+#
+# 2. Generate the scheduling model draft.
+#
+#    python3 sched_model_gen.py -o n3.td n3.json "Neoverse N3 Core Software Optimization Guide.pdf"
+#
+# Basic idea:
+#
+# * We generate an InstRW for a row in the instruction tables based on a simple rule:
+#   matching the throughput assuming all utilized units are fully utilized.
+#
+#   Take the following row in the Neoverse N3 instruction tables as an example:
+#
+#   Instruction Group           Instructions  Latency  Throughput  Pipelines
+#   Branch and link, register   BLR           1        2           B, S
+#
+#   The throughput is 2, that means 2 instructions are execute in a cycle.
+#   The pipeline B has 2 units, that means it can execute 2 B uops in a cycle,
+#   so each instruction has 1 B uop. Similarly, each instruction has 1 S uop.
+#   And the number of uops in an instruction is 2.
+#
+
+
+def extract(pdf_file: str):
+    from pdf2docx import Converter
+
+    cv = Converter(pdf_file)
+    tables = cv.extract_tables()
+    cv.close()
+
+    def cook(s: str):
+        if s is None:
+            return ""
+        s = s.replace("\n", " ").replace("\t", " ").strip()
+        return s
+
+    lines = []
+    for table in tables:
+        for row in table:
+            lines.append("\t".join([cook(_) for _ in row]))
+    return lines
+
+
+def split(s, sep):
+    s = s.split(sep)
+    s = [_.strip() for _ in s]
+    s = [_ for _ in s if _]
+    return s
+
+
+def join(arr, sep=""):
+    return sep.join([str(_) for _ in arr])
+
+
+def generate(rows: list[dict], o: dict, ofs):
+    cpu_name: str = o["name"]
+
+    pipelines = {}
+    for x in o["pipelines"]:
+        pipelines[x] = set()
+
+    symbols = {}
+    for name, arr in o["symbols"].items():
+        desc = {}
+        current = None
+        for x in arr:
+            if isinstance(x, str):
+                assert x in pipelines
+                assert x not in desc
+                current = x
+                desc[current] = []
+            else:
+                assert isinstance(x, int)
+                assert current
+                desc[current].append(x)
+
+        symbols[name] = []
+        for x in desc:
+            desc[x] = tuple(sorted(set(desc[x])))
+            pipelines[x].add(desc[x])
+            symbols[name].append(tuple([x, desc[x]]))
+
+    def divide(parts: list):
+        while True:
+            parts = [_ for _ in parts if _]
+
+            def try_divide():
+                n = len(parts)
+                for i in range(n - 1):
+                    for j in range(i + 1, n):
+                        pi = set(parts[i])
+                        pj = set(parts[j])
+                        pk = pi & pj
+                        if pk:
+                            parts[i] = tuple(sorted(pi - pk))
+                            parts[j] = tuple(sorted(pj - pk))
+                            parts.append(tuple(sorted(pk)))
+                            return True
+                return False
+
+            if not try_divide():
+                break
+        return parts
+
+    proc_resource_names = {}
+
+    for name in pipelines:
+        parts = list(pipelines[name])
+        pipelines[name] = sorted(divide(parts))
+
+        for x in pipelines[name]:
+            proc_resource_names[tuple([name, x])] = ""
+
+    for symbol_name, arr in symbols.items():
+        if len(arr) == 1:
+            t = arr[0]
+            if t in proc_resource_names:
+                proc_resource_names[t] = symbol_name
+
+    for t in proc_resource_names:
+        if not proc_resource_names[t]:
+            name = t[0]
+            s = "".join(
+                [
+                    _
+                    for _ in name
+                    if ord("a") <= ord(_) <= ord("z")
+                    or ord("A") <= ord(_) <= ord("Z")
+                    or ord("0") <= ord(_) <= ord("9")
+                ]
+            )
+            if len(pipelines[name]) > 1:
+                s += join(t[1])
+            proc_resource_names[t] = s
+
+    def _unit_name(x):
+        return f"{cpu_name}Unit{x}"
+
+    # Write ProcResource
+    for name, arr in pipelines.items():
+        for x in arr:
+            resource_name = proc_resource_names[tuple([name, x])]
+
+            comment = f"""{name} {join(x, '/')}"""
+            if name[-1].isdigit():
+                comment = name
+
+            print(
+                f"""def {_unit_name(resource_name)} : ProcResource<{len(x)}>; // {comment}""",
+                file=ofs,
+            )
+    print("", file=ofs)
+
+    def decompose(name, parts):
+        res = []
+        for x in pipelines[name]:
+            if set(x) <= set(parts):
+                res.append(x)
+        return sorted(res)
+
+    # Write ProcResGroup
+    for symbol_name, arr in symbols.items():
+        if len(arr) == 1 and arr[0] in proc_resource_names:
+            continue
+
+        names = []
+        for t in arr:
+            parts = decompose(t[0], t[1])
+            for x in parts:
+                names.append(proc_resource_names[tuple([t[0], x])])
+        names = [_unit_name(_) for _ in names]
+        print(
+            f"""def {_unit_name(symbol_name)} : ProcResGroup<[{join(names, ', ')}]>;""",
+            file=ofs,
+        )
+
+    writes = {}
+    key_to_num_micro_ops = {}
+
+    def add_write(latency, units, release_at_cycles, num_micro_ops, iterative: bool):
+        assert len(units) == len(release_at_cycles)
+
+        if iterative:
+            num_micro_ops = len(release_at_cycles)
+
+            postfix = "_".join(["1%s" % _ for _ in units])
+            postfix = "%s_%s" % (postfix, join(release_at_cycles, "_"))
+
+        else:
+            units_2 = []
+            release_at_cycles_2 = []
+            for x, y in zip(units, release_at_cycles):
+                units_2.extend([x] * y)
+                release_at_cycles_2.extend([1] * y)
+
+            postfix = "_".join(
+                ["%s%s" % (_2, _1) for _1, _2 in zip(units, release_at_cycles)]
+            )
+
+            units = units_2
+            release_at_cycles = release_at_cycles_2
+
+        units = [f"{_unit_name(_)}" for _ in units]
+
+        key = f"{cpu_name}Write_%sc_%s" % (latency, postfix)
+        if key in writes:
+            return key
+
+        assert key not in key_to_num_micro_ops
+        key_to_num_micro_ops[key] = num_micro_ops
+
+        text = f"""
+def {key} : SchedWriteRes<[{', '.join(units)}]> {{
+    let Latency = {latency};"""
+
+        if num_micro_ops > 1:
+            text += f"""
+    let NumMicroOps = {num_micro_ops};"""
+
+        if sum(release_at_cycles) != len(release_at_cycles):
+            text += f"""
+    let ReleaseAtCycles = {release_at_cycles};"""
+
+        text += """
+}
+"""
+        writes[key] = text
+
+        return key
+
+    for row in rows:
+        latency = row["latency"]
+        throughput = row["throughput"]
+
+        def _get_n_units(name):
+            res = 0
+            for x in symbols[name]:
+                res += len(x[1])
+            return res
+
+        n_units = sum([_get_n_units(_) for _ in row["units"]])
+        num_micro_ops = int(n_units / throughput)
+
+        release_at_cycles = [int(_get_n_units(_) / throughput) for _ in row["units"]]
+
+        if num_micro_ops != sum(release_at_cycles):
+            print("[UNEXPECTED]", json.dumps(row))
+
+        row["key"] = add_write(
+            latency, row["units"], release_at_cycles, num_micro_ops, row["iterative"]
+        )
+
+    # Write SchedWriteRes
+    for key in sorted(
+        key_to_num_micro_ops.keys(), key=lambda _: key_to_num_micro_ops[_]
+    ):
+        print(writes[key], end="", file=ofs)
+
+    # Write InstRW
+    for row in rows:
+        if "key" not in row:
+            continue
+
+        print(
+            f"""
+// {row['group']}
+def : InstRW<[{row['key']}], (instrs {', '.join(row['instrs'])})>;
+""",
+            end="",
+            file=ofs,
+        )
+
+
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("-o", metavar="TD_FILE", required=True)
+    parser.add_argument("json_file", metavar="JSON_FILE")
+    parser.add_argument("pdf_file", metavar="PDF_FILE")
+
+    args = parser.parse_args()
+
+    td_file = args.o
+    json_file = args.json_file
+    pdf_file = args.pdf_file
+
+    lines = extract(pdf_file)
+
+    with open(json_file, encoding="UTF-8") as ifs:
+        o = json.load(ifs)
+
+    def _get_row(line: list[str]):
+        if len(line) == 5:
+            line.append("")
+        if len(line) != 6:
+            return None
+
+        latency = line[2]
+        throughput = line[3]
+
+        units = split(line[4], ",")
+        if len(units) == 0:
+            return None
+        for x in units:
+            if x not in o["symbols"]:
+                return None
+
+        if latency in ["-", ""]:
+            print("[SKIPPED]", line)
+            return None
+
+        iterative = False
+
+        if "to" in latency:
+            latency = split(latency, "to")[1]
+            iterative = True
+        m = re.fullmatch(r"(\d+) ?\(\d+\)", latency)
+        if m:
+            latency = m.group(1)
+        latency = int(latency)
+
+        if "to" in throughput:
+            throughput = split(throughput, "to")[0]
+            iterative = True
+        if "/" in throughput:
+            throughput = Fraction(throughput)
+        else:
+            throughput = int(throughput)
+
+        row = {
+            "group": line[0],
+            "instrs": split(line[1], ","),
+            "latency": latency,
+            "throughput": throughput,
+            "units": units,
+            "note": line[5],
+            "iterative": iterative,
+        }
+        return row
+
+    def get_row(line):
+        try:
+            return _get_row(line)
+        except ValueError:
+            return None
+
+    def get_rows(lines: list[str]):
+        rows = []
+        state = "BEGIN"
+        for line in lines:
+            line = [_.strip() for _ in line.split("\t")]
+
+            def is_head():
+                return (
+                    len(line) == 6
+                    and line[0] == "Instruction Group"
+                    and line[2] == "Exec Latency"
+                    and line[3] == "Execution Throughput"
+                )
+
+            if state == "BEGIN":
+                if is_head():
+                    state = "ROW"
+            else:
+                assert state == "ROW"
+
+                row = get_row(line)
+                if row:
+                    rows.append(row)
+                else:
+                    state = "ROW" if is_head() else "BEGIN"
+        return rows
+
+    rows = get_rows(lines)
+
+    with open(td_file, "w", encoding="UTF-8") as ofs:
+        generate(rows, o, ofs)
+
+
+if __name__ == "__main__":
+    main()

>From 4e9ab9c8f419019fdef4929a14d5b32ae8e95155 Mon Sep 17 00:00:00 2001
From: Franklin <zhangfenglei at huawei.com>
Date: Tue, 18 Mar 2025 14:59:40 +0800
Subject: [PATCH 2/2] Update comments

---
 llvm/utils/sched_model_gen.py | 9 ++++-----
 1 file changed, 4 insertions(+), 5 deletions(-)

diff --git a/llvm/utils/sched_model_gen.py b/llvm/utils/sched_model_gen.py
index de9cdb77221f6..aabdb61ecc943 100644
--- a/llvm/utils/sched_model_gen.py
+++ b/llvm/utils/sched_model_gen.py
@@ -52,17 +52,16 @@
 # Basic idea:
 #
 # * We generate an InstRW for a row in the instruction tables based on a simple rule:
-#   matching the throughput assuming all utilized units are fully utilized.
+#   match the throughput assuming all utilized units are fully utilized.
 #
 #   Take the following row in the Neoverse N3 instruction tables as an example:
 #
 #   Instruction Group           Instructions  Latency  Throughput  Pipelines
 #   Branch and link, register   BLR           1        2           B, S
 #
-#   The throughput is 2, that means 2 instructions are execute in a cycle.
-#   The pipeline B has 2 units, that means it can execute 2 B uops in a cycle,
-#   so each instruction has 1 B uop. Similarly, each instruction has 1 S uop.
-#   And the number of uops in an instruction is 2.
+#   The throughput is 2, that means 2 instructions are executed in a cycle.
+#   The pipeline B and C each has 2 units, that means 2 B uops and 2 S uops
+#   are executed in a cycle. So each instruction has 1 B uop and 1 S uop.
 #