[clang] [Tooling/Inclusion] Optimize cppreference parser (PR #114832)
Vadim D. via cfe-commits
cfe-commits at lists.llvm.org
Mon Nov 4 09:18:15 PST 2024
https://github.com/vvd170501 created https://github.com/llvm/llvm-project/pull/114832
These are reverted changes from #113612. They don't affect clang itself, but speed up std symbol mapping generator by ~1.5x.
On a machine with 2 cores, it previously took 3 minutes to generate the mapping, now it only takes 2 minutes.
With 8 cores, the time goes down from ~50s to ~30s
>From cde239f6168e525d543f90a029ba643bc1b8c797 Mon Sep 17 00:00:00 2001
From: Vadim Dudkin <vvd170501 at gmail.com>
Date: Mon, 4 Nov 2024 19:53:35 +0300
Subject: [PATCH] Don't reparse symbol pages (speedup up to 1.5x)
---
.../include-mapping/cppreference_parser.py | 48 +++++++++++--------
1 file changed, 27 insertions(+), 21 deletions(-)
diff --git a/clang/tools/include-mapping/cppreference_parser.py b/clang/tools/include-mapping/cppreference_parser.py
index 9101f3dbff0f94..d26f4c4395ab92 100644
--- a/clang/tools/include-mapping/cppreference_parser.py
+++ b/clang/tools/include-mapping/cppreference_parser.py
@@ -40,7 +40,7 @@ def _HasClass(tag, *classes):
return False
-def _ParseSymbolPage(symbol_page_html, symbol_name, qual_name):
+def _ParseSymbolPage(symbol_page_html, symbols):
"""Parse symbol page and retrieve the include header defined in this page.
The symbol page provides header for the symbol, specifically in
"Defined in header <header>" section. An example:
@@ -51,8 +51,12 @@ def _ParseSymbolPage(symbol_page_html, symbol_name, qual_name):
Returns a list of headers.
"""
- headers = set()
+ headers = collections.defaultdict(set)
all_headers = set()
+ symbol_names = {}
+ for symbol_name, qualified_symbol_name in symbols:
+ symbol_names[symbol_name] = symbol_name
+ symbol_names[qualified_symbol_name] = symbol_name
soup = BeautifulSoup(symbol_page_html, "html.parser")
# Rows in table are like:
@@ -69,11 +73,10 @@ def _ParseSymbolPage(symbol_page_html, symbol_name, qual_name):
was_decl = True
# Symbols are in the first cell.
found_symbols = row.find("td").stripped_strings
- if not any(
- sym == symbol_name or sym == qual_name for sym in found_symbols
- ):
- continue
- headers.update(current_headers)
+ for found_symbol in found_symbols:
+ symbol_name = symbol_names.get(found_symbol)
+ if symbol_name:
+ headers[symbol_name].update(current_headers)
elif _HasClass(row, "t-dsc-header"):
# If we saw a decl since the last header, this is a new block of headers
# for a new block of decls.
@@ -88,7 +91,10 @@ def _ParseSymbolPage(symbol_page_html, symbol_name, qual_name):
current_headers.append(header_code.text)
all_headers.add(header_code.text)
# If the symbol was never named, consider all named headers.
- return headers or all_headers
+ return [
+ (symbol_name, headers.get(symbol_name) or all_headers)
+ for symbol_name, _ in symbols
+ ]
def _ParseSymbolVariant(caption):
@@ -138,9 +144,9 @@ def _ParseIndexPage(index_page_html):
return symbols
-def _ReadSymbolPage(path, name, qual_name):
+def _ReadSymbolPage(path, symbols):
with open(path) as f:
- return _ParseSymbolPage(f.read(), name, qual_name)
+ return _ParseSymbolPage(f.read(), symbols)
def _GetSymbols(pool, root_dir, index_page_name, namespace, variants_to_accept):
@@ -159,6 +165,7 @@ def _GetSymbols(pool, root_dir, index_page_name, namespace, variants_to_accept):
with open(index_page_path, "r") as f:
# Read each symbol page in parallel.
results = [] # (symbol_name, promise of [header...])
+ symbols_by_page = collections.defaultdict(list)
for symbol_name, symbol_page_path, variant in _ParseIndexPage(f.read()):
# Variant symbols (e.g. the std::locale version of isalpha) add ambiguity.
# FIXME: use these as a fallback rather than ignoring entirely.
@@ -167,25 +174,24 @@ def _GetSymbols(pool, root_dir, index_page_name, namespace, variants_to_accept):
if variant and variant not in variants_for_symbol:
continue
path = os.path.join(root_dir, symbol_page_path)
- if os.path.isfile(path):
- results.append(
- (
- symbol_name,
- pool.apply_async(
- _ReadSymbolPage, (path, symbol_name, qualified_symbol_name)
- ),
- )
- )
+ if path in symbols_by_page or os.path.isfile(path):
+ symbols_by_page[path].append((symbol_name, qualified_symbol_name))
else:
sys.stderr.write(
"Discarding information for symbol: %s. Page %s does not exist.\n"
% (symbol_name, path)
)
+ for path, symbols in symbols_by_page.items():
+ results.append(
+ pool.apply_async(_ReadSymbolPage, (path, symbols)),
+ )
+
# Build map from symbol name to a set of headers.
symbol_headers = collections.defaultdict(set)
- for symbol_name, lazy_headers in results:
- symbol_headers[symbol_name].update(lazy_headers.get())
+ for lazy_mapping in results:
+ for symbol_name, headers in lazy_mapping.get():
+ symbol_headers[symbol_name].update(headers)
symbols = []
for name, headers in sorted(symbol_headers.items(), key=lambda t: t[0]):
More information about the cfe-commits
mailing list