[PATCH] D142555: Refactor symbol page parsing.

Wed Jan 25 08:24:43 PST 2023

VitaNuo created this revision.
Herald added a project: All.
VitaNuo requested review of this revision.
Herald added a project: clang.
Herald added a subscriber: cfe-commits.

Repository:
  rG LLVM Github Monorepo

https://reviews.llvm.org/D142555

Files:
  clang/tools/include-mapping/cppreference_parser.py


Index: clang/tools/include-mapping/cppreference_parser.py
===================================================================

--- clang/tools/include-mapping/cppreference_parser.py
+++ clang/tools/include-mapping/cppreference_parser.py
@@ -47,7 +47,7 @@
 
   Returns a list of headers.
   """
-  headers = set()
+  symbol_headers = set()
   all_headers = set()
 
   soup = BeautifulSoup(symbol_page_html, "html.parser")
@@ -58,31 +58,39 @@
   #   Defined in header <baz>      .t-dsc-header
   #   decl2                        .t-dcl
   for table in soup.select('table.t-dcl-begin, table.t-dsc-begin'):
-    current_headers = []
-    was_decl = False
-    for row in table.select('tr'):
-      if _HasClass(row, 't-dcl', 't-dsc'):
-        was_decl = True
-        # Symbols are in the first cell.
-        found_symbols = row.find('td').stripped_strings
-        if not symbol_name in found_symbols:
-          continue
-        headers.update(current_headers)
-      elif _HasClass(row, 't-dsc-header'):
-        # If we saw a decl since the last header, this is a new block of headers
-        # for a new block of decls.
-        if was_decl:
-          current_headers = []
-        was_decl = False
+    rows = table.select('tr')
+    i = 0
+    while i < len(rows):
+      start = i
+      current_headers = set()
+      while i < len(rows) and _HasClass(rows[i], 't-dsc-header'):
+        row = rows[i]
         # There are also .t-dsc-header for "defined in namespace".
         if not "Defined in header " in row.text:
+          i = i + 1
           continue
         # The interesting header content (e.g. <cstdlib>) is wrapped in <code>.
         for header_code in row.find_all("code"):
-          current_headers.append(header_code.text)
           all_headers.add(header_code.text)
-  # If the symbol was never named, consider all named headers.
-  return headers or all_headers
+          current_headers.add(header_code.text)
+        i = i + 1
+      # some tables have header rows, skip them  
+      while i < len(rows) and _HasClass(rows[i], 't-dsc-hitem'):
+        i = i + 1
+      while i < len(rows) and (_HasClass(rows[i], 't-dcl', 't-dsc') or not rows[i].has_attr("class")):
+        row = rows[i]
+        # Symbols are in the first cell.
+        found_symbols = row.find('td').stripped_strings
+        if symbol_name in found_symbols:
+          for header in current_headers:
+            symbol_headers.add(header)
+        i = i + 1
+      # no headers or symbols in this block
+      if i == start:
+        i = i + 1
+    
+  # If the symbol was never named, consider all named headers.  
+  return symbol_headers or all_headers
 
 
 def _ParseIndexPage(index_page_html):


-------------- next part --------------
A non-text attachment was scrubbed...
Name: D142555.492127.patch
Type: text/x-patch
Size: 2708 bytes
Desc: not available
URL: <http://lists.llvm.org/pipermail/cfe-commits/attachments/20230125/4e610bfd/attachment.bin>