[llvm] Add DATA flag for non-function export symbols (PR #101951)

Mon Aug 5 03:14:56 PDT 2024

https://github.com/shatyuka created https://github.com/llvm/llvm-project/pull/101951

This patch fixes the wrong symbol type in Windows import library.

How this patch works:
Explicitly specify "DATA" flag in .def file.

Without this patch, data exports like global variables or virtual tables would be imported as functions, causing a "jmp" instruction to be read instead of the variable.

>From 9069a6e584ab91c59bbb169b82c6b020c1241208 Mon Sep 17 00:00:00 2001
From: shatyuka <shatyuka at qq.com>
Date: Mon, 5 Aug 2024 17:57:05 +0800
Subject: [PATCH] Add DATA flag for non-function export symbols

---
 llvm/utils/extract_symbols.py | 40 ++++++++++++++++++++++++++++-------
 1 file changed, 32 insertions(+), 8 deletions(-)

diff --git a/llvm/utils/extract_symbols.py b/llvm/utils/extract_symbols.py
index 10fdf14acd158..e640d789c977b 100755
--- a/llvm/utils/extract_symbols.py
+++ b/llvm/utils/extract_symbols.py
@@ -24,7 +24,7 @@
 import argparse
 import platform
 
-# Define a function which extracts a list of pairs of (symbols, is_def) from a
+# Define a function which extracts a list of pairs of (symbols, is_def, is_data) from a
 # library using llvm-nm becuase it can work both with regular and bitcode files.
 # We use subprocess.Popen and yield a symbol at a time instead of using
 # subprocess.check_output and returning a list as, especially on Windows, waiting
@@ -53,14 +53,14 @@ def nm_get_symbols(tool, lib):
         # The -P flag displays the size field for symbols only when applicable,
         # so the last field is optional. There's no space after the value field,
         # but \s+ match newline also, so \s+\S* will match the optional size field.
-        match = re.match("^(\S+)\s+[BDGRSTuVW]\s+\S+\s+\S*$", line)
+        match = re.match("^(\S+)\s+([BDGRSTuVW])\s+\S+\s+\S*$", line)
         if match:
-            yield (match.group(1), True)
+            yield (match.group(1), True, match.group(2) != "T")
         # Look for undefined symbols, which have type U and may or may not
         # (depending on which nm is being used) have value and size.
         match = re.match("^(\S+)\s+U\s+(\S+\s+\S*)?$", line)
         if match:
-            yield (match.group(1), False)
+            yield (match.group(1), False, False)
     process.wait()
 
 
@@ -77,6 +77,18 @@ def readobj_is_32bit_windows(tool, lib):
     return False
 
 
+# Define a function which determines if the target is Windows
+def readobj_is_windows(tool, lib):
+    output = subprocess.check_output(
+        [tool, "--file-header", lib], universal_newlines=True
+    )
+    for line in output.splitlines():
+        match = re.match("Format: (\S+)", line)
+        if match:
+            return match.group(1).startswith("COFF-")
+    return False
+
+
 # MSVC mangles names to ?<identifier_mangling>@<type_mangling>. By examining the
 # identifier/type mangling we can decide which symbols could possibly be
 # required and which we can discard.
@@ -307,14 +319,17 @@ def extract_symbols(arg):
     llvm_nm_path, should_keep_symbol, calling_convention_decoration, lib = arg
     symbol_defs = dict()
     symbol_refs = set()
-    for (symbol, is_def) in nm_get_symbols(llvm_nm_path, lib):
+    symbol_data = set()
+    for symbol, is_def, is_data in nm_get_symbols(llvm_nm_path, lib):
         symbol = should_keep_symbol(symbol, calling_convention_decoration)
         if symbol:
             if is_def:
                 symbol_defs[symbol] = 1 + symbol_defs.setdefault(symbol, 0)
+                if is_data:
+                    symbol_data.add(symbol)
             else:
                 symbol_refs.add(symbol)
-    return (symbol_defs, symbol_refs)
+    return (symbol_defs, symbol_refs, symbol_data)
 
 
 def get_template_name(sym, mangling):
@@ -426,6 +441,9 @@ def parse_tool_path(parser, tool, val):
     # library in the list
     calling_convention_decoration = readobj_is_32bit_windows(args.readobj, libs[0])
 
+    # Check if we should append "DATA" flag after the symbol name
+    append_data_flag = readobj_is_windows(args.readobj, libs[0])
+
     # Extract symbols from libraries in parallel. This is a huge time saver when
     # doing a debug build, as there are hundreds of thousands of symbols in each
     # library.
@@ -459,11 +477,14 @@ def parse_tool_path(parser, tool, val):
     # Merge everything into a single dict
     symbol_defs = dict()
     symbol_refs = set()
-    for (this_lib_defs, this_lib_refs) in libs_symbols:
+    symbol_data = set()
+    for this_lib_defs, this_lib_refs, this_lib_data in libs_symbols:
         for k, v in list(this_lib_defs.items()):
             symbol_defs[k] = v + symbol_defs.setdefault(k, 0)
         for sym in list(this_lib_refs):
             symbol_refs.add(sym)
+        for sym in list(this_lib_data):
+            symbol_data.add(sym)
 
     # Find which template instantiations are referenced at least once.
     template_instantiation_refs = set()
@@ -485,4 +506,7 @@ def parse_tool_path(parser, tool, val):
     for k, v in list(symbol_defs.items()):
         template = get_template_name(k, args.mangling)
         if v == 1 and (not template or template in template_instantiation_refs):
-            print(k, file=outfile)
+            if append_data_flag and k in symbol_data:
+                print(k, "DATA", file=outfile)
+            else:
+                print(k, file=outfile)