[llvm] Add DATA flag for non-function export symbols (PR #101951)
via llvm-commits
llvm-commits at lists.llvm.org
Mon Aug 5 03:14:56 PDT 2024
https://github.com/shatyuka created https://github.com/llvm/llvm-project/pull/101951
This patch fixes the wrong symbol type in Windows import library.
How this patch works:
Explicitly specify "DATA" flag in .def file.
Without this patch, data exports like global variables or virtual tables would be imported as functions, causing a "jmp" instruction to be read instead of the variable.
>From 9069a6e584ab91c59bbb169b82c6b020c1241208 Mon Sep 17 00:00:00 2001
From: shatyuka <shatyuka at qq.com>
Date: Mon, 5 Aug 2024 17:57:05 +0800
Subject: [PATCH] Add DATA flag for non-function export symbols
---
llvm/utils/extract_symbols.py | 40 ++++++++++++++++++++++++++++-------
1 file changed, 32 insertions(+), 8 deletions(-)
diff --git a/llvm/utils/extract_symbols.py b/llvm/utils/extract_symbols.py
index 10fdf14acd158..e640d789c977b 100755
--- a/llvm/utils/extract_symbols.py
+++ b/llvm/utils/extract_symbols.py
@@ -24,7 +24,7 @@
import argparse
import platform
-# Define a function which extracts a list of pairs of (symbols, is_def) from a
+# Define a function which extracts a list of pairs of (symbols, is_def, is_data) from a
# library using llvm-nm becuase it can work both with regular and bitcode files.
# We use subprocess.Popen and yield a symbol at a time instead of using
# subprocess.check_output and returning a list as, especially on Windows, waiting
@@ -53,14 +53,14 @@ def nm_get_symbols(tool, lib):
# The -P flag displays the size field for symbols only when applicable,
# so the last field is optional. There's no space after the value field,
# but \s+ match newline also, so \s+\S* will match the optional size field.
- match = re.match("^(\S+)\s+[BDGRSTuVW]\s+\S+\s+\S*$", line)
+ match = re.match("^(\S+)\s+([BDGRSTuVW])\s+\S+\s+\S*$", line)
if match:
- yield (match.group(1), True)
+ yield (match.group(1), True, match.group(2) != "T")
# Look for undefined symbols, which have type U and may or may not
# (depending on which nm is being used) have value and size.
match = re.match("^(\S+)\s+U\s+(\S+\s+\S*)?$", line)
if match:
- yield (match.group(1), False)
+ yield (match.group(1), False, False)
process.wait()
@@ -77,6 +77,18 @@ def readobj_is_32bit_windows(tool, lib):
return False
+# Define a function which determines if the target is Windows
+def readobj_is_windows(tool, lib):
+ output = subprocess.check_output(
+ [tool, "--file-header", lib], universal_newlines=True
+ )
+ for line in output.splitlines():
+ match = re.match("Format: (\S+)", line)
+ if match:
+ return match.group(1).startswith("COFF-")
+ return False
+
+
# MSVC mangles names to ?<identifier_mangling>@<type_mangling>. By examining the
# identifier/type mangling we can decide which symbols could possibly be
# required and which we can discard.
@@ -307,14 +319,17 @@ def extract_symbols(arg):
llvm_nm_path, should_keep_symbol, calling_convention_decoration, lib = arg
symbol_defs = dict()
symbol_refs = set()
- for (symbol, is_def) in nm_get_symbols(llvm_nm_path, lib):
+ symbol_data = set()
+ for symbol, is_def, is_data in nm_get_symbols(llvm_nm_path, lib):
symbol = should_keep_symbol(symbol, calling_convention_decoration)
if symbol:
if is_def:
symbol_defs[symbol] = 1 + symbol_defs.setdefault(symbol, 0)
+ if is_data:
+ symbol_data.add(symbol)
else:
symbol_refs.add(symbol)
- return (symbol_defs, symbol_refs)
+ return (symbol_defs, symbol_refs, symbol_data)
def get_template_name(sym, mangling):
@@ -426,6 +441,9 @@ def parse_tool_path(parser, tool, val):
# library in the list
calling_convention_decoration = readobj_is_32bit_windows(args.readobj, libs[0])
+ # Check if we should append "DATA" flag after the symbol name
+ append_data_flag = readobj_is_windows(args.readobj, libs[0])
+
# Extract symbols from libraries in parallel. This is a huge time saver when
# doing a debug build, as there are hundreds of thousands of symbols in each
# library.
@@ -459,11 +477,14 @@ def parse_tool_path(parser, tool, val):
# Merge everything into a single dict
symbol_defs = dict()
symbol_refs = set()
- for (this_lib_defs, this_lib_refs) in libs_symbols:
+ symbol_data = set()
+ for this_lib_defs, this_lib_refs, this_lib_data in libs_symbols:
for k, v in list(this_lib_defs.items()):
symbol_defs[k] = v + symbol_defs.setdefault(k, 0)
for sym in list(this_lib_refs):
symbol_refs.add(sym)
+ for sym in list(this_lib_data):
+ symbol_data.add(sym)
# Find which template instantiations are referenced at least once.
template_instantiation_refs = set()
@@ -485,4 +506,7 @@ def parse_tool_path(parser, tool, val):
for k, v in list(symbol_defs.items()):
template = get_template_name(k, args.mangling)
if v == 1 and (not template or template in template_instantiation_refs):
- print(k, file=outfile)
+ if append_data_flag and k in symbol_data:
+ print(k, "DATA", file=outfile)
+ else:
+ print(k, file=outfile)
More information about the llvm-commits
mailing list