[llvm] e7303fe - [Python] Use raw string literals for regexes (#120401)

via llvm-commits llvm-commits at lists.llvm.org
Wed Dec 18 04:58:24 PST 2024

Author: Oliver Stannard
Date: 2024-12-18T12:58:21Z
New Revision: e7303fe80a0bea124422219356c1c9e845110a77

URL: https://github.com/llvm/llvm-project/commit/e7303fe80a0bea124422219356c1c9e845110a77
DIFF: https://github.com/llvm/llvm-project/commit/e7303fe80a0bea124422219356c1c9e845110a77.diff

LOG: [Python] Use raw string literals for regexes (#120401)

Previously these backslashes were not followed by a valid escape
sequence character so were treated as literal backslashes, which was the
intended behaviour of the code. However python as of 3.12 has started
warning about these, so we should use raw string literals for regexes so
that backslashes are always interpreted literally. I've done this for
every regex in this file for consistency, including the ones which do
not contain backslashes.




diff  --git a/llvm/utils/extract_symbols.py b/llvm/utils/extract_symbols.py
index 684e124c762594..388723421d6602 100755
--- a/llvm/utils/extract_symbols.py
+++ b/llvm/utils/extract_symbols.py
@@ -53,12 +53,12 @@ def nm_get_symbols(tool, lib):
         # The -P flag displays the size field for symbols only when applicable,
         # so the last field is optional. There's no space after the value field,
         # but \s+ match newline also, so \s+\S* will match the optional size field.
-        match = re.match("^(\S+)\s+[BDGRSTuVW]\s+\S+\s+\S*$", line)
+        match = re.match(r"^(\S+)\s+[BDGRSTuVW]\s+\S+\s+\S*$", line)
         if match:
             yield (match.group(1), True)
         # Look for undefined symbols, which have type U and may or may not
         # (depending on which nm is being used) have value and size.
-        match = re.match("^(\S+)\s+U\s+(\S+\s+\S*)?$", line)
+        match = re.match(r"^(\S+)\s+U\s+(\S+\s+\S*)?$", line)
         if match:
             yield (match.group(1), False)
@@ -71,7 +71,7 @@ def readobj_is_32bit_windows(tool, lib):
         [tool, "--file-header", lib], universal_newlines=True
     for line in output.splitlines():
-        match = re.match("Format: (\S+)", line)
+        match = re.match(r"Format: (\S+)", line)
         if match:
             return match.group(1) == "COFF-i386"
     return False
@@ -85,7 +85,7 @@ def should_keep_microsoft_symbol(symbol, calling_convention_decoration):
     if not "?" in symbol:
         if calling_convention_decoration:
             # Remove calling convention decoration from names
-            match = re.match("[_@]([^@]+)", symbol)
+            match = re.match(r"[_@]([^@]+)", symbol)
             if match:
                 symbol = match.group(1)
         # Discard floating point/SIMD constants.
@@ -100,10 +100,10 @@ def should_keep_microsoft_symbol(symbol, calling_convention_decoration):
     # An anonymous namespace is mangled as ?A(maybe hex number)@. Any symbol
     # that mentions an anonymous namespace can be discarded, as the anonymous
     # namespace doesn't exist outside of that translation unit.
-    elif re.search("\?A(0x\w+)?@", symbol):
+    elif re.search(r"\?A(0x\w+)?@", symbol):
         return None
     # Skip X86GenMnemonicTables functions, they are not exposed from llvm/include/.
-    elif re.match("\?is[A-Z0-9]*@X86 at llvm", symbol):
+    elif re.match(r"\?is[A-Z0-9]*@X86 at llvm", symbol):
         return None
     # Keep mangled llvm:: and clang:: function symbols. How we detect these is a
     # bit of a mess and imprecise, but that avoids having to completely demangle
@@ -123,7 +123,7 @@ def should_keep_microsoft_symbol(symbol, calling_convention_decoration):
     #                 ::= .+@ (list of types)
     #                 ::= .*Z (list of types, varargs)
     # <throw-spec> ::= exceptions are not allowed
-    elif re.search("(llvm|clang)@@[A-Z][A-Z0-9_]*[A-JQ].+(X|.+@|.*Z)$", symbol):
+    elif re.search(r"(llvm|clang)@@[A-Z][A-Z0-9_]*[A-JQ].+(X|.+@|.*Z)$", symbol):
         return symbol
     return None
@@ -140,7 +140,7 @@ def should_keep_itanium_symbol(symbol, calling_convention_decoration):
     if not symbol.startswith("_") and not symbol.startswith("."):
         return symbol
     # Discard manglings that aren't nested names
-    match = re.match("\.?_Z(T[VTIS])?(N.+)", symbol)
+    match = re.match(r"\.?_Z(T[VTIS])?(N.+)", symbol)
     if not match:
         return None
     # Demangle the name. If the name is too complex then we don't need to keep
@@ -169,19 +169,19 @@ class TooComplexName(Exception):
 # (name, rest of string) pair.
 def parse_itanium_name(arg):
     # Check for a normal name
-    match = re.match("(\d+)(.+)", arg)
+    match = re.match(r"(\d+)(.+)", arg)
     if match:
         n = int(match.group(1))
         name = match.group(1) + match.group(2)[:n]
         rest = match.group(2)[n:]
         return name, rest
     # Check for constructor/destructor names
-    match = re.match("([CD][123])(.+)", arg)
+    match = re.match(r"([CD][123])(.+)", arg)
     if match:
         return match.group(1), match.group(2)
     # Assume that a sequence of characters that doesn't end a nesting is an
     # operator (this is very imprecise, but appears to be good enough)
-    match = re.match("([^E]+)(.+)", arg)
+    match = re.match(r"([^E]+)(.+)", arg)
     if match:
         return match.group(1), match.group(2)
     # Anything else: we can't handle it
@@ -196,13 +196,13 @@ def skip_itanium_template(arg):
     tmp = arg[1:]
     while tmp:
         # Check for names
-        match = re.match("(\d+)(.+)", tmp)
+        match = re.match(r"(\d+)(.+)", tmp)
         if match:
             n = int(match.group(1))
             tmp = match.group(2)[n:]
         # Check for substitutions
-        match = re.match("S[A-Z0-9]*_(.+)", tmp)
+        match = re.match(r"S[A-Z0-9]*_(.+)", tmp)
         if match:
             tmp = match.group(1)
         # Start of a template
@@ -231,14 +231,14 @@ def parse_itanium_nested_name(arg):
     ret = []
     # Skip past the N, and possibly a substitution
-    match = re.match("NS[A-Z0-9]*_(.+)", arg)
+    match = re.match(r"NS[A-Z0-9]*_(.+)", arg)
     if match:
         tmp = match.group(1)
         tmp = arg[1:]
     # Skip past CV-qualifiers and ref qualifiers
-    match = re.match("[rVKRO]*(.+)", tmp)
+    match = re.match(r"[rVKRO]*(.+)", tmp)
     if match:
         tmp = match.group(1)
@@ -280,19 +280,19 @@ def parse_microsoft_mangling(arg):
         if arg.startswith("@"):
             return components
         # Check for a simple name
-        match = re.match("(\w+)@(.+)", arg)
+        match = re.match(r"(\w+)@(.+)", arg)
         if match:
             components.append((match.group(1), False))
             arg = match.group(2)
         # Check for a special function name
-        match = re.match("(\?_?\w)(.+)", arg)
+        match = re.match(r"(\?_?\w)(.+)", arg)
         if match:
             components.append((match.group(1), False))
             arg = match.group(2)
         # Check for a template name
-        match = re.match("\?\$(\w+)@[^@]+@(.+)", arg)
+        match = re.match(r"\?\$(\w+)@[^@]+@(.+)", arg)
         if match:
             components.append((match.group(1), True))
             arg = match.group(2)
@@ -323,7 +323,7 @@ def get_template_name(sym, mangling):
         if mangling == "microsoft":
             names = parse_microsoft_mangling(sym)
-            match = re.match("\.?_Z(T[VTIS])?(N.+)", sym)
+            match = re.match(r"\.?_Z(T[VTIS])?(N.+)", sym)
             if match:
                 names, _ = parse_itanium_nested_name(match.group(2))


More information about the llvm-commits mailing list