[llvm] [Python] Use raw string literals for regexes (PR #120401)

Oliver Stannard via llvm-commits llvm-commits at lists.llvm.org
Wed Dec 18 02:54:17 PST 2024


https://github.com/ostannard created https://github.com/llvm/llvm-project/pull/120401

Previously these backslashes were not followed by a valid escape sequence character so were treated as literal backslashes, which was the intended behaviour of the code. However python as of 3.12 has started warning about these, so we should use raw string literals for regexes so that backslashes are always interpreted literally. I've done this for every regex in this file for consistency, including the ones which do not contain backslashes.

>From 3887d6940896e59ba0017b84a8df3d82b1f86322 Mon Sep 17 00:00:00 2001
From: Oliver Stannard <oliver.stannard at arm.com>
Date: Wed, 18 Dec 2024 10:47:53 +0000
Subject: [PATCH] [Python] Use raw string literals for regexes

Previously these backslashes were not followed by a valid escape sequence
character so were treated as literal backslashes, which was the intended
behavior of the code. However python as of 3.12 has started warning
about these, so we should use raw string literals for regexes so that
backslashes are always interpreted literally. I've done this for every
regex in this file for consistency, including the ones which do not
contain backslashes.
---
 llvm/utils/extract_symbols.py | 38 +++++++++++++++++------------------
 1 file changed, 19 insertions(+), 19 deletions(-)

diff --git a/llvm/utils/extract_symbols.py b/llvm/utils/extract_symbols.py
index 684e124c762594..388723421d6602 100755
--- a/llvm/utils/extract_symbols.py
+++ b/llvm/utils/extract_symbols.py
@@ -53,12 +53,12 @@ def nm_get_symbols(tool, lib):
         # The -P flag displays the size field for symbols only when applicable,
         # so the last field is optional. There's no space after the value field,
         # but \s+ match newline also, so \s+\S* will match the optional size field.
-        match = re.match("^(\S+)\s+[BDGRSTuVW]\s+\S+\s+\S*$", line)
+        match = re.match(r"^(\S+)\s+[BDGRSTuVW]\s+\S+\s+\S*$", line)
         if match:
             yield (match.group(1), True)
         # Look for undefined symbols, which have type U and may or may not
         # (depending on which nm is being used) have value and size.
-        match = re.match("^(\S+)\s+U\s+(\S+\s+\S*)?$", line)
+        match = re.match(r"^(\S+)\s+U\s+(\S+\s+\S*)?$", line)
         if match:
             yield (match.group(1), False)
     process.wait()
@@ -71,7 +71,7 @@ def readobj_is_32bit_windows(tool, lib):
         [tool, "--file-header", lib], universal_newlines=True
     )
     for line in output.splitlines():
-        match = re.match("Format: (\S+)", line)
+        match = re.match(r"Format: (\S+)", line)
         if match:
             return match.group(1) == "COFF-i386"
     return False
@@ -85,7 +85,7 @@ def should_keep_microsoft_symbol(symbol, calling_convention_decoration):
     if not "?" in symbol:
         if calling_convention_decoration:
             # Remove calling convention decoration from names
-            match = re.match("[_@]([^@]+)", symbol)
+            match = re.match(r"[_@]([^@]+)", symbol)
             if match:
                 symbol = match.group(1)
         # Discard floating point/SIMD constants.
@@ -100,10 +100,10 @@ def should_keep_microsoft_symbol(symbol, calling_convention_decoration):
     # An anonymous namespace is mangled as ?A(maybe hex number)@. Any symbol
     # that mentions an anonymous namespace can be discarded, as the anonymous
     # namespace doesn't exist outside of that translation unit.
-    elif re.search("\?A(0x\w+)?@", symbol):
+    elif re.search(r"\?A(0x\w+)?@", symbol):
         return None
     # Skip X86GenMnemonicTables functions, they are not exposed from llvm/include/.
-    elif re.match("\?is[A-Z0-9]*@X86 at llvm", symbol):
+    elif re.match(r"\?is[A-Z0-9]*@X86 at llvm", symbol):
         return None
     # Keep mangled llvm:: and clang:: function symbols. How we detect these is a
     # bit of a mess and imprecise, but that avoids having to completely demangle
@@ -123,7 +123,7 @@ def should_keep_microsoft_symbol(symbol, calling_convention_decoration):
     #                 ::= .+@ (list of types)
     #                 ::= .*Z (list of types, varargs)
     # <throw-spec> ::= exceptions are not allowed
-    elif re.search("(llvm|clang)@@[A-Z][A-Z0-9_]*[A-JQ].+(X|.+@|.*Z)$", symbol):
+    elif re.search(r"(llvm|clang)@@[A-Z][A-Z0-9_]*[A-JQ].+(X|.+@|.*Z)$", symbol):
         return symbol
     return None
 
@@ -140,7 +140,7 @@ def should_keep_itanium_symbol(symbol, calling_convention_decoration):
     if not symbol.startswith("_") and not symbol.startswith("."):
         return symbol
     # Discard manglings that aren't nested names
-    match = re.match("\.?_Z(T[VTIS])?(N.+)", symbol)
+    match = re.match(r"\.?_Z(T[VTIS])?(N.+)", symbol)
     if not match:
         return None
     # Demangle the name. If the name is too complex then we don't need to keep
@@ -169,19 +169,19 @@ class TooComplexName(Exception):
 # (name, rest of string) pair.
 def parse_itanium_name(arg):
     # Check for a normal name
-    match = re.match("(\d+)(.+)", arg)
+    match = re.match(r"(\d+)(.+)", arg)
     if match:
         n = int(match.group(1))
         name = match.group(1) + match.group(2)[:n]
         rest = match.group(2)[n:]
         return name, rest
     # Check for constructor/destructor names
-    match = re.match("([CD][123])(.+)", arg)
+    match = re.match(r"([CD][123])(.+)", arg)
     if match:
         return match.group(1), match.group(2)
     # Assume that a sequence of characters that doesn't end a nesting is an
     # operator (this is very imprecise, but appears to be good enough)
-    match = re.match("([^E]+)(.+)", arg)
+    match = re.match(r"([^E]+)(.+)", arg)
     if match:
         return match.group(1), match.group(2)
     # Anything else: we can't handle it
@@ -196,13 +196,13 @@ def skip_itanium_template(arg):
     tmp = arg[1:]
     while tmp:
         # Check for names
-        match = re.match("(\d+)(.+)", tmp)
+        match = re.match(r"(\d+)(.+)", tmp)
         if match:
             n = int(match.group(1))
             tmp = match.group(2)[n:]
             continue
         # Check for substitutions
-        match = re.match("S[A-Z0-9]*_(.+)", tmp)
+        match = re.match(r"S[A-Z0-9]*_(.+)", tmp)
         if match:
             tmp = match.group(1)
         # Start of a template
@@ -231,14 +231,14 @@ def parse_itanium_nested_name(arg):
     ret = []
 
     # Skip past the N, and possibly a substitution
-    match = re.match("NS[A-Z0-9]*_(.+)", arg)
+    match = re.match(r"NS[A-Z0-9]*_(.+)", arg)
     if match:
         tmp = match.group(1)
     else:
         tmp = arg[1:]
 
     # Skip past CV-qualifiers and ref qualifiers
-    match = re.match("[rVKRO]*(.+)", tmp)
+    match = re.match(r"[rVKRO]*(.+)", tmp)
     if match:
         tmp = match.group(1)
 
@@ -280,19 +280,19 @@ def parse_microsoft_mangling(arg):
         if arg.startswith("@"):
             return components
         # Check for a simple name
-        match = re.match("(\w+)@(.+)", arg)
+        match = re.match(r"(\w+)@(.+)", arg)
         if match:
             components.append((match.group(1), False))
             arg = match.group(2)
             continue
         # Check for a special function name
-        match = re.match("(\?_?\w)(.+)", arg)
+        match = re.match(r"(\?_?\w)(.+)", arg)
         if match:
             components.append((match.group(1), False))
             arg = match.group(2)
             continue
         # Check for a template name
-        match = re.match("\?\$(\w+)@[^@]+@(.+)", arg)
+        match = re.match(r"\?\$(\w+)@[^@]+@(.+)", arg)
         if match:
             components.append((match.group(1), True))
             arg = match.group(2)
@@ -323,7 +323,7 @@ def get_template_name(sym, mangling):
         if mangling == "microsoft":
             names = parse_microsoft_mangling(sym)
         else:
-            match = re.match("\.?_Z(T[VTIS])?(N.+)", sym)
+            match = re.match(r"\.?_Z(T[VTIS])?(N.+)", sym)
             if match:
                 names, _ = parse_itanium_nested_name(match.group(2))
             else:



More information about the llvm-commits mailing list