[compiler-rt] a0570e7 - [HWASan] allow symbolizer script to index binaries by build id.

Florian Mayer via llvm-commits llvm-commits at lists.llvm.org
Mon Apr 11 14:12:53 PDT 2022


Author: Florian Mayer
Date: 2022-04-11T14:12:31-07:00
New Revision: a0570e7750fcaa721a18df9c02584dda00a04e74

URL: https://github.com/llvm/llvm-project/commit/a0570e7750fcaa721a18df9c02584dda00a04e74
DIFF: https://github.com/llvm/llvm-project/commit/a0570e7750fcaa721a18df9c02584dda00a04e74.diff

LOG: [HWASan] allow symbolizer script to index binaries by build id.

Tested on an example callstack with misplaced binaries from Android.
Tested Regex against callstack without Build ID to confirm it still works.

Reviewed By: eugenis

Differential Revision: https://reviews.llvm.org/D123437

Added: 
    

Modified: 
    compiler-rt/lib/hwasan/scripts/hwasan_symbolize

Removed: 
    


################################################################################
diff  --git a/compiler-rt/lib/hwasan/scripts/hwasan_symbolize b/compiler-rt/lib/hwasan/scripts/hwasan_symbolize
index f73ea1783d758..3aba1875d19a2 100755
--- a/compiler-rt/lib/hwasan/scripts/hwasan_symbolize
+++ b/compiler-rt/lib/hwasan/scripts/hwasan_symbolize
@@ -21,6 +21,9 @@ import sys
 import string
 import subprocess
 import argparse
+import mmap
+import struct
+import os
 
 if sys.version_info.major < 3:
   # Simulate Python 3.x behaviour of defaulting to UTF-8 for print. This is
@@ -31,6 +34,71 @@ if sys.version_info.major < 3:
 last_access_address = None
 last_access_tag = None
 
+# Below, a parser for a subset of ELF. It only supports 64 bit, little-endian,
+# and only parses what is necessary to find the build ids. It uses a memoryview
+# into an mmap to avoid copying.
+Ehdr_size = 64
+e_shnum_offset = 60
+e_shoff_offset = 40
+
+Shdr_size = 64
+sh_type_offset = 4
+sh_offset_offset = 24
+sh_size_offset = 32
+SHT_NOTE = 7
+
+Nhdr_size = 12
+NT_GNU_BUILD_ID = 3
+
+def align_up(size, alignment):
+  return (size + alignment - 1) & ~(alignment - 1)
+
+def handle_Nhdr(mv, sh_size):
+  offset = 0
+  while offset < sh_size:
+    n_namesz, n_descsz, n_type = struct.unpack_from('<III', buffer=mv,
+                                                    offset=offset)
+    if (n_type == NT_GNU_BUILD_ID and n_namesz == 4 and
+        mv[offset + Nhdr_size: offset + Nhdr_size + 4] == b"GNU\x00"):
+      value = mv[offset + Nhdr_size + 4: offset + Nhdr_size + 4 + n_descsz]
+      return value.hex()
+    offset += Nhdr_size + align_up(n_namesz, 4) + align_up(n_descsz, 4)
+  return None
+
+def handle_Shdr(mv):
+  sh_type, = struct.unpack_from('<I', buffer=mv, offset=sh_type_offset)
+  if sh_type != SHT_NOTE:
+    return None, None
+  sh_offset, = struct.unpack_from('<Q', buffer=mv, offset=sh_offset_offset)
+  sh_size, = struct.unpack_from('<Q', buffer=mv, offset=sh_size_offset)
+  return sh_offset, sh_size
+
+def handle_elf(mv):
+  # \x02 is ELFCLASS64, \x01 is ELFDATA2LSB. HWASan currently only works on
+  # 64-bit little endian platforms (x86_64 and ARM64). If this changes, we will
+  # have to extend the parsing code.
+  if mv[:6] != b'\x7fELF\x02\x01':
+    return None
+  e_shnum, = struct.unpack_from('<H', buffer=mv, offset=e_shnum_offset)
+  e_shoff, = struct.unpack_from('<Q', buffer=mv, offset=e_shoff_offset)
+  for i in range(0, e_shnum):
+    start = e_shoff + i * Shdr_size
+    sh_offset, sh_size = handle_Shdr(mv[start: start + Shdr_size])
+    if sh_offset is None:
+      continue
+    note_hdr = mv[sh_offset: sh_offset + sh_size]
+    result = handle_Nhdr(note_hdr, sh_size)
+    if result is not None:
+      return result
+
+def get_buildid(filename):
+  with open(filename, "r") as fd:
+    if os.fstat(fd.fileno()).st_size < Ehdr_size:
+      return None
+    with mmap.mmap(fd.fileno(), 0, access=mmap.ACCESS_READ) as m:
+      with memoryview(m) as mv:
+        return handle_elf(mv)
+
 class Symbolizer:
   def __init__(self, path, binary_prefixes, paths_to_cut):
     self.__pipe = None
@@ -39,6 +107,7 @@ class Symbolizer:
     self.__paths_to_cut = paths_to_cut
     self.__log = False
     self.__warnings = set()
+    self.__index = {}
 
   def enable_logging(self, enable):
     self.__log = enable
@@ -77,9 +146,12 @@ class Symbolizer:
     file_name = re.sub(".*crtstuff.c:0", "???:0", file_name)
     return file_name
 
-  def __process_binary_name(self, name):
+  def __process_binary_name(self, name, buildid=None):
     if name.startswith('/'):
       name = name[1:]
+    if buildid is not None and buildid in self.__index:
+      return self.__index[buildid]
+
     for p in self.__binary_prefixes:
       full_path = os.path.join(p, name)
       if os.path.exists(full_path):
@@ -121,10 +193,10 @@ class Symbolizer:
     except Symbolizer.__EOF:
       pass
 
-  def iter_call_stack(self, binary, addr):
+  def iter_call_stack(self, binary, buildid, addr):
     self.__open_pipe()
     p = self.__pipe
-    binary = self.__process_binary_name(binary)
+    binary = self.__process_binary_name(binary, buildid)
     if not binary:
       return
     self.__write("CODE %s %s" % (binary, addr))
@@ -137,15 +209,25 @@ class Symbolizer:
     except Symbolizer.__EOF:
       pass
 
+  def build_index(self):
+    for p in self.__binary_prefixes:
+      for dname, _, fnames in os.walk(p):
+        for fn in fnames:
+          filename = os.path.join(dname, fn)
+          bid = get_buildid(filename)
+          if bid is not None:
+            self.__index[bid] = filename
+
 def symbolize_line(line, symbolizer_path):
   #0 0x7f6e35cf2e45  (/blah/foo.so+0x11fe45)
-  match = re.match(r'^(.*?)#([0-9]+)( *)(0x[0-9a-f]*) *\((.*)\+(0x[0-9a-f]+)\)', line, re.UNICODE)
+  match = re.match(r'^(.*?)#([0-9]+)( *)(0x[0-9a-f]*) *\((.*)\+(0x[0-9a-f]+)\)(?:\s*\(BuildId: ([0-9a-f]+)\))?', line, re.UNICODE)
   if match:
     frameno = match.group(2)
     binary = match.group(5)
     addr = int(match.group(6), 16)
+    buildid = match.group(7)
 
-    frames = list(symbolizer.iter_call_stack(binary, addr))
+    frames = list(symbolizer.iter_call_stack(binary, buildid, addr))
 
     if len(frames) > 0:
       print("%s#%s%s%s in %s" % (match.group(1), match.group(2),
@@ -210,6 +292,7 @@ parser.add_argument('-v', action='store_true')
 parser.add_argument('--ignore-tags', action='store_true')
 parser.add_argument('--symbols', action='append')
 parser.add_argument('--source', action='append')
+parser.add_argument('--index', action='store_true')
 parser.add_argument('--symbolizer')
 parser.add_argument('args', nargs=argparse.REMAINDER)
 args = parser.parse_args()
@@ -297,6 +380,8 @@ if args.v:
 
 symbolizer = Symbolizer(symbolizer_path, binary_prefixes, paths_to_cut)
 symbolizer.enable_logging(args.d)
+if args.index:
+  symbolizer.build_index()
 
 for line in sys.stdin:
   if sys.version_info.major < 3:


        


More information about the llvm-commits mailing list