[compiler-rt] 9ef451d - [hwasan] Offline symbolization script.

Evgenii Stepanov via llvm-commits llvm-commits at lists.llvm.org
Mon Dec 9 13:46:32 PST 2019


Author: Evgenii Stepanov
Date: 2019-12-09T13:46:11-08:00
New Revision: 9ef451d1fdaa9a1cadf1a99c3540253a0e9c118d

URL: https://github.com/llvm/llvm-project/commit/9ef451d1fdaa9a1cadf1a99c3540253a0e9c118d
DIFF: https://github.com/llvm/llvm-project/commit/9ef451d1fdaa9a1cadf1a99c3540253a0e9c118d.diff

LOG: [hwasan] Offline symbolization script.

Summary:
A script to symbolize hwasan reports after the fact using unstripped
binaries. Supports stack-based reports. Requires llvm-symbolizer
(addr2line is not an option).

Reviewers: pcc, hctim

Subscribers: mgorny, #sanitizers, llvm-commits

Tags: #sanitizers, #llvm

Differential Revision: https://reviews.llvm.org/D71148

Added: 
    compiler-rt/lib/hwasan/scripts/CMakeLists.txt
    compiler-rt/lib/hwasan/scripts/hwasan_symbolize

Modified: 
    compiler-rt/lib/hwasan/CMakeLists.txt

Removed: 
    


################################################################################
diff  --git a/compiler-rt/lib/hwasan/CMakeLists.txt b/compiler-rt/lib/hwasan/CMakeLists.txt
index 636392fa8ae2..03863e4be68d 100644
--- a/compiler-rt/lib/hwasan/CMakeLists.txt
+++ b/compiler-rt/lib/hwasan/CMakeLists.txt
@@ -181,6 +181,8 @@ endforeach()
 
 add_compiler_rt_resource_file(hwasan_blacklist hwasan_blacklist.txt hwasan)
 
+add_subdirectory("scripts")
+
 # if(COMPILER_RT_INCLUDE_TESTS)
 #   add_subdirectory(tests)
 # endif()

diff  --git a/compiler-rt/lib/hwasan/scripts/CMakeLists.txt b/compiler-rt/lib/hwasan/scripts/CMakeLists.txt
new file mode 100644
index 000000000000..68c8375c75ed
--- /dev/null
+++ b/compiler-rt/lib/hwasan/scripts/CMakeLists.txt
@@ -0,0 +1,2 @@
+add_compiler_rt_script(hwasan_symbolize)
+add_dependencies(hwasan hwasan_symbolize)

diff  --git a/compiler-rt/lib/hwasan/scripts/hwasan_symbolize b/compiler-rt/lib/hwasan/scripts/hwasan_symbolize
new file mode 100755
index 000000000000..f77e36fbd622
--- /dev/null
+++ b/compiler-rt/lib/hwasan/scripts/hwasan_symbolize
@@ -0,0 +1,282 @@
+#!/usr/bin/env python
+#===- lib/hwasan/scripts/hwasan_symbolize ----------------------------------===#
+#
+# Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+# See https:#llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+#
+#===------------------------------------------------------------------------===#
+#
+# HWAddressSanitizer offline symbolization script.
+#
+#===------------------------------------------------------------------------===#
+import glob
+import os
+import re
+import sys
+import string
+import subprocess
+import argparse
+
+last_access_address = None
+last_access_tag = None
+
+class Symbolizer:
+  def __init__(self, path, binary_prefixes, paths_to_cut):
+    self.__pipe = None
+    self.__path = path
+    self.__binary_prefixes = binary_prefixes
+    self.__paths_to_cut = paths_to_cut
+    self.__log = False
+
+  def enable_logging(self, enable):
+    self.__log = enable
+
+  def __open_pipe(self):
+    if not self.__pipe:
+      self.__pipe = subprocess.Popen([self.__path, "-inlining", "-functions"],
+                                     stdin=subprocess.PIPE, stdout=subprocess.PIPE)
+
+  class __EOF:
+    pass
+
+  def __write(self, s):
+    print >>self.__pipe.stdin, s
+    if self.__log:
+      print >>sys.stderr, ("#>>  |%s|" % (s,))
+
+  def __read(self):
+    s = self.__pipe.stdout.readline().rstrip()
+    if self.__log:
+      print >>sys.stderr, ("# << |%s|" % (s,))
+    if s == '':
+      raise Symbolizer.__EOF
+    return s
+
+  def __process_source_path(self, file_name):
+    for path_to_cut in self.__paths_to_cut:
+      file_name = re.sub(".*" + path_to_cut, "", file_name)
+    file_name = re.sub(".*hwasan_[a-z_]*.(cc|h):[0-9]*", "[hwasan_rtl]", file_name)
+    file_name = re.sub(".*asan_[a-z_]*.(cc|h):[0-9]*", "[asan_rtl]", file_name)
+    file_name = re.sub(".*crtstuff.c:0", "???:0", file_name)
+    return file_name
+
+  def __process_binary_name(self, name):
+    if name.startswith('/'):
+      name = name[1:]
+    for p in self.__binary_prefixes:
+      full_path = os.path.join(p, name)
+      if os.path.exists(full_path):
+        return full_path
+    # Try stripping extra path components as the last resort.
+    for p in self.__binary_prefixes:
+      full_path = os.path.join(p, os.path.basename(name))
+      if os.path.exists(full_path):
+        return full_path
+    print >>sys.stderr, "Could not find symbols for", name
+    return None
+
+  def iter_locals(self, binary, addr):
+    self.__open_pipe()
+    p = self.__pipe
+    binary = self.__process_binary_name(binary)
+    if not binary:
+      return
+    self.__write("FRAME %s %s" % (binary, addr))
+    try:
+      while True:
+        function_name = self.__read()
+        local_name = self.__read()
+        file_line = self.__read()
+        extra = self.__read().split()
+
+        file_line = self.__process_source_path(file_line)
+        offset = None if extra[0] == '??' else int(extra[0])
+        size = None if extra[1] == '??' else int(extra[1])
+        tag_offset = None if extra[2] == '??' else int(extra[2])
+        yield (function_name, file_line, local_name, offset, size, tag_offset)
+    except Symbolizer.__EOF:
+      pass
+
+  def iter_call_stack(self, binary, addr):
+    self.__open_pipe()
+    p = self.__pipe
+    binary = self.__process_binary_name(binary)
+    if not binary:
+      return
+    self.__write("CODE %s %s" % (binary, addr))
+    try:
+      while True:
+        function_name = self.__read()
+        file_line = self.__read()
+        file_line = self.__process_source_path(file_line)
+        yield (function_name, file_line)
+    except Symbolizer.__EOF:
+      pass
+
+def symbolize_line(line, symbolizer_path):
+  #0 0x7f6e35cf2e45  (/blah/foo.so+0x11fe45)
+  match = re.match(r'^(.*?)#([0-9]+)( *)(0x[0-9a-f]+) *\((.*)\+(0x[0-9a-f]+)\)', line, re.UNICODE)
+  if match:
+    frameno = match.group(2)
+    binary = match.group(5)
+    addr = int(match.group(6), 16)
+
+    frames = list(symbolizer.iter_call_stack(binary, addr))
+
+    if len(frames) > 0:
+      print "%s#%s%s%s in %s" % (match.group(1).encode('utf-8'), match.group(2).encode('utf-8'),
+                                 match.group(3).encode('utf-8'), frames[0][0], frames[0][1])
+      for i in range(1, len(frames)):
+        space1 = ' ' * match.end(1)
+        space2 = ' ' * (match.start(4) - match.end(1) - 2)
+        print "%s->%s%s in %s" % (space1, space2, frames[i][0], frames[i][1])
+    else:
+      print line.rstrip().encode('utf-8')
+  else:
+    print line.rstrip().encode('utf-8')
+
+def save_access_address(line):
+  global last_access_address, last_access_tag
+  match = re.match(r'^(.*?)HWAddressSanitizer: tag-mismatch on address (0x[0-9a-f]+) ', line, re.UNICODE)
+  if match:
+    last_access_address = int(match.group(2), 16)
+  match = re.match(r'^(.*?) of size [0-9]+ at 0x[0-9a-f]+ tags: ([0-9a-f]+)/[0-9a-f]+ \(ptr/mem\)', line, re.UNICODE)
+  if match:
+    last_access_tag = int(match.group(2), 16)
+
+def process_stack_history(line, symbolizer, ignore_tags=False):
+  if last_access_address is None or last_access_tag is None:
+    return
+  if re.match(r'Previously allocated frames:', line, re.UNICODE):
+    return True
+  pc_mask = (1 << 48) - 1
+  fp_mask = (1 << 20) - 1
+  # record_addr:0x1234ABCD record:0x1234ABCD (/path/to/binary+0x1234ABCD)
+  match = re.match(r'^(.*?)record_addr:(0x[0-9a-f]+) +record:(0x[0-9a-f]+) +\((.*)\+(0x[0-9a-f]+)\)', line, re.UNICODE)
+  if match:
+    record_addr = int(match.group(2), 16)
+    record = int(match.group(3), 16)
+    binary = match.group(4)
+    addr = int(match.group(5), 16)
+    base_tag = (record_addr >> 3) & 0xFF
+    fp = (record >> 48) << 4
+    pc = record & pc_mask
+
+    for local in symbolizer.iter_locals(binary, addr):
+      frame_offset = local[3]
+      size = local[4]
+      if frame_offset is None or size is None:
+        continue
+      obj_offset = (last_access_address - fp - frame_offset) & fp_mask
+      if obj_offset >= size:
+        continue
+      tag_offset = local[5]
+      if not ignore_tags and (tag_offset is None or base_tag ^ tag_offset != last_access_tag):
+        continue
+      print ''
+      print 'Potentially referenced stack object:'
+      print '  %d bytes inside variable "%s" in stack frame of function "%s"' % (obj_offset, local[2], local[0])
+      print '  at %s' % (local[1],)
+    return True
+  return False
+
+parser = argparse.ArgumentParser()
+parser.add_argument('-d', action='store_true')
+parser.add_argument('-v', action='store_true')
+parser.add_argument('--ignore-tags', action='store_true')
+parser.add_argument('--symbols', action='append')
+parser.add_argument('--source', action='append')
+parser.add_argument('--symbolizer')
+parser.add_argument('args', nargs=argparse.REMAINDER)
+args = parser.parse_args()
+
+# Unstripped binaries location.
+binary_prefixes = args.symbols or []
+if not binary_prefixes:
+  if 'ANDROID_PRODUCT_OUT' in os.environ:
+    product_out = os.path.join(os.environ['ANDROID_PRODUCT_OUT'], 'symbols')
+    binary_prefixes.append(product_out)
+
+for p in binary_prefixes:
+  if not os.path.isdir(p):
+    print >>sys.stderr, "Symbols path does not exist or is not a directory:", p
+    sys.exit(1)
+
+# Source location.
+paths_to_cut = args.source or []
+if not paths_to_cut:
+  paths_to_cut.append(os.getcwd() + '/')
+  if 'ANDROID_BUILD_TOP' in os.environ:
+    paths_to_cut.append(os.environ['ANDROID_BUILD_TOP'] + '/')
+
+# llvm-symbolizer binary.
+# 1. --symbolizer flag
+# 2. environment variable
+# 3. unsuffixed binary in the current directory
+# 4. if inside Android platform, prebuilt binary at a known path
+# 5. first "llvm-symbolizer", then "llvm-symbolizer-$VER" with the
+#    highest available version in $PATH
+symbolizer_path = args.symbolizer
+if not symbolizer_path:
+  if 'LLVM_SYMBOLIZER_PATH' in os.environ:
+    symbolizer_path = os.environ['LLVM_SYMBOLIZER_PATH']
+  elif 'HWASAN_SYMBOLIZER_PATH' in os.environ:
+    symbolizer_path = os.environ['HWASAN_SYMBOLIZER_PATH']
+
+if not symbolizer_path:
+  s = os.path.join(os.path.dirname(sys.argv[0]), 'llvm-symbolizer')
+  if os.path.exists(s):
+    symbolizer_path = s
+
+if not symbolizer_path:
+  if 'ANDROID_BUILD_TOP' in os.environ:
+    s = os.path.join(os.environ['ANDROID_BUILD_TOP'], 'prebuilts/clang/host/linux-x86/llvm-binutils-stable/llvm-symbolizer')
+    if os.path.exists(s):
+      symbolizer_path = s
+
+if not symbolizer_path:
+  for path in os.environ["PATH"].split(os.pathsep):
+    p = os.path.join(path, 'llvm-symbolizer')
+    if os.path.exists(p):
+      symbolizer_path = p
+      break
+
+def extract_version(s):
+  idx = s.rfind('-')
+  if idx == -1:
+    return 0
+  x = float(s[idx + 1:])
+  return x
+
+if not symbolizer_path:
+  for path in os.environ["PATH"].split(os.pathsep):
+    candidates = glob.glob(os.path.join(path, 'llvm-symbolizer-*'))
+    if len(candidates) > 0:
+      candidates.sort(key = extract_version, reverse = True)
+      symbolizer_path = candidates[0]
+      break
+
+if not os.path.exists(symbolizer_path):
+  print >>sys.stderr, "Symbolizer path does not exist:", symbolizer_path
+  sys.exit(1)
+
+if args.v:
+  print "Looking for symbols in:"
+  for s in binary_prefixes:
+    print "  %s" % (s,)
+  print "Stripping source path prefixes:"
+  for s in paths_to_cut:
+    print "  %s" % (s,)
+  print "Using llvm-symbolizer binary in:\n  %s" % (symbolizer_path,)
+  print
+
+symbolizer = Symbolizer(symbolizer_path, binary_prefixes, paths_to_cut)
+symbolizer.enable_logging(args.d)
+
+for line in sys.stdin:
+  line = line.decode('utf-8')
+  save_access_address(line)
+  if process_stack_history(line, symbolizer, ignore_tags=args.ignore_tags):
+    continue
+  symbolize_line(line, symbolizer_path)


        


More information about the llvm-commits mailing list