SERVER-22035 Introduce mongosymb.py stack trace symbolizer.

2024-12-01 09:32:32 +01:00 · 2015-12-18 18:20:18 -05:00 · 2015-12-18 18:20:18 -05:00 · fffc3c12ac
commit fffc3c12ac
parent b45dba8425
1 changed files with 183 additions and 0 deletions
--- a/buildscripts/mongosymb.py
+++ b/buildscripts/mongosymb.py
@ -0,0 +1,183 @@
+#!/usr/bin/env python
+"""Script and library for symbolizing MongoDB stack traces.
+
+To use as a script, paste the JSON object on the line after ----- BEGIN BACKTRACE ----- into the
+standard input of this script. There are numerous caveats. In the default mode, you need
+to pass in the path to the executable being symbolized, and if you want shared library stack
+traces, you must be on the same system.
+
+There is largely untested support for extracting debug information from S3 buckets. This work
+is experimental.
+
+Sample usage:
+
+mongosymb.py --symbolizer-path=/path/to/llvm-symbolizer /path/to/executable </file/with/stacktrace
+
+You can also pass --output-format=json, to get rich json output. It shows some extra information,
+but emits json instead of plain text.
+"""
+
+import json
+import optparse
+import os
+import subprocess
+import sys
+
+def symbolize_frames(trace_doc, dbg_path_resolver, symbolizer_path=None, dsym_hint=None):
+    """Given a trace_doc in MongoDB stack dump format, returns a list of symbolized stack frames.
+    """
+
+    if symbolizer_path is None:
+        symbolizer_path = os.environ.get("MONGOSYMB_SYMBOLIZER_PATH", "llvm-symbolizer")
+    if dsym_hint is None:
+        dsym_hint = []
+
+    def make_base_addr_map(somap_list):
+        """Makes a map from binary load address to description of library from the somap, which is
+        a list of dictionaries describing individual loaded libraries.
+        """
+        base_addr_map = {}
+        for so_entry in somap_list:
+            base_addr_map[so_entry["b"]] = so_entry
+        return base_addr_map
+
+    base_addr_map = make_base_addr_map(trace_doc["processInfo"]["somap"])
+
+    frames = []
+    for frame in trace_doc["backtrace"]:
+        soinfo = base_addr_map.get(frame["b"], {})
+        elf_type = soinfo.get("elfType", 0)
+        if elf_type == 3:
+            addr_base = "0"
+        elif elf_type == 2:
+            addr_base = frame["b"]
+        else:
+            addr_base = soinfo.get("vmaddr", "0")
+        addr = long(addr_base, 16) + long(frame["o"], 16)
+        frames.append(dict(path=dbg_path_resolver.get_dbg_file(soinfo),
+                           buildId=soinfo.get("buildId", None),
+                           offset=frame["o"],
+                           addr=addr,
+                           symbol=frame.get("s", None)))
+
+    symbolizer_args = [symbolizer_path]
+    for dh in dsym_hint:
+        symbolizer_args.append("-dsym-hint=%s" %dh)
+    symbolizer_process = subprocess.Popen(
+        args=symbolizer_args,
+        close_fds=True,
+        stdin=subprocess.PIPE,
+        stdout=subprocess.PIPE,
+        stderr=open("/dev/null"))
+
+    def extract_symbols(stdin):
+        """Extracts symbol information from the output of llvm-symbolizer.
+
+        Returns a list of dictionaries, each of which has fn, file, column and line entries.
+
+        The format of llvm-symbolizer output is that for every CODE line of input,
+        it outputs zero or more pairs of lines, and then a blank line. This way, if
+        a CODE line of input maps to several inlined functions, you can use the blank
+        line to find the end of the list of symbols corresponding to the CODE line.
+
+        The first line of each pair contains the function name, and the second contains the file,
+        column and line information.
+        """
+        result = []
+        step = 0
+        while True:
+            line = stdin.readline().decode()
+            if line == "\n":
+                break
+            if step == 0:
+                result.append({"fn" : line.strip()})
+                step = 1
+            else:
+                file_name, line, column = line.strip().rsplit(':', 3)
+                result[-1].update({"file": file_name, "column": int(column), "line": int(line)})
+                step = 0
+        return result
+
+    for frame in frames:
+        if frame["path"] is None:
+            continue
+        symbolizer_process.stdin.write("CODE %(path)s 0x%(addr)X\n" % frame)
+        symbolizer_process.stdin.flush()
+        frame["symbinfo"] = extract_symbols(symbolizer_process.stdout)
+    symbolizer_process.stdin.close()
+    symbolizer_process.wait()
+    return frames
+
+class path_dbg_file_resolver(object):
+    def __init__(self, bin_path_guess):
+        self._bin_path_guess = bin_path_guess
+
+    def get_dbg_file(self, soinfo):
+        return soinfo.get("path", self._bin_path_guess)
+
+class s3_buildid_dbg_file_resolver(object):
+    def __init__(self, cache_dir, s3_bucket):
+        self._cache_dir = cache_dir
+        self._s3_bucket = s3_bucket
+
+    def get_dbg_file(self, soinfo):
+        buildId = soinfo.get("buildId", None)
+        if buildId is None:
+            return None
+        buildId = buildId.lower()
+        buildIdPath = os.path.join(self._cache_dir, buildId + ".debug")
+        if not os.path.exists(buildIdPath):
+            try:
+                self._get_from_s3(buildId)
+            except:
+                ex = sys.exc_info()[0]
+                sys.stderr.write("Failed to find debug symbols for %s in s3: %s\n" %(buildId, ex))
+                return None
+        if not os.path.exists(buildIdPath):
+            return None
+        return buildIdPath
+
+    def _get_from_s3(self, buildId):
+        subprocess.check_call(
+            ['wget', 'https://s3.amazonaws.com/%s/%s.debug.gz' % (self._s3_bucket, buildId)],
+            cwd=self._cache_dir)
+        subprocess.check_call(['gunzip', buildId + ".debug.gz"], cwd=self._cache_dir)
+
+def classic_output(frames, outfile, **kwargs):
+    for frame in frames:
+        symbinfo = frame["symbinfo"]
+        if len(symbinfo) > 0:
+            for sframe in symbinfo:
+                outfile.write(" %(file)s:%(line)s  %(fn)s\n" % sframe)
+        else:
+            outfile.write(" %(path)s!!!\n" % symbinfo)
+
+def main(argv):
+    parser = optparse.OptionParser()
+    parser.add_option("--dsym-hint", action="append", dest="dsym_hint")
+    parser.add_option("--symbolizer-path", dest="symbolizer_path", default=None)
+    parser.add_option("--debug-file-resolver", dest="debug_file_resolver", default="path")
+    parser.add_option("--output-format", dest="output_format", default="classic")
+    (options, args) = parser.parse_args(argv)
+    resolver_constructor = dict(path=path_dbg_file_resolver, s3=s3_buildid_dbg_file_resolver).get(
+        options.debug_file_resolver, None)
+    if resolver_constructor is None:
+        sys.stderr.write("Invalid debug-file-resolver argument: %s\n" % options.debug_file_resolver)
+        sys.exit(1)
+
+    output_fn = dict(json=json.dump, classic=classic_output).get(options.output_format, None)
+    if output_fn is None:
+        sys.stderr.write("Invalid output-format argument: %s\n" % options.output_format)
+        sys.exit(1)
+
+    resolver = resolver_constructor(*args[1:])
+    trace_doc = json.load(sys.stdin)
+    frames = symbolize_frames(trace_doc,
+                              resolver,
+                              symbolizer_path=options.symbolizer_path,
+                              dsym_hint=options.dsym_hint)
+    output_fn(frames, sys.stdout, indent=2)
+
+if __name__ == '__main__':
+    main(sys.argv)
+    sys.exit(0)