0
0
mirror of https://github.com/mongodb/mongo.git synced 2024-12-01 01:21:03 +01:00
mongodb/buildscripts/hang_analyzer.py

549 lines
18 KiB
Python
Executable File

#!/usr/bin/env python
"""Hang Analyzer
A prototype hang analyzer for Evergreen integration to help investigate test timeouts
1. Script supports taking dumps, and/or dumping a summary of useful information about a process
2. Script will iterate through a list of interesting processes,
and run the tools from step 1. The list of processes can be provided as an option.
3. Java processes will be dumped using jstack, if available.
Supports Linux, MacOS X, Solaris, and Windows.
"""
import StringIO
import csv
import glob
import itertools
import os
import platform
import re
import signal
import subprocess
import sys
import tempfile
import threading
import time
from distutils import spawn
from optparse import OptionParser
if sys.platform == "win32":
import win32process
def call(a = []):
sys.stdout.write(str(a) + "\n")
sys.stdout.flush()
ret = subprocess.call(a)
if( ret != 0):
sys.stderr.write("Bad exit code %d\n" % (ret))
raise Exception()
# Copied from python 2.7 version of subprocess.py
def check_output(*popenargs, **kwargs):
r"""Run command with arguments and return its output as a byte string.
If the exit code was non-zero it raises a CalledProcessError. The
CalledProcessError object will have the return code in the returncode
attribute and output in the output attribute.
The arguments are the same as for the Popen constructor. Example:
>>> check_output(["ls", "-l", "/dev/null"])
'crw-rw-rw- 1 root root 1, 3 Oct 18 2007 /dev/null\n'
The stdout argument is not allowed as it is used internally.
To capture standard error in the result, use stderr=STDOUT.
>>> check_output(["/bin/sh", "-c",
... "ls -l non_existent_file ; exit 0"],
... stderr=STDOUT)
'ls: non_existent_file: No such file or directory\n'
"""
if 'stdout' in kwargs:
raise ValueError('stdout argument not allowed, it will be overridden.')
process = subprocess.Popen(stdout=subprocess.PIPE, *popenargs, **kwargs)
output, unused_err = process.communicate()
retcode = process.poll()
if retcode:
cmd = kwargs.get("args")
if cmd is None:
cmd = popenargs[0]
raise CalledProcessError(retcode, cmd, output=output)
return output
def callo(a = []):
sys.stdout.write(str(a) + "\n")
sys.stdout.flush()
return check_output(a)
def find_program(prog, paths):
"""Finds the specified program in env PATH, or tries a set of paths """
loc = spawn.find_executable(prog)
if(loc != None):
return loc
for loc in paths:
p = os.path.join(loc, prog)
if os.path.exists(p):
return p
return None
class WindowsDumper(object):
def __find_debugger(self):
"""Finds the installed debugger"""
# We are looking for c:\Program Files (x86)\Windows Kits\8.1\Debuggers\x64
cdb = spawn.find_executable('cdb.exe')
if(cdb != None):
return cdb
from win32com.shell import shell, shellcon
# Cygwin via sshd does not expose the normal environment variables
# Use the shell api to get the variable instead
rootDir = shell.SHGetFolderPath(0, shellcon.CSIDL_PROGRAM_FILESX86, None, 0)
for i in range(0,2):
pathToTest = os.path.join(rootDir, "Windows Kits", "8." + str(i), "Debuggers", "x64" )
sys.stdout.write("Checking for debugger in %s\n" % pathToTest)
if(os.path.exists(pathToTest)):
return os.path.join(pathToTest, "cdb.exe")
return None
def dump_info(self, pid, process_name, stream, take_dump = False):
"""Dump useful information to the console"""
dbg = self.__find_debugger()
if dbg is None:
stream.write("WARNING: Debugger cdb.exe not found, skipping dumping of %d\n" % (pid))
return
stream.write("INFO: Debugger %s, analyzing %d\n" % (dbg, pid))
cmds = [
".symfix", # Fixup symbol path
".reload", # Reload symbols
"!peb", # Dump current exe, & environment variables
"lm", # Dump loaded modules
"~* k 100", # Dump All Threads
".dump /ma /u dump_" + process_name + "." + str(pid) + "." + self.get_dump_ext() if take_dump else "",
# Dump to file, dump_<process name>_<time stamp>_<pid in hex>.<pid>.mdmp
".detach", # Detach
"q" # Quit
]
call([dbg, '-c', ";".join(cmds), '-p', str(pid)])
stream.write("INFO: Done analyzing process\n")
def get_dump_ext(self):
return "mdmp"
def dump_core(self, pid, output_file):
"""Take a dump of pid to specified file"""
dbg = self.__find_debugger()
if dbg is None:
sys.stdout.write("WARNING: Debugger cdb.exe not found, skipping dumping of %d to %s\n" % (pid, output_file))
return
sys.stdout.write("INFO: Debugger %s, analyzing %d to %s\n" % (dbg, pid, output_file))
call([dbg, '-c', ".dump /ma %s;.detach;q" % output_file, '-p', str(pid)] )
sys.stdout.write("INFO: Done analyzing process\n")
class WindowsProcessList(object):
def __find_ps(self):
"""Finds tasklist """
return os.path.join(os.environ["WINDIR"], "system32", "tasklist.exe")
def dump_processes(self):
"""Get list of [Pid, Process Name]"""
ps = self.__find_ps()
sys.stdout.write("INFO: Getting list of processes using %s\n" % ps)
ret = callo([ps, "/FO", "CSV"])
b = StringIO.StringIO(ret)
csvReader = csv.reader(b)
p = [[int(row[1]), row[0]] for row in csvReader if row[1] != "PID"]
sys.stdout.write("INFO: Done analyzing process\n")
return p
# LLDB dumper is for MacOS X
class LLDBDumper(object):
def __find_debugger(self):
"""Finds the installed debugger"""
return find_program('lldb', ['/usr/bin'])
def dump_info(self, pid, process_name, stream, take_dump = False):
dbg = self.__find_debugger()
if dbg is None:
stream.write("WARNING: Debugger lldb not found, skipping dumping of %d\n" % (pid))
return
stream.write("INFO: Debugger %s, analyzing %d\n" % (dbg, pid))
lldb_version = callo([dbg, "--version"])
stream.write(lldb_version)
# Do we have the XCode or LLVM version of lldb?
# Old versions of lldb do not work well when taking commands via a file
# XCode (7.2): lldb-340.4.119
# LLVM - lldb version 3.7.0 ( revision )
if 'version' not in lldb_version:
# We have XCode's lldb
lldb_version = lldb_version[lldb_version.index("lldb-"):]
lldb_version = lldb_version.replace('lldb-', '')
lldb_major_version = int(lldb_version[:lldb_version.index('.')])
if lldb_major_version < 340:
stream.write("WARNING: Debugger lldb is too old, please upgrade to XCode 7.2\n")
return
cmds = [
"attach -p %d" % pid,
"target modules list",
"thread backtrace all",
"process save-core dump_" + process_name + "." + str(pid) + "." + self.get_dump_ext() if take_dump else "",
"settings set interpreter.prompt-on-quit false",
"quit",
]
tf = tempfile.NamedTemporaryFile()
for c in cmds:
tf.write(c + "\n")
tf.flush()
# Works on in MacOS 10.9 & later
#call([dbg] + list( itertools.chain.from_iterable([['-o', b] for b in cmds])))
call(['cat', tf.name])
call([dbg, '--source', tf.name])
stream.write("INFO: Done analyzing process\n")
def get_dump_ext(self):
return "core"
def dump_core(self, pid, output_file):
"""Take a dump of pid to specified file"""
sys.stderr.write("ERROR: lldb does not support dumps, stupid debugger\n")
class DarwinProcessList(object):
def __find_ps(self):
"""Finds ps"""
return find_program('ps', ['/bin'])
def dump_processes(self):
"""Get list of [Pid, Process Name]"""
ps = self.__find_ps()
sys.stdout.write("INFO: Getting list of processes using %s\n" % ps)
ret = callo([ps, "-axco", "pid,comm"])
b = StringIO.StringIO(ret)
csvReader = csv.reader(b, delimiter=' ', quoting=csv.QUOTE_NONE, skipinitialspace=True)
p = [[int(row[0]), row[1]] for row in csvReader if row[0] != "PID"]
sys.stdout.write("INFO: Done analyzing process\n")
return p
# GDB dumper is for Linux & Solaris
class GDBDumper(object):
def __find_debugger(self):
"""Finds the installed debugger"""
return find_program('gdb', ['/opt/mongodbtoolchain/gdb/bin', '/usr/bin'])
def dump_info(self, pid, process_name, stream, take_dump = False):
dbg = self.__find_debugger()
if dbg is None:
stream.write("WARNING: Debugger gdb not found, skipping dumping of %d\n" % (pid))
return
stream.write("INFO: Debugger %s, analyzing %d\n" % (dbg, pid))
call([dbg, "--version"])
cmds = [
"set pagination off",
"attach %d" % pid,
"info sharedlibrary",
"thread apply all bt",
"gcore dump_" + process_name + "." + str(pid) + "." + self.get_dump_ext() if take_dump else "",
"set confirm off",
"quit",
]
call([dbg, "--quiet"] + list( itertools.chain.from_iterable([['-ex', b] for b in cmds])))
stream.write("INFO: Done analyzing process\n")
def get_dump_ext(self):
return "core"
def _find_gcore(self):
"""Finds the installed gcore"""
dbg = "/usr/bin/gcore"
if os.path.exists(dbg):
return dbg
return None
def dump_core(self, pid, output_file):
"""Take a dump of pid to specified file"""
dbg = self._find_gcore()
if dbg is None:
sys.stdout.write("WARNING: Debugger gcore not found, skipping dumping of %d to %s\n" % (pid, output_file))
return
sys.stdout.write("INFO: Debugger %s, analyzing %d to %s\n" % (dbg, pid, output_file))
call([dbg, "-o", output_file, str(pid)])
sys.stdout.write("INFO: Done analyzing process\n")
# GCore appends the pid to the output file name
return output_file + "." + str(pid)
class LinuxProcessList(object):
def __find_ps(self):
"""Finds ps"""
return find_program('ps', ['/bin', '/usr/bin'])
def dump_processes(self):
"""Get list of [Pid, Process Name]"""
ps = self.__find_ps()
sys.stdout.write("INFO: Getting list of processes using %s\n" % ps)
call([ps, "--version"])
ret = callo([ps, "-eo", "pid,args"])
b = StringIO.StringIO(ret)
csvReader = csv.reader(b, delimiter=' ', quoting=csv.QUOTE_NONE, skipinitialspace=True)
p = [[int(row[0]), os.path.split(row[1])[1]] for row in csvReader if row[0] != "PID"]
sys.stdout.write("INFO: Done analyzing process\n")
return p
class SolarisProcessList(object):
def __find_ps(self):
"""Finds ps"""
return find_program('ps', ['/bin', '/usr/bin'])
def dump_processes(self):
"""Get list of [Pid, Process Name]"""
ps = self.__find_ps()
sys.stdout.write("INFO: Getting list of processes using %s\n" % ps)
ret = callo([ps, "-eo", "pid,args"])
b = StringIO.StringIO(ret)
csvReader = csv.reader(b, delimiter=' ', quoting=csv.QUOTE_NONE, skipinitialspace=True)
p = [[int(row[0]), os.path.split(row[1])[1]] for row in csvReader if row[0] != "PID"]
sys.stdout.write("INFO: Done analyzing process\n")
return p
# jstack is a JDK utility
class JstackDumper(object):
def __find_debugger(self):
"""Finds the installed jstack debugger"""
return find_program('jstack', ['/usr/bin'])
def dump_info(self, pid, process_name, stream, take_dump = False):
"""Dump java thread stack traces to the console"""
jstack = self.__find_debugger()
if jstack is None:
stream.write("WARNING: Debugger jstack not found, skipping dumping of %d\n" % (pid))
return
stream.write("INFO: Debugger %s, analyzing %d\n" % (jstack, pid))
call([jstack, "-l", str(pid)])
stream.write("INFO: Done analyzing process\n")
def dump_core(self, pid, output_file):
"""Take a dump of pid to specified file"""
sys.stderr.write("ERROR: jstack does not support dumps\n")
# jstack is a JDK utility
class JstackWindowsDumper(object):
def dump_info(self, pid, process_name, stream):
"""Dump java thread stack traces to the console"""
stream.write("WARNING: Debugger jstack not supported, skipping dumping of %d\n" % (pid))
def get_hang_analyzers():
dbg = None
jstack = None
ps = None
if sys.platform.startswith("linux"):
dbg = GDBDumper()
jstack = JstackDumper()
ps = LinuxProcessList()
elif sys.platform.startswith("sunos"):
dbg = GDBDumper()
jstack = JstackDumper()
ps = SolarisProcessList()
elif os.name == 'nt' or (os.name == "posix" and sys.platform == "cygwin"):
dbg = WindowsDumper()
jstack = JstackWindowsDumper()
ps = WindowsProcessList()
elif sys.platform == "darwin":
dbg = LLDBDumper()
jstack = JstackDumper()
ps = DarwinProcessList()
return [ps, dbg, jstack]
def check_dump_quota(quota, ext):
"""Check if sum of the files with ext is within the specified quota in megabytes"""
files = glob.glob("*." + ext)
size_sum = 0
for file_name in files:
size_sum += os.path.getsize(file_name)
return (size_sum <= quota)
def signal_process(pid):
"""Signal python process with SIGUSR1, N/A on Windows"""
try:
os.kill(pid, signal.SIGUSR1)
print "Waiting for python process to report"
time.sleep(5)
except OSError,e:
print "Hit OS error trying to signal python process: " + str(e)
except AttributeError:
print "Cannot send signal to python on Windows"
def timeout_protector():
print "Script timeout has been hit, terminating"
if sys.platform == "win32":
# Have the process exit with code 9 when it terminates itself to closely match the exit code
# of the process when it sends itself a SIGKILL.
handle = win32process.GetCurrentProcess()
win32process.TerminateProcess(handle, 9)
else:
os.kill(os.getpid(), signal.SIGKILL)
# Basic procedure
#
# 1. Get a list of interesting processes
# 2. Dump useful information or take dumps
def main():
print "Python Version: " + sys.version
print "OS: " + platform.platform()
try:
distro = platform.linux_distribution()
print "Linux Distribution: " + str(distro)
except AttributeError:
print "Cannot determine Linux distro since Python is too old"
try:
uid = os.getuid()
print "Current User: " + str(uid)
current_login = os.getlogin()
print "Current Login: " + current_login
except OSError:
print "Cannot determine Unix Current Login"
except AttributeError:
print "Cannot determine Unix Current Login, not supported on Windows"
interesting_processes = ["mongo", "mongod", "mongos", "_test", "dbtest", "python", "java"]
parser = OptionParser(description=__doc__)
parser.add_option('-p', '--process_names', dest='process_names', help='List of process names to analyze')
parser.add_option('-s', '--max_core_dumps_size', dest='max_core_dumps_size', default=10000, help='Maximum total size of core dumps to keep in megabytes')
(options, args) = parser.parse_args()
if options.process_names is not None:
interesting_processes = options.process_names.split(',')
[ps, dbg, jstack] = get_hang_analyzers()
if( ps == None or (dbg == None and jstack == None)):
sys.stderr.write("hang_analyzer.py: Unsupported platform: %s\n" % (sys.platform))
exit(1)
# Make sure the script does not hang
timer = threading.Timer(120, timeout_protector)
timer.start()
processes_orig = ps.dump_processes()
# Find all running interesting processes.
processes = [a for a in processes_orig
if any([a[1].find(ip) >= 0 for ip in interesting_processes]) and a[0] != os.getpid()]
sys.stdout.write("Found %d interesting processes\n" % len(processes))
max_dump_size_bytes = int(options.max_core_dumps_size) * 1024 * 1024
if( len(processes) == 0):
for process in processes_orig:
sys.stdout.write("Ignoring process %d of %s\n" % (process[0], process[1]))
else:
# Dump all other processes, except python & java.
for process in [a for a in processes if not re.match("^(java|python)", a[1])]:
sys.stdout.write("Dumping process %d of %s\n" % (process[0], process[1]))
dbg.dump_info(process[0], process[1], sys.stdout, check_dump_quota(max_dump_size_bytes, dbg.get_dump_ext()))
# Dump java processes using jstack.
for process in [a for a in processes if a[1].startswith("java")]:
sys.stdout.write("Dumping process %d of %s\n" % (process[0], process[1]))
jstack.dump_info(process[0], process[1], sys.stdout)
# Dump python processes after signalling them.
for process in [a for a in processes if a[1].startswith("python")]:
signal_process(process[0])
dbg.dump_info(process[0], process[1], sys.stdout, check_dump_quota(max_dump_size_bytes, dbg.get_dump_ext()))
# Suspend the timer so we can exit cleanly
timer.cancel()
sys.stdout.write("Done analyzing processes for hangs\n")
if __name__ == "__main__":
main()