0
0
mirror of https://github.com/mongodb/mongo.git synced 2024-11-28 07:59:02 +01:00
mongodb/buildscripts/hang_analyzer.py
Mark Benvenuto 35f827aef4 SERVER-14714: Add stack trace signal handler
SERVER-14181: Dump dbtest & python processes, add timeout
2014-08-05 13:58:33 -04:00

426 lines
13 KiB
Python
Executable File

#!/usr/bin/env python -u
"""Hang Analyzer
A prototype hang analyzer for MCI integration to help investigate test timeouts
1. Script supports taking dumps, and/or dumping a summary of useful information about a process
2. Script will iterate through a list of interesting processes (mongo, mongod, and mongos),
and run the tools from step 1.
Supports Linux, MacOS X, and Windows.
"""
import StringIO
import csv
import itertools
import os
import platform
import signal
import subprocess
import sys
import tempfile
import threading
import time
from distutils import spawn
from optparse import OptionParser
def call(a = []):
sys.stdout.write(str(a) + "\n")
sys.stdout.flush()
ret = subprocess.call(a);
if( ret != 0):
sys.stderr.write("Bad exit code %d\n" % (ret))
raise Exception()
# Copied from python 2.7 version of subprocess.py
def check_output(*popenargs, **kwargs):
r"""Run command with arguments and return its output as a byte string.
If the exit code was non-zero it raises a CalledProcessError. The
CalledProcessError object will have the return code in the returncode
attribute and output in the output attribute.
The arguments are the same as for the Popen constructor. Example:
>>> check_output(["ls", "-l", "/dev/null"])
'crw-rw-rw- 1 root root 1, 3 Oct 18 2007 /dev/null\n'
The stdout argument is not allowed as it is used internally.
To capture standard error in the result, use stderr=STDOUT.
>>> check_output(["/bin/sh", "-c",
... "ls -l non_existent_file ; exit 0"],
... stderr=STDOUT)
'ls: non_existent_file: No such file or directory\n'
"""
if 'stdout' in kwargs:
raise ValueError('stdout argument not allowed, it will be overridden.')
process = subprocess.Popen(stdout=subprocess.PIPE, *popenargs, **kwargs)
output, unused_err = process.communicate()
retcode = process.poll()
if retcode:
cmd = kwargs.get("args")
if cmd is None:
cmd = popenargs[0]
raise CalledProcessError(retcode, cmd, output=output)
return output
def callo(a = []):
sys.stdout.write(str(a) + "\n")
sys.stdout.flush()
return check_output(a)
def find_program(prog, paths):
"""Finds the specified program in env PATH, or tries a set of paths """
loc = spawn.find_executable(prog)
if(loc != None):
return loc
for loc in paths:
p = os.path.join(loc, prog)
if os.path.exists(p):
return p
return None
class WindowsDumper(object):
def __find_debugger(self):
"""Finds the installed debugger"""
# We are looking for c:\Program Files (x86)\Windows Kits\8.1\Debuggers\x64
cdb = spawn.find_executable('cdb.exe')
if(cdb != None):
return cdb
from win32com.shell import shell, shellcon
# Cygwin via sshd does not expose the normal environment variables
# Use the shell api to get the variable instead
rootDir = shell.SHGetFolderPath(0, shellcon.CSIDL_PROGRAM_FILESX86, None, 0)
for i in range(0,2):
pathToTest = os.path.join(rootDir, "Windows Kits", "8." + str(i), "Debuggers", "x64" );
sys.stdout.write("Checking for debugger in %s\n" % pathToTest)
if(os.path.exists(pathToTest)):
return os.path.join(pathToTest, "cdb.exe")
return None
def dump_info(self, pid, stream):
"""Dump useful information to the console"""
dbg = self.__find_debugger()
if dbg is None:
stream.write("WARNING: Debugger cdb.exe not found, skipping dumping of %d\n" % (pid))
return
stream.write("INFO: Debugger %s, analyzing %d\n" % (dbg, pid))
cmds = [
".symfix", # Fixup symbol path
".reload", # Reload symbols
"!peb", # Dump current exe, & environment variables
"lm", # Dump loaded modoules
"~* k 100", # Dump All Threads
".detach", # Detach
"q" # Quit
]
call([dbg, '-c', ";".join(cmds), '-p', str(pid)])
stream.write("INFO: Done analyzing process\n");
def dump_core(self, pid, output_file):
"""Take a dump of pid to specified file"""
dbg = self.__find_debugger()
if dbg is None:
sys.stdout.write("WARNING: Debugger cdb.exe not found, skipping dumping of %d to %s\n" % (pid, output_file))
return
sys.stdout.write("INFO: Debugger %s, analyzing %d to %s\n" % (dbg, pid, output_file))
call([dbg, '-c', ".dump /ma %s;.detach;q" % output_file, '-p', str(pid)] )
sys.stdout.write("INFO: Done analyzing process\n");
class WindowsProcessList(object):
def __find_ps(self):
"""Finds tasklist """
return os.path.join(os.environ["windir"], "system32", "tasklist.exe")
def dump_processes(self):
"""Get list of [Pid, Process Name]"""
ps = self.__find_ps()
sys.stdout.write("INFO: Getting list of processes using %s\n" % ps)
ret = callo([ps, "/FO", "CSV"])
b = StringIO.StringIO(ret)
csvReader = csv.reader(b)
p = [[int(row[1]), row[0]] for row in csvReader if row[1] != "PID"]
sys.stdout.write("INFO: Done analyzing process\n");
return p
# LLDB dumper is for MacOS X
class LLDBDumper(object):
def __find_debugger(self):
"""Finds the installed debugger"""
return find_program('lldb', ['/usr/bin'])
def dump_info(self, pid, stream):
dbg = self.__find_debugger()
if dbg is None:
stream.write("WARNING: Debugger lldb not found, skipping dumping of %d\n" % (pid))
return
stream.write("INFO: Debugger %s, analyzing %d\n" % (dbg, pid))
call([dbg, "--version"])
cmds = [
"attach -p %d" % pid,
"target modules list",
"thread backtrace all",
"settings set interpreter.prompt-on-quit false",
"quit",
]
tf = tempfile.NamedTemporaryFile()
for c in cmds:
tf.write(c + "\n")
tf.flush()
# Works on in MacOS 10.9 & later
#call([dbg] + list( itertools.chain.from_iterable([['-o', b] for b in cmds])))
call(['cat', tf.name])
call([dbg, '--source', tf.name])
stream.write("INFO: Done analyzing process\n");
def dump_core(self, pid, output_file):
"""Take a dump of pid to specified file"""
sys.stderr.write("ERROR: lldb does not support dumps, stupid debugger\n");
class DarwinProcessList(object):
def __find_ps(self):
"""Finds ps"""
return find_program('ps', ['/bin'])
def dump_processes(self):
"""Get list of [Pid, Process Name]"""
ps = self.__find_ps()
sys.stdout.write("INFO: Getting list of processes using %s\n" % ps)
ret = callo([ps, "-axco", "pid,comm"])
b = StringIO.StringIO(ret)
csvReader = csv.reader(b, delimiter=' ', quoting=csv.QUOTE_NONE, skipinitialspace=True)
p = [[int(row[0]), row[1]] for row in csvReader if row[0] != "PID"]
sys.stdout.write("INFO: Done analyzing process\n");
return p
# GDB dumper is for Linux
class GDBDumper(object):
def __find_debugger(self):
"""Finds the installed debugger"""
return find_program('gdb', ['/usr/bin'])
def dump_info(self, pid, stream):
dbg = self.__find_debugger()
if dbg is None:
stream.write("WARNING: Debugger gdb not found, skipping dumping of %d\n" % (pid))
return
stream.write("INFO: Debugger %s, analyzing %d\n" % (dbg, pid))
call([dbg, "--version"])
cmds = [
"attach %d" % pid,
"thread apply all bt",
"set confirm off",
"quit",
]
call([dbg, "--quiet"] + list( itertools.chain.from_iterable([['-ex', b] for b in cmds])))
stream.write("INFO: Done analyzing process\n");
def _find_gcore(self):
"""Finds the installed gcore"""
dbg = "/usr/bin/gcore"
if os.path.exists(dbg):
return dbg
return None
def dump_core(self, pid, output_file):
"""Take a dump of pid to specified file"""
dbg = self._find_gcore()
if dbg is None:
sys.stdout.write("WARNING: Debugger gcore not found, skipping dumping of %d to %s\n" % (pid, output_file))
return
sys.stdout.write("INFO: Debugger %s, analyzing %d to %s\n" % (dbg, pid, output_file))
call([dbg, "-o", output_file, str(pid)]);
sys.stdout.write("INFO: Done analyzing process\n");
# GCore appends the pid to the output file name
return output_file + "." + str(pid)
class LinuxProcessList(object):
def __find_ps(self):
"""Finds ps"""
return find_program('ps', ['/bin', '/usr/bin'])
def dump_processes(self):
"""Get list of [Pid, Process Name]"""
ps = self.__find_ps()
sys.stdout.write("INFO: Getting list of processes using %s\n" % ps)
call([ps, "--version"])
ret = callo([ps, "-eo", "pid,args"])
b = StringIO.StringIO(ret)
csvReader = csv.reader(b, delimiter=' ', quoting=csv.QUOTE_NONE, skipinitialspace=True)
p = [[int(row[0]), os.path.split(row[1])[1]] for row in csvReader if row[0] != "PID"]
sys.stdout.write("INFO: Done analyzing process\n");
return p
def get_hang_analyzers():
dbg = None
ps = None
if sys.platform.startswith("linux"):
dbg = GDBDumper()
ps = LinuxProcessList()
elif os.name == 'nt' or (os.name == "posix" and sys.platform == "cygwin"):
dbg = WindowsDumper()
ps = WindowsProcessList()
elif sys.platform == "darwin":
dbg = LLDBDumper()
ps = DarwinProcessList()
return [ps, dbg]
interesting_processes = ["mongo", "mongod", "mongos", "_test", "dbtest", "python"]
def is_interesting_process(p):
for ip in interesting_processes:
if p.find(ip) != -1:
return True
return False
def signal_process(pid):
"""Signal python process with SIGUSR1, N/A on Windows"""
try:
os.kill(pid, signal.SIGUSR1)
print "Waiting for python process to report"
time.sleep(5)
except OSError,e:
print "Hit OS error trying to signal python process: " + str(e)
except AttributeError:
print "Cannot send signal to python on Windows"
def timeout_protector():
print "Script timeout has been hit, terminating"
os.kill(os.getpid(), signal.SIGKILL)
# Basic procedure
#
# 1. Get a list of interesting processes
# 2. Dump useful information or take dumps
def main():
print "Python Version: " + sys.version
print "OS" + platform.platform()
try:
distro = platform.linux_distribution()
print "Linux Distribution: " + str(distro)
except AttributeError:
print "Cannot determine Linux distro since Python is too old"
try:
uid = os.getuid()
print "Current User: " + str(uid)
current_login = os.getlogin()
print "Current Login: " + current_login
except OSError:
print "Cannot determine Unix Current Login"
except AttributeError:
print "Cannot determine Unix Current Login, not supported on Windows"
parser = OptionParser(description=__doc__)
(options, args) = parser.parse_args()
[ps, dbg] = get_hang_analyzers()
if( ps == None or dbg == None):
sys.stderr.write("hang_analyzer.py: Unsupported platform: %s\n" % (sys.platform))
exit(1)
# Make sure the script does not hang
timer = threading.Timer(120, timeout_protector)
timer.start()
processes_orig = ps.dump_processes()
# Pick unit tests which include the phrase "_test"
processes = [a for a in processes_orig
if is_interesting_process(a[1]) and a[0] != os.getpid()]
sys.stdout.write("Found %d interesting processes\n" % len(processes))
if( len(processes) == 0):
for process in processes_orig:
sys.stdout.write("Ignoring process %d of %s\n" % (process[0], process[1]))
else:
# Dump all other processes first since signaling the python script interrupts it
for process in [a for a in processes if a[1] != "python"]:
sys.stdout.write("Dumping process %d of %s\n" % (process[0], process[1]))
dbg.dump_info(process[0], sys.stdout)
for process in [a for a in processes if a[1] == "python"]:
signal_process(process[0])
dbg.dump_info(process[0], sys.stdout)
# Suspend the timer so we can exit cleanly
timer.cancel()
sys.stdout.write("Done analyzing processes for hangs\n")
if __name__ == "__main__":
main()