mirror of
https://github.com/mongodb/mongo.git
synced 2024-11-28 07:59:02 +01:00
35f827aef4
SERVER-14181: Dump dbtest & python processes, add timeout
426 lines
13 KiB
Python
Executable File
426 lines
13 KiB
Python
Executable File
#!/usr/bin/env python -u
|
|
|
|
"""Hang Analyzer
|
|
|
|
A prototype hang analyzer for MCI integration to help investigate test timeouts
|
|
|
|
1. Script supports taking dumps, and/or dumping a summary of useful information about a process
|
|
2. Script will iterate through a list of interesting processes (mongo, mongod, and mongos),
|
|
and run the tools from step 1.
|
|
|
|
Supports Linux, MacOS X, and Windows.
|
|
"""
|
|
|
|
import StringIO
|
|
import csv
|
|
import itertools
|
|
import os
|
|
import platform
|
|
import signal
|
|
import subprocess
|
|
import sys
|
|
import tempfile
|
|
import threading
|
|
import time
|
|
from distutils import spawn
|
|
from optparse import OptionParser
|
|
|
|
def call(a = []):
|
|
sys.stdout.write(str(a) + "\n")
|
|
sys.stdout.flush()
|
|
ret = subprocess.call(a);
|
|
if( ret != 0):
|
|
sys.stderr.write("Bad exit code %d\n" % (ret))
|
|
raise Exception()
|
|
|
|
# Copied from python 2.7 version of subprocess.py
|
|
def check_output(*popenargs, **kwargs):
|
|
r"""Run command with arguments and return its output as a byte string.
|
|
|
|
If the exit code was non-zero it raises a CalledProcessError. The
|
|
CalledProcessError object will have the return code in the returncode
|
|
attribute and output in the output attribute.
|
|
|
|
The arguments are the same as for the Popen constructor. Example:
|
|
|
|
>>> check_output(["ls", "-l", "/dev/null"])
|
|
'crw-rw-rw- 1 root root 1, 3 Oct 18 2007 /dev/null\n'
|
|
|
|
The stdout argument is not allowed as it is used internally.
|
|
To capture standard error in the result, use stderr=STDOUT.
|
|
|
|
>>> check_output(["/bin/sh", "-c",
|
|
... "ls -l non_existent_file ; exit 0"],
|
|
... stderr=STDOUT)
|
|
'ls: non_existent_file: No such file or directory\n'
|
|
"""
|
|
if 'stdout' in kwargs:
|
|
raise ValueError('stdout argument not allowed, it will be overridden.')
|
|
process = subprocess.Popen(stdout=subprocess.PIPE, *popenargs, **kwargs)
|
|
output, unused_err = process.communicate()
|
|
retcode = process.poll()
|
|
if retcode:
|
|
cmd = kwargs.get("args")
|
|
if cmd is None:
|
|
cmd = popenargs[0]
|
|
raise CalledProcessError(retcode, cmd, output=output)
|
|
return output
|
|
|
|
def callo(a = []):
|
|
sys.stdout.write(str(a) + "\n")
|
|
sys.stdout.flush()
|
|
return check_output(a)
|
|
|
|
def find_program(prog, paths):
|
|
"""Finds the specified program in env PATH, or tries a set of paths """
|
|
loc = spawn.find_executable(prog)
|
|
|
|
if(loc != None):
|
|
return loc
|
|
|
|
for loc in paths:
|
|
p = os.path.join(loc, prog)
|
|
if os.path.exists(p):
|
|
return p
|
|
|
|
return None
|
|
|
|
|
|
class WindowsDumper(object):
|
|
|
|
def __find_debugger(self):
|
|
"""Finds the installed debugger"""
|
|
# We are looking for c:\Program Files (x86)\Windows Kits\8.1\Debuggers\x64
|
|
cdb = spawn.find_executable('cdb.exe')
|
|
if(cdb != None):
|
|
return cdb
|
|
from win32com.shell import shell, shellcon
|
|
|
|
# Cygwin via sshd does not expose the normal environment variables
|
|
# Use the shell api to get the variable instead
|
|
rootDir = shell.SHGetFolderPath(0, shellcon.CSIDL_PROGRAM_FILESX86, None, 0)
|
|
|
|
for i in range(0,2):
|
|
pathToTest = os.path.join(rootDir, "Windows Kits", "8." + str(i), "Debuggers", "x64" );
|
|
sys.stdout.write("Checking for debugger in %s\n" % pathToTest)
|
|
if(os.path.exists(pathToTest)):
|
|
return os.path.join(pathToTest, "cdb.exe")
|
|
return None
|
|
|
|
def dump_info(self, pid, stream):
|
|
"""Dump useful information to the console"""
|
|
dbg = self.__find_debugger()
|
|
|
|
if dbg is None:
|
|
stream.write("WARNING: Debugger cdb.exe not found, skipping dumping of %d\n" % (pid))
|
|
return
|
|
|
|
stream.write("INFO: Debugger %s, analyzing %d\n" % (dbg, pid))
|
|
|
|
cmds = [
|
|
".symfix", # Fixup symbol path
|
|
".reload", # Reload symbols
|
|
"!peb", # Dump current exe, & environment variables
|
|
"lm", # Dump loaded modoules
|
|
"~* k 100", # Dump All Threads
|
|
".detach", # Detach
|
|
"q" # Quit
|
|
]
|
|
|
|
call([dbg, '-c', ";".join(cmds), '-p', str(pid)])
|
|
|
|
stream.write("INFO: Done analyzing process\n");
|
|
|
|
def dump_core(self, pid, output_file):
|
|
"""Take a dump of pid to specified file"""
|
|
dbg = self.__find_debugger()
|
|
|
|
if dbg is None:
|
|
sys.stdout.write("WARNING: Debugger cdb.exe not found, skipping dumping of %d to %s\n" % (pid, output_file))
|
|
return
|
|
|
|
sys.stdout.write("INFO: Debugger %s, analyzing %d to %s\n" % (dbg, pid, output_file))
|
|
|
|
call([dbg, '-c', ".dump /ma %s;.detach;q" % output_file, '-p', str(pid)] )
|
|
|
|
sys.stdout.write("INFO: Done analyzing process\n");
|
|
|
|
class WindowsProcessList(object):
|
|
|
|
def __find_ps(self):
|
|
"""Finds tasklist """
|
|
return os.path.join(os.environ["windir"], "system32", "tasklist.exe")
|
|
|
|
def dump_processes(self):
|
|
"""Get list of [Pid, Process Name]"""
|
|
ps = self.__find_ps()
|
|
|
|
sys.stdout.write("INFO: Getting list of processes using %s\n" % ps)
|
|
|
|
ret = callo([ps, "/FO", "CSV"])
|
|
|
|
b = StringIO.StringIO(ret)
|
|
csvReader = csv.reader(b)
|
|
|
|
p = [[int(row[1]), row[0]] for row in csvReader if row[1] != "PID"]
|
|
|
|
sys.stdout.write("INFO: Done analyzing process\n");
|
|
|
|
return p
|
|
|
|
# LLDB dumper is for MacOS X
|
|
class LLDBDumper(object):
|
|
|
|
def __find_debugger(self):
|
|
"""Finds the installed debugger"""
|
|
return find_program('lldb', ['/usr/bin'])
|
|
|
|
def dump_info(self, pid, stream):
|
|
dbg = self.__find_debugger()
|
|
|
|
if dbg is None:
|
|
stream.write("WARNING: Debugger lldb not found, skipping dumping of %d\n" % (pid))
|
|
return
|
|
|
|
stream.write("INFO: Debugger %s, analyzing %d\n" % (dbg, pid))
|
|
|
|
call([dbg, "--version"])
|
|
|
|
cmds = [
|
|
"attach -p %d" % pid,
|
|
"target modules list",
|
|
"thread backtrace all",
|
|
"settings set interpreter.prompt-on-quit false",
|
|
"quit",
|
|
]
|
|
|
|
tf = tempfile.NamedTemporaryFile()
|
|
|
|
for c in cmds:
|
|
tf.write(c + "\n")
|
|
|
|
tf.flush()
|
|
|
|
# Works on in MacOS 10.9 & later
|
|
#call([dbg] + list( itertools.chain.from_iterable([['-o', b] for b in cmds])))
|
|
call(['cat', tf.name])
|
|
call([dbg, '--source', tf.name])
|
|
|
|
stream.write("INFO: Done analyzing process\n");
|
|
|
|
def dump_core(self, pid, output_file):
|
|
"""Take a dump of pid to specified file"""
|
|
sys.stderr.write("ERROR: lldb does not support dumps, stupid debugger\n");
|
|
|
|
class DarwinProcessList(object):
|
|
|
|
def __find_ps(self):
|
|
"""Finds ps"""
|
|
return find_program('ps', ['/bin'])
|
|
|
|
def dump_processes(self):
|
|
"""Get list of [Pid, Process Name]"""
|
|
ps = self.__find_ps()
|
|
|
|
sys.stdout.write("INFO: Getting list of processes using %s\n" % ps)
|
|
|
|
ret = callo([ps, "-axco", "pid,comm"])
|
|
|
|
b = StringIO.StringIO(ret)
|
|
csvReader = csv.reader(b, delimiter=' ', quoting=csv.QUOTE_NONE, skipinitialspace=True)
|
|
|
|
p = [[int(row[0]), row[1]] for row in csvReader if row[0] != "PID"]
|
|
|
|
sys.stdout.write("INFO: Done analyzing process\n");
|
|
|
|
return p
|
|
|
|
# GDB dumper is for Linux
|
|
class GDBDumper(object):
|
|
|
|
def __find_debugger(self):
|
|
"""Finds the installed debugger"""
|
|
return find_program('gdb', ['/usr/bin'])
|
|
|
|
def dump_info(self, pid, stream):
|
|
dbg = self.__find_debugger()
|
|
|
|
if dbg is None:
|
|
stream.write("WARNING: Debugger gdb not found, skipping dumping of %d\n" % (pid))
|
|
return
|
|
|
|
stream.write("INFO: Debugger %s, analyzing %d\n" % (dbg, pid))
|
|
|
|
call([dbg, "--version"])
|
|
|
|
cmds = [
|
|
"attach %d" % pid,
|
|
"thread apply all bt",
|
|
"set confirm off",
|
|
"quit",
|
|
]
|
|
|
|
call([dbg, "--quiet"] + list( itertools.chain.from_iterable([['-ex', b] for b in cmds])))
|
|
|
|
stream.write("INFO: Done analyzing process\n");
|
|
|
|
def _find_gcore(self):
|
|
"""Finds the installed gcore"""
|
|
dbg = "/usr/bin/gcore"
|
|
if os.path.exists(dbg):
|
|
return dbg
|
|
return None
|
|
|
|
def dump_core(self, pid, output_file):
|
|
"""Take a dump of pid to specified file"""
|
|
|
|
dbg = self._find_gcore()
|
|
|
|
if dbg is None:
|
|
sys.stdout.write("WARNING: Debugger gcore not found, skipping dumping of %d to %s\n" % (pid, output_file))
|
|
return
|
|
|
|
sys.stdout.write("INFO: Debugger %s, analyzing %d to %s\n" % (dbg, pid, output_file))
|
|
|
|
call([dbg, "-o", output_file, str(pid)]);
|
|
|
|
sys.stdout.write("INFO: Done analyzing process\n");
|
|
|
|
# GCore appends the pid to the output file name
|
|
return output_file + "." + str(pid)
|
|
|
|
class LinuxProcessList(object):
|
|
def __find_ps(self):
|
|
"""Finds ps"""
|
|
return find_program('ps', ['/bin', '/usr/bin'])
|
|
|
|
def dump_processes(self):
|
|
"""Get list of [Pid, Process Name]"""
|
|
ps = self.__find_ps()
|
|
|
|
sys.stdout.write("INFO: Getting list of processes using %s\n" % ps)
|
|
|
|
call([ps, "--version"])
|
|
|
|
ret = callo([ps, "-eo", "pid,args"])
|
|
|
|
b = StringIO.StringIO(ret)
|
|
csvReader = csv.reader(b, delimiter=' ', quoting=csv.QUOTE_NONE, skipinitialspace=True)
|
|
|
|
p = [[int(row[0]), os.path.split(row[1])[1]] for row in csvReader if row[0] != "PID"]
|
|
|
|
sys.stdout.write("INFO: Done analyzing process\n");
|
|
|
|
return p
|
|
|
|
|
|
def get_hang_analyzers():
|
|
dbg = None
|
|
ps = None
|
|
if sys.platform.startswith("linux"):
|
|
dbg = GDBDumper()
|
|
ps = LinuxProcessList()
|
|
elif os.name == 'nt' or (os.name == "posix" and sys.platform == "cygwin"):
|
|
dbg = WindowsDumper()
|
|
ps = WindowsProcessList()
|
|
elif sys.platform == "darwin":
|
|
dbg = LLDBDumper()
|
|
ps = DarwinProcessList()
|
|
return [ps, dbg]
|
|
|
|
|
|
interesting_processes = ["mongo", "mongod", "mongos", "_test", "dbtest", "python"]
|
|
|
|
def is_interesting_process(p):
|
|
for ip in interesting_processes:
|
|
if p.find(ip) != -1:
|
|
return True
|
|
return False
|
|
|
|
|
|
def signal_process(pid):
|
|
"""Signal python process with SIGUSR1, N/A on Windows"""
|
|
try:
|
|
os.kill(pid, signal.SIGUSR1)
|
|
|
|
print "Waiting for python process to report"
|
|
time.sleep(5)
|
|
except OSError,e:
|
|
print "Hit OS error trying to signal python process: " + str(e)
|
|
|
|
except AttributeError:
|
|
print "Cannot send signal to python on Windows"
|
|
|
|
|
|
def timeout_protector():
|
|
print "Script timeout has been hit, terminating"
|
|
os.kill(os.getpid(), signal.SIGKILL)
|
|
|
|
|
|
# Basic procedure
|
|
#
|
|
# 1. Get a list of interesting processes
|
|
# 2. Dump useful information or take dumps
|
|
def main():
|
|
print "Python Version: " + sys.version
|
|
print "OS" + platform.platform()
|
|
|
|
try:
|
|
distro = platform.linux_distribution()
|
|
print "Linux Distribution: " + str(distro)
|
|
except AttributeError:
|
|
print "Cannot determine Linux distro since Python is too old"
|
|
|
|
try:
|
|
uid = os.getuid()
|
|
print "Current User: " + str(uid)
|
|
current_login = os.getlogin()
|
|
print "Current Login: " + current_login
|
|
except OSError:
|
|
print "Cannot determine Unix Current Login"
|
|
except AttributeError:
|
|
print "Cannot determine Unix Current Login, not supported on Windows"
|
|
|
|
parser = OptionParser(description=__doc__)
|
|
(options, args) = parser.parse_args()
|
|
|
|
[ps, dbg] = get_hang_analyzers()
|
|
|
|
if( ps == None or dbg == None):
|
|
sys.stderr.write("hang_analyzer.py: Unsupported platform: %s\n" % (sys.platform))
|
|
exit(1)
|
|
|
|
# Make sure the script does not hang
|
|
timer = threading.Timer(120, timeout_protector)
|
|
timer.start()
|
|
|
|
processes_orig = ps.dump_processes()
|
|
|
|
# Pick unit tests which include the phrase "_test"
|
|
processes = [a for a in processes_orig
|
|
if is_interesting_process(a[1]) and a[0] != os.getpid()]
|
|
|
|
sys.stdout.write("Found %d interesting processes\n" % len(processes))
|
|
|
|
if( len(processes) == 0):
|
|
for process in processes_orig:
|
|
sys.stdout.write("Ignoring process %d of %s\n" % (process[0], process[1]))
|
|
else:
|
|
# Dump all other processes first since signaling the python script interrupts it
|
|
for process in [a for a in processes if a[1] != "python"]:
|
|
sys.stdout.write("Dumping process %d of %s\n" % (process[0], process[1]))
|
|
dbg.dump_info(process[0], sys.stdout)
|
|
|
|
for process in [a for a in processes if a[1] == "python"]:
|
|
signal_process(process[0])
|
|
|
|
dbg.dump_info(process[0], sys.stdout)
|
|
|
|
# Suspend the timer so we can exit cleanly
|
|
timer.cancel()
|
|
|
|
sys.stdout.write("Done analyzing processes for hangs\n")
|
|
|
|
if __name__ == "__main__":
|
|
main()
|