0
0
mirror of https://github.com/mongodb/mongo.git synced 2024-12-01 09:32:32 +01:00

SERVER-53693 Improve local powercycle testing experience

This commit is contained in:
Robert Guo 2021-02-16 03:34:40 -05:00 committed by Evergreen Agent
parent 69f865ae18
commit aafa095c4d
12 changed files with 452 additions and 567 deletions

View File

@ -1 +0,0 @@
"""Empty."""

View File

@ -1,14 +0,0 @@
"""Command-line entry-point for powercycle_operations."""
from buildscripts.powercycle_setup import parser
def main(argv):
"""
Execute Main function for powercycle_operations.
:param argv: sys.argv
:return: None
"""
subcommand = parser.parse_command_line(argv[1:])
subcommand.execute()

View File

@ -1,44 +0,0 @@
"""Parser for command line arguments."""
import argparse
import buildscripts.powercycle_setup.plugins as plugins
_PLUGINS = [plugins.PowercyclePlugin()]
def _add_subcommands():
"""Create and return the command line arguments parser."""
parser = argparse.ArgumentParser()
subparsers = parser.add_subparsers(dest="command")
# Add sub-commands.
for plugin in _PLUGINS:
plugin.add_subcommand(subparsers)
return parser
def parse(sys_args):
"""Parse the CLI args."""
# Split out this function for easier testing.
parser = _add_subcommands()
parsed_args = parser.parse_args(sys_args)
return parser, parsed_args
def parse_command_line(sys_args, **kwargs):
"""Parse the command line arguments passed to powercycle_operations.py and return the subcommand object to execute."""
parser, parsed_args = parse(sys_args)
subcommand = parsed_args.command
for plugin in _PLUGINS:
subcommand_obj = plugin.parse(subcommand, parser, parsed_args, **kwargs)
if subcommand_obj is not None:
return subcommand_obj
raise RuntimeError(
f"Powercycle configuration has invalid subcommand: {subcommand}. Try '--help'")

View File

@ -1,422 +0,0 @@
"""Set up powercycle remote operations."""
import getpass
import os
import re
import shlex
import subprocess
import sys
import yaml
from buildscripts.powercycle_setup.remote_operations import RemoteOperations, SSHOperation
from buildscripts.resmokelib.plugin import PluginInterface, Subcommand
from buildscripts.resmokelib.powercycle import powercycle_constants
class PowercycleCommand(Subcommand): # pylint: disable=abstract-method, too-many-instance-attributes
"""Base class for remote operations to set up powercycle."""
def __init__(self):
"""Initialize PowercycleCommand."""
self.expansions = yaml.safe_load(open(powercycle_constants.EXPANSIONS_FILE))
self.retries = 0 if "ssh_retries" not in self.expansions else int(
self.expansions["ssh_retries"])
self.ssh_identity = self._get_ssh_identity()
self.ssh_connection_options = self.ssh_identity + " " + self.expansions[
"ssh_connection_options"]
self.sudo = "" if self.is_windows() else "sudo"
# The username on the Windows image that powercycle uses is currently the default user.
self.user = "Administrator" if self.is_windows() else getpass.getuser()
self.user_host = self.user + "@" + self.expansions["private_ip_address"]
self.remote_op = RemoteOperations(
user_host=self.user_host,
ssh_connection_options=self.ssh_connection_options,
)
@staticmethod
def is_windows() -> bool:
""":return: True if running on Windows."""
return sys.platform == "win32" or sys.platform == "cygwin"
@staticmethod
def _call(cmd):
cmd = shlex.split(cmd)
# Use a common pipe for stdout & stderr for logging.
process = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
buff_stdout, _ = process.communicate()
buff = buff_stdout.decode("utf-8", "replace")
return process.poll(), buff
def _get_posix_workdir(self) -> str:
workdir = self.expansions['workdir']
if self.is_windows():
workdir = workdir.replace("\\", "/")
return workdir
def _get_ssh_identity(self) -> str:
workdir = self._get_posix_workdir()
pem_file = '/'.join([workdir, 'powercycle.pem'])
return f"-i {pem_file}"
class SetUpEC2Instance(PowercycleCommand):
"""Set up EC2 instance."""
COMMAND = "setUpEC2Instance"
def execute(self) -> None: # pylint: disable=too-many-instance-attributes, too-many-locals, too-many-statements
""":return: None."""
# First operation -
# Create remote_dir.
group_cmd = f"id -Gn {self.user}"
_, group = self._call(group_cmd)
group = group.split(" ")[0]
user_group = f"{self.user}:{group}"
remote_dir = powercycle_constants.REMOTE_DIR
db_path = powercycle_constants.DB_PATH
set_permission_stmt = f"chmod -R 777"
if self.is_windows():
set_permission_stmt = f"setfacl -s user::rwx,group::rwx,other::rwx"
cmds = f"{self.sudo} mkdir -p {remote_dir}; {self.sudo} chown -R {user_group} {remote_dir}; {set_permission_stmt} {remote_dir}; ls -ld {remote_dir}"
cmds = f"{cmds}; {self.sudo} mkdir -p {db_path}; {self.sudo} chown -R {user_group} {db_path}; {set_permission_stmt} {db_path}; ls -ld {db_path}"
self.remote_op.operation(SSHOperation.SHELL, cmds, None)
# Second operation -
# Copy buildscripts and mongoDB executables to the remote host.
files = ["etc", "buildscripts", "dist-test/bin"]
shared_libs = "dist-test/lib"
if os.path.isdir(shared_libs):
files.append(shared_libs)
self.remote_op.operation(SSHOperation.COPY_TO, files, remote_dir)
# Third operation -
# Set up virtualenv on remote.
venv = powercycle_constants.VIRTUALENV_DIR
python = "/opt/mongodbtoolchain/v3/bin/python3" if "python" not in self.expansions else self.expansions[
"python"]
cmds = f"python_loc=$(which {python})"
cmds = f"{cmds}; remote_dir={remote_dir}"
cmds = f"{cmds}; if [ \"Windows_NT\" = \"$OS\" ]; then python_loc=$(cygpath -w $python_loc); remote_dir=$(cygpath -w $remote_dir); fi"
cmds = f"{cmds}; virtualenv --python $python_loc --system-site-packages {venv}"
cmds = f"{cmds}; activate=$(find {venv} -name 'activate')"
cmds = f"{cmds}; . $activate"
cmds = f"{cmds}; pip3 install -r $remote_dir/etc/pip/powercycle-requirements.txt"
self.remote_op.operation(SSHOperation.SHELL, cmds, None)
# Fourth operation -
# Enable core dumps on non-Windows remote hosts.
# The core pattern must specify a director, since mongod --fork will chdir("/")
# and cannot generate a core dump there (see SERVER-21635).
# We need to reboot the host for the core limits to take effect.
if not self.is_windows():
core_pattern = f"{remote_dir}/dump_%e.%p.core"
sysctl_conf = "/etc/sysctl.conf"
cmds = "ulimit -a"
cmds = f"{cmds}; echo \"{self.user} - core unlimited\" | {self.sudo} tee -a /etc/security/limits.conf"
cmds = f"{cmds}; if [ -f {sysctl_conf} ]"
cmds = f"{cmds}; then grep ^kernel.core_pattern {sysctl_conf}"
cmds = f"{cmds}; if [ $? -eq 0 ]"
cmds = f"{cmds}; then {self.sudo} sed -i \"s,kernel.core_pattern=.*,kernel.core_pattern=$core_pattern,\" {sysctl_conf}"
cmds = f"{cmds}; else echo \"kernel.core_pattern={core_pattern}\" | {self.sudo} tee -a {sysctl_conf}"
cmds = f"{cmds}; fi"
cmds = f"{cmds}; else echo Cannot change the core pattern and no core dumps will be generated."
cmds = f"{cmds}; fi"
# The following line for restarting the machine is based on
# https://unix.stackexchange.com/a/349558 in order to ensure the ssh client gets a
# response from the remote machine before it restarts.
cmds = f"{cmds}; nohup {self.sudo} reboot &>/dev/null & exit"
self.remote_op.operation(SSHOperation.SHELL, cmds, None)
# Fifth operation -
# Print the ulimit & kernel.core_pattern
if not self.is_windows():
# Always exit successfully, as this is just informational.
cmds = "uptime"
cmds = f"{cmds}; ulimit -a"
cmds = f"{cmds}; if [ -f /sbin/sysctl ]"
cmds = f"{cmds}; then /sbin/sysctl kernel.core_pattern"
cmds = f"{cmds}; fi"
self.remote_op.operation(SSHOperation.SHELL, cmds, None, True)
# Sixth operation -
# Set up curator to collect system & process stats on remote.
variant = "windows-64" if self.is_windows() else "ubuntu1604"
curator_hash = "b0c3c0fc68bce26d9572796d6bed3af4a298e30e"
curator_url = f"https://s3.amazonaws.com/boxes.10gen.com/build/curator/curator-dist-{variant}-{curator_hash}.tar.gz"
cmds = f"curl -s {curator_url} | tar -xzv"
monitor_system_file = powercycle_constants.MONITOR_SYSTEM_FILE
monitor_proc_file = powercycle_constants.MONITOR_PROC_FILE
if self.is_windows():
# Since curator runs as SYSTEM user, ensure the output files can be accessed.
cmds = f"{cmds}; touch {monitor_system_file}; chmod 777 {monitor_system_file}"
cmds = f"{cmds}; cygrunsrv --install curator_sys --path curator --chdir $HOME --args 'stat system --file {monitor_system_file}'"
cmds = f"{cmds}; touch {monitor_proc_file}; chmod 777 {monitor_proc_file}"
cmds = f"{cmds}; cygrunsrv --install curator_proc --path curator --chdir $HOME --args 'stat process-all --file {monitor_proc_file}'"
cmds = f"{cmds}; cygrunsrv --start curator_sys"
cmds = f"{cmds}; cygrunsrv --start curator_proc"
else:
cmds = f"{cmds}; touch {monitor_system_file} {monitor_proc_file}"
cmds = f"{cmds}; cmd=\"@reboot cd $HOME && {self.sudo} ./curator stat system >> {monitor_system_file}\""
cmds = f"{cmds}; (crontab -l ; echo \"$cmd\") | crontab -"
cmds = f"{cmds}; cmd=\"@reboot cd $HOME && $sudo ./curator stat process-all >> {monitor_proc_file}\""
cmds = f"{cmds}; (crontab -l ; echo \"$cmd\") | crontab -"
cmds = f"{cmds}; crontab -l"
cmds = f"{cmds}; {{ {self.sudo} $HOME/curator stat system --file {monitor_system_file} > /dev/null 2>&1 & {self.sudo} $HOME/curator stat process-all --file {monitor_proc_file} > /dev/null 2>&1 & }} & disown"
self.remote_op.operation(SSHOperation.SHELL, cmds, retry=True)
# Seventh operation -
# Install NotMyFault, used to crash Windows.
if self.is_windows():
windows_crash_zip = powercycle_constants.WINDOWS_CRASH_ZIP
windows_crash_dl = powercycle_constants.WINDOWS_CRASH_DL
windows_crash_dir = powercycle_constants.WINDOWS_CRASH_DIR
cmds = f"curl -s -o {windows_crash_zip} {windows_crash_dl}"
cmds = f"{cmds}; unzip -q {windows_crash_zip} -d {windows_crash_dir}"
cmds = f"{cmds}; chmod +x {windows_crash_dir}/*.exe"
self.remote_op.operation(SSHOperation.SHELL, cmds, None)
class TarEC2Artifacts(PowercycleCommand):
"""Tar EC2 artifacts."""
COMMAND = "tarEC2Artifacts"
def execute(self) -> None:
""":return: None."""
if "ec2_ssh_failure" in self.expansions:
return
tar_cmd = "tar" if "tar" not in self.expansions else self.expansions["tar"]
ec2_artifacts = powercycle_constants.LOG_PATH
# On test success, we only archive mongod.log.
if self.expansions.get("exit_code", "1") != "0":
ec2_artifacts = f"{ec2_artifacts} {powercycle_constants.DB_PATH}"
ec2_artifacts = f"{ec2_artifacts} {powercycle_constants.BACKUP_ARTIFACTS}"
if self.is_windows():
ec2_artifacts = f"{ec2_artifacts} {powercycle_constants.EVENT_LOGPATH}"
cmd = f"{tar_cmd} czf ec2_artifacts.tgz {ec2_artifacts}"
self.remote_op.operation(SSHOperation.SHELL, cmd, None)
class CopyEC2Artifacts(PowercycleCommand):
"""Copy EC2 artifacts."""
COMMAND = "copyEC2Artifacts"
def execute(self) -> None:
""":return: None."""
if "ec2_ssh_failure" in self.expansions:
return
self.remote_op.operation(SSHOperation.COPY_FROM, "ec2_artifacts.tgz", None)
class GatherRemoteEventLogs(PowercycleCommand):
"""
Gather remote event logs.
The event logs on Windows are a useful diagnostic to have when determining if something bad
happened to the remote machine after it was repeatedly crashed during powercycle testing. For
example, the Application and System event logs have previously revealed that the mongod.exe
process abruptly exited due to not being able to open a file despite the process successfully
being restarted and responding to network requests.
"""
COMMAND = "gatherRemoteEventLogs"
def execute(self) -> None:
""":return: None."""
if not self.is_windows() or self.expansions.get("ec2_ssh_failure", ""):
return
event_logpath = powercycle_constants.EVENT_LOGPATH
cmds = f"mkdir -p {event_logpath}"
cmds = f"{cmds}; wevtutil qe Application /c:10000 /rd:true /f:Text > {event_logpath}/application.log"
cmds = f"{cmds}; wevtutil qe Security /c:10000 /rd:true /f:Text > {event_logpath}/security.log"
cmds = f"{cmds}; wevtutil qe System /c:10000 /rd:true /f:Text > {event_logpath}/system.log"
self.remote_op.operation(SSHOperation.SHELL, cmds, None)
class GatherRemoteMongoCoredumps(PowercycleCommand):
"""Gather Remote Mongo Coredumps."""
COMMAND = "gatherRemoteMongoCoredumps"
def execute(self) -> None:
""":return: None."""
if "ec2_ssh_failure" in self.expansions:
return
remote_dir = powercycle_constants.REMOTE_DIR
# Find all core files and move to $remote_dir
cmds = "core_files=$(/usr/bin/find -H . \\( -name '*.core' -o -name '*.mdmp' \\) 2> /dev/null)"
cmds = f"{cmds}; if [ -z \"$core_files\" ]; then exit 0; fi"
cmds = f"{cmds}; echo Found remote core files $core_files, moving to $(pwd)"
cmds = f"{cmds}; for core_file in $core_files"
cmds = f"{cmds}; do base_name=$(echo $core_file | sed 's/.*///')"
cmds = f"{cmds}; if [ ! -f $base_name ]; then mv $core_file .; fi"
cmds = f"{cmds}; done"
self.remote_op.operation(SSHOperation.SHELL, cmds, remote_dir)
class CopyRemoteMongoCoredumps(PowercycleCommand):
"""Copy Remote Mongo Coredumps."""
COMMAND = "copyRemoteMongoCoredumps"
def execute(self) -> None:
""":return: None."""
if self.expansions.get("ec2_ssh_failure", ""):
return
if self.is_windows():
core_suffix = "mdmp"
else:
core_suffix = "core"
remote_dir = powercycle_constants.REMOTE_DIR
# Core file may not exist so we ignore the return code.
self.remote_op.operation(SSHOperation.SHELL, f"{remote_dir}/*.{core_suffix}", None, True)
class CopyEC2MonitorFiles(PowercycleCommand):
"""Copy EC2 monitor files."""
COMMAND = "copyEC2MonitorFiles"
def execute(self) -> None:
""":return: None."""
tar_cmd = "tar" if "tar" not in self.expansions else self.expansions["tar"]
cmd = f"{tar_cmd} czf ec2_monitor_files.tgz {powercycle_constants.EC2_MONITOR_FILES}"
self.remote_op.operation(SSHOperation.SHELL, cmd, None)
self.remote_op.operation(SSHOperation.COPY_FROM, 'ec2_monitor_files.tgz', None)
class RunHangAnalyzerOnRemoteInstance(PowercycleCommand):
"""Run the hang-analyzer on a remote instance."""
COMMAND = "runHangAnalyzerOnRemoteInstance"
def execute(self) -> None: # pylint: disable=too-many-locals
""":return: None."""
if "private_ip_address" not in self.expansions:
return
hang_analyzer_processes = "dbtest,java,mongo,mongod,mongos,python,_test" if "hang_analyzer_processes" not in self.expansions else self.expansions[
"hang_analyzer_processes"]
hang_analyzer_option = f"-o file -o stdout -p {hang_analyzer_processes}"
hang_analyzer_dump_core = True if "hang_analyzer_dump_core" not in self.expansions else self.expansions[
"hang_analyzer_dump_core"]
if hang_analyzer_dump_core:
hang_analyzer_option = f"-c {hang_analyzer_option}"
core_ext = "core"
if self.is_windows():
core_ext = "mdmp"
remote_dir = powercycle_constants.REMOTE_DIR
files = self._call("ls")[1].split("\n")
dbg_regex = re.compile(r"(\.debug$)|(\.dSYM$)|(\.pdb$)")
debug_files = [f for f in files if dbg_regex.match(f)]
file_param = []
for debug_file in debug_files:
file_param.append(debug_file)
if file_param:
self.remote_op.operation(SSHOperation.COPY_TO, file_param, remote_dir)
# Activate virtualenv on remote host. The virtualenv bin_dir is different for Linux and
# Windows.
venv = powercycle_constants.VIRTUALENV_DIR
cmds = f"activate=$(find {venv} -name 'activate')"
cmds = f"{cmds}; . $activate"
# In the 'cmds' variable we pass to remote host, use 'python' instead of '$python' since
# we don't want to evaluate the local python variable, but instead pass the python string
# so the remote host will use the right python when the virtualenv is sourced.
cmds = f"{cmds}; cd {remote_dir}"
cmds = f"{cmds}; PATH=\"/opt/mongodbtoolchain/gdb/bin:$PATH\" python buildscripts/resmoke.py hang-analyzer {hang_analyzer_option}"
self.remote_op.operation(SSHOperation.SHELL, cmds, None)
file_param = []
file_param.append(f"{remote_dir}/debugger*.*")
file_param.append(f"{remote_dir}/*.{core_ext}")
self.remote_op.operation(SSHOperation.COPY_FROM, file_param, None)
class NoOp(Subcommand):
"""No op."""
def execute(self) -> None:
""":return: None."""
pass
class PowercyclePlugin(PluginInterface):
"""Interact with powercycle_operations."""
def add_subcommand(self, subparsers):
"""
Add 'powercycle_operations' subcommand.
:param subparsers: argparse parser to add to
:return: None
"""
subparsers.add_parser(SetUpEC2Instance.COMMAND)
subparsers.add_parser(TarEC2Artifacts.COMMAND)
subparsers.add_parser(CopyEC2Artifacts.COMMAND)
subparsers.add_parser(GatherRemoteEventLogs.COMMAND)
subparsers.add_parser(GatherRemoteMongoCoredumps.COMMAND)
subparsers.add_parser(CopyRemoteMongoCoredumps.COMMAND)
subparsers.add_parser(CopyEC2MonitorFiles.COMMAND)
subparsers.add_parser(RunHangAnalyzerOnRemoteInstance.COMMAND)
# Accept arbitrary args like 'powercycle.py undodb foobar', but ignore them.
def parse(self, subcommand, parser, parsed_args, **kwargs): # pylint: disable=too-many-return-statements
"""
Return powercycle_operation if command is one we recognize.
:param subcommand: equivalent to parsed_args.command
:param parser: parser used
:param parsed_args: output of parsing
:param kwargs: additional args
:return: None or a Subcommand
"""
# Only return subcommand if expansion file has been written.
if not os.path.exists(powercycle_constants.EXPANSIONS_FILE):
print(f"Did not find {powercycle_constants.EXPANSIONS_FILE}, skipping {subcommand}.")
return NoOp()
if subcommand == SetUpEC2Instance.COMMAND:
return SetUpEC2Instance()
elif subcommand == TarEC2Artifacts.COMMAND:
return TarEC2Artifacts()
elif subcommand == CopyEC2Artifacts.COMMAND:
return CopyEC2Artifacts()
elif subcommand == GatherRemoteEventLogs.COMMAND:
return GatherRemoteEventLogs()
elif subcommand == GatherRemoteMongoCoredumps.COMMAND:
return GatherRemoteMongoCoredumps()
elif subcommand == CopyRemoteMongoCoredumps.COMMAND:
return CopyRemoteMongoCoredumps()
elif subcommand == CopyEC2MonitorFiles.COMMAND:
return CopyEC2MonitorFiles()
elif subcommand == RunHangAnalyzerOnRemoteInstance.COMMAND:
return RunHangAnalyzerOnRemoteInstance()
else:
return None

View File

@ -11,6 +11,10 @@ import argparse
from buildscripts.resmokelib.plugin import PluginInterface, Subcommand
from buildscripts.resmokelib.powercycle import powercycle, powercycle_config, powercycle_constants
from buildscripts.resmokelib.powercycle.remote_hang_analyzer import RunHangAnalyzerOnRemoteInstance
from buildscripts.resmokelib.powercycle.save_diagnostics import GatherRemoteEventLogs, TarEC2Artifacts, \
CopyEC2Artifacts, CopyEC2MonitorFiles, GatherRemoteMongoCoredumps, CopyRemoteMongoCoredumps
from buildscripts.resmokelib.powercycle.setup import SetUpEC2Instance
SUBCOMMAND = "powercycle"
@ -18,6 +22,12 @@ SUBCOMMAND = "powercycle"
class Powercycle(Subcommand):
"""Main class to run powercycle subcommand."""
# Parser command Enum
RUN = 1
HOST_SETUP = 2
SAVE_DIAG = 3
REMOTE_HANG_ANALYZER = 4
def __init__(self, parser_actions, options):
"""Initialize."""
self.parser_actions = parser_actions
@ -25,8 +35,38 @@ class Powercycle(Subcommand):
def execute(self):
"""Execute powercycle test."""
return {
self.RUN: self._exec_powercycle_main, self.HOST_SETUP: self._exec_powercycle_host_setup,
self.SAVE_DIAG: self._exec_powercycle_save_diagnostics,
self.REMOTE_HANG_ANALYZER: self._exec_powercycle_hang_analyzer
}[self.options.run_option]()
def _exec_powercycle_main(self):
powercycle.main(self.parser_actions, self.options)
@staticmethod
def _exec_powercycle_host_setup():
SetUpEC2Instance().execute()
@staticmethod
def _exec_powercycle_save_diagnostics():
# The event logs on Windows are a useful diagnostic to have when determining if something bad
# happened to the remote machine after it was repeatedly crashed during powercycle testing. For
# example, the Application and System event logs have previously revealed that the mongod.exe
# process abruptly exited due to not being able to open a file despite the process successfully
# being restarted and responding to network requests.
GatherRemoteEventLogs().execute()
TarEC2Artifacts().execute()
CopyEC2Artifacts().execute()
CopyEC2MonitorFiles().execute()
GatherRemoteMongoCoredumps().execute()
CopyRemoteMongoCoredumps().execute()
@staticmethod
def _exec_powercycle_hang_analyzer():
RunHangAnalyzerOnRemoteInstance().execute()
class PowercyclePlugin(PluginInterface):
"""Interface to parsing."""
@ -35,9 +75,33 @@ class PowercyclePlugin(PluginInterface):
"""Initialize."""
self.parser_actions = None
@staticmethod
def _add_powercycle_commands(parent_parser):
"""Add sub-subcommands for powercycle."""
sub_parsers = parent_parser.add_subparsers(help="powercycle commands")
setup_parser = sub_parsers.add_parser("setup-host")
setup_parser.set_defaults(run_option=Powercycle.HOST_SETUP)
save_parser = sub_parsers.add_parser("save-diagnostics")
save_parser.set_defaults(run_option=Powercycle.SAVE_DIAG)
save_parser = sub_parsers.add_parser("remote-hang-analyzer")
save_parser.set_defaults(run_option=Powercycle.REMOTE_HANG_ANALYZER)
run_parser = sub_parsers.add_parser("run")
run_parser.set_defaults(run_option=Powercycle.RUN)
# Only need to return run_parser for further processing; others don't need additional args.
return run_parser
def add_subcommand(self, subparsers): # pylint: disable=too-many-statements
"""Create and add the parser for the subcommand."""
parser = subparsers.add_parser(SUBCOMMAND, help=__doc__, usage="usage")
intermediate_parser = subparsers.add_parser(
SUBCOMMAND, help=__doc__,
usage="MongoDB Powercycle tests; type one of the subcommands for more information")
parser = self._add_powercycle_commands(intermediate_parser)
test_options = parser.add_argument_group("Test Options")
mongodb_options = parser.add_argument_group("MongoDB Options")

View File

@ -0,0 +1,62 @@
"""Library functions for powercycle."""
import getpass
import shlex
import subprocess
import sys
import yaml
from buildscripts.resmokelib.plugin import Subcommand
from buildscripts.resmokelib.powercycle import powercycle_constants
from buildscripts.resmokelib.powercycle.lib.remote_operations import RemoteOperations
class PowercycleCommand(Subcommand): # pylint: disable=abstract-method, too-many-instance-attributes
"""Base class for remote operations to set up powercycle."""
def __init__(self):
"""Initialize PowercycleCommand."""
self.expansions = yaml.safe_load(open(powercycle_constants.EXPANSIONS_FILE))
self.retries = 0 if "ssh_retries" not in self.expansions else int(
self.expansions["ssh_retries"])
self.ssh_identity = self._get_ssh_identity()
self.ssh_connection_options = self.ssh_identity + " " + self.expansions[
"ssh_connection_options"]
self.sudo = "" if self.is_windows() else "sudo"
# The username on the Windows image that powercycle uses is currently the default user.
self.user = "Administrator" if self.is_windows() else getpass.getuser()
self.user_host = self.user + "@" + self.expansions["private_ip_address"]
self.remote_op = RemoteOperations(
user_host=self.user_host,
ssh_connection_options=self.ssh_connection_options,
)
@staticmethod
def is_windows() -> bool:
""":return: True if running on Windows."""
return sys.platform == "win32" or sys.platform == "cygwin"
@staticmethod
def _call(cmd):
cmd = shlex.split(cmd)
# Use a common pipe for stdout & stderr for logging.
process = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
buff_stdout, _ = process.communicate()
buff = buff_stdout.decode("utf-8", "replace")
return process.poll(), buff
def _get_posix_workdir(self) -> str:
workdir = self.expansions['workdir']
if self.is_windows():
workdir = workdir.replace("\\", "/")
return workdir
def _get_ssh_identity(self) -> str:
workdir = self._get_posix_workdir()
pem_file = '/'.join([workdir, 'powercycle.pem'])
return f"-i {pem_file}"

View File

@ -31,7 +31,7 @@ import pymongo
import requests
import yaml
from buildscripts.powercycle_setup import remote_operations
from buildscripts.resmokelib.powercycle.lib import remote_operations
from buildscripts.resmokelib.powercycle import powercycle_config, powercycle_constants
# See https://docs.python.org/2/library/sys.html#sys.platform
@ -1807,7 +1807,7 @@ def main(parser_actions, options): # pylint: disable=too-many-branches,too-many
verify_remote_access(local_ops)
# Pass client_args to the remote script invocation.
client_args = "powercycle"
client_args = "powercycle run"
options_dict = vars(options)
for action in parser_actions:
option_value = options_dict.get(action.dest, None)

View File

@ -0,0 +1,57 @@
"""Run the hang analyzer on the remote powercycle instance."""
import os
import re
from buildscripts.resmokelib.powercycle import powercycle_constants
from buildscripts.resmokelib.powercycle.lib import PowercycleCommand
from buildscripts.resmokelib.powercycle.lib.remote_operations import SSHOperation
class RunHangAnalyzerOnRemoteInstance(PowercycleCommand):
"""Run the hang-analyzer on a remote instance."""
COMMAND = "runHangAnalyzerOnRemoteInstance"
def execute(self) -> None: # pylint: disable=too-many-locals
""":return: None."""
if "private_ip_address" not in self.expansions:
return
hang_analyzer_processes = "dbtest,java,mongo,mongod,mongos,python,_test" if "hang_analyzer_processes" not in self.expansions else self.expansions[
"hang_analyzer_processes"]
hang_analyzer_option = f"-o file -o stdout -p {hang_analyzer_processes}"
hang_analyzer_dump_core = True if "hang_analyzer_dump_core" not in self.expansions else self.expansions[
"hang_analyzer_dump_core"]
if hang_analyzer_dump_core:
hang_analyzer_option = f"-c {hang_analyzer_option}"
core_ext = "core"
if self.is_windows():
core_ext = "mdmp"
remote_dir = powercycle_constants.REMOTE_DIR
files = self._call("ls")[1].split("\n")
dbg_regex = re.compile(r"(\.debug$)|(\.dSYM$)|(\.pdb$)")
debug_files = [f for f in files if dbg_regex.match(f)]
file_param = []
for debug_file in debug_files:
file_param.append(debug_file)
if file_param:
self.remote_op.operation(SSHOperation.COPY_TO, file_param, remote_dir)
# Activate virtualenv on remote host. The virtualenv bin_dir is different for Linux and
# Windows.
venv = powercycle_constants.VIRTUALENV_DIR
cmds = f"activate=$(find {venv} -name 'activate')"
cmds = f"{cmds}; . $activate"
# In the 'cmds' variable we pass to remote host, use 'python' instead of '$python' since
# we don't want to evaluate the local python variable, but instead pass the python string
# so the remote host will use the right python when the virtualenv is sourced.
cmds = f"{cmds}; cd {remote_dir}"
cmds = f"{cmds}; PATH=\"/opt/mongodbtoolchain/gdb/bin:$PATH\" python buildscripts/resmoke.py hang-analyzer {hang_analyzer_option}"
self.remote_op.operation(SSHOperation.SHELL, cmds, None)
file_param = []
file_param.append(f"{remote_dir}/debugger*.*")
file_param.append(f"{remote_dir}/*.{core_ext}")
self.remote_op.operation(SSHOperation.COPY_FROM, file_param, None)

View File

@ -0,0 +1,125 @@
"""Save various diagnostics info from the remote powercycle instance."""
from buildscripts.resmokelib.powercycle import powercycle_constants
from buildscripts.resmokelib.powercycle.lib import PowercycleCommand
from buildscripts.resmokelib.powercycle.lib.remote_operations import SSHOperation
class TarEC2Artifacts(PowercycleCommand):
"""Tar EC2 artifacts."""
COMMAND = "tarEC2Artifacts"
def execute(self) -> None:
""":return: None."""
if "ec2_ssh_failure" in self.expansions:
return
tar_cmd = "tar" if "tar" not in self.expansions else self.expansions["tar"]
ec2_artifacts = powercycle_constants.LOG_PATH
# On test success, we only archive mongod.log.
if self.expansions.get("exit_code", "1") != "0":
ec2_artifacts = f"{ec2_artifacts} {powercycle_constants.DB_PATH}"
ec2_artifacts = f"{ec2_artifacts} {powercycle_constants.BACKUP_ARTIFACTS}"
if self.is_windows():
ec2_artifacts = f"{ec2_artifacts} {powercycle_constants.EVENT_LOGPATH}"
cmd = f"{tar_cmd} czf ec2_artifacts.tgz {ec2_artifacts}"
self.remote_op.operation(SSHOperation.SHELL, cmd, None)
class CopyEC2Artifacts(PowercycleCommand):
"""Copy EC2 artifacts."""
COMMAND = "copyEC2Artifacts"
def execute(self) -> None:
""":return: None."""
if "ec2_ssh_failure" in self.expansions:
return
self.remote_op.operation(SSHOperation.COPY_FROM, "ec2_artifacts.tgz", None)
class GatherRemoteEventLogs(PowercycleCommand):
"""
Gather remote event logs.
The event logs on Windows are a useful diagnostic to have when determining if something bad
happened to the remote machine after it was repeatedly crashed during powercycle testing. For
example, the Application and System event logs have previously revealed that the mongod.exe
process abruptly exited due to not being able to open a file despite the process successfully
being restarted and responding to network requests.
"""
COMMAND = "gatherRemoteEventLogs"
def execute(self) -> None:
""":return: None."""
if not self.is_windows() or self.expansions.get("ec2_ssh_failure", ""):
return
event_logpath = powercycle_constants.EVENT_LOGPATH
cmds = f"mkdir -p {event_logpath}"
cmds = f"{cmds}; wevtutil qe Application /c:10000 /rd:true /f:Text > {event_logpath}/application.log"
cmds = f"{cmds}; wevtutil qe Security /c:10000 /rd:true /f:Text > {event_logpath}/security.log"
cmds = f"{cmds}; wevtutil qe System /c:10000 /rd:true /f:Text > {event_logpath}/system.log"
self.remote_op.operation(SSHOperation.SHELL, cmds, None)
class GatherRemoteMongoCoredumps(PowercycleCommand):
"""Gather Remote Mongo Coredumps."""
COMMAND = "gatherRemoteMongoCoredumps"
def execute(self) -> None:
""":return: None."""
if "ec2_ssh_failure" in self.expansions:
return
remote_dir = powercycle_constants.REMOTE_DIR
# Find all core files and move to $remote_dir
cmds = "core_files=$(/usr/bin/find -H . \\( -name '*.core' -o -name '*.mdmp' \\) 2> /dev/null)"
cmds = f"{cmds}; if [ -z \"$core_files\" ]; then exit 0; fi"
cmds = f"{cmds}; echo Found remote core files $core_files, moving to $(pwd)"
cmds = f"{cmds}; for core_file in $core_files"
cmds = f"{cmds}; do base_name=$(echo $core_file | sed 's/.*///')"
cmds = f"{cmds}; if [ ! -f $base_name ]; then mv $core_file .; fi"
cmds = f"{cmds}; done"
self.remote_op.operation(SSHOperation.SHELL, cmds, remote_dir)
class CopyRemoteMongoCoredumps(PowercycleCommand):
"""Copy Remote Mongo Coredumps."""
COMMAND = "copyRemoteMongoCoredumps"
def execute(self) -> None:
""":return: None."""
if self.expansions.get("ec2_ssh_failure", ""):
return
if self.is_windows():
core_suffix = "mdmp"
else:
core_suffix = "core"
remote_dir = powercycle_constants.REMOTE_DIR
# Core file may not exist so we ignore the return code.
self.remote_op.operation(SSHOperation.SHELL, f"{remote_dir}/*.{core_suffix}", None, True)
class CopyEC2MonitorFiles(PowercycleCommand):
"""Copy EC2 monitor files."""
COMMAND = "copyEC2MonitorFiles"
def execute(self) -> None:
""":return: None."""
tar_cmd = "tar" if "tar" not in self.expansions else self.expansions["tar"]
cmd = f"{tar_cmd} czf ec2_monitor_files.tgz {powercycle_constants.EC2_MONITOR_FILES}"
self.remote_op.operation(SSHOperation.SHELL, cmd, None)
self.remote_op.operation(SSHOperation.COPY_FROM, 'ec2_monitor_files.tgz', None)

View File

@ -0,0 +1,135 @@
"""setup the remote host for powercycle."""
import os
from buildscripts.resmokelib.powercycle.lib import PowercycleCommand
from buildscripts.resmokelib.powercycle import powercycle_constants
from buildscripts.resmokelib.powercycle.lib.remote_operations import SSHOperation
class SetUpEC2Instance(PowercycleCommand):
"""Set up EC2 instance."""
COMMAND = "setUpEC2Instance"
def execute(self) -> None: # pylint: disable=too-many-instance-attributes, too-many-locals, too-many-statements
""":return: None."""
# First operation -
# Create remote_dir.
group_cmd = f"id -Gn {self.user}"
_, group = self._call(group_cmd)
group = group.split(" ")[0]
user_group = f"{self.user}:{group}"
remote_dir = powercycle_constants.REMOTE_DIR
db_path = powercycle_constants.DB_PATH
set_permission_stmt = f"chmod -R 777"
if self.is_windows():
set_permission_stmt = f"setfacl -s user::rwx,group::rwx,other::rwx"
cmds = f"{self.sudo} mkdir -p {remote_dir}; {self.sudo} chown -R {user_group} {remote_dir}; {set_permission_stmt} {remote_dir}; ls -ld {remote_dir}"
cmds = f"{cmds}; {self.sudo} mkdir -p {db_path}; {self.sudo} chown -R {user_group} {db_path}; {set_permission_stmt} {db_path}; ls -ld {db_path}"
self.remote_op.operation(SSHOperation.SHELL, cmds, None)
# Second operation -
# Copy buildscripts and mongoDB executables to the remote host.
files = ["etc", "buildscripts", "dist-test/bin"]
shared_libs = "dist-test/lib"
if os.path.isdir(shared_libs):
files.append(shared_libs)
self.remote_op.operation(SSHOperation.COPY_TO, files, remote_dir)
# Third operation -
# Set up virtualenv on remote.
venv = powercycle_constants.VIRTUALENV_DIR
python = "/opt/mongodbtoolchain/v3/bin/python3" if "python" not in self.expansions else self.expansions[
"python"]
cmds = f"python_loc=$(which {python})"
cmds = f"{cmds}; remote_dir={remote_dir}"
cmds = f"{cmds}; if [ \"Windows_NT\" = \"$OS\" ]; then python_loc=$(cygpath -w $python_loc); remote_dir=$(cygpath -w $remote_dir); fi"
cmds = f"{cmds}; virtualenv --python $python_loc --system-site-packages {venv}"
cmds = f"{cmds}; activate=$(find {venv} -name 'activate')"
cmds = f"{cmds}; . $activate"
cmds = f"{cmds}; pip3 install -r $remote_dir/etc/pip/powercycle-requirements.txt"
self.remote_op.operation(SSHOperation.SHELL, cmds, None)
# Fourth operation -
# Enable core dumps on non-Windows remote hosts.
# The core pattern must specify a director, since mongod --fork will chdir("/")
# and cannot generate a core dump there (see SERVER-21635).
# We need to reboot the host for the core limits to take effect.
if not self.is_windows():
core_pattern = f"{remote_dir}/dump_%e.%p.core"
sysctl_conf = "/etc/sysctl.conf"
cmds = "ulimit -a"
cmds = f"{cmds}; echo \"{self.user} - core unlimited\" | {self.sudo} tee -a /etc/security/limits.conf"
cmds = f"{cmds}; if [ -f {sysctl_conf} ]"
cmds = f"{cmds}; then grep ^kernel.core_pattern {sysctl_conf}"
cmds = f"{cmds}; if [ $? -eq 0 ]"
cmds = f"{cmds}; then {self.sudo} sed -i \"s,kernel.core_pattern=.*,kernel.core_pattern=$core_pattern,\" {sysctl_conf}"
cmds = f"{cmds}; else echo \"kernel.core_pattern={core_pattern}\" | {self.sudo} tee -a {sysctl_conf}"
cmds = f"{cmds}; fi"
cmds = f"{cmds}; else echo Cannot change the core pattern and no core dumps will be generated."
cmds = f"{cmds}; fi"
# The following line for restarting the machine is based on
# https://unix.stackexchange.com/a/349558 in order to ensure the ssh client gets a
# response from the remote machine before it restarts.
cmds = f"{cmds}; nohup {self.sudo} reboot &>/dev/null & exit"
self.remote_op.operation(SSHOperation.SHELL, cmds, None)
# Fifth operation -
# Print the ulimit & kernel.core_pattern
if not self.is_windows():
# Always exit successfully, as this is just informational.
cmds = "uptime"
cmds = f"{cmds}; ulimit -a"
cmds = f"{cmds}; if [ -f /sbin/sysctl ]"
cmds = f"{cmds}; then /sbin/sysctl kernel.core_pattern"
cmds = f"{cmds}; fi"
self.remote_op.operation(SSHOperation.SHELL, cmds, None, True)
# Sixth operation -
# Set up curator to collect system & process stats on remote.
variant = "windows-64" if self.is_windows() else "ubuntu1604"
curator_hash = "b0c3c0fc68bce26d9572796d6bed3af4a298e30e"
curator_url = f"https://s3.amazonaws.com/boxes.10gen.com/build/curator/curator-dist-{variant}-{curator_hash}.tar.gz"
cmds = f"curl -s {curator_url} | tar -xzv"
monitor_system_file = powercycle_constants.MONITOR_SYSTEM_FILE
monitor_proc_file = powercycle_constants.MONITOR_PROC_FILE
if self.is_windows():
# Since curator runs as SYSTEM user, ensure the output files can be accessed.
cmds = f"{cmds}; touch {monitor_system_file}; chmod 777 {monitor_system_file}"
cmds = f"{cmds}; cygrunsrv --install curator_sys --path curator --chdir $HOME --args 'stat system --file {monitor_system_file}'"
cmds = f"{cmds}; touch {monitor_proc_file}; chmod 777 {monitor_proc_file}"
cmds = f"{cmds}; cygrunsrv --install curator_proc --path curator --chdir $HOME --args 'stat process-all --file {monitor_proc_file}'"
cmds = f"{cmds}; cygrunsrv --start curator_sys"
cmds = f"{cmds}; cygrunsrv --start curator_proc"
else:
cmds = f"{cmds}; touch {monitor_system_file} {monitor_proc_file}"
cmds = f"{cmds}; cmd=\"@reboot cd $HOME && {self.sudo} ./curator stat system >> {monitor_system_file}\""
cmds = f"{cmds}; (crontab -l ; echo \"$cmd\") | crontab -"
cmds = f"{cmds}; cmd=\"@reboot cd $HOME && $sudo ./curator stat process-all >> {monitor_proc_file}\""
cmds = f"{cmds}; (crontab -l ; echo \"$cmd\") | crontab -"
cmds = f"{cmds}; crontab -l"
cmds = f"{cmds}; {{ {self.sudo} $HOME/curator stat system --file {monitor_system_file} > /dev/null 2>&1 & {self.sudo} $HOME/curator stat process-all --file {monitor_proc_file} > /dev/null 2>&1 & }} & disown"
self.remote_op.operation(SSHOperation.SHELL, cmds, retry=True)
# Seventh operation -
# Install NotMyFault, used to crash Windows.
if self.is_windows():
windows_crash_zip = powercycle_constants.WINDOWS_CRASH_ZIP
windows_crash_dl = powercycle_constants.WINDOWS_CRASH_DL
windows_crash_dir = powercycle_constants.WINDOWS_CRASH_DIR
cmds = f"curl -s -o {windows_crash_zip} {windows_crash_dl}"
cmds = f"{cmds}; unzip -q {windows_crash_zip} -d {windows_crash_dir}"
cmds = f"{cmds}; chmod +x {windows_crash_dir}/*.exe"
self.remote_op.operation(SSHOperation.SHELL, cmds, None)

View File

@ -2373,7 +2373,7 @@ functions:
# Set an exit trap so we can save the real exit status (see SERVER-34033).
trap 'echo $? > error_exit.txt; exit 0' EXIT
set +o errexit
eval $python -u buildscripts/resmoke.py powercycle \
eval $python -u buildscripts/resmoke.py powercycle run \
"--sshUserHost=$(printf "%s@%s" "$user" "${private_ip_address}") \
--sshConnection=\"-i ${private_key_file} ${ssh_connection_options}\" \
--taskName=${task_name}"
@ -2631,10 +2631,10 @@ functions:
set -o errexit
${activate_virtualenv}
$python buildscripts/powercycle_operations.py setUpEC2Instance
$python buildscripts/resmoke.py powercycle setup-host
### Process & archive remote EC2 artifacts ###
"tar EC2 artifacts": &tar_ec2_artifacts
"save powercycle artifacts": &save_powercycle_artifacts
command: shell.exec
params:
shell: bash
@ -2647,79 +2647,7 @@ functions:
fi
${activate_virtualenv}
$python buildscripts/powercycle_operations.py tarEC2Artifacts
"copy EC2 artifacts": &copy_ec2_artifacts
command: shell.exec
params:
shell: bash
working_dir: src
script: |
if [ ! -f powercycle_ip_address.yml ]; then
exit 0
fi
${activate_virtualenv}
$python buildscripts/powercycle_operations.py copyEC2Artifacts
"copy ec2 monitor files": &copy_ec2_monitor_files
command: shell.exec
params:
shell: bash
working_dir: src
script: |
set -o verbose
if [ ! -f powercycle_ip_address.yml ]; then
exit 0
fi
${activate_virtualenv}
$python buildscripts/powercycle_operations.py copyEC2MonitorFiles
# The event logs on Windows are a useful diagnostic to have when determining if something bad
# happened to the remote machine after it was repeatedly crashed during powercycle testing. For
# example, the Application and System event logs have previously revealed that the mongod.exe
# process abruptly exited due to not being able to open a file despite the process successfully
# being restarted and responding to network requests.
"gather remote event logs": &gather_remote_event_logs
command: shell.exec
params:
shell: bash
working_dir: src
script: |
if [ ! -f powercycle_ip_address.yml ]; then
exit 0
fi
${activate_virtualenv}
$python buildscripts/powercycle_operations.py gatherRemoteEventLogs
"gather remote mongo coredumps": &gather_remote_mongo_coredumps
command: shell.exec
params:
shell: bash
working_dir: "src"
script: |
if [ ! -f powercycle_ip_address.yml ]; then
exit 0
fi
${activate_virtualenv}
$python buildscripts/powercycle_operations.py gatherRemoteMongoCoredumps
"copy remote mongo coredumps": &copy_remote_mongo_coredumps
command: shell.exec
params:
shell: bash
working_dir: "src"
script: |
if [ ! -f powercycle_ip_address.yml ]; then
exit 0
fi
${activate_virtualenv}
$python buildscripts/powercycle_operations.py copyRemoteMongoCoredumps
$python buildscripts/resmoke.py powercycle save-diagnostics
"archive remote EC2 artifacts": &archive_remote_ec2_artifacts
command: s3.put
@ -2748,12 +2676,7 @@ functions:
optional: true
"save ec2 task artifacts":
- *gather_remote_event_logs
- *tar_ec2_artifacts
- *copy_ec2_artifacts
- *copy_ec2_monitor_files
- *gather_remote_mongo_coredumps
- *copy_remote_mongo_coredumps
- *save_powercycle_artifacts
- *archive_remote_ec2_artifacts
- *archive_remote_ec2_monitor_files
@ -3105,7 +3028,7 @@ functions:
# Call hang analyzer for tasks that are running remote mongo processes
if [ -n "${private_ip_address}" ]; then
$python buildscripts/powercycle_operations.py runHangAnalyzerOnRemoteInstance
$python buildscripts/resmoke.py powercycle remote-hang-analyzer
fi
"wait for resmoke to shutdown":