0
0
mirror of https://github.com/mongodb/mongo.git synced 2024-11-24 00:17:37 +01:00
mongodb/buildscripts/evergreen_resmoke_job_count.py
Ryan Berryhill d3bfe8c0ec SERVER-86578 Enable TSAN all feature flags variant (#25909)
GitOrigin-RevId: 040a4b10d87a85c2cd17119373aa9636eca36d57
2024-08-19 20:38:06 +00:00

232 lines
8.8 KiB
Python

#!/usr/bin/env python3
"""Determine the number of resmoke jobs to run."""
import argparse
import logging
import platform
import re
import sys
from collections import defaultdict
import psutil
import structlog
import yaml
LOGGER = structlog.get_logger(__name__)
CPU_COUNT = psutil.cpu_count()
PLATFORM_MACHINE = platform.machine()
SYS_PLATFORM = sys.platform
# The following constants define tasks that should override the resmoke jobs in various
# configurations. The factor value will set the max number of resmoke jobs based on the number
# of CPUs a machine has. For example, if the factor is 0.5 and a machine has 8 CPUs, the max resmoke
# jobs would be 4 (8 * 0.5). If the running task has multiple overrides that apply, the lowest
# value will be used.
#
# The task name is specified as a regex. The task name used will be the task executing the test,
# which means if the task has been split to run in sub-tasks, an extra "_0", "_1", ... will be
# appended to the task name. For this reason, most task names should end with a ".*".
# Apply factor for a task based on the build variant it is running on.
VARIANT_TASK_FACTOR_OVERRIDES = {
"enterprise-rhel-8-64-bit": [{"task": r"logical_session_cache_replication.*", "factor": 0.75}],
"enterprise-rhel-8-64-bit-inmem": [
{"task": "secondary_reads_passthrough", "factor": 0.3},
{"task": "multi_stmt_txn_jscore_passthrough_with_migration", "factor": 0.3},
],
"enterprise-rhel8-debug-tsan": [
# Lower the default resmoke_jobs_factor for TSAN to reduce memory pressure for this suite,
# as otherwise TSAN variants occasionally run out of memory
# Non-TSAN variants don't need this adjustment as they have a reasonable free memory margin
{"task": r"fcv_upgrade_downgrade_sharded_collections_jscore_passthrough.*", "factor": 0.27},
{"task": r"shard.*uninitialized_fcv_jscore_passthrough.*", "factor": 0.125},
],
"enterprise-rhel8-debug-tsan-all-feature-flags": [
# Lower the default resmoke_jobs_factor for TSAN to reduce memory pressure for this suite,
# as otherwise TSAN variants occasionally run out of memory.
# The all feature flags variant sometimes needs more aggressive reductions than the no
# feature flags variant.
{
"task": r"fcv_upgrade_downgrade_sharded_collections_jscore_passthrough.*",
"factor": 0.125,
},
{"task": r"fcv_upgrade_downgrade_replica_sets_jscore_passthrough.*", "factor": 0.27},
{"task": r"shard.*uninitialized_fcv_jscore_passthrough.*", "factor": 0.125},
],
"rhel8-debug-aubsan-classic-engine": [
{"task": r"shard.*uninitialized_fcv_jscore_passthrough.*", "factor": 0.25}
],
"rhel8-debug-aubsan-all-feature-flags": [
{"task": r"shard.*uninitialized_fcv_jscore_passthrough.*", "factor": 0.25}
],
"enterprise-windows-all-feature-flags-required": [{"task": "noPassthrough", "factor": 0.5}],
"enterprise-windows-all-feature-flags-non-essential": [
{"task": "noPassthrough", "factor": 0.5}
],
"enterprise-windows": [{"task": "noPassthrough", "factor": 0.5}],
"windows-debug-suggested": [{"task": "noPassthrough", "factor": 0.5}],
"windows": [{"task": "noPassthrough", "factor": 0.5}],
}
TASKS_FACTORS = [{"task": r"replica_sets.*", "factor": 0.5}, {"task": r"sharding.*", "factor": 0.5}]
DISTRO_MULTIPLIERS = {"rhel8.8-large": 1.618}
# Apply factor for a task based on the machine type it is running on.
MACHINE_TASK_FACTOR_OVERRIDES = {
"aarch64": TASKS_FACTORS,
"ppc64le": [
dict(task=r"causally_consistent_hedged_reads_jscore_passthrough.*", factor=0.125),
dict(task=r"causally_consistent_read_concern_snapshot_passthrough.*", factor=0.125),
dict(task=r"sharded_causally_consistent_read_concern_snapshot_passthrough.*", factor=0.125),
],
}
# Apply factor for a task based on the platform it is running on.
PLATFORM_TASK_FACTOR_OVERRIDES = {"win32": TASKS_FACTORS, "cygwin": TASKS_FACTORS}
# Apply factor for a task everywhere it is run.
GLOBAL_TASK_FACTOR_OVERRIDES = {
r"causally_consistent_hedged_reads_jscore_passthrough.*": 0.25,
r"logical_session_cache.*_refresh_jscore_passthrough.*": 0.25,
r"multi_shard_.*multi_stmt_txn_.*jscore_passthrough.*": 0.125,
r"replica_sets_reconfig_jscore_passthrough.*": 0.25,
r"replica_sets_reconfig_jscore_stepdown_passthrough.*": 0.25,
r"replica_sets_reconfig_kill_primary_jscore_passthrough.*": 0.25,
r"sharded_causally_consistent_jscore_passthrough.*": 0.75,
r"sharded_collections_jscore_passthrough.*": 0.75,
r"shard.*uninitialized_fcv_jscore_passthrough.*": 0.25,
}
def global_task_factor(task_name, overrides, factor):
"""
Check for a global task override and return factor.
:param task_name: Name of task to check for.
:param overrides: Global override data.
:param factor: Default factor if there is no override.
:return: Factor that should be used based on global overrides.
"""
for task_re, task_factor in overrides.items():
if re.compile(task_re).match(task_name):
return task_factor
return factor
def get_task_factor(task_name, overrides, override_type, factor):
"""Check for task override and return factor."""
for task_override in overrides.get(override_type, []):
if re.compile(task_override["task"]).match(task_name):
return task_override["factor"]
return factor
def determine_final_multiplier(distro):
"""Determine the final multiplier."""
multipliers = defaultdict(lambda: 1, DISTRO_MULTIPLIERS)
return multipliers[distro]
def determine_factor(task_name, variant, distro, factor):
"""Determine the job factor."""
factors = [
get_task_factor(task_name, MACHINE_TASK_FACTOR_OVERRIDES, PLATFORM_MACHINE, factor),
get_task_factor(task_name, PLATFORM_TASK_FACTOR_OVERRIDES, SYS_PLATFORM, factor),
get_task_factor(task_name, VARIANT_TASK_FACTOR_OVERRIDES, variant, factor),
global_task_factor(task_name, GLOBAL_TASK_FACTOR_OVERRIDES, factor),
]
return min(factors) * determine_final_multiplier(distro)
def determine_jobs(task_name, variant, distro, jobs_max=0, job_factor=1.0):
"""Determine the resmoke jobs."""
if jobs_max < 0:
raise ValueError("The jobs_max must be >= 0.")
if job_factor <= 0:
raise ValueError("The job_factor must be > 0.")
factor = determine_factor(task_name, variant, distro, job_factor)
jobs_available = int(round(CPU_COUNT * factor))
if jobs_max == 0:
return max(1, jobs_available)
return min(jobs_max, jobs_available)
def output_jobs(jobs, outfile):
"""Output jobs configuration to the specified location."""
output = {"resmoke_jobs": jobs}
if outfile:
with open(outfile, "w") as fh:
yaml.dump(output, stream=fh, default_flow_style=False)
yaml.dump(output, stream=sys.stdout, default_flow_style=False)
def main():
"""Determine the resmoke jobs value a task should use in Evergreen."""
parser = argparse.ArgumentParser(description=main.__doc__)
parser.add_argument("--taskName", dest="task", required=True, help="Task being executed.")
parser.add_argument(
"--buildVariant",
dest="variant",
required=True,
help="Build variant task is being executed on.",
)
parser.add_argument(
"--distro", dest="distro", required=True, help="Distro task is being executed on."
)
parser.add_argument(
"--jobFactor",
dest="jobs_factor",
type=float,
default=1.0,
help=(
"Job factor to use as a mulitplier with the number of CPUs. Defaults" " to %(default)s."
),
)
parser.add_argument(
"--jobsMax",
dest="jobs_max",
type=int,
default=0,
help=(
"Maximum number of jobs to use. Specify 0 to indicate the number of"
" jobs is determined by --jobFactor and the number of CPUs. Defaults"
" to %(default)s."
),
)
parser.add_argument(
"--outFile",
dest="outfile",
help=("File to write configuration to. If" " unspecified no file is generated."),
)
options = parser.parse_args()
logging.basicConfig(stream=sys.stdout, level=logging.INFO)
structlog.configure(logger_factory=structlog.stdlib.LoggerFactory())
LOGGER.info(
"Finding job count",
task=options.task,
variant=options.variant,
platform=PLATFORM_MACHINE,
sys=SYS_PLATFORM,
cpu_count=CPU_COUNT,
)
jobs = determine_jobs(
options.task, options.variant, options.distro, options.jobs_max, options.jobs_factor
)
if jobs < CPU_COUNT:
print("Reducing number of jobs to run from {} to {}".format(CPU_COUNT, jobs))
output_jobs(jobs, options.outfile)
if __name__ == "__main__":
main()