0
0
mirror of https://github.com/mongodb/mongo.git synced 2024-11-24 00:17:37 +01:00
mongodb/buildscripts/evergreen_task_timeout.py
Juan Gu 855dfadef0 SERVER-94077 Use isort in Ruff configs (#27865)
GitOrigin-RevId: e793d662774ccd3ab6c3f356c2287cf1f7ff9805
2024-10-10 19:33:49 +00:00

516 lines
18 KiB
Python
Executable File

#!/usr/bin/env python3
"""Determine the timeout value a task should use in evergreen."""
from __future__ import annotations
import argparse
import math
import os
import shlex
import sys
from datetime import timedelta
from pathlib import Path
from typing import Dict, List, Optional
import inject
import structlog
import yaml
from pydantic import BaseModel
from buildscripts.ciconfig.evergreen import EvergreenProjectConfig, parse_evergreen_file
from buildscripts.resmoke_proxy.resmoke_proxy import ResmokeProxyService
from buildscripts.timeouts.timeout_service import TimeoutParams, TimeoutService
from buildscripts.util.cmdutils import enable_logging
from buildscripts.util.taskname import determine_task_base_name
from evergreen import EvergreenApi, RetryingEvergreenApi
LOGGER = structlog.get_logger(__name__)
DEFAULT_TIMEOUT_OVERRIDES = "etc/evergreen_timeouts.yml"
DEFAULT_EVERGREEN_CONFIG = "etc/evergreen.yml"
DEFAULT_EVERGREEN_AUTH_CONFIG = "~/.evergreen.yml"
COMMIT_QUEUE_ALIAS = "__commit_queue"
IGNORED_SUITES = {
"integration_tests_replset",
"integration_tests_replset_ssl_auth",
"integration_tests_sharded",
"integration_tests_standalone",
"integration_tests_standalone_audit",
"mongos_test",
"server_selection_json_test",
"sdam_json_test",
}
HISTORY_LOOKBACK = timedelta(weeks=2)
COMMIT_QUEUE_TIMEOUT = timedelta(minutes=20)
DEFAULT_REQUIRED_BUILD_TIMEOUT = timedelta(hours=1, minutes=20)
DEFAULT_NON_REQUIRED_BUILD_TIMEOUT = timedelta(hours=2)
class TimeoutOverride(BaseModel):
"""
Specification for overriding a task timeout.
* task: Name of task to overide.
* exec_timeout: Value to override exec timeout with.
* idle_timeout: Value to override idle timeout with.
"""
task: str
exec_timeout: Optional[int] = None
idle_timeout: Optional[int] = None
@classmethod
def from_seconds(
cls, task: str, exec_timeout_secs: Optional[float], idle_timeout_secs: Optional[float]
) -> TimeoutOverride:
"""Create an instance of an override from seconds."""
exec_timeout = exec_timeout_secs / 60 if exec_timeout_secs else None
idle_timeout = idle_timeout_secs / 60 if idle_timeout_secs else None
return cls(
task=task,
exec_timeout=exec_timeout,
idle_timeout=idle_timeout,
)
def get_exec_timeout(self) -> Optional[timedelta]:
"""Get a timedelta of the exec timeout to use."""
if self.exec_timeout is not None:
return timedelta(minutes=self.exec_timeout)
return None
def get_idle_timeout(self) -> Optional[timedelta]:
"""Get a timedelta of the idle timeout to use."""
if self.idle_timeout is not None:
return timedelta(minutes=self.idle_timeout)
return None
class TimeoutOverrides(BaseModel):
"""Collection of timeout overrides to apply."""
overrides: Dict[str, List[TimeoutOverride]]
@classmethod
def from_yaml_file(cls, file_path: Path) -> "TimeoutOverrides":
"""Read the timeout overrides from the given file."""
with open(file_path) as file_handler:
return cls(**yaml.safe_load(file_handler))
def _lookup_override(self, build_variant: str, task_name: str) -> Optional[TimeoutOverride]:
"""
Check if the given task on the given build variant has an override defined.
Note: If multiple overrides are found, an exception will be raised.
:param build_variant: Build Variant to check.
:param task_name: Task name to check.
:return: Timeout override if found.
"""
overrides = [
override
for override in self.overrides.get(build_variant, [])
if override.task == task_name
]
if overrides:
if len(overrides) > 1:
LOGGER.error(
"Found multiple overrides for the same task",
build_variant=build_variant,
task=task_name,
overrides=[override.dict() for override in overrides],
)
raise ValueError(f"Found multiple overrides for '{task_name}' on '{build_variant}'")
return overrides[0]
return None
def lookup_exec_override(self, build_variant: str, task_name: str) -> Optional[timedelta]:
"""
Look up the exec timeout override of the given build variant/task.
:param build_variant: Build Variant to check.
:param task_name: Task name to check.
:return: Exec timeout override if found.
"""
override = self._lookup_override(build_variant, task_name)
if override is not None:
return override.get_exec_timeout()
return None
def lookup_idle_override(self, build_variant: str, task_name: str) -> Optional[timedelta]:
"""
Look up the idle timeout override of the given build variant/task.
:param build_variant: Build Variant to check.
:param task_name: Task name to check.
:return: Idle timeout override if found.
"""
override = self._lookup_override(build_variant, task_name)
if override is not None:
return override.get_idle_timeout()
return None
def output_timeout(
exec_timeout: timedelta, idle_timeout: Optional[timedelta], output_file: Optional[str]
) -> None:
"""
Output timeout configuration to the specified location.
:param exec_timeout: Exec timeout to output.
:param idle_timeout: Idle timeout to output.
:param output_file: Location of output file to write.
"""
output = {
"exec_timeout_secs": math.ceil(exec_timeout.total_seconds()),
}
if idle_timeout is not None:
output["timeout_secs"] = math.ceil(idle_timeout.total_seconds())
if output_file:
with open(output_file, "w") as outfile:
yaml.dump(output, stream=outfile, default_flow_style=False)
yaml.dump(output, stream=sys.stdout, default_flow_style=False)
class TaskTimeoutOrchestrator:
"""An orchestrator for determining task timeouts."""
@inject.autoparams()
def __init__(
self,
timeout_service: TimeoutService,
timeout_overrides: TimeoutOverrides,
evg_project_config: EvergreenProjectConfig,
) -> None:
"""
Initialize the orchestrator.
:param timeout_service: Service for calculating historic timeouts.
:param timeout_overrides: Timeout overrides for specific tasks.
:param evg_project_config: Evergreen project configuration.
"""
self.timeout_service = timeout_service
self.timeout_overrides = timeout_overrides
self.evg_project_config = evg_project_config
def determine_exec_timeout(
self,
task_name: str,
variant: str,
idle_timeout: Optional[timedelta] = None,
exec_timeout: Optional[timedelta] = None,
evg_alias: str = "",
historic_timeout: Optional[timedelta] = None,
) -> timedelta:
"""
Determine what exec timeout should be used.
:param task_name: Name of task being run.
:param variant: Name of build variant being run.
:param idle_timeout: Idle timeout if specified.
:param exec_timeout: Override to use for exec_timeout or 0 if no override.
:param evg_alias: Evergreen alias running the task.
:param historic_timeout: Timeout determined by looking at previous task executions.
:return: Exec timeout to use for running task.
"""
determined_timeout = DEFAULT_NON_REQUIRED_BUILD_TIMEOUT
if historic_timeout is not None:
determined_timeout = historic_timeout
override = self.timeout_overrides.lookup_exec_override(variant, task_name)
if exec_timeout and exec_timeout.total_seconds() != 0:
LOGGER.info(
"Using timeout from cmd line", exec_timeout_secs=exec_timeout.total_seconds()
)
determined_timeout = exec_timeout
elif override is not None:
LOGGER.info("Overriding configured timeout", exec_timeout_secs=override.total_seconds())
determined_timeout = override
elif (
self._is_required_build_variant(variant)
and determined_timeout > DEFAULT_REQUIRED_BUILD_TIMEOUT
):
LOGGER.info(
"Overriding required-builder timeout",
exec_timeout_secs=DEFAULT_REQUIRED_BUILD_TIMEOUT.total_seconds(),
)
determined_timeout = DEFAULT_REQUIRED_BUILD_TIMEOUT
elif evg_alias == COMMIT_QUEUE_ALIAS:
LOGGER.info(
"Overriding commit-queue timeout",
exec_timeout_secs=COMMIT_QUEUE_TIMEOUT.total_seconds(),
)
determined_timeout = COMMIT_QUEUE_TIMEOUT
# The timeout needs to be at least as large as the idle timeout.
if idle_timeout and determined_timeout.total_seconds() < idle_timeout.total_seconds():
LOGGER.info(
"Making exec timeout as large as idle timeout",
exec_timeout_secs=idle_timeout.total_seconds(),
)
return idle_timeout
return determined_timeout
def determine_idle_timeout(
self,
task_name: str,
variant: str,
idle_timeout: Optional[timedelta] = None,
historic_timeout: Optional[timedelta] = None,
) -> Optional[timedelta]:
"""
Determine what idle timeout should be used.
:param task_name: Name of task being run.
:param variant: Name of build variant being run.
:param idle_timeout: Override to use for idle_timeout.
:param historic_timeout: Timeout determined by looking at previous task executions.
:return: Idle timeout to use for running task.
"""
determined_timeout = historic_timeout
override = self.timeout_overrides.lookup_idle_override(variant, task_name)
if idle_timeout and idle_timeout.total_seconds() != 0:
LOGGER.info(
"Using timeout from cmd line", idle_timeout_secs=idle_timeout.total_seconds()
)
determined_timeout = idle_timeout
elif override is not None:
LOGGER.info("Overriding configured timeout", idle_timeout_secs=override.total_seconds())
determined_timeout = override
return determined_timeout
def determine_historic_timeout(
self,
project: str,
task: str,
variant: str,
suite_name: str,
exec_timeout_factor: Optional[float],
) -> TimeoutOverride:
"""
Calculate the timeout based on historic test results.
:param project: Name of project to query.
:param task: Name of task to query.
:param variant: Name of build variant to query.
:param suite_name: Name of test suite being run.
:param exec_timeout_factor: Scaling factor to use when determining timeout.
"""
if suite_name in IGNORED_SUITES:
return TimeoutOverride(task=task, exec_timeout=None, idle_timeout=None)
timeout_params = TimeoutParams(
evg_project=project,
build_variant=variant,
task_name=task,
suite_name=suite_name,
is_asan=self.is_build_variant_asan(variant),
)
timeout_estimate = self.timeout_service.get_timeout_estimate(timeout_params)
if timeout_estimate and timeout_estimate.is_specified():
exec_timeout = timeout_estimate.calculate_task_timeout(
repeat_factor=1, scaling_factor=exec_timeout_factor
)
idle_timeout = timeout_estimate.calculate_test_timeout(repeat_factor=1)
if exec_timeout is not None or idle_timeout is not None:
LOGGER.info(
"Getting historic based timeout",
exec_timeout_secs=exec_timeout,
idle_timeout_secs=idle_timeout,
)
return TimeoutOverride.from_seconds(task, exec_timeout, idle_timeout)
return TimeoutOverride(task=task, exec_timeout=None, idle_timeout=None)
def is_build_variant_asan(self, build_variant: str) -> bool:
"""
Determine if the given build variant is an ASAN build variant.
:param build_variant: Name of build variant to check.
:return: True if build variant is an ASAN build variant.
"""
bv = self.evg_project_config.get_variant(build_variant)
return bv.is_asan_build()
def _is_required_build_variant(self, build_variant: str) -> bool:
"""
Determine if the given build variants is a required build variant.
:param build_variant: Name of build variant to check.
:param evergreen_project_config: Evergreen config to query the variant name.
:return: True if the given build variant is required.
"""
bv = self.evg_project_config.get_variant(build_variant)
return "!" in bv.display_name
def determine_timeouts(
self,
cli_idle_timeout: Optional[timedelta],
cli_exec_timeout: Optional[timedelta],
outfile: Optional[str],
project: str,
task: str,
variant: str,
evg_alias: str,
suite_name: str,
exec_timeout_factor: Optional[float],
) -> None:
"""
Determine the timeouts to use for the given task and write timeouts to expansion file.
:param cli_idle_timeout: Idle timeout specified by the CLI.
:param cli_exec_timeout: Exec timeout specified by the CLI.
:param outfile: File to write timeout expansions to.
:param project: Evergreen project task is being run on.
:param task: Name of task.
:param variant: Build variant task is being run on.
:param evg_alias: Evergreen alias that triggered task.
:param suite_name: Name of evergreen suite being run.
:param exec_timeout_factor: Scaling factor to use when determining timeout.
"""
historic_timeout = self.determine_historic_timeout(
project, task, variant, suite_name, exec_timeout_factor
)
idle_timeout = self.determine_idle_timeout(
task, variant, cli_idle_timeout, historic_timeout.get_idle_timeout()
)
exec_timeout = self.determine_exec_timeout(
task,
variant,
idle_timeout,
cli_exec_timeout,
evg_alias,
historic_timeout.get_exec_timeout(),
)
output_timeout(exec_timeout, idle_timeout, outfile)
def main():
"""Determine the timeout value a task should use in evergreen."""
parser = argparse.ArgumentParser(description=main.__doc__)
parser.add_argument(
"--install-dir",
dest="install_dir",
required=True,
help="Path to bin directory of testable installation",
)
parser.add_argument("--task-name", dest="task", required=True, help="Task being executed.")
parser.add_argument(
"--suite-name", dest="suite_name", required=True, help="Resmoke suite being run against."
)
parser.add_argument(
"--build-variant",
dest="variant",
required=True,
help="Build variant task is being executed on.",
)
parser.add_argument(
"--project",
dest="project",
required=True,
help="Evergreen project task is being executed on.",
)
parser.add_argument(
"--evg-alias",
dest="evg_alias",
required=True,
help="Evergreen alias used to trigger build.",
)
parser.add_argument(
"--test-flags",
dest="test_flags",
help="Test flags that are used for `resmoke.py run` command call.",
)
parser.add_argument("--timeout", dest="timeout", type=int, help="Timeout to use (in sec).")
parser.add_argument(
"--exec-timeout", dest="exec_timeout", type=int, help="Exec timeout to use (in sec)."
)
parser.add_argument(
"--exec-timeout-factor",
dest="exec_timeout_factor",
type=float,
help="Exec timeout factor to use (in sec).",
)
parser.add_argument("--out-file", dest="outfile", help="File to write configuration to.")
parser.add_argument(
"--timeout-overrides",
dest="timeout_overrides_file",
default=DEFAULT_TIMEOUT_OVERRIDES,
help="File containing timeout overrides to use.",
)
parser.add_argument(
"--evg-api-config",
dest="evg_api_config",
default=DEFAULT_EVERGREEN_AUTH_CONFIG,
help="Evergreen API config file.",
)
parser.add_argument(
"--evg-project-config",
dest="evg_project_config",
default=DEFAULT_EVERGREEN_CONFIG,
help="Evergreen project config file.",
)
options = parser.parse_args()
timeout_override = timedelta(seconds=options.timeout) if options.timeout else None
exec_timeout_override = (
timedelta(seconds=options.exec_timeout) if options.exec_timeout else None
)
task_name = determine_task_base_name(options.task, options.variant)
timeout_overrides = TimeoutOverrides.from_yaml_file(
os.path.expanduser(options.timeout_overrides_file)
)
enable_logging(verbose=False)
LOGGER.info("Determining timeouts", cli_args=options)
def dependencies(binder: inject.Binder) -> None:
binder.bind(
EvergreenApi,
RetryingEvergreenApi.get_api(config_file=os.path.expanduser(options.evg_api_config)),
)
binder.bind(TimeoutOverrides, timeout_overrides)
binder.bind(
EvergreenProjectConfig,
parse_evergreen_file(os.path.expanduser(options.evg_project_config)),
)
binder.bind(
ResmokeProxyService,
ResmokeProxyService(
run_options=f"--installDir={shlex.quote(options.install_dir)} {options.test_flags}"
),
)
inject.configure(dependencies)
task_timeout_orchestrator = inject.instance(TaskTimeoutOrchestrator)
task_timeout_orchestrator.determine_timeouts(
timeout_override,
exec_timeout_override,
options.outfile,
options.project,
task_name,
options.variant,
options.evg_alias,
options.suite_name,
options.exec_timeout_factor,
)
if __name__ == "__main__":
main()