0
0
mirror of https://github.com/mongodb/mongo.git synced 2024-11-24 00:17:37 +01:00
mongodb/buildscripts/large_file_check.py
Juan Gu 855dfadef0 SERVER-94077 Use isort in Ruff configs (#27865)
GitOrigin-RevId: e793d662774ccd3ab6c3f356c2287cf1f7ff9805
2024-10-10 19:33:49 +00:00

157 lines
4.9 KiB
Python
Executable File

#!/usr/bin/env python3
"""Check files in git diff to ensure they are within a given size limit."""
# pylint: disable=wrong-import-position
import argparse
import fnmatch
import logging
import os
import pathlib
import sys
import textwrap
from typing import Any, Dict, List, Optional, Tuple
import structlog
from git import Repo
mongo_dir = os.path.dirname(os.path.dirname(os.path.abspath(os.path.realpath(__file__))))
# Get relative imports to work when the package is not installed on the PYTHONPATH.
if __name__ == "__main__" and __package__ is None:
sys.path.append(mongo_dir)
from buildscripts.linter import git
from buildscripts.patch_builds.change_data import (
RevisionMap,
find_changed_files_in_repos,
generate_revision_map,
)
# Console renderer for structured logging
def renderer(_logger: logging.Logger, _name: str, eventdict: Dict[Any, Any]) -> str:
if "files" in eventdict:
return "{event}: {files}".format(**eventdict)
if "repo" in eventdict:
return "{event}: {repo}".format(**eventdict)
if "file" in eventdict:
if "bytes" in eventdict:
return "{event}: {file} {bytes} bytes".format(**eventdict)
return "{event}: {file}".format(**eventdict)
return "{event}".format(**eventdict)
# Configure the logger so it doesn't spam output on huge diffs
structlog.configure(
logger_factory=structlog.stdlib.LoggerFactory(),
wrapper_class=structlog.stdlib.BoundLogger,
cache_logger_on_first_use=True,
processors=[
structlog.stdlib.filter_by_level,
renderer,
],
)
LOGGER = structlog.get_logger(__name__)
MONGO_REVISION_ENV_VAR = "REVISION"
def _get_repos_and_revisions() -> Tuple[List[Repo], RevisionMap]:
"""Get the repo object and a map of revisions to compare against."""
modules = git.get_module_paths()
repos = [
Repo(path)
for path in modules
# Exclude enterprise module; it's in the "modules" folder but does not correspond to a repo
if "src/mongo/db/modules/enterprise" not in path
]
revision_map = generate_revision_map(repos, {"mongo": os.environ.get(MONGO_REVISION_ENV_VAR)})
return repos, revision_map
def git_changed_files(excludes: List[pathlib.Path]) -> List[pathlib.Path]:
"""
Get the files that have changes since the last git commit.
:param excludes: A list of files which should be excluded from changed file checks.
:return: List of changed files.
"""
repos, revision_map = _get_repos_and_revisions()
LOGGER.debug("revisions", revision=revision_map)
def _filter_fn(file_path: pathlib.Path) -> bool:
if not file_path.exists():
return False
for exclude in excludes:
if fnmatch.fnmatch(file_path, exclude):
return False
return True
files = [
filename
for filename in list(map(pathlib.Path, find_changed_files_in_repos(repos, revision_map)))
if _filter_fn(filename)
]
LOGGER.debug("Found files to check", files=list(map(str, files)))
return files
def diff_file_sizes(size_limit: int, excludes: Optional[List[str]] = None) -> List[pathlib.Path]:
if excludes is None:
excludes = []
large_files: list[pathlib.Path] = []
for file_path in git_changed_files(excludes):
LOGGER.debug("Checking file size", file=str(file_path))
file_size = file_path.stat().st_size
if file_size > size_limit:
LOGGER.error("File too large", file=str(file_path), bytes=file_size)
large_files.append(file_path)
return large_files
def main(*args: str) -> int:
"""Execute Main entry point."""
parser = argparse.ArgumentParser(
description="Git commit large file checker.",
epilog=textwrap.dedent("""\
NOTE: The --exclude argument is an exact match but can accept glob patterns. If * is used,
it matches *all* characters, including path separators.
"""),
)
parser.add_argument("--verbose", action="store_true", help="Enable verbose logging")
parser.add_argument(
"--exclude",
help="Paths to exclude from check",
nargs="+",
type=pathlib.Path,
required=False,
)
parser.add_argument("--size-mb", help="File size limit (MiB)", type=int, default="10")
parsed_args = parser.parse_args(args[1:])
if parsed_args.verbose:
logging.basicConfig(level=logging.DEBUG)
structlog.stdlib.filter_by_level(LOGGER, "debug", {})
else:
logging.basicConfig(level=logging.INFO)
structlog.stdlib.filter_by_level(LOGGER, "info", {})
large_files = diff_file_sizes(parsed_args.size_mb * 1024 * 1024, parsed_args.exclude)
if len(large_files) == 0:
LOGGER.info("All files passed size check")
return 0
LOGGER.error("Some files failed size check", files=list(map(str, large_files)))
return 1
if __name__ == "__main__":
sys.exit(main(*sys.argv))