From 3f7fe091f4687f27c965aba925c07b746fd82f0a Mon Sep 17 00:00:00 2001 From: Eman Elsabban Date: Thu, 16 Nov 2023 13:00:02 -0800 Subject: [PATCH] A quick script to check if a job took longer than x time to run and update python in precommit --- .pre-commit-config.yaml | 9 +-- requirements-dev.txt | 6 +- tron/bin/check_job_exceeding_time.py | 95 ++++++++++++++++++++++++++++ 3 files changed, 103 insertions(+), 7 deletions(-) create mode 100644 tron/bin/check_job_exceeding_time.py diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 66128a600..d04a52893 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -1,6 +1,6 @@ --- default_language_version: - python: python3.6 + python: python3.8 repos: - repo: https://github.com/pre-commit/pre-commit-hooks rev: v2.5.0 @@ -21,6 +21,7 @@ repos: hooks: - id: flake8 exclude: ^docs/source/conf.py$ + language_version: python3.8 - repo: https://github.com/asottile/reorder_python_imports rev: v1.9.0 hooks: @@ -41,8 +42,8 @@ repos: language: script files: ^tests/.*\.py$ - repo: http://github.com/psf/black - rev: 19.10b0 + rev: 23.3.0 hooks: - id: black - language_version: python3.6 - args: [--target-version, py36] + language_version: python3.8 + args: [--target-version, py38] diff --git a/requirements-dev.txt b/requirements-dev.txt index c87f4b21c..a9a007c25 100644 --- a/requirements-dev.txt +++ b/requirements-dev.txt @@ -4,7 +4,7 @@ asynctest==0.12.0 cfgv==2.0.1 entrypoints==0.3 flake8==3.7.9 -identify==1.4.9 +identify==2.4.4 importlib-resources==1.0.2 iniconfig==1.1.1 isort==4.3.18 @@ -15,7 +15,7 @@ mypy-extensions==0.4.3 nodeenv==1.3.3 packaging==19.2 pluggy==0.13.0 -pre-commit==1.21.0 +pre-commit==2.9.2 py==1.10.0 pycodestyle==2.5.0 pyflakes==2.1.1 @@ -26,4 +26,4 @@ pytest-asyncio==0.14.0 requirements-tools==1.2.1 toml==0.10.2 typed-ast==1.4.0 -virtualenv==16.7.5 +virtualenv==20.0.8 diff --git a/tron/bin/check_job_exceeding_time.py b/tron/bin/check_job_exceeding_time.py new file mode 100644 index 000000000..5ef1e78a2 --- /dev/null +++ b/tron/bin/check_job_exceeding_time.py @@ -0,0 +1,95 @@ +#!/usr/bin/env python3.8 +import logging +import sys + +import pytimeparse + +from tron.commands import cmd_utils +from tron.commands.client import Client + + +log = logging.getLogger("check_exceeding_time") + + +def parse_cli(): + parser = cmd_utils.build_option_parser() + parser.add_argument( + "--job", + default=None, + help="Check if a particular job exceeded a time to run. If unset checks all jobs", + ) + parser.add_argument( + "--time", + help="This is used to specify the time that if any job exceeds will show. Defaults to 5 hours", + type=int, + dest="time_limit", + default=18000, + ) + args = parser.parse_args() + return args + + +def check_if_time_exceeded(job_runs, job_expected_runtime, result): + states_to_check = {"queued", "scheduled", "cancelled", "skipped"} + for job_run in job_runs: + if job_run.get("state", "unknown") not in states_to_check: + if is_job_run_exceeding_expected_runtime( + job_run, + job_expected_runtime, + ): + result.append(job_run["id"]) + return + + +def is_job_run_exceeding_expected_runtime(job_run, job_expected_runtime): + states_to_check = {"queued", "scheduled", "cancelled", "skipped"} + if ( + job_expected_runtime is not None + and job_run.get( + "state", + "unknown", + ) + not in states_to_check + ): + duration_seconds = pytimeparse.parse(job_run.get("duration", "")) + if duration_seconds and duration_seconds > job_expected_runtime: + return True + return False + + +def check_job_time(job, time_limit, result): + job_runs = sorted( + job.get("runs", []), + key=lambda k: (k["end_time"] is None, k["end_time"], k["run_time"]), + reverse=True, + ) + + check_if_time_exceeded(job_runs, time_limit, result) + + +def main(): + args = parse_cli() + cmd_utils.setup_logging(args) + cmd_utils.load_config(args) + client = Client(args.server, args.cluster_name) + result = [] + + url_index = client.index() + if args.job is None: + jobs = client.jobs(include_job_runs=True) + for job in jobs: + check_job_time(job=job, time_limit=args.time_limit, result=result) + else: + job_url = client.get_url(args.job) + job = client.job_runs(job_url) + check_job_time(job=job, client=client, url_index=url_index, result=result) + + if result is None: + print("All jobs ran within the time limit") + else: + print(f"These are the runs that took longer than {args.time_limit} to run: {result}") + return + + +if __name__ == "__main__": + sys.exit(main())