diff --git a/.github/workflows/python-publish.yml b/.github/workflows/python-publish.yml new file mode 100644 index 0000000..942e5df --- /dev/null +++ b/.github/workflows/python-publish.yml @@ -0,0 +1,40 @@ +# This workflow will upload a Python Package using Twine when a release is created +# For more information see: https://docs.github.com/en/actions/automating-builds-and-tests/building-and-testing-python#publishing-to-package-registries + +# This workflow uses actions that are not certified by GitHub. +# They are provided by a third-party and are governed by +# separate terms of service, privacy policy, and support +# documentation. + +name: Upload Python Package + +on: + release: + types: [published] + +permissions: + contents: read + +jobs: + deploy: + + runs-on: ubuntu-latest + + steps: + - uses: actions/checkout@main + - name: Set up Python + uses: actions/setup-python@v3 + with: + python-version: '3.8' + - name: Install pypa/setuptools + run: >- + python -m + pip install wheel + - name: Build a binary wheel + run: >- + python setup.py sdist bdist_wheel + - name: Publish distribution 📦 to PyPI + uses: pypa/gh-action-pypi-publish@27b31702a0e7fc50959f5ad993c78deac1bdfc29 + with: + user: __token__ + password: ${{ secrets.PYPI_API_TOKEN }} diff --git a/README.md b/README.md index 9579c85..263c3a6 100644 --- a/README.md +++ b/README.md @@ -6,24 +6,24 @@  [![Twitter Follow](https://img.shields.io/twitter/follow/maharshigor.svg?style=social)](https://twitter.com/maharshigor) -A _makeshift_ toolkit, built on top of [submitit](https://github.com/facebookincubator/submitit), to launch SLURM jobs over a range of hyperparameters from the command line. It is designed to be used with existing python scripts and interactively monitor their status. +A _makeshift_ toolkit, built on top of [submitit](https://github.com/facebookincubator/submitit), to launch SLURM jobs over a range of hyperparameters from the command line. It is designed to be used with existing Python scripts and interactively monitor their status. __`submititnow` provides two command-line tools:__ -* `slaunch` to launch a python script as SLURM job(s). +* `slaunch` to launch a Python script as SLURM job(s). * `jt` (job-tracker) to interactively monitor the jobs. -__It also provides an abstracted [`experiment_lib.Experiment`](submititnow/experiment_lib.py#L16) API to create, launch and monitor an experiment, or a group of job(s), from python scripts with customized parameter-sweeping configurations, while being able to track them with `jt`.__ +__It also provides an abstracted [`experiment_lib.Experiment`](submititnow/experiment_lib.py#L16) API to create, launch and monitor an experiment, or a group of job(s), from Python scripts with customized parameter-sweeping configurations, while being able to track them with `jt`.__ ## `slaunch` : Launching a python script over SLURM -Let's say you have a python script [`examples/annotate_queries.py`](examples/annotate_queries.py) that can be run using following command: +Let's say you have a Python script [`examples/annotate_queries.py`](examples/annotate_queries.py) which can be run using the following command: ```bash python examples/annotate_queries.py --model='BERT-LARGE-uncased' \ --dataset='NaturalQuestions' --fold='dev' ``` -You can launch a job that runs this script over a SLURM cluster using following: +You can launch a job that runs this script over a SLURM cluster using the following: ```bash slaunch examples/annotate_queries.py \ --slurm_mem="16g" --slurm_gres="gpu:rtxa4000:1" \ @@ -39,12 +39,12 @@ slaunch examples/annotate_queries.py \ --model 'BERT-LARGE-uncased' 'Roberta-uncased' 'T5-cased-small' \ --dataset='NaturalQuestions' --fold 'dev' 'train' ``` -This will launch a total of 6 jobs with following configuration: +This will launch a total of 6 jobs with the following configuration: ![Slaunch Terminal Response](docs/imgs/slaunch_annotate_queries.png) -### __Any constraints on the target python script that we launch?__ -The target python script must have the following format: +### __Any constraints on the target Python script that we launch?__ +The target Python script must have the following format: ```python import argparse @@ -70,15 +70,15 @@ if __name__ == '__main__': ## **`jt`** :   Looking up info on previously launched experiments: -As instructed in the screenshot of the Launch response, user can utilize the `jt` (short of `job-tracker`) command to monitor the job progress. +As instructed in the above screenshot of the Launch response, user can utilize the `jt` (short for `job-tracker`) command to monitor the job progress. ### **`jt jobs EXP_NAME [EXP_ID]`** -Executing `jt jobs examples.annotate_queries 227720` will give following response: +Executing `jt jobs examples.annotate_queries 227720` will give the following response: ![jt jobs EXP_NAME EXP_ID Terminal Response](docs/imgs/jt_annotate_queries_expid.png) -In fact, user can also lookup all `examples.annotate_queries` jobs simply by removing the `[EXP_ID]` from the previous command: +In fact, user can also lookup all `examples.annotate_queries` jobs simply by removing `[EXP_ID]` from the previous command: ``` jt jobs examples.annotate_queries ``` @@ -90,23 +90,23 @@ __Looking up stderr and stdout of a Job__ Executing `jt out 227720_2` reveals the `stdout` output of the corresponding Job: ![jt out JOB_ID Terminal Response](docs/imgs/jt_out_job_id.png) -Similar is case for `jt err 227720_2` which reveals `stderr` logs. +Similarly, `jt err 227720_2` reveals the `stderr` logs. ### **`jt sh JOB_ID`** -__Looking up SLURM SBATCH shell file of a Job__ +__Looking up SBATCH script for a Job__ -submitit tool internally create an SBATCH shell script per experiment to launch the jobs on SLURM cluster. This command helps inspect this `submission.sh` file. +The submitit tool internally creates an SBATCH shell script per experiment to launch the jobs on a SLURM cluster. This command outputs this `submission.sh` file for inspection. Executing `jt sh 227720_2` reveals the following: ![jt out JOB_ID Terminal Response](docs/imgs/jt_sh_job_id.png) ### **`jt ls`** -Finally, user can use `jt ls` to simply list the experiments maintains by the `submititnow` tool. +Finally, user can use `jt ls` to simply list the experiments maintained by the `submititnow` tool. -Outputs of this command can be further used to interact using `jt jobs` command. +The experiment names output by this command can then be passed into the `jt jobs` command. ## __Installing__ Python 3.8+ is required. @@ -115,14 +115,10 @@ Python 3.8+ is required. pip install -U git+https://github.com/maharshi95/submititnow.git ``` -## **Experiment API:** -Sometimes `slaunch` command-line tool is not enough. For example, one may want to launch a job with customized parameter-sweep configurations, or have a certain parameter (e.g. `output_filepath`) different for each job in the launch. In such cases, one can use the Experiment API provided by `submititnow` to launch jobs from python scripts and also get the benefits of being able to track them with `jt`. +## **Experiment API** +Sometimes the `slaunch` command-line tool is not enough. For example, one may want to launch a job with customized parameter-sweep configurations, or vary a certain parameter (e.g. `output_filepath`) for each job in the launch. In such cases, one can use the Experiment API provided by `submititnow` to launch jobs from Python scripts and also get the benefits of being able to track them with `jt`. [examples/launch_demo_script.py](examples/launch_demo_script.py) provides a demo of how to use the `Experiment` API to launch a job with customized parameter-sweep configurations. ```bash python examples/launch_demo_script.py ``` -## **Experiment API:** -Sometimes `slaunch` command-line tool is not enough. For example, one may want to launch a job with customized parameter-sweep configurations, or have a certain parameter (e.g. `output_filepath`) different for each job in the launch. In such cases, one can use the Experiment API provided by `submititnow` to launch jobs from python scripts and also get the benefits of being able to track them with `jt`. - -[examples/launch_demo_script.py](examples/launch_demo_script.py) provides a demo of how to use the `Experiment` API to launch a job with customized parameter-sweep configurations. \ No newline at end of file diff --git a/bin/slaunch b/bin/slaunch index e9f71e4..26bae10 100755 --- a/bin/slaunch +++ b/bin/slaunch @@ -49,7 +49,6 @@ def create_module_args_list( sweep_args: Iterable[str], downstream_args: Sequence[str], ): - make_args_sweepable(module_argparser, sweep_args) module_args_with_sweeps = module_argparser.parse_args(downstream_args) @@ -79,9 +78,10 @@ def job_description_function(args: argparse.Namespace): class UnSupportedPythonModuleError(Exception): def __init__(self, module_name: str, missing_attr: str): super().__init__( - f"Module '{module_name}' is not supported by submititnow. '{missing_attr}' is missing.\n" - f"Target script must have two functions: \n\t* 'main(args: argparser.Namespace)'" - f"\n\t* 'add_arguments(parser = None) -> argparse.ArgumentParser'." + f"Module '{module_name}' is not supported by submititnow. '{missing_attr}'" + " is missing.\nTarget script must have two functions: \n\t* 'main(args:" + " argparser.Namespace)'\n\t* 'add_arguments(parser = None) ->" + " argparse.ArgumentParser'." ) diff --git a/examples/.config.json b/examples/.config.json new file mode 100644 index 0000000..4c5e8fb --- /dev/null +++ b/examples/.config.json @@ -0,0 +1,4 @@ +{ + "slurm_profile": "clip", + "slurm_gres": "gpu:1" +} \ No newline at end of file diff --git a/examples/gpu_matmul.py b/examples/gpu_matmul.py new file mode 100644 index 0000000..b8e1c16 --- /dev/null +++ b/examples/gpu_matmul.py @@ -0,0 +1,34 @@ +import argparse +from typing import Optional +import torch +from tqdm import trange + +def add_arguments( + parser: Optional[argparse.ArgumentParser] = None, +) -> argparse.ArgumentParser: + if parser is None: + parser = argparse.ArgumentParser('Perform a matrix multiplication on a GPU') + + parser.add_argument("--matrix-size", type=int, default=1000) + parser.add_argument("--n-iter", type=int, default=10) + + return parser + + +def main(args: argparse.Namespace): + # Set torch seed to + torch.manual_seed(42) + + for i in trange(args.n_iter): + M1 = torch.randn(args.matrix_size, args.matrix_size).cuda() + M2 = torch.randn(args.matrix_size, args.matrix_size).cuda() + + result = M1 @ M2 + norm = torch.norm(result, p="fro") + print(f"Norm of result: {norm}") + + +if __name__ == "__main__": + parser = add_arguments() + args = parser.parse_args() + main(args) diff --git a/setup.py b/setup.py index 9aa8b51..261ee44 100644 --- a/setup.py +++ b/setup.py @@ -7,7 +7,7 @@ setuptools.setup( name="submititnow", - version="0.9.1", + version="0.9.3", author="Maharshi Gor", author_email="maharshigor@gmail.com", description="A package to make submitit easier to use", @@ -26,6 +26,7 @@ "typer[all]>=0.7.0", "rich-cli>=1.8.0", "rich>=12.6.0", + "tqdm>=4.0.0", ], python_requires=">=3.8", ) diff --git a/submititnow/cli.py b/submititnow/cli.py index 3d268cb..7d83f7b 100644 --- a/submititnow/cli.py +++ b/submititnow/cli.py @@ -1,5 +1,5 @@ from __future__ import annotations - +import io import time from rich import print as rich_print @@ -13,12 +13,15 @@ def show_file_content(filepath: str): - with open(filepath) as fp: - text = fp.read().replace("]\n ", "]\r") # Handle tqdm progress bars + rich_print("[bold bright_yellow]Reading file:[/bold bright_yellow] [bold cyan]{}[/bold cyan]\n".format(filepath)) + with open(filepath, "r", newline='') as fp: + text = fp.read() for line in text.split("\n"): - if "\r" in line: - line = line[line.rindex("\r") + 1 :] - rich_print(line) + line_buffer = io.StringIO() + for chunks in line.split("\r"): + line_buffer.seek(0) + line_buffer.write(chunks) + rich_print(line_buffer.getvalue()) def _generate_console_table(exp: Experiment): diff --git a/submititnow/options.py b/submititnow/options.py index 405a37f..eed0242 100644 --- a/submititnow/options.py +++ b/submititnow/options.py @@ -1,35 +1,128 @@ import argparse +import json from typing import Dict, Any +class SlurmAdditionalArgAction(argparse.Action): + def __init__(self, check_func, *args, **kwargs): + """ + argparse custom action. + :param check_func: callable to do the real check. + """ + self._check_func = check_func + super(SlurmAdditionalArgAction, self).__init__(*args, **kwargs) + + def __call__(self, parser, namespace, values, option_string): + if isinstance(values, list): + values = [self._check_func(parser, v) for v in values] + else: + values = self._check_func(parser, values) + if option_string.startswith("--"): + option_string = option_string[2:] + setattr(namespace, self.dest, {option_string: values}) + + def add_slurm_arguments(parser: argparse.ArgumentParser): slurm_group = parser.add_argument_group("SLURM parameters") slurm_group.add_argument( - "--slurm_profile", default=None, help="SubmititNow profile for SLURM." + "--config", + default=None, + help="SubmititNow config file for SLURM.", + dest="slurm_config", + ) + slurm_group.add_argument( + "--profile", + default=None, + help="SubmititNow profile for SLURM.", + dest="slurm_profile", + ) + slurm_group.add_argument( + "--account", default=None, help="SLURM account", dest="slurm_account" + ) + slurm_group.add_argument( + "--partition", default=None, help="SLURM partition", dest="slurm_partition" + ) + slurm_group.add_argument("--qos", default=None, help="SLURM qos", dest="slurm_qos") + slurm_group.add_argument( + "--mem", default=None, help="SLURM memory requirement", dest="slurm_mem" + ) + slurm_group.add_argument( + "--gres", default=None, help="SLURM GPU Resource requirement", dest="slurm_gres" + ) + slurm_group.add_argument( + "--time", default=None, help="SLURM time requirement", dest="slurm_time" + ) + slurm_group.add_argument( + "--nodes", + default=1, + help="SLURM nodes requirement", + dest="slurm_nodes", + type=int, + ) + slurm_group.add_argument( + "--ntasks-per-node", + default=None, + help="SLURM ntasks per node", + dest="slurm_ntasks_per_node", + type=int, + ) + slurm_group.add_argument( + "--cpus-per-task", + default=None, + help="SLURM cpus per task", + dest="slurm_cpus_per_task", + type=int, ) - slurm_group.add_argument("--slurm_account", default=None, help="SLURM account") - slurm_group.add_argument("--slurm_partition", default=None, help="SLURM partition") - slurm_group.add_argument("--slurm_qos", default=None, help="SLURM qos") slurm_group.add_argument( - "--slurm_mem", default=None, help="SLURM memory requirement" + "--cpus-per-gpu", + default=None, + help="SLURM cpus per gpu", + dest="slurm_cpus_per_gpu", + type=int, ) slurm_group.add_argument( - "--slurm_gres", default=None, help="SLURM GPU Resource requirement" + "--gpus-per-node", + default=None, + help="SLURM gpus per node", + dest="slurm_gpus_per_node", + type=int, ) slurm_group.add_argument( - "--slurm_time", default=None, help="SLURM time requirement" + "--gpus-per-task", + default=None, + help="SLURM gpus per task", + dest="slurm_gpus_per_task", + type=int, + ) + + # Additional arguments + slurm_group.add_argument( + "--nodelist", + default=None, + help="SLURM nodelist", + action=SlurmAdditionalArgAction, + check_func=lambda parser, value: value, + dest="slurm_additional_parameters", ) return parser def add_submititnow_arguments(parser: argparse.ArgumentParser): submititnow_group = parser.add_argument_group("SubmititNow parameters") - submititnow_group.add_argument("--exp_name", default=None, help="Experiment Name.") + submititnow_group.add_argument("--exp-name", default=None, help="Experiment Name.") submititnow_group.add_argument( - "--submititnow_dir", default=None, help="Root directory for submititnow." + "--submititnow-dir", default=None, help="Root directory for submititnow." ) return parser def get_slurm_params(args: argparse.Namespace) -> Dict[str, Any]: - return {k: v for k, v in vars(args).items() if k.startswith("slurm_")} + slurm_args = { + k: v for k, v in vars(args).items() if k.startswith("slurm_") and v is not None + } + if slurm_args.get("slurm_config") is not None: + config_filename = slurm_args.pop("slurm_config") + with open(config_filename, "r") as f: + default_args = json.load(f) + slurm_args = {**default_args, **slurm_args} + return slurm_args