diff --git a/README.md b/README.md index bd3cbb0..a830e2d 100644 --- a/README.md +++ b/README.md @@ -45,6 +45,11 @@ For example, if you want to use the gpu_p2 partition, you would need to do: hydra-submitit-launch my_app.py dev hydra.launcher.setup=null hydra.launcher.partition=gpu_p2 ``` +In order to change the timeout on the SLURM job to for example 10 hours, you would need to do: +``` +hydra-submitit-launch my_app.py base +hydra.launcher.hours=10 +``` +This will automatically select the right qos for you. ## References - Hydra: https://hydra.cc/docs/intro/ diff --git a/conf/config.yaml b/conf/config.yaml new file mode 100644 index 0000000..6561735 --- /dev/null +++ b/conf/config.yaml @@ -0,0 +1 @@ +fake_param: 4 diff --git a/example_app.py b/example_app.py new file mode 100644 index 0000000..5516a2d --- /dev/null +++ b/example_app.py @@ -0,0 +1,11 @@ +import hydra + +from omegaconf import OmegaConf + + +@hydra.main(config_path='conf', config_name='config') +def example_main(cfg): + print(OmegaConf.to_container(cfg, resolve=True)) + +if __name__ == '__main__': + example_main() diff --git a/hydra_plugins/jz_hydra_submitit_launcher/hydra/launcher/4gpus_dev.yaml b/hydra_plugins/jz_hydra_submitit_launcher/hydra/launcher/4gpus_dev.yaml index 2b2fbf0..db47409 100644 --- a/hydra_plugins/jz_hydra_submitit_launcher/hydra/launcher/4gpus_dev.yaml +++ b/hydra_plugins/jz_hydra_submitit_launcher/hydra/launcher/4gpus_dev.yaml @@ -1,5 +1,4 @@ defaults: - dev -cpus_per_task: 40 -gpus_per_node: 4 \ No newline at end of file +gpus_per_node: 4 diff --git a/hydra_plugins/jz_hydra_submitit_launcher/hydra/launcher/4gpus_t3.yaml b/hydra_plugins/jz_hydra_submitit_launcher/hydra/launcher/4gpus_t3.yaml index 752f671..cd60e45 100644 --- a/hydra_plugins/jz_hydra_submitit_launcher/hydra/launcher/4gpus_t3.yaml +++ b/hydra_plugins/jz_hydra_submitit_launcher/hydra/launcher/4gpus_t3.yaml @@ -1,5 +1,4 @@ defaults: - t3 -cpus_per_task: 40 -gpus_per_node: 4 \ No newline at end of file +gpus_per_node: 4 diff --git a/hydra_plugins/jz_hydra_submitit_launcher/hydra/launcher/4gpus_t4.yaml b/hydra_plugins/jz_hydra_submitit_launcher/hydra/launcher/4gpus_t4.yaml index 3e7c471..fbf5e02 100644 --- a/hydra_plugins/jz_hydra_submitit_launcher/hydra/launcher/4gpus_t4.yaml +++ b/hydra_plugins/jz_hydra_submitit_launcher/hydra/launcher/4gpus_t4.yaml @@ -1,5 +1,4 @@ defaults: - t4 -cpus_per_task: 40 -gpus_per_node: 4 \ No newline at end of file +gpus_per_node: 4 diff --git a/hydra_plugins/jz_hydra_submitit_launcher/hydra/launcher/base.yaml b/hydra_plugins/jz_hydra_submitit_launcher/hydra/launcher/base.yaml index e4d2106..89a7ed7 100644 --- a/hydra_plugins/jz_hydra_submitit_launcher/hydra/launcher/base.yaml +++ b/hydra_plugins/jz_hydra_submitit_launcher/hydra/launcher/base.yaml @@ -5,8 +5,9 @@ timeout_min: 60 gpus_per_node: 1 tasks_per_node: 1 gres: "gpu:${hydra.launcher.gpus_per_node}" -qos: qos_gpu-dev +qos: ${qos_from_hours:${hours}} cpus_per_gpu: 10 +cpus_per_task: ${cpu_from_gpu:${hydra.launcher.gpus_per_node},${hydra.launcher.cpus_per_gpu}} gpus_per_task: ${hydra.launcher.gpus_per_node} additional_parameters: account: ${oc.env:IDRPROJ}@gpu @@ -14,4 +15,4 @@ additional_parameters: hint: nomultithread time: "${hours}:00:00" setup: - - "#SBATCH -C v100-32g" \ No newline at end of file + - "#SBATCH -C v100-32g" diff --git a/hydra_plugins/jz_hydra_submitit_launcher/hydra/launcher/dev.yaml b/hydra_plugins/jz_hydra_submitit_launcher/hydra/launcher/dev.yaml index fe97084..dbfb96a 100644 --- a/hydra_plugins/jz_hydra_submitit_launcher/hydra/launcher/dev.yaml +++ b/hydra_plugins/jz_hydra_submitit_launcher/hydra/launcher/dev.yaml @@ -1,5 +1,6 @@ defaults: - base +qos: qos_gpu-dev additional_parameters: time: "2:00:00" diff --git a/hydra_plugins/jz_hydra_submitit_launcher/resolvers.py b/hydra_plugins/jz_hydra_submitit_launcher/resolvers.py index 1cb898c..b470426 100644 --- a/hydra_plugins/jz_hydra_submitit_launcher/resolvers.py +++ b/hydra_plugins/jz_hydra_submitit_launcher/resolvers.py @@ -1,3 +1,17 @@ from omegaconf import OmegaConf -OmegaConf.register_new_resolver("multiply10", lambda x: x * 10) + +def time_to_qos(timeout_hour): + timeout_hour = int(timeout_hour) + if timeout_hour > 20: + qos = 't4' + elif timeout_hour > 2: + qos = 't3' + else: + qos = 'dev' + + qos = f'qos_gpu-{qos}' + return qos + +OmegaConf.register_new_resolver("qos_from_hours", time_to_qos, replace=True) +OmegaConf.register_new_resolver("cpu_from_gpu", lambda x,y: x*y, replace=True)