diff --git a/docker/scripts/startup.sh b/docker/scripts/startup.sh index 5c5a6aa49..527e8306a 100644 --- a/docker/scripts/startup.sh +++ b/docker/scripts/startup.sh @@ -132,6 +132,10 @@ while [ "$1" != "" ]; do shift TEST=$1 ;; + --additional_requirements_path) + shift + ADDITIONAL_REQUIREMENTS_PATH=$1 + ;; *) usage exit 1 @@ -140,6 +144,16 @@ while [ "$1" != "" ]; do shift done + +# Optionally install addtional dependencies +if [[ -n ${ADDITIONAL_REQUIREMENTS_PATH+x} ]]; then + echo "Installing addtional requirements..." + COMMAND="cd algorithmic-efficiency && pip install -r ${ADDITIONAL_REQUIREMENTS_PATH}" + echo $COMMAND + eval $COMMAND +fi + + if [[ ${TEST} == "true" ]]; then cd algorithmic-efficiency COMMAND="python3 tests/test_traindiffs.py" diff --git a/scoring/run_workloads.py b/scoring/run_workloads.py index 1c33079d3..e474b6910 100644 --- a/scoring/run_workloads.py +++ b/scoring/run_workloads.py @@ -9,9 +9,11 @@ --tuning_search_space """ +import datetime import json import os import struct +import subprocess import time from absl import app @@ -26,9 +28,11 @@ 'docker_image_url', 'us-central1-docker.pkg.dev/training-algorithms-external/mlcommons-docker-repo/algoperf_jax_dev', 'URL to docker image') -flags.DEFINE_integer('run_percentage', - 100, - 'Percentage of max num steps to run for.') +flags.DEFINE_integer( + 'run_percentage', + 100, + 'Percentage of max num steps to run for.' + 'Must set the flag enable_step_budget to True for this to take effect.') flags.DEFINE_string('experiment_name', 'my_experiment', 'Name of top sub directory in experiment dir.') @@ -83,10 +87,24 @@ 'If your algorithm has a smaller per step time than our baselines ' 'you may want to increase the number of steps per workload.') flags.DEFINE_string( - 'workload', + 'workloads', None, + 'String representing a comma separated list of workload names.' 'If not None, only run this workload, else run all workloads in workload_metadata_path.' ) +flags.DEFINE_string('additional_requirements_path', + None, + 'Path to requirements.txt if any.') +flags.DEFINE_integer( + 'max_steps', + None, + 'Maximum number of steps to run. Must set flag enable_step_budget.' + 'This flag takes precedence over the run_percentage flag.') +flags.DEFINE_bool( + 'enable_step_budget', + False, + 'Flag that has to be explicitly set to override time budgets to step budget percentage.' +) FLAGS = flags.FLAGS @@ -106,15 +124,40 @@ def container_running(): return True +def kill_containers(): + docker_client = docker.from_env() + containers = docker_client.containers.list() + for container in containers: + container.kill() + + +def gpu_is_active(): + output = subprocess.check_output([ + 'nvidia-smi', + '--query-gpu=utilization.gpu', + '--format=csv,noheader,nounits' + ]) + return any(int(x) > 0 for x in output.decode().splitlines()) + + def wait_until_container_not_running(sleep_interval=5 * 60): + # check gpu util + # if the gpu has not been utilized for 30 minutes kill the + gpu_last_active = datetime.datetime.now().timestamp() + while container_running(): + # check if gpus have been inactive > 45 min and if so terminate container + if gpu_is_active(): + gpu_last_active = datetime.datetime.now().timestamp() + if (datetime.datetime.now().timestamp() - gpu_last_active) > 45 * 60: + kill_containers( + "Killing container: GPUs have been inactive > 45 minutes...") time.sleep(sleep_interval) return def main(_): framework = FLAGS.framework - run_fraction = FLAGS.run_percentage / 100. experiment_name = FLAGS.experiment_name docker_image_url = FLAGS.docker_image_url submission_path = FLAGS.submission_path @@ -132,7 +175,13 @@ def main(_): study_end_index = FLAGS.study_end_index else: study_end_index = num_studies - 1 + + additional_requirements_path_flag = '' + if FLAGS.additional_requirements_path: + additional_requirements_path_flag = f'--additional_requirements_path {FLAGS.additional_requirements_path} ' + submission_id = FLAGS.submission_id + rng_seed = FLAGS.seed if not rng_seed: @@ -144,17 +193,22 @@ def main(_): with open(FLAGS.workload_metadata_path) as f: workload_metadata = json.load(f) + # Get list of all possible workloads workloads = [w for w in workload_metadata.keys()] - # Read held-out workloads + # Read heldout workloads if FLAGS.held_out_workloads_config_path: held_out_workloads = read_held_out_workloads( FLAGS.held_out_workloads_config_path) workloads = workloads + held_out_workloads - # Filter for single workload - if FLAGS.workload and (FLAGS.workload in workloads): - workloads = [FLAGS.workload] + # Filter workloads if explicit workloads specified + if FLAGS.workloads is not None: + workloads = list( + filter(lambda x: x in FLAGS.workloads.split(','), workloads)) + if len(workloads) != len(FLAGS.workloads.split(',')): + unmatched_workloads = set(FLAGS.workloads.split(',')) - set(workloads) + raise ValueError(f'Invalid workload name {unmatched_workloads}') rng_subkeys = prng.split(rng_key, num_studies) @@ -174,14 +228,22 @@ def main(_): "sudo sh -c 'echo 3 > /proc/sys/vm/drop_caches'") # clear caches print('=' * 100) dataset = workload_metadata[base_workload_name]['dataset'] - max_steps = int(workload_metadata[base_workload_name]['max_steps'] * - run_fraction) + max_steps_flag = '' + if FLAGS.enable_step_budget: + run_fraction = FLAGS.run_percentage / 100. + if FLAGS.max_steps is None: + max_steps = int(workload_metadata[base_workload_name]['max_steps'] * + run_fraction) + else: + max_steps = FLAGS.max_steps + max_steps_flag = f'-m {max_steps}' + mount_repo_flag = '' if FLAGS.local: - mount_repo_flag = '-v $HOME/algorithmic-efficiency:/algorithmic-efficiency ' - command = ('docker run -t -d -v $HOME/data/:/data/ ' - '-v $HOME/experiment_runs/:/experiment_runs ' - '-v $HOME/experiment_runs/logs:/logs ' + mount_repo_flag = '-v /home/kasimbeg/algorithmic-efficiency:/algorithmic-efficiency ' + command = ('docker run -t -d -v /home/kasimbeg/data/:/data/ ' + '-v /home/kasimbeg/experiment_runs/:/experiment_runs ' + '-v /home/kasimbeg/experiment_runs/logs:/logs ' f'{mount_repo_flag}' '--gpus all --ipc=host ' f'{docker_image_url} ' @@ -190,9 +252,10 @@ def main(_): f'-s {submission_path} ' f'-w {workload} ' f'-e {study_dir} ' - f'-m {max_steps} ' + f'{max_steps_flag} ' f'--num_tuning_trials {num_tuning_trials} ' f'--rng_seed {run_seed} ' + f'{additional_requirements_path_flag}' '-c false ' '-o true ' '-i true ') @@ -235,4 +298,4 @@ def main(_): if __name__ == '__main__': flags.mark_flag_as_required('workload_metadata_path') - app.run(main) \ No newline at end of file + app.run(main) diff --git a/setup.cfg b/setup.cfg index 321020ad9..eb570dafb 100644 --- a/setup.cfg +++ b/setup.cfg @@ -121,6 +121,8 @@ jax_core_deps = chex==0.1.7 ml_dtypes==0.2.0 protobuf==4.25.3 + scipy==1.11.4 + # JAX CPU jax_cpu = diff --git a/utils/run_workloads.py b/utils/run_workloads.py deleted file mode 100644 index 39f6a7b6f..000000000 --- a/utils/run_workloads.py +++ /dev/null @@ -1,200 +0,0 @@ -""" -Example Usage: -python run_workloads.py \ ---workload_config_path workload_config.json \ ---framework jax \ ---experiment_name my_first_experiment \ ---docker_image_url us-central1-docker.pkg.dev/training-algorithms-external/mlcommons-docker-repo/algoperf_jax_dev \ ---run_percentage 10 \ ---workload_config_path workload_config.json \ ---dry_run -""" - -import json -import os -import struct -import time - -from absl import app -from absl import flags -from absl import logging - -import docker - -flags.DEFINE_string( - 'docker_image_url', - 'us-central1-docker.pkg.dev/training-algorithms-external/mlcommons-docker-repo/algoperf_jax_dev', - 'URL to docker image') -flags.DEFINE_integer('run_percentage', - 100, - 'Percentage of max num steps to run for.') -flags.DEFINE_string('experiment_name', - 'my_experiment', - 'Name of top sub directory in experiment dir.') -flags.DEFINE_boolean('rsync_data', - True, - 'Whether or not to transfer the data from GCP w rsync.') -flags.DEFINE_boolean('local', False, 'Mount local algorithmic-efficiency repo.') -flags.DEFINE_string('framework', 'jax', 'Can be either PyTorch or JAX.') -flags.DEFINE_boolean( - 'dry_run', - False, - 'Whether or not to actually run the docker containers. ' - 'If False, simply print the docker run commands. ') -flags.DEFINE_integer('num_studies', 1, 'Number of studies to run') -flags.DEFINE_integer('study_start_index', None, 'Start index for studies.') -flags.DEFINE_integer('study_end_index', None, 'End index for studies.') -flags.DEFINE_integer('num_tuning_trials', 1, 'Number of tuning trials.') -flags.DEFINE_integer('hparam_start_index', - None, - 'Start index for tuning trials.') -flags.DEFINE_integer('hparam_end_index', None, 'End index for tuning trials.') -flags.DEFINE_integer('seed', None, 'Random seed for evaluating a submission.') -flags.DEFINE_integer('submission_id', - 0, - 'Submission ID to generate study and hparam seeds.') -flags.DEFINE_string( - 'workload_config_path', - 'workload_confing.json', - 'Path to config containing dataset and maximum number of steps per workload.' - 'The default values of these are set to the full budgets as determined ' - 'via the target-setting procedure. ' - 'Note that training will be interrupted at either the set maximum number ' - 'of steps or the fixed workload maximum run time, whichever comes first. ' - 'If your algorithm has a smaller per step time than our baselines ' - 'you may want to increase the number of steps per workload.') - -FLAGS = flags.FLAGS - - -def read_workloads(filename): - with open(filename, "r") as f: - held_out_workloads = json.load(f) - return held_out_workloads - - -def container_running(): - docker_client = docker.from_env() - containers = docker_client.containers.list() - if len(containers) == 0: - return False - else: - return True - - -def wait_until_container_not_running(sleep_interval=5 * 60): - while container_running(): - time.sleep(sleep_interval) - return - - -def main(_): - # What Docker image to run the container with - docker_image_url = FLAGS.docker_image_url - - # Framework - framework = FLAGS.framework - - # - run_fraction = FLAGS.run_percentage / 100. - experiment_name = FLAGS.experiment_name - - # Get study and trial interval arguments - num_studies = FLAGS.num_studies - study_start_index = FLAGS.study_start_index if FLAGS.study_start_index else 0 - study_end_index = FLAGS.study_end_index if FLAGS.study_end_index else num_studies - 1 - - # Get trial arguments - num_tuning_trials = FLAGS.num_tuning_trials - hparam_start_index_flag = '' - hparam_end_index_flag = '' - if FLAGS.hparam_start_index: - hparam_start_index_flag = f'--hparam_start_index {FLAGS.hparam_start_index} ' - if FLAGS.hparam_end_index: - hparam_end_index_flag = f'--hparam_end_index {FLAGS.hparam_end_index} ' - - # Generate rng keys from submission_id and seed - submission_id = FLAGS.submission_id - rng_seed = FLAGS.seed - - if not rng_seed: - rng_seed = struct.unpack('I', os.urandom(4))[0] - - logging.info('Using RNG seed %d', rng_seed) - - # Read workload specifications to run - with open(FLAGS.workload_config_path) as f: - workload_config = json.load(f) - workloads = [w for w in workload_config.keys()] - - for study_index in range(study_start_index, study_end_index + 1): - print('-' * 100) - print('*' * 40, f'Starting study {study_index + 1}/{num_studies}', '*' * 40) - print('-' * 100) - study_dir = os.path.join(experiment_name, f'study_{study_index}') - - for workload in workloads: - # For each runnable workload check if there are any containers running - wait_until_container_not_running() - - # Clear caches - os.system("sudo sh -c 'echo 3 > /proc/sys/vm/drop_caches'") - print('=' * 100) - - # Get workload dataset, max step, algorithm path and tuning search space - dataset = workload_config[workload]['dataset'] - max_steps = int(workload_config[workload]['max_steps'] * run_fraction) - submission_path = workload_config[workload]['submission_path'] - tuning_search_space = workload_config[workload]['tuning_search_space'] - - # Optionally, define flag to mount local algorithmic-efficiency repo - mount_repo_flag = '' - if FLAGS.local: - mount_repo_flag = '-v $HOME/algorithmic-efficiency:/algorithmic-efficiency ' - - command = ('docker run -t -d -v $HOME/data/:/data/ ' - '-v $HOME/experiment_runs/:/experiment_runs ' - '-v $HOME/experiment_runs/logs:/logs ' - f'{mount_repo_flag}' - '--gpus all --ipc=host ' - f'{docker_image_url} ' - f'-d {dataset} ' - f'-f {framework} ' - f'-s {submission_path} ' - f'-w {workload} ' - f'-t {tuning_search_space} ' - f'-e {study_dir} ' - f'-m {max_steps} ' - f'--num_tuning_trials {num_tuning_trials} ' - f'{hparam_start_index_flag} ' - f'{hparam_end_index_flag} ' - f'--rng_seed {rng_seed} ' - '-c false ' - '-o true ' - '-i true ') - if not FLAGS.dry_run: - print('Running docker container command') - print('Container ID: ') - return_code = os.system(command) - else: - return_code = 0 - if return_code == 0: - print( - f'SUCCESS: container for {framework} {workload} launched successfully' - ) - print(f'Command: {command}') - print(f'Results will be logged to {experiment_name}') - else: - print( - f'Failed: container for {framework} {workload} failed with exit code {return_code}.' - ) - print(f'Command: {command}') - wait_until_container_not_running() - os.system( - "sudo sh -c 'echo 3 > /proc/sys/vm/drop_caches'") # clear caches - - print('=' * 100) - - -if __name__ == '__main__': - app.run(main) diff --git a/utils/target_setting_workload_config.json b/utils/target_setting_workload_config.json deleted file mode 100644 index a8c050422..000000000 --- a/utils/target_setting_workload_config.json +++ /dev/null @@ -1,195 +0,0 @@ -{ - "imagenet_resnet": { - "max_steps": 186666, - "dataset": "imagenet", - "submission_path": "reference_algorithms/target_setting_algorithms/jax_adamw.py", - "tuning_search_space": "reference_algorithms/target_setting_algorithms/imagenet_resnet/tuning_search_space.json" - }, - "imagenet_resnet_gelu": { - "max_steps": 186666, - "dataset": "imagenet", - "submission_path": "reference_algorithms/target_setting_algorithms/jax_momentum.py", - "tuning_search_space": "reference_algorithms/target_setting_algorithms/imagenet_resnet_gelu/tuning_search_space.json" - }, - "imagenet_resnet_large_bn_init": { - "max_steps": 186666, - "dataset": "imagenet", - "submission_path": "reference_algorithms/target_setting_algorithms/jax_momentum.py", - "tuning_search_space": "reference_algorithms/target_setting_algorithms/imagenet_resnet_large_bn_init/tuning_search_space.json" - }, - "imagenet_resnet_silu": { - "max_steps": 186666, - "dataset": "imagenet", - "submission_path": "reference_algorithms/target_setting_algorithms/jax_nadamw.py", - "tuning_search_space": "reference_algorithms/target_setting_algorithms/imagenet_resnet_silu/tuning_search_space.json" - }, - "imagenet_vit": { - "max_steps": 186666, - "dataset": "imagenet", - "submission_path": "reference_algorithms/target_setting_algorithms/jax_adamw.py", - "tuning_search_space": "reference_algorithms/target_setting_algorithms/imagenet_vit/tuning_search_space.json" - }, - "imagenet_vit_glu": { - "max_steps": 186666, - "dataset": "imagenet", - "submission_path": "reference_algorithms/target_setting_algorithms/jax_nadamw.py", - "tuning_search_space": "reference_algorithms/target_setting_algorithms/imagenet_vit_glu/tuning_search_space.json" - }, - "imagenet_vit_map": { - "max_steps": 186666, - "dataset": "imagenet", - "submission_path": "reference_algorithms/target_setting_algorithms/jax_nadamw.py", - "tuning_search_space": "reference_algorithms/target_setting_algorithms/imagenet_vit_map/tuning_search_space.json" - }, - "imagenet_vit_post_ln": { - "max_steps": 186666, - "dataset": "imagenet", - "submission_path": "reference_algorithms/target_setting_algorithms/jax_nadamw.py", - "tuning_search_space": "reference_algorithms/target_setting_algorithms/imagenet_vit_post_ln/tuning_search_space.json" - }, - "fastmri": { - "max_steps": 36189, - "dataset": "fastmri", - "submission_path": "reference_algorithms/target_setting_algorithms/jax_nesterov.py", - "tuning_search_space": "reference_algorithms/target_setting_algorithms/fastmri/tuning_search_space.json" - }, - "fastmri_layernorm": { - "max_steps": 36189, - "dataset": "fastmri", - "submission_path": "reference_algorithms/target_setting_algorithms/jax_nadamw.py", - "tuning_search_space": "reference_algorithms/target_setting_algorithms/fastmri_layernorm/tuning_search_space.json" - }, - "fastmri_model_size": { - "max_steps": 36189, - "dataset": "fastmri", - "submission_path": "reference_algorithms/target_setting_algorithms/jax_nadamw.py", - "tuning_search_space": "reference_algorithms/target_setting_algorithms/fastmri_model_size/tuning_search_space.json" - }, - "fastmri_tanh": { - "max_steps": 36189, - "dataset": "fastmri", - "submission_path": "reference_algorithms/target_setting_algorithms/jax_nadamw.py", - "tuning_search_space": "reference_algorithms/target_setting_algorithms/fastmri_tanh/tuning_search_space.json" - }, - "ogbg": { - "max_steps": 80000, - "dataset": "ogbg", - "submission_path": "reference_algorithms/target_setting_algorithms/jax_nesterov.py", - "tuning_search_space": "reference_algorithms/target_setting_algorithms/ogbg/tuning_search_space.json" - }, - "ogbg_gelu": { - "max_steps": 80000, - "dataset": "ogbg", - "submission_path": "reference_algorithms/target_setting_algorithms/jax_nadamw.py", - "tuning_search_space": "reference_algorithms/target_setting_algorithms/ogbg_gelu/tuning_search_space.json" - }, - "ogbg_model_size": { - "max_steps": 80000, - "dataset": "ogbg", - "submission_path": "reference_algorithms/target_setting_algorithms/jax_nadamw.py", - "tuning_search_space": "reference_algorithms/target_setting_algorithms/ogbg_model_size/tuning_search_space.json" - }, - "ogbg_silu": { - "max_steps": 80000, - "dataset": "ogbg", - "submission_path": "reference_algorithms/target_setting_algorithms/jax_nadamw.py", - "tuning_search_space": "reference_algorithms/target_setting_algorithms/ogbg_silu/tuning_search_space.json" - }, - "wmt": { - "max_steps": 133333, - "dataset": "wmt", - "submission_path": "reference_algorithms/target_setting_algorithms/jax_nadamw.py", - "tuning_search_space": "reference_algorithms/target_setting_algorithms/wmt/tuning_search_space.json" - }, - "wmt_attention_temp": { - "max_steps": 133333, - "dataset": "wmt", - "submission_path": "reference_algorithms/target_setting_algorithms/jax_nadamw.py", - "tuning_search_space": "reference_algorithms/target_setting_algorithms/wmt_attention_temp/tuning_search_space.json" - }, - "wmt_glu_tanh": { - "max_steps": 133333, - "dataset": "wmt", - "submission_path": "reference_algorithms/target_setting_algorithms/jax_nadamw.py", - "tuning_search_space": "reference_algorithms/target_setting_algorithms/wmt_glu_tanh/tuning_search_space.json" - }, - "wmt_post_ln": { - "max_steps": 133333, - "dataset": "wmt", - "submission_path": "reference_algorithms/target_setting_algorithms/jax_adamw.py", - "tuning_search_space": "reference_algorithms/target_setting_algorithms/wmt_post_ln/tuning_search_space.json" - }, - "librispeech_deepspeech": { - "max_steps": 48000, - "dataset": "librispeech", - "submission_path": "reference_algorithms/target_setting_algorithms/jax_nadamw.py", - "tuning_search_space": "reference_algorithms/target_setting_algorithms/librispeech_deepspeech/tuning_search_space.json" - }, - "librispeech_deepspeech_no_resnet": { - "max_steps": 48000, - "dataset": "librispeech", - "submission_path": "reference_algorithms/target_setting_algorithms/jax_nadamw.py", - "tuning_search_space": "reference_algorithms/target_setting_algorithms/librispeech_deepspeech_no_resnet/tuning_search_space.json" - }, - "librispeech_deepspeech_norm_and_spec_aug": { - "max_steps": 48000, - "dataset": "librispeech", - "submission_path": "reference_algorithms/target_setting_algorithms/jax_nadamw.py", - "tuning_search_space": "reference_algorithms/target_setting_algorithms/librispeech_deepspeech_norm_and_spec_aug/tuning_search_space.json" - }, - "librispeech_deepspeech_tanh": { - "max_steps": 48000, - "dataset": "librispeech", - "submission_path": "reference_algorithms/target_setting_algorithms/jax_nadamw.py", - "tuning_search_space": "reference_algorithms/target_setting_algorithms/librispeech_deepspeech_tanh/tuning_search_space.json" - }, - "criteo1tb": { - "max_steps": 10666, - "dataset": "criteo1tb", - "submission_path": "reference_algorithms/target_setting_algorithms/jax_nadamw.py", - "tuning_search_space": "reference_algorithms/target_setting_algorithms/criteo1tb/tuning_search_space.json" - }, - "criteo1tb_embed_init": { - "max_steps": 10666, - "dataset": "criteo1tb", - "submission_path": "reference_algorithms/target_setting_algorithms/jax_nadamw.py", - "tuning_search_space": "reference_algorithms/target_setting_algorithms/criteo1tb_embed_init/tuning_search_space.json" - }, - "criteo1tb_layernorm": { - "max_steps": 10666, - "dataset": "criteo1tb", - "submission_path": "reference_algorithms/target_setting_algorithms/jax_nadamw.py", - "tuning_search_space": "reference_algorithms/target_setting_algorithms/criteo1tb_layernorm/tuning_search_space.json" - }, - "criteo1tb_resnet": { - "max_steps": 10666, - "dataset": "criteo1tb", - "submission_path": "reference_algorithms/target_setting_algorithms/jax_nadamw.py", - "tuning_search_space": "reference_algorithms/target_setting_algorithms/criteo1tb_resnet/tuning_search_space.json" - }, - "librispeech_conformer": { - "max_steps": 80000, - "dataset": "librispeech", - "submission_path": "reference_algorithms/target_setting_algorithms/jax_adamw.py", - "tuning_search_space": "reference_algorithms/target_setting_algorithms/librispeech_conformer/tuning_search_space.json" - }, - "librispeech_conformer_attention_temperature": { - "max_steps": 80000, - "dataset": "librispeech", - "submission_path": "reference_algorithms/target_setting_algorithms/jax_adamw.py", - "tuning_search_space": "reference_algorithms/target_setting_algorithms/librispeech_conformer_attention_temperature/tuning_search_space.json" - }, - "librispeech_conformer_gelu": { - "max_steps": 80000, - "dataset": "librispeech", - "submission_path": "reference_algorithms/target_setting_algorithms/jax_nadamw.py", - "tuning_search_space": "reference_algorithms/target_setting_algorithms/librispeech_conformer_gelu/tuning_search_space.json" - }, - "librispeech_conformer_layernorm": { - "max_steps": 80000, - "dataset": "librispeech", - "submission_path": "reference_algorithms/target_setting_algorithms/jax_nadamw.py", - "tuning_search_space": "reference_algorithms/target_setting_algorithms/librispeech_conformer_layernorm/tuning_search_space.json" - } - -} \ No newline at end of file