Merge pull request #800 from mlcommons/dev

Dev -> main
mlcommons · Oct 22, 2024 · c2aa9e1 · c2aa9e1
2 parents ce1003e + 5ce9e5a
commit c2aa9e1
Show file tree

Hide file tree

Showing 5 changed files with 96 additions and 412 deletions.
diff --git a/docker/scripts/startup.sh b/docker/scripts/startup.sh
@@ -132,6 +132,10 @@ while [ "$1" != "" ]; do
 	        shift
             TEST=$1
 	        ;;
+        --additional_requirements_path)
+            shift
+            ADDITIONAL_REQUIREMENTS_PATH=$1
+            ;;
         *) 
             usage 
             exit 1
@@ -140,6 +144,16 @@ while [ "$1" != "" ]; do
     shift 
 done
 
+
+# Optionally install addtional dependencies
+if [[ -n ${ADDITIONAL_REQUIREMENTS_PATH+x} ]]; then
+    echo "Installing addtional requirements..."
+    COMMAND="cd algorithmic-efficiency && pip install -r ${ADDITIONAL_REQUIREMENTS_PATH}"
+    echo $COMMAND
+    eval $COMMAND
+fi
+
+
 if [[ ${TEST} == "true" ]]; then
   cd algorithmic-efficiency
   COMMAND="python3 tests/test_traindiffs.py"

diff --git a/scoring/run_workloads.py b/scoring/run_workloads.py
@@ -9,9 +9,11 @@
 --tuning_search_space <path_to_tuning_search_space_json> 
 """
 
+import datetime
 import json
 import os
 import struct
+import subprocess
 import time
 
 from absl import app
@@ -26,9 +28,11 @@
     'docker_image_url',
     'us-central1-docker.pkg.dev/training-algorithms-external/mlcommons-docker-repo/algoperf_jax_dev',
     'URL to docker image')
-flags.DEFINE_integer('run_percentage',
-                     100,
-                     'Percentage of max num steps to run for.')
+flags.DEFINE_integer(
+    'run_percentage',
+    100,
+    'Percentage of max num steps to run for.'
+    'Must set the flag enable_step_budget to True for this to take effect.')
 flags.DEFINE_string('experiment_name',
                     'my_experiment',
                     'Name of top sub directory in experiment dir.')
@@ -83,10 +87,24 @@
     'If your algorithm has a smaller per step time than our baselines '
     'you may want to increase the number of steps per workload.')
 flags.DEFINE_string(
-    'workload',
+    'workloads',
     None,
+    'String representing a comma separated list of workload names.'
     'If not None, only run this workload, else run all workloads in workload_metadata_path.'
 )
+flags.DEFINE_string('additional_requirements_path',
+                    None,
+                    'Path to requirements.txt if any.')
+flags.DEFINE_integer(
+    'max_steps',
+    None,
+    'Maximum number of steps to run. Must set flag enable_step_budget.'
+    'This flag takes precedence over the run_percentage flag.')
+flags.DEFINE_bool(
+    'enable_step_budget',
+    False,
+    'Flag that has to be explicitly set to override time budgets to step budget percentage.'
+)
 
 FLAGS = flags.FLAGS
 
@@ -106,15 +124,40 @@ def container_running():
     return True
 
 
+def kill_containers():
+  docker_client = docker.from_env()
+  containers = docker_client.containers.list()
+  for container in containers:
+    container.kill()
+
+
+def gpu_is_active():
+  output = subprocess.check_output([
+      'nvidia-smi',
+      '--query-gpu=utilization.gpu',
+      '--format=csv,noheader,nounits'
+  ])
+  return any(int(x) > 0 for x in output.decode().splitlines())
+
+
 def wait_until_container_not_running(sleep_interval=5 * 60):
+  # check gpu util
+  # if the gpu has not been utilized for 30 minutes kill the
+  gpu_last_active = datetime.datetime.now().timestamp()
+
   while container_running():
+    # check if gpus have been inactive > 45 min and if so terminate container
+    if gpu_is_active():
+      gpu_last_active = datetime.datetime.now().timestamp()
+    if (datetime.datetime.now().timestamp() - gpu_last_active) > 45 * 60:
+      kill_containers(
+          "Killing container: GPUs have been inactive > 45 minutes...")
     time.sleep(sleep_interval)
   return
 
 
 def main(_):
   framework = FLAGS.framework
-  run_fraction = FLAGS.run_percentage / 100.
   experiment_name = FLAGS.experiment_name
   docker_image_url = FLAGS.docker_image_url
   submission_path = FLAGS.submission_path
@@ -132,7 +175,13 @@ def main(_):
     study_end_index = FLAGS.study_end_index
   else:
     study_end_index = num_studies - 1
+
+  additional_requirements_path_flag = ''
+  if FLAGS.additional_requirements_path:
+    additional_requirements_path_flag = f'--additional_requirements_path {FLAGS.additional_requirements_path} '
+
   submission_id = FLAGS.submission_id
+
   rng_seed = FLAGS.seed
 
   if not rng_seed:
@@ -144,17 +193,22 @@ def main(_):
   with open(FLAGS.workload_metadata_path) as f:
     workload_metadata = json.load(f)
 
+  # Get list of all possible workloads
   workloads = [w for w in workload_metadata.keys()]
 
-  # Read held-out workloads
+  # Read heldout workloads
   if FLAGS.held_out_workloads_config_path:
     held_out_workloads = read_held_out_workloads(
         FLAGS.held_out_workloads_config_path)
     workloads = workloads + held_out_workloads
 
-  # Filter for single workload
-  if FLAGS.workload and (FLAGS.workload in workloads):
-    workloads = [FLAGS.workload]
+  # Filter workloads if explicit workloads specified
+  if FLAGS.workloads is not None:
+    workloads = list(
+        filter(lambda x: x in FLAGS.workloads.split(','), workloads))
+    if len(workloads) != len(FLAGS.workloads.split(',')):
+      unmatched_workloads = set(FLAGS.workloads.split(',')) - set(workloads)
+      raise ValueError(f'Invalid workload name {unmatched_workloads}')
 
   rng_subkeys = prng.split(rng_key, num_studies)
 
@@ -174,14 +228,22 @@ def main(_):
           "sudo sh -c 'echo 3 > /proc/sys/vm/drop_caches'")  # clear caches
       print('=' * 100)
       dataset = workload_metadata[base_workload_name]['dataset']
-      max_steps = int(workload_metadata[base_workload_name]['max_steps'] *
-                      run_fraction)
+      max_steps_flag = ''
+      if FLAGS.enable_step_budget:
+        run_fraction = FLAGS.run_percentage / 100.
+        if FLAGS.max_steps is None:
+          max_steps = int(workload_metadata[base_workload_name]['max_steps'] *
+                          run_fraction)
+        else:
+          max_steps = FLAGS.max_steps
+        max_steps_flag = f'-m {max_steps}'
+
       mount_repo_flag = ''
       if FLAGS.local:
-        mount_repo_flag = '-v $HOME/algorithmic-efficiency:/algorithmic-efficiency '
-      command = ('docker run -t -d -v $HOME/data/:/data/ '
-                 '-v $HOME/experiment_runs/:/experiment_runs '
-                 '-v $HOME/experiment_runs/logs:/logs '
+        mount_repo_flag = '-v /home/kasimbeg/algorithmic-efficiency:/algorithmic-efficiency '
+      command = ('docker run -t -d -v /home/kasimbeg/data/:/data/ '
+                 '-v /home/kasimbeg/experiment_runs/:/experiment_runs '
+                 '-v /home/kasimbeg/experiment_runs/logs:/logs '
                  f'{mount_repo_flag}'
                  '--gpus all --ipc=host '
                  f'{docker_image_url} '
@@ -190,9 +252,10 @@ def main(_):
                  f'-s {submission_path} '
                  f'-w {workload} '
                  f'-e {study_dir} '
-                 f'-m {max_steps} '
+                 f'{max_steps_flag} '
                  f'--num_tuning_trials {num_tuning_trials} '
                  f'--rng_seed {run_seed} '
+                 f'{additional_requirements_path_flag}'
                  '-c false '
                  '-o true '
                  '-i true ')
@@ -235,4 +298,4 @@ def main(_):
 
 if __name__ == '__main__':
   flags.mark_flag_as_required('workload_metadata_path')
-  app.run(main)
+  app.run(main)
diff --git a/setup.cfg b/setup.cfg
@@ -121,6 +121,8 @@ jax_core_deps =
   chex==0.1.7
   ml_dtypes==0.2.0
   protobuf==4.25.3
+  scipy==1.11.4
+
 
 # JAX CPU
 jax_cpu =