diff --git a/.gitignore b/.gitignore index d398bc8c4..95d9fa6c1 100644 --- a/.gitignore +++ b/.gitignore @@ -20,3 +20,6 @@ algorithmic_efficiency/workloads/librispeech_conformer/work_dir *.vocab wandb/ *.txt + +!scoring/test_data/experiment_dir/study_0/mnist_jax/trial_0/eval_measurements.csv +!scoring/test_data/experiment_dir/study_0/mnist_jax/trial_1/eval_measurements.csv \ No newline at end of file diff --git a/DOCUMENTATION.md b/DOCUMENTATION.md index 992e1e2a6..ab953384e 100644 --- a/DOCUMENTATION.md +++ b/DOCUMENTATION.md @@ -484,10 +484,12 @@ The integral is normalized by the total integration area, with higher benchmark For the benchmark score, we compute and integrate the performance profiles using the training times of only the fixed workloads. But we use the submission's performance on the held-out workloads to penalize submissions. Specifically, if a submission is unable to train a held-out workload, we score the submission on the corresponding fixed workload as if that submission did not reach the target. In other words, for a submission to receive a finite training time on a fixed workload, it needs to: -- Reach the validation target on the fixed workload within the maximum runtime. -- Reach the validation target fixed workload within 4x of the fastest submission. -- Reach the validation target on the held-out workload (corresponding to the fixed workload) within the maximum runtime. -- Reach the validation target on the held-out workload (corresponding to the fixed workload) within 4x of the fastest submission. To determine the fastest submission on a held-out workload, we only consider submissions that reached the target on the corresponding fixed workload. This protects us against extremely fast submissions that only work on a specific held-out workload and are useless as general algorithms. +1. Reach the validation target on the fixed workload within the maximum runtime. +2. Reach the validation target fixed workload within 4x of the fastest submission. +3. Reach the validation target on the held-out workload (corresponding to the fixed workload) within the maximum runtime. +4. Reach the validation target on the held-out workload (corresponding to the fixed workload) within 4x of the fastest submission. To determine the fastest submission on a held-out workload, we only consider submissions that reached the target on the corresponding fixed workload. This protects us against extremely fast submissions that only work on a specific held-out workload and are useless as general algorithms. + +For fixed workloads without a corresponding held-out workload (e.g. in this iteration this is the case for one ImageNet and one LibriSpeech workload, since we only sample one held-out workload *per dataset*), requirements 3 and 4 are automatically satisfied. Only if all four requirements are met, does the submission get a finite score. Otherwise, a submission will receive a training time of infinity. Note that the tuning process works the same for held-out workloads as for the fixed workloads, i.e. in the external tuning ruleset there are multiple tuning trials and only the fastest trial per study is relevant for scoring. diff --git a/GETTING_STARTED.md b/GETTING_STARTED.md index 29521fead..14f64d5e8 100644 --- a/GETTING_STARTED.md +++ b/GETTING_STARTED.md @@ -17,6 +17,7 @@ - [Run your Submission in a Docker Container](#run-your-submission-in-a-docker-container) - [Docker Tips](#docker-tips) - [Score your Submission](#score-your-submission) + - [Running workloads](#running-workloads) ## Set Up and Installation @@ -336,18 +337,17 @@ docker exec -it /bin/bash ``` ## Score your Submission -To score your submission we will score over all workloads, held-out workloads and studies as described in the rules. -We will sample 1 held-out workload per dataset for a total of 6 held-out workloads and will use the sampled -held-out workloads in the scoring criteria for the matching base workloads. -In other words, the total number of runs expected for official scoring is: -- for external ruleset (8 (workloads) + 6 (held-out workloads)) x 5 (studies) x 5 (trials) -- for internal ruleset (8 (workloads) + 6 (held-out workloads)) x 5 (studies) +To score your submission we will score over all fixed workloads, held-out workloads and studies as described in the rules. +We will sample 1 held-out workload per dataset for a total of 6 held-out workloads and will use the sampled held-out workloads in the scoring criteria for the matching fixed base workloads. +In other words, the total number of runs expected for official scoring is: +- for external tuning ruleset: **350** = (8 (fixed workloads) + 6 (held-out workloads)) x 5 (studies) x 5 (trials) +- for self-tuning ruleset: **70** = (8 (fixed workloads) + 6 (held-out workloads)) x 5 (studies) ### Running workloads -To run workloads for scoring you may specify a "virtual" list of held-out workloads. It is important -to note that the official set of held-out workloads will be sampled by the competition organizers during scoring time. + +To run workloads for (a mock) scoring you may specify a "virtual" list of held-out workloads. It is important to note that the official set of held-out workloads will be sampled by the competition organizers during scoring time. An example config for held-out workloads is stored in `scoring/held_workloads_example.json`. To generate a new sample of held out workloads run: @@ -370,12 +370,10 @@ python scoring/run_workloads.py \ --seed ``` -Note that to run the above script you will need the minimum jax_cpu and pytorch_cpu installations of the algorithmic-efficiency package. +Note that to run the above script you will need at least the `jax_cpu` and `pytorch_cpu` installations of the `algorithmic-efficiency` package. -During submission development, it might be useful to do faster, approximate scoring (e.g. without 5 different s -tudies or when some trials are missing) so the scoring scripts allow some flexibility. To simulate official scoring, -pass the `--strict=True` flag in score_submission.py. To get the raw scores and performance profiles of group of -submissions or single submission: +During submission development, it might be useful to do faster, approximate scoring (e.g. without `5` different studies or when some trials are missing) so the scoring scripts allow someflexibility. +To simulate official scoring, pass the `--strict=True` flag in `score_submission.py`. To get the raw scores and performance profiles of group of submissions or single submission: ```bash python score_submissions.py --submission_directory --output_dir --compute_performance_profiles diff --git a/datasets/README.md b/datasets/README.md index c68a5cc6b..fc85d05c6 100644 --- a/datasets/README.md +++ b/datasets/README.md @@ -116,7 +116,7 @@ $DATA_DIR │ └── ogbg_molpcba-validation.tfrecord-00000-of-00001 ``` -In total, it should contain 13 files (via `find -type f | wc -l`) for a total of 777 MB (via `du -sch ogbg/`). +In total, it should contain 13 files (via `find -type f | wc -l`) for a total of 830 MB (via `du -sch --apparent-size ogbg/`). ### WMT @@ -184,7 +184,7 @@ $DATA_DIR └── wmt_sentencepiece_model ``` -In total, it should contain 43 files (via `find -type f | wc -l`) for a total of 3.3 GB (via `du -sch wmt/`). +In total, it should contain 43 files (via `find -type f | wc -l`) for a total of 3.3 GB (via `du -sch --apparent-size wmt/`). ### FastMRI @@ -222,7 +222,7 @@ $DATA_DIR │ └── file1002570.h5 ``` -In total, it should contain 1280 files (via `find -type f | wc -l`) for a total of 112 GB (via `du -sch fastmri/`). +In total, it should contain 1280 files (via `find -type f | wc -l`) for a total of 113 GB (via `du -sch --apparent-size fastmri/`). ### ImageNet @@ -277,11 +277,11 @@ $DATA_DIR │ ├── [...] ``` -In total, it should contain 1,281,167 `train` files and 50,000 `val` (via `find -type f | wc -l`) for a total of 177 GB and 7.8 GB, respectively (via `du -sch train/` and `du -sch val/`). +In total, it should contain 1,281,167 `train` files and 50,000 `val` (via `find -type f | wc -l`) for a total of 137 GB and 6.3 GB, respectively (via `du -sch --apparent-size train/` and `du -sch --apparent-size val/`).
-The final directory structure should look like this for ImageNet2012 (JAX): +The final directory structure should look like this for ImageNet2012 (JAX) (including v2): ```bash $DATA_DIR @@ -289,7 +289,7 @@ $DATA_DIR │ ├── jax │ │ ├── downloads │ │ │ ├── extracted -│ │ │ └── manual_ +│ │ │ └── manual │ │ ├── imagenet2012 │ │ │ └── 5.1.0 │ │ │ ├── dataset_info.json @@ -307,7 +307,7 @@ $DATA_DIR │ │ ├── [...] ``` -In total, it should contain 1,111 files (via `find -type f | wc -l`) for a total of 145 GB (via `du -sch imagenet/jax`). +In total, it should contain 1,111 files (via `find -type f | wc -l`) for a total of 145 GB (via `du -sch --apparent-size imagenet/jax`).
@@ -339,7 +339,7 @@ $DATA_DIR │ └── label.labels.txt ``` -In total, it should contain 20 files (via `find -type f | wc -l`) for a total of 1.2 GB (via `du -sch imagenet_v2/`). +In total, it should contain 20 files (via `find -type f | wc -l`) for a total of 1.2 GB (via `du -sch --apparent-size imagenet_v2/`).
### Criteo1TB @@ -366,7 +366,7 @@ $DATA_DIR │ ├── [...] ``` -In total, it should contain 885 files (via `find -type f | wc -l`) for a total of 1.1 TB (via `du -sch criteo1tb/`). +In total, it should contain 885 files (via `find -type f | wc -l`) for a total of 1.1 TB (via `du -sch --apparent-size criteo1tb/`). ### LibriSpeech @@ -423,7 +423,7 @@ $DATA_DIR │ │ ├── [...] ``` -In total, it should contain 543,323 files (via `find -type f | wc -l`) for a total of 388 GB (via `du -sch librispeech/`). +In total, it should contain 543,323 files (via `find -type f | wc -l`) for a total of 387 GB (via `du -sch --apparent-size librispeech/`). #### Training SPM Tokenizer diff --git a/scoring/generate_held_out_workloads.py b/scoring/generate_held_out_workloads.py index 474c4e7d7..a9d551938 100644 --- a/scoring/generate_held_out_workloads.py +++ b/scoring/generate_held_out_workloads.py @@ -13,7 +13,6 @@ flags.DEFINE_string('output_filename', 'held_out_workloads.json', 'Path to file to record sampled held_out workloads.') -flags.DEFINE_string('framework', 'jax', 'JAX or PyTorch') FLAGS = flags.FLAGS HELD_OUT_WORKLOADS = { @@ -55,7 +54,7 @@ def main(_): rng = np.random.default_rng(rng_seed) sampled_held_out_workloads = [] - for k, v in HELD_OUT_WORKLOADS.items(): + for _, v in HELD_OUT_WORKLOADS.items(): sampled_index = rng.integers(len(v)) sampled_held_out_workloads.append(v[sampled_index]) diff --git a/scoring/test_data/experiment_dir/mnist_jax/trial_0/eval_measurements.csv b/scoring/test_data/experiment_dir/study_0/mnist_jax/trial_0/eval_measurements.csv similarity index 100% rename from scoring/test_data/experiment_dir/mnist_jax/trial_0/eval_measurements.csv rename to scoring/test_data/experiment_dir/study_0/mnist_jax/trial_0/eval_measurements.csv diff --git a/scoring/test_data/experiment_dir-2023-10-11-16-58-40/mnist_jax/trial_1/eval_measurements.csv b/scoring/test_data/experiment_dir/study_0/mnist_jax/trial_1/eval_measurements.csv similarity index 100% rename from scoring/test_data/experiment_dir-2023-10-11-16-58-40/mnist_jax/trial_1/eval_measurements.csv rename to scoring/test_data/experiment_dir/study_0/mnist_jax/trial_1/eval_measurements.csv diff --git a/scoring/test_scoring_utils.py b/scoring/test_scoring_utils.py index e6c1d7c63..d5d62563b 100644 --- a/scoring/test_scoring_utils.py +++ b/scoring/test_scoring_utils.py @@ -1,8 +1,7 @@ from absl.testing import absltest +from scoring import performance_profile from scoring import scoring_utils -from scoring.scoring import NUM_TRIALS -from scoring.scoring import NUM_WORKLOADS TEST_LOGFILE = 'scoring/test_data/adamw_fastmri_jax_04-18-2023-13-10-58.log' TEST_DIR = 'scoring/test_data/experiment_dir' @@ -31,7 +30,7 @@ def test_get_experiment_df(self): def test_scores(self): df = scoring_utils.get_experiment_df(TEST_DIR) - performance_profile_df = scoring.compute_performance_profiles( + _ = performance_profile.compute_performance_profiles( {'my.submission': df}, time_col='score', min_tau=1.0,