From e7abddb813974ef4d69e0d2491b856a5be1a53b0 Mon Sep 17 00:00:00 2001 From: Chandramouli Shama Sastry Date: Mon, 9 Oct 2023 19:14:39 +0000 Subject: [PATCH 01/27] collect results from workdirs run at different timestamps --- scoring/scoring_utils.py | 74 ++++++++++++++++++++++------------------ 1 file changed, 41 insertions(+), 33 deletions(-) diff --git a/scoring/scoring_utils.py b/scoring/scoring_utils.py index 37db73dd4..90a4e8867 100644 --- a/scoring/scoring_utils.py +++ b/scoring/scoring_utils.py @@ -1,3 +1,4 @@ +import glob import json import os import re @@ -9,6 +10,7 @@ METRICS_LINE_REGEX = '(.*) Metrics: ({.*})' TRIAL_DIR_REGEX = 'trial_(\d+)' MEASUREMENTS_FILENAME = 'eval_measurements.csv' +TIMESTAMP = r"-\d{4}(-\d{2}){5}" #### File IO helper functions ### @@ -137,7 +139,9 @@ def get_experiment_df(experiment_dir): scoring.compute_performance_profiles. Args: experiment_dir: path to experiment directory containing - results for workloads. + results for workloads. Measurements from experiments + sharing the same prefix but different timestamps are + collected together. The directory structure is assumed to be: + experiment_dir + @@ -148,38 +152,42 @@ def get_experiment_df(experiment_dir): df: DataFrame where indices are trials, columns are metric names and values are lists. e.g - +----+-----------+---------+--------------------+--------------------+ - | | workload | trial | validation/accuracy| score | - |----+-----------+---------+--------------------+--------------------| - | 0 | mnist_jax | trial_1 | [0.0911, 0.0949] | [10.6396, 10.6464] | - +----+-----------+---------+--------------------+--------------------+ + +----+-----------+-----------------------------+--------------------+--------------------+ + | | workload | trial | validation/accuracy| score | + |----+-----------+-----------------------------+--------------------+--------------------| + | 0 | mnist_jax | (trial_1, ) | [0.0911, 0.0949] | [10.6396, 10.6464] | + +----+-----------+-----------------------------+--------------------+--------------------+ """ df = pd.DataFrame() - workload_dirs = os.listdir(experiment_dir) - for workload in workload_dirs: - data = { - 'workload': workload, - } - trial_dirs = [ - t for t in os.listdir(os.path.join(experiment_dir, workload)) - if re.match(TRIAL_DIR_REGEX, t) - ] - for trial in trial_dirs: - eval_measurements_filepath = os.path.join( - experiment_dir, - workload, - trial, - MEASUREMENTS_FILENAME, - ) - try: - trial_df = pd.read_csv(eval_measurements_filepath) - except FileNotFoundError as e: - logging.info(f'Could not read {eval_measurements_filepath}') - continue - data['trial'] = trial - for column in trial_df.columns: - values = trial_df[column].to_numpy() - data[column] = values - trial_df = pd.DataFrame([data]) - df = pd.concat([df, trial_df], ignore_index=True) + paths = filter( + lambda x: re.match(experiment_dir + TIMESTAMP, x) or x == experiment_dir, + glob.glob(f"{experiment_dir}*")) + for experiment_dir in list(paths): + workload_dirs = os.listdir(experiment_dir) + for workload in workload_dirs: + data = { + 'workload': workload, + } + trial_dirs = [ + t for t in os.listdir(os.path.join(experiment_dir, workload)) + if re.match(TRIAL_DIR_REGEX, t) + ] + for trial in trial_dirs: + eval_measurements_filepath = os.path.join( + experiment_dir, + workload, + trial, + MEASUREMENTS_FILENAME, + ) + try: + trial_df = pd.read_csv(eval_measurements_filepath) + except FileNotFoundError as e: + logging.info(f'Could not read {eval_measurements_filepath}') + continue + data['trial'] = (trial, experiment_dir) + for column in trial_df.columns: + values = trial_df[column].to_numpy() + data[column] = values + trial_df = pd.DataFrame([data]) + df = pd.concat([df, trial_df], ignore_index=True) return df From 4d851cea6fc2a480c313be96100702808ae2ab5f Mon Sep 17 00:00:00 2001 From: Chandramouli Shama Sastry Date: Wed, 11 Oct 2023 17:42:11 +0000 Subject: [PATCH 02/27] scoring tests --- .../mnist_jax/trial_1/eval_measurements.csv | 43 +++++++++++++++++++ scoring/test_scoring_utils.py | 14 ++++++ 2 files changed, 57 insertions(+) create mode 100644 scoring/test_data/experiment_dir-2023-10-11-16-58-40/mnist_jax/trial_1/eval_measurements.csv diff --git a/scoring/test_data/experiment_dir-2023-10-11-16-58-40/mnist_jax/trial_1/eval_measurements.csv b/scoring/test_data/experiment_dir-2023-10-11-16-58-40/mnist_jax/trial_1/eval_measurements.csv new file mode 100644 index 000000000..16b173d2d --- /dev/null +++ b/scoring/test_data/experiment_dir-2023-10-11-16-58-40/mnist_jax/trial_1/eval_measurements.csv @@ -0,0 +1,43 @@ +accumulated_eval_time,accumulated_logging_time,accumulated_submission_time,global_step,preemption_count,score,test/accuracy,test/loss,test/num_examples,total_duration,train/accuracy,train/loss,validation/accuracy,validation/loss,validation/num_examples +3.3074607849121094,0.0,10.884385585784912,1,0,10.884385585784912,0.1094000041484832,2.584994316101074,10000,14.191919088363647,0.1090000048279762,2.5650947093963623,0.1155000030994415,2.579528570175171,10000 +3.337209701538086,0.0272295475006103,20.800366163253784,2260,0,20.800366163253784,0.8156000375747681,0.8071792125701904,10000,24.222018003463745,0.8114000558853149,0.8132649064064026,0.8164000511169434,0.8204169273376465,10000 +3.3682632446289062,0.0540821552276611,30.71659564971924,4595,0,30.71659564971924,0.8079000115394592,0.728191077709198,10000,34.255709409713745,0.8051000237464905,0.7325593829154968,0.8007000088691711,0.7405639290809631,10000 +3.392564058303833,0.079397439956665,40.63307213783264,6971,0,40.63307213783264,0.8212000131607056,0.6618639826774597,10000,44.28224039077759,0.8146000504493713,0.6732342839241028,0.8176000118255615,0.6771405339241028,10000 +3.4162850379943848,0.1048784255981445,50.54938673973084,9310,0,50.54938673973084,0.8224000334739685,0.6520460844039917,10000,54.30677127838135,0.8114000558853149,0.6680686473846436,0.8162000179290771,0.6716861724853516,10000 +3.44294548034668,0.1316289901733398,60.46591234207153,11640,0,60.46591234207153,0.8291000127792358,0.612684965133667,10000,64.33635830879211,0.8278000354766846,0.6238877773284912,0.829200029373169,0.6163886785507202,10000 +3.4705374240875244,0.1571955680847168,70.38250923156738,13962,0,70.38250923156738,0.8330000638961792,0.6149598956108093,10000,74.36598539352417,0.8288000226020813,0.6226670742034912,0.829800009727478,0.6122417449951172,10000 +3.496747732162476,0.1836600303649902,80.29951977729797,16249,0,80.29951977729797,0.8243000507354736,0.6327693462371826,10000,84.3950743675232,0.8252000212669373,0.644512414932251,0.8238000273704529,0.6446439623832703,10000 +3.522087574005127,0.20969820022583,90.21488404273988,18545,0,90.21488404273988,0.8294000625610352,0.6092277765274048,10000,94.4209051132202,0.8259000182151794,0.6247073411941528,0.8263000249862671,0.6222184896469116,10000 +3.549894332885742,0.2363710403442382,100.13060092926024,20791,0,100.13060092926024,0.8614000678062439,0.52787846326828,10000,104.44931364059448,0.8500000238418579,0.553955078125,0.858500063419342,0.5388756990432739,10000 +3.5757312774658203,0.2636079788208008,110.0454170703888,23088,0,110.0454170703888,0.8564000129699707,0.5537337064743042,10000,114.4781494140625,0.8472000360488892,0.5771669745445251,0.8512000441551208,0.5673128366470337,10000 +3.600522756576538,0.2896809577941894,119.95860981941225,25491,0,119.95860981941225,0.8591000437736511,0.5227596163749695,10000,124.5031967163086,0.8520000576972961,0.5452638268470764,0.8614000678062439,0.5311681628227234,10000 +3.625296115875244,0.3164572715759277,129.8751039505005,27834,0,129.8751039505005,0.8648000359535217,0.5044941306114197,10000,134.5310184955597,0.8707000613212585,0.4973550140857696,0.872700035572052,0.5061097741127014,10000 +3.652939796447754,0.3452167510986328,139.7876615524292,30160,0,139.7876615524292,0.8766000270843506,0.4868762791156769,10000,144.55956268310547,0.8693000674247742,0.5040879845619202,0.876300036907196,0.4923528432846069,10000 +3.67879319190979,0.3744730949401855,149.7024416923523,32448,0,149.7024416923523,0.8764000535011292,0.4738911986351013,10000,154.5886163711548,0.8797000646591187,0.4765508472919464,0.8817000389099121,0.4693307876586914,10000 +3.70514726638794,0.401404857635498,159.61914157867432,34683,0,159.61914157867432,0.8802000284194946,0.4768852889537811,10000,164.61644506454468,0.8692000508308411,0.4997503161430359,0.8842000365257263,0.4773700535297394,10000 +3.73129153251648,0.4311785697937011,169.52961444854736,36914,0,169.52961444854736,0.8880000710487366,0.4485557973384857,10000,174.64349031448364,0.8759000301361084,0.4759842455387115,0.8856000304222107,0.4616841077804565,10000 +3.756492137908936,0.4605388641357422,179.44147443771362,39160,0,179.44147443771362,0.8827000260353088,0.4693212509155273,10000,184.6703307628632,0.8759000301361084,0.4877949357032776,0.8823000192642212,0.47818323969841,10000 +3.782692193984986,0.4901380538940429,189.3587987422943,41392,0,189.3587987422943,0.8880000710487366,0.4533746242523193,10000,194.70058631896973,0.8836000561714172,0.4600184261798858,0.8873000144958496,0.4541208744049072,10000 +3.809891700744629,0.5197367668151855,199.274918794632,43666,0,199.274918794632,0.8891000151634216,0.4392279982566833,10000,204.731507062912,0.8875000476837158,0.4534733593463897,0.8879000544548035,0.4517350792884826,10000 +3.8350577354431152,0.5492231845855713,209.1874165534973,45945,0,209.1874165534973,0.8857000470161438,0.4512746930122375,10000,214.7579679489136,0.8837000131607056,0.4590539634227752,0.8818000555038452,0.4652692973613739,10000 +3.85998797416687,0.5787074565887451,219.0983612537384,48237,0,219.0983612537384,0.89000004529953,0.4437272548675537,10000,224.7834641933441,0.8831000328063965,0.4595882892608642,0.8875000476837158,0.4498450458049774,10000 +3.885103702545166,0.6087353229522705,229.0138251781464,50487,0,229.0138251781464,0.894800066947937,0.4131699800491333,10000,234.81255435943604,0.8968000411987305,0.4132199287414551,0.8938000202178955,0.4232735633850097,10000 +3.911269426345825,0.6387021541595459,238.9281618595124,52699,0,238.9281618595124,0.9029000401496888,0.4060218632221222,10000,244.8411045074463,0.8994000554084778,0.4132817685604095,0.8960000276565552,0.4201119840145111,10000 +3.936365365982056,0.668968677520752,248.83987426757807,54968,0,248.83987426757807,0.9014000296592712,0.3983155488967895,10000,254.8668367862701,0.8975000381469727,0.4028871953487396,0.9005000591278076,0.4035448133945465,10000 +3.963345289230346,0.6995627880096436,258.7516326904297,57238,0,258.7516326904297,0.9070000648498536,0.3919171690940857,10000,264.8952040672302,0.905500054359436,0.3880215883255005,0.9032000303268432,0.3942824900150299,10000 +3.9895851612091056,0.7304470539093018,268.66518568992615,59479,0,268.66518568992615,0.910700023174286,0.3861576020717621,10000,274.9236674308777,0.903700053691864,0.4019041359424591,0.9068000316619872,0.3913732171058655,10000 +4.01618766784668,0.7612087726593018,278.5771634578705,61780,0,278.5771634578705,0.9115000367164612,0.3799691796302795,10000,284.9529445171356,0.9104000329971312,0.3839159905910492,0.9086000323295592,0.3893324434757232,10000 +4.04093861579895,0.7921364307403564,288.49143171310425,64050,0,288.49143171310425,0.9142000675201416,0.3721667230129242,10000,294.9814648628235,0.9113000631332396,0.374419093132019,0.9112000465393066,0.3832390904426574,10000 +4.066902160644531,0.8264949321746826,298.4053153991699,66301,0,298.4053153991699,0.9122000336647034,0.372380793094635,10000,305.0141706466675,0.9129000306129456,0.3773751556873321,0.9122000336647034,0.3790977895259857,10000 +4.0928122997283936,0.8593041896820068,308.3142282962799,68549,0,308.3142282962799,0.9160000681877136,0.3669502735137939,10000,315.04016852378845,0.9104000329971312,0.3748669624328613,0.9105000495910645,0.3784786462783813,10000 +4.1184492111206055,0.8911542892456055,318.22746777534485,70800,0,318.22746777534485,0.9153000712394714,0.3636593520641327,10000,325.0690577030182,0.9185000658035278,0.3687613904476166,0.910800039768219,0.3743433952331543,10000 +4.146052122116089,0.922600030899048,328.1395530700684,72999,0,328.1395530700684,0.9163000583648682,0.3645511567592621,10000,335.09958815574646,0.916100025177002,0.364233136177063,0.9119000434875488,0.3745094835758209,10000 +4.172115802764893,0.9543182849884032,338.0537300109863,75213,0,338.0537300109863,0.9157000184059144,0.3641790449619293,10000,345.1285364627838,0.9118000268936156,0.3679656088352203,0.9119000434875488,0.3735902607440948,10000 +4.197761535644531,0.9881937503814696,347.9646620750427,77485,0,347.9646620750427,0.9155000448226928,0.3638691902160644,10000,355.1568307876587,0.9173000454902648,0.3572725653648376,0.9121000170707704,0.3734744191169739,10000 +4.22634482383728,1.0207302570343018,357.88095235824585,79736,0,357.88095235824585,0.915600061416626,0.3638694882392883,10000,365.1885812282562,0.9158000349998474,0.3646068274974823,0.9122000336647034,0.3734698891639709,10000 +4.25168514251709,1.0548341274261477,367.7923603057861,81999,0,367.7923603057861,0.915600061416626,0.3638694882392883,10000,375.2162811756134,0.9165000319480896,0.3642504215240478,0.9122000336647034,0.3734698891639709,10000 +4.2778027057647705,1.0879690647125244,377.7043735980988,84267,0,377.7043735980988,0.915600061416626,0.3638694882392883,10000,385.2464287281037,0.914500057697296,0.3697480857372284,0.9122000336647034,0.3734698891639709,10000 +4.303594589233398,1.1212027072906494,387.61431884765625,86553,0,387.61431884765625,0.915600061416626,0.3638694882392883,10000,395.2737421989441,0.917400062084198,0.3631039559841156,0.9122000336647034,0.3734698891639709,10000 +4.330204248428345,1.1557085514068604,397.5266599655152,88769,0,397.5266599655152,0.915600061416626,0.3638694882392883,10000,405.3015666007996,0.907300055027008,0.3805611133575439,0.9122000336647034,0.3734698891639709,10000 +4.3575522899627686,1.1906015872955322,407.4391739368439,90941,0,407.4391739368439,0.915600061416626,0.3638694882392883,10000,415.3328382968903,0.9160000681877136,0.3628231287002563,0.9122000336647034,0.3734698891639709,10000 +4.3869407176971436,1.2255029678344727,417.34814286231995,93156,0,417.34814286231995,0.915600061416626,0.36386948823928833,10000,425.36263489723206,0.9160000681877136,0.3684694468975067,0.9122000336647034,0.37346988916397095,10000 diff --git a/scoring/test_scoring_utils.py b/scoring/test_scoring_utils.py index b766a04d7..67bfc44f2 100644 --- a/scoring/test_scoring_utils.py +++ b/scoring/test_scoring_utils.py @@ -1,5 +1,6 @@ from absl.testing import absltest import scoring_utils +import scoring TEST_LOGFILE = 'test_data/adamw_fastmri_jax_04-18-2023-13-10-58.log' TEST_DIR = 'test_data/experiment_dir' @@ -25,6 +26,19 @@ def test_get_trials_df(self): def test_get_experiment_df(self): df = scoring_utils.get_experiment_df(TEST_DIR) + assert len(df)==2 + + def test_scores(self): + df = scoring_utils.get_experiment_df(TEST_DIR) + performance_profile_df = scoring.compute_performance_profiles( + {'my.submission':df}, + time_col='score', + min_tau=1.0, + max_tau=None, + reference_submission_tag=None, + num_points=100, + scale='linear', + verbosity=0) if __name__ == '__main__': From 1733c0048b7edca06e6f9421471745d4856bef65 Mon Sep 17 00:00:00 2001 From: Chandramouli Shama Sastry Date: Wed, 11 Oct 2023 18:01:58 +0000 Subject: [PATCH 03/27] style fixes --- scoring/test_scoring_utils.py | 21 +++++++++++---------- 1 file changed, 11 insertions(+), 10 deletions(-) diff --git a/scoring/test_scoring_utils.py b/scoring/test_scoring_utils.py index 67bfc44f2..7a9821bf4 100644 --- a/scoring/test_scoring_utils.py +++ b/scoring/test_scoring_utils.py @@ -1,6 +1,7 @@ from absl.testing import absltest import scoring_utils -import scoring + +import scoring TEST_LOGFILE = 'test_data/adamw_fastmri_jax_04-18-2023-13-10-58.log' TEST_DIR = 'test_data/experiment_dir' @@ -26,19 +27,19 @@ def test_get_trials_df(self): def test_get_experiment_df(self): df = scoring_utils.get_experiment_df(TEST_DIR) - assert len(df)==2 + assert len(df) == 2 def test_scores(self): df = scoring_utils.get_experiment_df(TEST_DIR) performance_profile_df = scoring.compute_performance_profiles( - {'my.submission':df}, - time_col='score', - min_tau=1.0, - max_tau=None, - reference_submission_tag=None, - num_points=100, - scale='linear', - verbosity=0) + {'my.submission': df}, + time_col='score', + min_tau=1.0, + max_tau=None, + reference_submission_tag=None, + num_points=100, + scale='linear', + verbosity=0) if __name__ == '__main__': From 8c7fed682fbf781bf79f5ad384dd2f96fcb9802f Mon Sep 17 00:00:00 2001 From: priyakasimbeg Date: Thu, 9 Nov 2023 19:15:04 -0800 Subject: [PATCH 04/27] Update scoring_utils.py --- scoring/scoring_utils.py | 1 + 1 file changed, 1 insertion(+) diff --git a/scoring/scoring_utils.py b/scoring/scoring_utils.py index 8252c75a9..3768d0fbb 100644 --- a/scoring/scoring_utils.py +++ b/scoring/scoring_utils.py @@ -9,6 +9,7 @@ import algorithmic_efficiency.workloads.workloads as workloads_registry + TRIAL_LINE_REGEX = '(.*) --- Tuning run (\d+)/(\d+) ---' METRICS_LINE_REGEX = '(.*) Metrics: ({.*})' TRIAL_DIR_REGEX = 'trial_(\d+)' From f012936a0f7effa033c3fdb17ae6448923b820b3 Mon Sep 17 00:00:00 2001 From: Frank Schneider Date: Tue, 14 Nov 2023 12:08:06 +0100 Subject: [PATCH 05/27] Rules as provided by the lawyers --- RULES.md | 514 +++++++++---------------------------------------------- 1 file changed, 78 insertions(+), 436 deletions(-) diff --git a/RULES.md b/RULES.md index d74525244..0126c7ead 100644 --- a/RULES.md +++ b/RULES.md @@ -1,491 +1,133 @@ -# MLCommons™ AlgoPerf: Benchmark Rules - -**Version:** 0.0.18 *(Last updated 03 Oktober 2023)* - -> **TL;DR** New training algorithms and models can make neural net training faster. -> We need a rigorous training time benchmark that measures time to result given a fixed hardware configuration and stimulates algorithmic progress. We propose a [Training Algorithm Track](#training-algorithm-track) and a [Model Track](#model-track) in order to help disentangle optimizer improvements and model architecture improvements. This two-track structure lets us enforce a requirement that new optimizers work well on multiple models and that new models aren't highly specific to particular training hacks. +# MLCommons™ AlgoPerf: Competition Rules ## Table of Contents -- [Introduction](#introduction) -- [Training Algorithm Track](#training-algorithm-track) - - [Submissions](#submissions) - - [Specification](#specification) - - [Evaluation during training](#evaluation-during-training) - - [Valid submissions](#valid-submissions) - - [Tuning](#tuning) - - [External tuning ruleset](#external-tuning-ruleset) - - [Self-tuning ruleset](#self-tuning-ruleset) - - [Workloads](#workloads) - - [Fixed workloads](#fixed-workloads) - - [Randomized workloads](#randomized-workloads) - - [Qualification set](#qualification-set) - - [Scoring](#scoring) - - [Benchmarking hardware](#benchmarking-hardware) - - [Defining target performance](#defining-target-performance) - - [Benchmark score using performance profiles](#benchmark-score-using-performance-profiles) - - [Benchmark Procedure](#benchmark-procedure) -- [Model Track](#model-track) - -## Introduction - -We need a more scientifically sound methodology for evaluating training speedups due to new algorithms, including both new optimizers and new model architectures. Cutting edge machine learning (ML) models are exceeding the compute budgets of many researchers, and ML compute is becoming a larger and larger cost in industry. To reduce the compute and potentially environmental cost of ML research and practice, we need rigorous benchmarking of efficiency. Such benchmarks will guide us in selecting the best directions to evolve existing techniques and ultimately enable progress toward models that produce not only better results, but better results **at lower cost**. - -MLCommons' mission is to build fair and useful benchmarks for measuring training and inference performance of ML hardware, software, and services. Improvements in training speed can come from better hardware, better software stacks, and better algorithms. -To date, the Closed Division of the MLPerf™ Training benchmark has been extremely successful in driving systems innovation by requiring mathematical equivalence to a reference implementation, while still allowing submissions on different hardware. Although the Open Division allows new models and training algorithms, it has several issues that make it inappropriate as a benchmark for progress in training algorithms. By allowing arbitrary hardware, it is impossible to isolate improvements due to algorithms or due to extra computation. Unrestricted hardware makes the benchmark only accessible to the most well-funded organizations, even if many academic labs and others have interesting algorithms to measure. Finally, even if we could isolate improvements due to particular algorithmic changes and make the benchmark more broadly accessible, there is still no incentive to avoid hyper-specific changes that only help the particular benchmark workload. - -In order to drive innovation in machine learning algorithms that reduce the time needed to create useful models, we propose a new set of benchmarks called **AlgoPerf** to evaluate the training time for different algorithms (models, optimizers, preprocessing, etc.) on a **fixed hardware configuration** (future iterations can adopt new hardware configurations as needed). Our proposal includes two tracks: (1) the [Training Algorithm Track](#training-algorithm-track) and (2) the [Model Track](#model-track). The goal of the Training Algorithm Track is to find training algorithms (optimizers, etc.) that train benchmark models to reach the goal out-of-sample error rate as fast as possible. However, to incentivize practically useful algorithms, in the Training Algorithm Track we require that a single training algorithm simultaneously performs well across all benchmark models and datasets. Similarly, the goal of the Model Track is to find models that can be trained to achieve the target solution quality (out-of-sample error) in the least amount of time on each benchmark dataset. Although submissions in the Model Track will be inherently dataset-specific, we sharply constrain what parts of the training program can be modified in the Model Track and require submitted models to be easily trainable using standard optimizers. Thus the two-track structure discourages overly specific solutions that aren't generally useful to practitioners and will hopefully produce evidence on the relative returns of speeding up training by finding new models or by developing new training algorithms. - -## Training Algorithm Track - -The goal of the **AlgoPerf: Training Algorithm Track** is to reach the same results faster ("time to result") by using better optimizers, data ordering/weighting schemes, and weight update strategies while producing techniques that work well on a wide variety of models and datasets. We hope to encourage generally useful training algorithms that are not specific to only a small number of particular workloads. - -In general, submissions to the Training Algorithm Track will replace specific pieces of a reference implementation in order to produce a training program that reaches the same results faster on as many workloads as possible. The training program has a fixed, high-level structure and competitors are allowed to replace a particular set of functions in the program (the [**submission functions**](#submission-functions)), but must leave all other pieces ([**fixed functions**](#fixed-functions) and high-level structure) of the reference implementation unchanged. The submitted code must perform well on multiple datasets and models simultaneously (a model and dataset pair constitute a [workload](#workloads) for the purposes of this track). - -Submissions to the Training Algorithm Track can be entered under two separate rulesets, named [external tuning ruleset](#external-tuning-ruleset) and [self-tuning ruleset](#self-tuning-ruleset), with it being possible to submit to both rulesets. The main difference is that the external tuning ruleset allows moderate, automatic, parallel tuning of the optimizer's hyperparameters on each workload, using the submitted workload-agnostic search space. This allows the training algorithm to adapt to a particular task while ensuring that it is not too difficult to tune automatically. Under the self-tuning ruleset, there is no external tuning and submissions need to adapt to a particular task autonomously within a single optimization run. Unless otherwise specified, the rules in this section apply to both rulesets (see, for example, the [Tuning](#tuning) section for the most substantial difference between the rulesets). - -The intention is that a training algorithm submission will be broadly applicable and useful without customization to the specific [workload](#workloads) (model, dataset, loss function). We want to discourage detecting the particular workload and doing something highly specific that isn't generally useful. In order to further discourage submissions that overfit to the particular [fixed benchmark workloads](#fixed-workloads), submissions will also be evaluated on [held-out workloads](#randomized-workloads) specified after the submission deadline. - -For a description of how to submit a training algorithm to the AlgoPerf: Training Algorithms Benchmark, see the [Call for submissions](CALL_FOR_SUBMISSIONS.md), which details the entire competition process. - -### Submissions - -A valid submission is a piece of code that defines all of the submission functions and is able to train all benchmark workloads on the [benchmarking hardware](#benchmarking-hardware) (defined in the [Scoring](#scoring) section). Both the validation set and the test set performance will be checked regularly during training (see the [Evaluation during training](#evaluation-during-training) section), however, only the validation performance is relevant for scoring. Training halts when the workload-specific [target errors](#defining-target-performance) for the validation and test sets have been reached. For each workload, only the training time to reach the *validation* set target error is used as input to the [scoring process](#scoring) for the submission. Submissions using [external tuning](#external-tuning-ruleset) will be tuned independently for each workload using a single workload-agnostic search space for their specified hyperparameters. The tuning trials are selected based on the time to reach the *validation* target. Submissions under either tuning ruleset may always self-tune while on the clock. - -#### Specification - -Any function defined in the reference implementations that isn't a [submission function](#submission-functions) is a [fixed function](#fixed-functions) for the Training Algorithm Track. No submitted code is run to compute the evaluation metrics in the Training Algorithm Track. We just use the final model parameters and the fixed functions from this track at test time. - -In principle, submissions are allowed to use the available hardware systems in any data- or model-parallel manner they desire, within the constraints of the submission function APIs. However, in practice, model-parallelism may not be possible with the API. They are allowed to access any framework-specific device information necessary to exploit the hardware. - -Submissions provide a [per-workload batch size](#batch-size-getter) to use. Specification of the batch size for each workload is necessary to avoid running out of memory for different workloads. Therefore, submitters can determine this batch size in advance and specify it as part of the submission. Submitters may also provide per-workload batch sizes for all [randomized workloads](#randomized-workloads). If no such batch size is provided for a randomized workload, by default, submissions will then use the batch size of the most similar [fixed workload](#fixed-workloads) (for example, if there is an ImageNet fixed workload and also a randomized workload with a similarly sized model on similarly sized images, the ImageNet batch size will be used for held-out workloads generated from this randomized workload). - -The **submission functions** are the *batch size getter*, *optimizer state initializer*, *variable update*, and *data selection functions*. The *fixed functions* are the *data augmentation/preprocessing*, *model initialization*, *forward pass*, and *loss function*. The trained model will be evaluated in a separate step that does not call any of the submitted code. - -##### Fixed functions - -With the exception of `_build_input_queue`, submitters can call any of these functions (along with any public function in the provided `Workload` instance) at any time in their submitted functions. - -```python -@property -def step_hint(self): -> int -``` - -- The `step_hint` function gives the number of global steps the baseline algorithm was allowed to use to reach the targets for a workload. Note that the baseline algorithms may have reached the target in fewer steps than this, but these were the max number of steps the baseline algorithms used for their learning rate schedules. Submitters can use this to help specify learning rate (or other) schedules. - -###### Data augmentation and preprocessing - -```python -def _build_input_queue( - self, - data_rng: RandomState, - split: str, - data_dir: str, - global_batch_size: int) -> Iterator[Dict[str, Tensor]]: -``` - -- The `_build_input_queue` function will be called to produce the iterator over batches that the submitted data selection function consumes. It is responsible for all data reading, shuffling, repeating, preprocessing, and batching. Note that for Jax this should return an iterator over tensors of shape `(num_devices, per_device_batch_size, ...)`, and for PyTorch this should return tensors of shape `(per_device_batch_size, ...)` (assuming PyTorch's [DDP](https://pytorch.org/docs/stable/notes/ddp.html) is used). - -###### Model initialization - -```python -def init_model_fn( - self, - rng: RandomState, - dropout_rate: Optional[float] = None, - aux_dropout_rate: Optional[float] = None -) -> initial model parameters -``` - -- Unlike in the [Model Track](#model-track), this function that initializes the parameters of the model, is fixed. While it can be called by the submission (e.g. to restart the model after a failed training effort) it cannot be changed. - -###### Forward pass - -```python -def model_fn( - self, - params: ParameterContainer, - augmented_and_preprocessed_input_batch: Tensor, - model_state: ModelAuxiliaryState, - mode: ForwardPassMode, # mode \in {train, eval} - rng: RandomState, - hyperparameters: Hyperparameters, - update_batch_norm: bool -) -> (logits_output_batch, new_model_state): Tuple[Tensor, ModelAuxiliaryState] -``` - -- `params` is whatever the structure is that contains the (`float32`) model parameters. The naming is overloaded due to having to handle the more object-oriented `PyTorch` style and the functional `JAX` style of development. In the `Flax` library (written in `JAX`), this is typically a nested dictionary of `JAX`/`numpy` arrays, but in `PyTorch` this is the `torch.nn.Model`. -- It is possible that `model_parameters` will be endowed with additional information about the kind of each parameter, e.g. "weights" or "bias" or "batch norm", although `model_fn` does not really need that information we might use the same nested structure elsewhere -- `logits_output_batch` is before the output activation -- `new_model_state` is for batch norm or similar side effects and will only be updated if `update_batch_norm` is set -- `hyperparameters` will contain only dropout rates, which will be used in the models that support it. These can be tuned or will default to documented model-specific values. Note that adding additional dropout would be considered changing the model, which is not allowed, but the tuning of dropout in existing dropout layers can be considered a regularizer, so we allow it. There should be at most two dropout rates in a model (if there are more than two we will reuse the same values). - -###### Loss function - -```python -def loss_fn( - self, - # Dense or one-hot labels, or a tuple of (tensor, padding) for speech. - label_batch: Union[Tuple[Tensor, Tensor], Tensor], - logits_batch: Union[Tuple[Tensor, Tensor], Tensor], - mask_batch: Optional[Tensor] = None, - label_smoothing: float = 0.0) -> Dict[str, Tensor] # differentiable -``` - -- Unlike in the [Model Track](#model-track), we will specify the loss function name in order to let training algorithms depend on the loss function. It will be one of {**mean squared error**, **cross-entropy**, **CTC**, or **L1 reconstruction error**}. - - The optimizer must work with all values of the enum, which will be provided via a property on the workload object that is provided to all submissions functions. -- The loss function does **not** include regularization. Instead, regularization can be added by the submissions in the `update_params` function. -- The loss function returns a dict {'summed': scalar summed loss, 'n_valid_examples': scalar number of valid examples in batch, 'per_example': 1-d array of per-example losses}. - Note that the returned quantities are not synced across devices; this can be done by the user in the `update_params` function. - -##### Submission functions - -###### Batch size getter - -```python -def get_batch_size(workload_name: str) -> int -``` - -- Submitters define a specific batch size for each [workload](#workloads). -- For example, in advance, they can determine the largest batch size without running out of memory for each workload. -- For the [held-out workloads](#randomized-workloads), submitters may provide a batch size once the submission code is frozen and the held-out workloads are sampled from the randomized workloads. By default, this function will use the `workload_name` of the fixed workload it is based on. - -###### Optimizer state initializer - -```python -def init_optimizer_state( - workload: Workload, - model_params: ParameterContainer, - model_state: ModelAuxiliaryState, - hyperparameters: Hyperparameters, - rng: RandomState -) -> initial_optimizer_state -``` - -- Allowed to create state for the optimizer -- Does not involve the initialization for the model parameters, which in the Training Algorithm Track, is considered a fixed function, see [Model initialization](#model-initialization). -- The optimizer state is a dictionary (`Dict[str, Any]`). For a PyTorch submission, any value in this dictionary which is a class instance with internal state has to have a `state_dict()` method implemented to be stored correctly at the training checkpoints. - -###### Variable update function - -```python -def update_params( - workload: Workload, - current_param_container: ParameterContainer, - current_params_types: ParameterTypeTree, - model_state: ModelAuxiliaryState, - hyperparameters: Hyperparameters, - batch: Dict[str, Tensor], - loss_type: LossType, - optimizer_state: OptimizerState, - eval_results: List[Tuple[int, float]], - global_step: int, - rng: RandomState -) -> (updated_optimizer_state, updated_variables, updated_model_state) -``` - -- `current_param_container` is the same kind of nested structure as used by `model_fn` which constitutes a nested collection of `float32` arrays, each endowed with information about what kind of parameter that array represents stored in a parallel structure of `current_params_types`. - - Parameter kind is one of {"weights", "biases", "embeddings", "conv", "batch norm"}. -- `model_state` holds auxiliary state necessary for some models, such as the current batch norm statistics. -- The loss function will be one of a small set of known possibilities and the update function is allowed to branch on the `loss_type` enum/name. -- The `loss_fn` produces a loss per example and a summed loss (both only for one device), which both can be used. -- Allowed to update state for the optimizer. -- Uses the `model_fn` of the `workload` in order to decouple the loss from the model so that model outputs (forward passes) can be reused (by storing them in the optimizer state). -- The submission can access the target evaluation metric via the `workload` variable. -- **A call to this function will be considered a step** - - The time between a call to this function and the next call to this function will be considered the per-step time. -- Cannot modify the given hyperparameters in a workload-conditional way (please see the [Valid submission](#valid-submissions) section). This rule is intended to prohibit circumventing the tuning rules by looking up a pre-tuned optimal set of hyperparameters for each workload. It is not intended to prohibit line searches and other similar techniques. - - This will be checked by the spirit jury. -- The fixed `init_model_fn` can optionally be called during training, for example, to reinitialize the model after a failed training effort. -- Cannot replace the model parameters with pre-trained ones. - - This will be checked by the spirit jury. -- This API supports Polyak averaging and similar methods that implement moving averages of model parameters. -- Batch norm should work here because the `model_fn` will return updated batch norm moving averages when it is told to with `update_batch_norm`. - -###### Data selection - -```python -def data_selection( - workload: Workload, - input_queue: Iterator[Tuple[Tensor, Tensor]], - optimizer_state: OptimizerState, - current_param_container: ParameterContainer, - hyperparameters: Hyperparameters, - global_step: int, - rng: RandomState -) -> Dict[str, Tensor] -``` - -- `input_queue` can yield up to the number of elements in the training dataset -- Want to allow for submitters to construct their own data batches from the dataset -- Submissions are allowed to arbitrarily modify the input examples, as long as the modifications are sufficiently generic to be applicable to any workload -- This is only called on the training inputs. **No submitted code will be called at eval in the training track.** -- This allows for any of the following methods: - - Data echoing - - Curriculum learning - - Bootstrapping - - Biased sampling (based on loss values, so need to store the forward pass in the `optimizer_state`, potentially forward pass of a cheaper proxy model) - - Submissions need batching control - -#### Evaluation during training - -In general, with noisy, non-deterministic training, evaluation frequency can affect training time measurements as more "bites of the apple" potentially allows the training code to exploit instability. We also want to discourage submissions from complicated and unrealistic logic that attempts to guess when training is close to complete and increases the evaluation rate, while not producing a well-sampled training curve at the start of training. Simply allowing submissions complete freedom over evaluation frequency encourages competitors to work to minimize the number of evaluations, which distracts from the primary goal of finding better training algorithms. - -Submissions are eligible for an untimed eval every `eval_period` seconds, run as soon as the current call of `update_params` completes. Any additional evaluations performed by the submission code count against the runtime for scoring. The harness that runs the submission code will attempt to eval every `eval_period` seconds by checking between each submission step (call of `update_params`) whether it has been at least `eval_period` seconds since that last eval and, if so, pausing the clock and running an eval. This means that if calls to `update_params` typically take a lot more than `eval_period` seconds, such submissions will not receive as many untimed evals as a submission that had an `update_params` function that took less time. However, for appropriate settings of `eval_period`, we expect this to be quite rare. Submissions are always free to restructure their `update_params` code to split work into two subsequent steps to regain the potential benefits of these untimed model evaluations. For each workload, the `eval_period` will be set such that the total evaluation time is roughly between 10% and 20% of the total training time for the target-setting runs. - -#### Valid submissions - -The intention of this benchmark is to identify training algorithm submissions that will be broadly applicable and effective in practical scenarios without customization to the specific [workload](#workloads) (model, dataset, and loss function). Generally useful training algorithms can train models faster and thus require less compute resources, decreasing the cost of machine learning. We want to discourage all submissions that sidestep the purpose of this benchmark. - -We reserve the right to disqualify submissions if they clearly violate this spirit of the benchmark, even if those submissions perform well in our benchmark. Unfortunately, we can't easily write rules that make it completely clear if a submission is circumventing the spirit of the benchmark in a way that would encompass all possible cases. Instead, we will have to prohibit these activities in the abstract and defer rulings about specific submissions to a **"spirit [of the rules] jury"** that can hear the justifications of the submitters, inspect the code, and ultimately decide if the spirit of the rules has been violated. The jury might also ask the submitters to explain how the submission was produced, for example, by disclosing their intermediate experiments. - -We want to state clearly that we welcome creative ideas and novel research. Therefore, the API aims to allow a wide variety of submissions, however, in some cases, routines that would be allowed in principle might not be practically feasible in the provided framework. The spirit jury, however, will only be invoked for submissions that aim to bypass the core premise of this benchmark since submissions like this would also be irrelevant in practice. - -In order to help clarify which submissions are [allowed](#allowed-submissions) and [disallowed](#disallowed-submissions), we described a few examples below. Two essential questions can help provide a general guideline for whether a submission is allowed or not: - -1. What **information** is being used by the submission? -2. What **action** is the submission code taking based on this information? - -In general, both parts are needed to decide if a particular piece of code is within the spirit of the rules. For example, it is fine to use the shape information of the model parameters to switch between a low-memory and a high-memory approximation, but it isn't allowed to use this shape as a "fingerprint" to uniquely identify a workload and then use pre-computed hyperparameters for this specific workload. As a rule of thumb, submissions are allowed if it is reasonable to assume that the method will work comparably well on unseen workloads automatically without requiring human engineering labor. - -##### Allowed submissions - -Submissions are allowed to use the provided model parameter information, e.g. the shapes and types of the layers, if the resulting action works on generic workloads. - -
-Examples: - -- Using shape information of the parameters to switch between low-memory and high-memory routines is allowed. -- Using shape information of the parameters to conditionally construct variables to avoid running out of memory, e.g. by approximating larger matrices, is allowed. -- Using the ordering of the parameters to train deeper layers differently, e.g. training them sequentially, is allowed. -- Submissions are allowed to use the layer type to change the update rules, e.g. use a different update rule for all batch normalization layers, or use different sub-routines for each layer type, e.g. compute variances for convolutional layers but not for batch normalization layers. - -
-
- -Automatic methods for determining or dynamically setting hyperparameters are allowed if they function on generic workloads. - -
-Examples: - -- Submissions are allowed to use automatic procedures for setting hyperparameters, e.g. automated learning rate range tests. -- Inner-loop tuning methods for setting hyperparameters, e.g. line searches, are allowed. -- Changing the batch size dynamically during training. - -
-
- -Submissions can also be based on learned training algorithms. - -
-Examples: - -- Submission are allowed to learn the update rule of the training method. -- In the [self-tuning ruleset](#self-tuning-ruleset), submissions could try out a learned list of hyperparameters. - -
-
- -Submissions can use additional software dependencies provided they have the intention of supporting new algorithmic and mathematical ideas. The procedure for adding dependencies is described in more detail in the [Software dependencies](#software-dependencies) section. - -
-Examples: - -- [`BackPACK`](https://docs.backpack.pt/en/master/index.html) is a `pip` package that hooks into `PyTorch` to extract additional information from the backward pass. An allowed use of `BackPACK` would be to compute batch statistics (e.g. within-batch gradient variances, etc.) to calibrate or auto-tune training algorithms. - -
- -##### Disallowed submissions - -Submissions are not allowed to circumvent the tuning rules by looking up the result of an offline computation that was performed ahead of time. - -
-Examples: - -- Submissions are not allowed to look up (pre-trained) model parameters. -- Computing the optimal hyperparameters for every fixed workload offline and having the submission look up those pre-computed values (and finding the closest fixed workload for a held-out workload) is not allowed. In contrast, finding and hard-coding a single good setting of the hyperparameters that works well across all the workloads simultaneously would be allowed. -- Submissions are not allowed to adjust the hyperparameter search spaces for the external tuning ruleset, such that it differs between the workloads. - -
-
- -Submissions are not allowed to detect the particular workload (irrespective of which information they use to this end) in order to use settings that are specified for individual workloads. This would result in highly specific behavior that isn't generally useful. This also extends to learned approaches that ultimately detect specific workloads. In general, all else being equal, if some submission was written that was extremely effective on a small set of the workloads (and far worse on the rest) and another submission with the opposite performance pattern, we would prefer both submissions to be submitted and tested on **all** workloads. - -
-Examples: - -- A hard-coded switching of the update rule based on the workload is not allowed, e.g. using Adam for RNNs and SGD with momentum on CNNs. Although submissions can specialize for certain layer types in generic ways, they should not uniquely identify a model or dataset. In other words, if there are two workloads A and B that both have convolutional layers and fully connected layers the submission shouldn't detect whether it is dealing with A or B specifically and choose Adam for one and SGD with momentum for the other. However, if the updates for all parameters of convolutional layers always used SGD with momentum and the updates for all other layers always used Adam and a workload with both types of layers had mixed updates, that would be fine. -It is also allowed to make the update rule part of the (external) hyperparameter tuning or determine the optimal update rule during the run, i.e. while "on-the-clock". -- Submissions are not allowed to look up learning rate schedules that are only utilized for specific subsets of the workloads. It is allowed to use one general learning rate schedule or dynamically adapt the learning rate based on general information such as curvature. - -
-
- -It is not allowed to compute any kind of pairwise metrics between the fixed workloads and the held-out workloads. - -
-Examples: - -- On a held-out workload, submissions are not allowed to find the nearest neighbor among the fixed workloads to set any hyperparameter. - -
-
- -Valid submissions must rely on new algorithmic or mathematical ideas and should not use software engineering approaches to speed up primitive operations in `PyTorch`, `JAX`, their dependencies, the operating system, or the hardware. We recognize that the way a method is implemented will impact its performance in the benchmark. It is generally acceptable to make clever, judicious, and efficient use of public APIs in `JAX` and/or `PyTorch` from within the submission function APIs. It is not acceptable to use these APIs to optimize the internals of primitive operations and standard dependencies in ways that could generally benefit any submission. - -
-Examples: - -- Submissions are allowed to use `CUDA` streams to schedule operations, e.g., transfering data between CPU and GPU, or among GPUs, while performing other computations. -- Submissions are not allowed to use `CUDA` streams or asynchronous operations (e.g., spawning additional threads) to perform additional computations that run during the [untimed evaluations](#evaluation-during-training). -- Submissions are not allowed to use faster GPU kernels than other submitters by writing their own, using `TVM`, or using a different version of `cuDNN`/`cuBLAS`. -- Submissions are not allowed to skip or reduce system or framework overhead, such as modifying `JAX` to skip internal steps like pytree flattening/unflattening. -- Submissions are not allowed to introduce new compiler optimizations, such as modifying `XLA` to perform more or less kernel fusion. - -
- -##### Software dependencies - -We require submissions to use specific versions of `PyTorch`/`JAX` as well as additional dependencies in order to facilitate fair comparisons. Submitters must build on top of these provided software packages, which might be provided as a `Docker` container. Additional dependencies can be added as long as they include a comment describing what was added and why. Submitters are free to add dependencies that support new algorithmic and mathematical ideas but they should not circumvent the intention of the benchmark to measure training speedups due to new training methods. For example, software engineering techniques that lead to faster implementations of existing software, e.g. using newer versions of `PyTorch` or `JAX`, are not allowed and these are described in more detail in the [Disallowed submissions](#disallowed-submissions) section. In case of doubts, these additional dependencies will be judged by the spirit jury. - -### Tuning - -Tuning will be substantially different for the [external](#external-tuning-ruleset) and the [self-tuning ruleset](#self-tuning-ruleset) and the individual specifications for each will be described in the following. - -#### External tuning ruleset - -For each workload, the hyperparameters are tuned using $O=20$ tuning **trials**. To estimate the variance of the results, this tuning will be repeated for $S=5$ **studies**, for a total of $S\cdot O = 100$ different hyperparameter settings. The submitters will provide a workload-agnostic search space and the working group will then return $100$ hyperparameters settings obtained using [(quasi)random search](https://arxiv.org/abs/1706.03200). The working group will also randomly partition these $100$ trials into $5$ studies of $20$ trials each. In lieu of independent samples from a search space, submissions can instead supply a fixed list of $20$ hyper-parameter points that will be sampled without replacement. - -In each trial, the tuning trial with the fastest training time to achieve the *validation target* is determined among the $O=20$ hyperparameter settings. For scoring, we use this required training time to reach the *validation targets* of those $5$ selected runs. The median of these $5$ per-study training times will be the final training time for the submission on this workload and is used in the scoring procedure (see the "[Scoring submissions](#scoring)" section). Runs that do not reach the target performance of the evaluation metric have an infinite time. Submissions are always free to perform additional self-tuning while being timed. - -#### Self-tuning ruleset - -Submissions to this ruleset are not allowed to have user-defined hyperparameters. This ruleset allows both submissions that use the same hyperparameters for all workloads, including the randomized ones (e.g. Adam with default parameters), as well as submissions that perform inner-loop tuning during their training run (e.g. SGD with line searches). - -Submissions will run on one instance of the [benchmarking hardware](#benchmarking-hardware). As always, submissions are allowed to perform inner-loop tuning (e.g. for their learning rate) but the tuning efforts will be part of their score. A submission will run *S=5* times and its score will be the median time to reach the target evaluation metric value on the validation set. To account for the lack of external tuning, submissions have a longer time budget to reach the target performance. Compared to the [external tuning ruleset](#external-tuning-ruleset), the `max_runtime` is tripled. Runs that do not reach the target performance of the evaluation metric within this allotted time budget have an infinite time. - -### Workloads - -For the purposes of the Training Algorithm Track, we consider a workload the combination of a `dataset`, `model`, `loss_fn`, along with a target that is defined over some evaluation metric. E.g., ResNet50 on ImageNet using the cross-entropy loss until a target error of 22.6% on the validation set has been reached, would constitute a workload. The evaluation metric, in this example the misclassification error rate, is directly implied by the dataset/task. - -Submissions will be scored based on their performance on the [fixed workload](#fixed-workloads). However, additionally submissions must also perform resonably well on a set of [held-out workloads](#randomized-workloads) in order for their score on the fixed workload to count (for full details see the [Scoring](#scoring) section). These held-out workloads will be generated after the submission deadline, but their randomized generating process is publicly available with the call for submissions (see "[Randomized workloads](#randomized-workloads)" section). - -Furthermore, a less computationally expensive subset of the fixed workloads is collected with the [qualification set](#qualification-set). Submitters without enough compute resources to self-report on the full set of fixed and held-out workloads can instead self-report on this smaller qualification set. Well-performing submissions can thereby qualify for computational resources provided by sponsors of the benchmark to be scored on the full benchmark set. +- [Goal](#goal) +- [Sponsor](#sponsor) +- [Eligibility](#eligibility) +- [Competition Period](#competition-period) +- [Agreement to Official Rules](#agreement-to-official-rules) +- [How to Enter](#how-to-enter) +- [Submission Conditions](#submission-conditions) +- [Software Dependencies](#software-dependencies) +- [Scoring](#scoring) +- [Submissions](#submissions) +- [Optional](#optional) +- [Physical Review](#physical-review) +- [Notification](#notification) +- [Prizes](#prizes) +- [Prize Conditions](#prize-conditions) +- [Jurisdiction](#jurisdiction) +- [Cancellation and Modification](#cancellation-and-modification) +- [Publicity](#publicity) +- [Privacy](#privacy) +- [Official Rules and Winners List](#official-rules-and-winners-list) -#### Fixed workloads +## Goal -The fixed workloads are fully specified with the call for submissions. They contain a diverse set of tasks such as image classification, machine translation, speech recognition, or other typical machine learning tasks. For a single task there might be multiple models and therefore multiple fixed workloads. The entire set of fixed workloads should have a combined runtime of roughly 100 hours on the [benchmarking hardware](#benchmarking-hardware). +To discover new training algorithms that can train general (not customized) neural networks faster. Sponsor will use an objective measuring program to allocate a score to each entry (“Submission”) and determine [xxx] winners, each of which will be eligible to win a prize. -The currently eight fixed workloads are: +## Sponsor -| | **Task** | **Dataset** | **Model** | **Loss** | **Metric** | Validation
**Target** | Test
**Target** | Maximum
**Runtime**
(in secs) | -|------------|-------------------------------|-------------|-------------------------|----------|------------|--------------------------|----------------------|------------------------| -| **1** | Clickthrough rate prediction | Criteo 1TB | DLRMsmall | CE | CE | 0.123649 | 0.126060 | 21,600 | -| **2** | MRI reconstruction | fastMRI | U-Net | L1 | SSIM | 0.7344 | 0.741652 | 10,800 | -| **3
4** | Image classification | ImageNet | ResNet-50
ViT | CE | ER | 0.22569
0.22691 | 0.3440
0.3481 | 111,600
111,600 | -| **5
6** | Speech recognition | LibriSpeech | Conformer
DeepSpeech | CTC | WER | 0.078477
0.1162 | 0.046973
0.068093 |
72,000 | -| **7** | Molecular property prediction | OGBG | GNN | CE | mAP | 0.28098 | 0.268729 | 12,000 | -| **8** | Translation | WMT | Transformer | CE | BLEU | 30.8491 | 30.7219 | 80,000 | +This Competition (“Competition”) is sponsored by [MLCommons and ___________], [public facing address]. -#### Randomized workloads +## Eligibility -In addition to the [fixed and known workloads](#fixed-workloads), there will also be randomized workloads in our benchmark. These randomized workloads will introduce minor modifications to a fixed workload (e.g. small model changes). The exact instances of these randomized workloads will only be created after the submission deadline and are thus unknown to both the submitters as well as the benchmark organizers. The instructions for creating them, i.e. providing a set or distribution of workloads to sample from, will be defined by this working group and made public with the call for submissions, to allow the members of this working group to submit as well as ensure that they do not possess any additional information compared to other submitters. We will refer to the unspecific workloads as *randomized workloads*, e.g. the set or distribution. The specific instance of such a randomized workload we call a *held-out workload*. That is, a held-out workload is a specific sample of a randomized workload that is used for one iteration of the benchmark. While we may reuse randomized workloads between iterations of the benchmark, new held-out workloads will be sampled for each new benchmark iteration. +The Competition is open to English-speaking individuals and teams (made of individuals), who are the age of majority as of the Competition start date, have internet access, a GitHub account in good standing, and can legally participate in this Competition (“Teams”). A Team may have unlimited participants, but all names must be entered. ML Commons Chairs and Sponsor’s associated institutions are not eligible for prizes, but may participate. No natural person can be on multiple teams. This Competition is void wherever such competitions are prohibited. This Competition is subject to all applicable laws, including national, state, and local laws. -The held-out workloads function similarly to a holdout test set discouraging submissions that overfit to the [fixed and known workloads](#fixed-workloads). After the submission deadline, a third party will draw samples from the randomized workloads (e.g. from the set or the distribution) to generate a specific set of held-out workloads. The validation and test targets on each held-out workload will be defined using the [same protocol as the fixed workloads](#defining-target-performance) (with the only change being that only two target-setting training algorithms are used instead of four, to save computational resources) using the same training time budget as the fixed workload they are based on. +## Competition Period -Modifications could, for example, include changing the number of layers or units (drawn from an interval), swapping the activation function (drawn from a set of applicable functions), or using different data augmentations (drawn from a list of possible pre-processing steps). The sample space should be wide enough to discourage submitters from simply trying them all out, but at the same time should be restricted enough to produce realistic workloads with acceptable achievable performances. +The Competition begins at 12:01am (ET) on [date] and ends at 11:59pm (ET) on [date], all according to Sponsor’s time clock, which decisions are final (the “Competition Period”). There are several deadlines contained within the Competition Period: -In the first iteration of this benchmark, we manually designed three different workloads variants for each fixed workload. The variants are designed such that they achieve a comparable performance to the fixed workload and that they might require different hyperparameters to achieve this performance. After the submission deadline, one held-out workload will be sampled for each fixed workload. + • Intention to Submit. You must register your Intention to Submit no later than 11:59pm ET on [date]. + • Submission Period. You must complete your Submission and enter it no later than 11:59pm ET on [date]. + • Deadline for specifying the Submission batch sizes for held-out workloads. 11:59pm ET on [date] + • Deadline for self-reporting results: 11:59pm ET on [date] -Our scoring procedure uses the held-out workloads only to penalize submissions that can't handle the introduced modifications (see the [Scoring](#scoring) section for further details). +## Agreement to Official Rules -#### Qualification set +By participating, Teams agree to be fully unconditionally bound by these Rules, and you represent and warrant that you meet the eligibility requirements set forth herein. In addition, you agree to accept the decisions of Sponsor, as final and binding, and waive any right to claim ambiguity in the Competition or these Rules. -The qualification set is designed for submitters that may not have the compute resources to self-report on the full set of [fixed](#fixed-workloads) and [held-out workloads](#randomized-workloads). They may instead self-report numbers on this smaller qualification set. The best-performing submissions may then qualify for compute sponsorship offering a free evaluation on the full benchmark set and therefore the possibility to win [awards and prize money](/SUBMISSION_PROCESS_RULES.md#awards-and-prize-money). +## How to Enter -The qualification set consists of the same [fixed workloads](#fixed-workloads) as mentioned above, except for both workloads on *ImageNet*, both workloads on *LibriSpeech*, and the *fastMRI* workload. The remaining three workloads (*WMT*, *Criteo 1TB*, and *OGBG*) form the qualification set. There are no [randomized workloads](#randomized-workloads) in the qualification set. The qualification set of workloads aims to have a combined runtime of roughly 24 hours on the [benchmarking hardware](#benchmarking-hardware). +There are five (5) steps to a successful submission (“Submission”). -For the [external tuning ruleset](#external-tuning-ruleset), we will only use $1$ study instead of the proposed $5$, when evaluating on the qualification set. The [self-tuning ruleset](#self-tuning-ruleset) will use $5$ studies on the qualification set as well since it is computationally cheaper. +Register Intent to Submit. Registration of intent does not obligate you to enter a Submission, but you must register prior to entering your Submission. Click for the Intent Form. This is your “Team,” even if you are a single person. Please note that natural persons may not be on multiple teams, but each Team may enter multiple Submissions. -### Scoring +Develop your Submission. Develop your Submission according to the guidelines set forth in these rules, along with the links to various necessary information. Please note that all Submissions must be entered subject to the Apache 2.0 license. In order to develop your Submission, you must: -Submissions will be scored based on their required training time to reach the target performance on the validation set of each workload. This target performance metric can be the same as the loss function but might also be a different workload-specific metric such as the error rate or BLEU score. The target performance was defined using four standard training algorithms, see the "[Defining target performance](#defining-target-performance)" section for more details. The training time of a submission includes the compilation times for computation graphs and ops that could happen just-in-time during training; all our benchmarks should be fast enough to compile so as not to dramatically impact overall performance. The overall ranking is then determined by summarizing the performances across all [fixed workloads](#fixed-workloads), using [performance profiles](#benchmark-score-using-performance-profiles), as explained below. +Fork the Benchmark Codebase. Begin by creating a (public or private) GitHub repository for your contest submission. Once you submitted, this repository must be a clone of the frozen main branch of the benchmark codebase. Ensure that all elements of the original codebase remain unaltered, with the exception of the /submission directory. -The training time until the target performance on the test set was reached is not used in the scoring procedure but might be used for additional analysis of the competition results. +Preserve the Apache 2 License. You must maintain the same Apache 2 License for your repository as the benchmark codebase. This means you may not change the licensing terms. Submissions that change the terms or otherwise fail to maintain the license, will be deemed ineligible submissions. -#### Benchmarking hardware +Define Software Dependencies. If your Submission will have any software dependencies, you must create a requirements.txt file in the /submission directory. This file must clearly list all software dependencies your Submission requires in order to be a valid Submission. File must be "pip readable" (the dependencies listed can be installed via the pip install -r requirements.txt command). You may not modify the package versions of the software dependencies used by the benchmarking codebase, including using a different version of libraries such as PyTorch or JAX from those specified in the benchmark. -All scored runs have to be performed on the benchmarking hardware to allow for a fair comparison of training times. The benchmarking hardware has to be chosen to be easily accessible via common cloud computing providers. The exact hardware specification will be specified in the call for submissions and will most likely change with each iteration of the benchmark. As a placeholder, we are currently planning with 8xV100 GPUs with 16GB of VRAM per card, e.g. the [p3.16xlarge instance on AWS](https://aws.amazon.com/ec2/instance-types/) or the [NVIDIA V100 8 GPUs instance on GCP](https://cloud.google.com/compute/docs/gpus#nvidia_v100_gpus). +Complete Your Submission Forms. Please complete the following agreements [links to tm/cla here] (“Agreements”). You will need to attach them to the email in which your Submission is sent to the working group chairs, who will process your Submission. Failure to complete the proper Submission Forms will results in disqualification of your Submission. -For self-reported results, it is acceptable to perform the tuning trials on hardware different from the benchmarking hardware, as long as the same hardware is used for all tuning trials. Once the best trial, i.e. the one that reached the *validation* target the fastest, was determined, this run has to be repeated on the competition hardware. For example, submitters can tune using their locally available hardware but have to use the benchmarking hardware, e.g. via cloud providers, for the $5$ scored runs. This allows for a fair comparison to the reported results of other submitters while allowing some flexibility in the hardware. +Submit Your Entry. During the Submission Period, once your Submission and your is complete, send an email to the working group chairs at [emailaddress] containing the URL of your GitHub repository, along with the name [BENCHMARK COMPETITION 2024] in the subject line, and attach all Agreements. At the close of the Submission Period, your GitHub repository must be public. [We might want to simplify this process by having them fill out another form (instead of asking them for an email). Would it be okay to change this after starting the competition? Yes.] -#### Defining target performance +Define the batch sizes for held-out workloads. Once the held-out workloads have been sampled, you have until the "Deadline for specifying the submission batch sizes for held-out workloads" to define the batch sizes for the held-out workloads via the get_batch_size function of your submission. -Target performances on the validation and test sets will be defined for each [workload](#workloads) separately. For the [fixed workloads](#fixed-workloads), we take the best performance achievable by one of four standard algorithms (AdamW, NadamW, Nesterov Momentum, and Heavy Ball Momentum). These target-setting algorithms will follow the general process of the external tuning ruleset, with a slightly larger tuning budget of $200$ trials to guarantee competitive performance. Once the best algorithm and its hyperparameters are determined, training is repeated $20$ times. The median of the best achieved validation errors across seeds is used as the *validation* target. Out of the $10$ repeated runs that achieved this validation target, we took the worst achieved test error across seeds as our *test* target. Taking the median validation performance after rerunning the best hyperparameter point prevents our procedure from selecting a lucky outlier. -To save computational resources, we only tuned two training algorithms instead of four, for the [randomized workloads](#randomized-workloads). For each workload variant, we used NadamW and the other best-performing training algorithm on the corresponding fixed workload the randomized workload is based on. +Report Results. Prior to the Deadline for Self-Reporting Results, run your Submission on either the qualification set or the full benchmark set and report the results. You must report your scores by [ how? sending an email to emailaddress@xxx] Reported scores must include all unmodified logs that the benchmarking codebase automatically generates in a separate /results directory within the /submission folder. -Both [tuning rulesets](#tuning) will use the same target performances. The runtime of the target-setting algorithms on each workload will be chosen to match published results and is constrained by the overall time budget of roughly a single week for all fixed workloads. The `max_runtime` for submissions on each workload is $\frac{1}{3}$ longer than the runtime of the target-setting algorithms (this `max_runtime` will be three times as much for the self-tuning ruleset, see the [Self-tuning ruleset](#self-tuning-ruleset) section). +## Submission Conditions -#### Benchmark score using performance profiles +All Submissions must meet the requirements of the terms contained in these rules, including reliance on new algorithmic or mathematical ideas and concepts, and must not use software engineering approaches in order to increase primitive operations in PyTorch, JAX, their dependencies, the operating system9s0, or the hardware. By entering, all Team members warrant that their Submission does not infringe any third party’s rights, and that Team members have obtained all necessary permissions from all relevant third parties to submit the Submission. If, in the sole discretion of Sponsor, any Submission constitutes copyright or other intellectual property infringement, the Submission will be disqualified. Team must hold all rights through license or ownership to the entire Submission. Team members agree to indemnify Sponsor against any and all claims of infringement from any third party for any use by Sponsor of a Submission. Team members may not be: 1) represented under contract that would limit or impair Sponsor’s ability to use the Submission; or 2) are under any other contractual relationship, including but not limited to guild and/or union memberships, that may prohibit them from participating fully in this Competition, or from allowing Sponsor to use royalty-free, the Submission worldwide in all media in perpetuity. -We will aggregate the training times of a submission on all fixed workloads using [Performance Profiles](http://www.argmin.net/2018/03/26/performance-profiles/) (originally from [Dolan and Moré](https://arxiv.org/abs/cs/0102001)). Below we surface several relevant definitions from their work for easier readability, before explaining how we integrate the performance profiles to reach a scalar benchmark score that will be used for ranking submissions. +No Submission may depict any offensive or obscene subject matter as determined in Sponsor’s sole discretion. No Submission shall portray Sponsor in a negative light. The Submission will be deemed to be owned equally by all team members, regardless of any agreement between the team members, which will not be honored by Sponsor). A Submission may be disqualified by Sponsor, in its sole discretion, if they violate the spirit and goodwill of the rules, including without limitation, if Sponsor determines a Submission is a slavish copy or derivative work of a third party that was previously developed. Submissions will be disqualified if they circumvent any rules, or protocols, including circumventing the tuning rules by looking up the result of an offline computation performed ahead of time; computing any form of pairwise metrics between the fixed and held-out workloads. Submission may use public APIs in JAX and PyTorch from within the submission function APIs, but may not use APIs to optimize the internals of primitive operations and/or standard dependencies to benefit any Submission. -*Notation:* We have a set $\mathcal{S} = \{s_1, s_2, \dots, s_k\}$ of in total $k$ submissions that we evaluate on a set of $n$ fixed workloads: $\mathcal{W} = \{w_1, w_2, \dots, w_n\}$. For each submission $s$ and each workload $w$ we have a training time score $t_{s,w} \in [0,\infty)$. This is the time it took the submission to reach the validation target performance on this particular workload. +## Software Dependencies -##### Computing performance ratios +Submissions must use specific version of PyTorch and JAX, provided by Sponsor. Additional dependencies may be added, provided Teams include a description of the additions and their function. Submissions can include dependencies that support new algorithmic and mathematical ideas provided they do not circumvent the intention of the benchmark in any way that changes measurement of the training speeds. -For all workloads and submissions, we first compute their performance ratio $r$, which is defined for a particular submission $\bar{s}$ and a particular workload $\bar{w}$ to be: +## Scoring -$$r_{\bar{s},\bar{w}} = \frac{t_{\bar{s},\bar{w}}}{\min_{s \in \mathcal{S}} t_{s,\bar{w}}} \in [1,\infty)$$ +All otherwise qualified Submissions shall be scored. Submissions will be scored based on their required training time to reach the target performance on the test set of each workload, using measuring techniques designed to give all Submissions equal parity. In the event that no Submission receives a minimum training time set by judges of XXX, no prizes will be awarded. The Teams with the highest scores will be determined to be winners (“Selected Teams”). In the event of a tie the prize money will be split equally between the winners. -This performance ratio $r_{s,w}$ expresses the "time spent by submission $s$ on workload $w$" relative to the "time spent by the best submission on this workload". E.g. If a submission takes twice as long on a particular workload compared to the best submission on this workload it will have a performance ratio of $2$. Lower performance ratios are therefore better, with an optimal ratio of $1$ if the given submission is the fastest on this workload. +## Submissions -##### Building performance profiles +Teams may enter as many Submissions as they like during the Submission Period and all otherwise qualified Submissions will be scored. -Next, we compute how often a submission is within a factor $\tau \in [1,\infty)$ of the optimal submission. For this, we determine the following function for every submission $\bar{s}$: +## Optional -$$\rho_{\bar{s}}(\tau) = \left(\frac{1}{n}\right) \cdot \left[\text{number of workloads where}\, r_{\bar{s},w}\leq \tau\right]$$ +Team members may join the Algorithm mailing group, located here. This mailing group provides information to Teams regarding the status of the Competition. -In other words, we compute the fraction of workloads where a submission $\bar{s}$ is less than $\tau$ away from the optimal submission. The function $\rho_{\bar{s}}(\tau)$ is monotonically increasing with $\tau$ and bounded between $0$ and $1$. +## Physical Review -An example of a performance profiles plot is shown below, where we plot $\rho_{\bar{s}}(\tau)$ for seven "submissions": +All Submission are subject to human review and testing to determine whether, in Sponsor’s sole and exclusive discretion, any Submission fails to comply with the spirit of the Competition, and is thus disqualified. Both physical review team and other judges shall be qualified to judge the Competition. -![Example performance profile](.assets/performance_profiles.png) +## Notification -##### Integrating performance profiles for the benchmark score +On or about [date], the Selected Team with the best scores as determined by Sponsor will be notified that they are potential winners of the Competition. The Selected Team will be notified by either phone or email at the sole discretion of Sponsor or Sponsor’s representative. Selected Team will be required to respond (as directed) to a phone and/or e-mail notification within 72 hours of attempted notification.  The failure to respond timely to the notification may result in forfeiture of the prize; and, in such case, Sponsor may choose the next highest scoring Submission from among the remaining eligible Submissions. Selected Team members will each be required to sign and return a Declaration (or affidavit, at Sponsor’s option) of Eligibility and Liability/Publicity Release (“Declaration”) and any other documents Sponsor or Sponsor’s representative may require within 72 hours of receipt of the Declaration. Failure to timely return a signed Declaration (or failure of a Team member to return it), or any other required documents or the return of any prize notification as undeliverable will result in Prize forfeiture. National and state income taxes may apply and are the sole responsibility of the winner. All expenses not specifically stated as being included are excluded, and are the responsibility of the Selected Teams. No assignment, transfer or substitution of Prize is permitted, however, Sponsor reserves the right to substitute a prize for one of comparable or greater value should Prize become impracticable to award or unavailable for any reason. -To get a scalar score that is usable for ranking submissions, we will integrate the performance profiles $\rho_{\bar{s}}(\tau)$ of all submissions to get their benchmark score $B_{\bar{s}}$, with +## Prizes -$$B_{\bar{s}} = \frac{1}{r_{\text{max}}-1} \int_{1}^{r_{\text{max}}} \rho_{\bar{s}}(\tau) \,d\tau \in [0, 1].$$ +There will be two prizes awarded, one per each ruleset. Prizes will be awarded in US Dollars. Prize will be awarded in cash, or as a gift card, at Sponsor’s option. In the event the prize is a gift card, Team will be required to accept the terms and conditions of gift card. Prizes will be divided evenly among enumerated Team members listed as of the date of the Submission. In the event Sponsor is unable to award the prize, as outlined herein, for any reason, Sponsor may substitute a prize of equal or greater value. +"Best Performance ‘external-tuning’" US $25,000 +"Best Performance ‘self- tuning’" US $25,000 -The upper integration limit will be set to $r_{\text{max}} = 4$ which also serves as the upper limit of the performance profile plot. -This means that any submission that requires more than four times the runtime of the fastest submission will not get any credit on this workload compared to a training algorithm that is unable to successfully train within the maximum allowed runtime budget. -The integral is normalized by the total integration area, with higher benchmark scores being better. +## Prize Conditions -##### Using held-out workloads in scoring +For all prizes, all national, state, province, and local taxes and other expenses in connection with the prize not expressly described herein as being awarded are the sole responsibility of the Selected Contestant. Selected Teams are solely responsible for any other unspecified expenses related to prize. Selected Teams cannot assign their prize to another person. No substitution of prize, provided however that Sponsor reserves the right to substitute a prize with another prize of equal or greater value. In the event of noncompliance with the foregoing requirements or if prize notification is returned as undeliverable, prize will be forfeited and, at Sponsor’s discretion, an alternate Selected Teams with the next highest score will be chosen. -For the benchmark score, we compute and integrate the performance profiles using the training times of only the fixed workloads. But we use the submission's performance on the held-out workloads to penalize submissions. Specifically, if a submission is unable to train a held-out workload, we score the submission on the corresponding fixed workload as if that submission did not reach the target. In other words, for a submission to receive a finite training time on a fixed workload, it needs to: +Competition is subject to these Official Rules. By participating, Teams agree: (i) to be bound by these complete Official Rules and the decisions of Sponsor which shall be final and binding; and (ii) to waive any right to claim ambiguity in the Competition or these Official Rules, except where prohibited by law. By participating in Competition or by accepting a prize, Selected Team agrees to release Sponsor, including its parent, subsidiary and affiliated entities together with the respective directors, employees, officers, licensees, licensors and agents, and respective advertising and promotion entities and any person or entity associated with the production, judging, or administration of the Competition (collectively, the “Releasees”) from any and all liability, loss or damage arising from or in connection with awarding, receipt and/or use or misuse of prize or participation in any prize-related activities. Releases shall not be liable for: (i) telephone system, telephone or computer hardware, software or other technical or computer malfunctions, lost connections, disconnections, delays or transmission errors; (ii) data corruption, theft, destruction, unauthorized access to or alteration of entry or other materials; (iii) any injuries, losses or damages of any kind, including death, caused by the use of the prize money, or resulting from acceptance, possession or use of a prize, or from participation in the Competition; or (iv) any printing, typographical, administrative or technological errors in any materials associated with the Competition.  Sponsor disclaims any liability for damage to any computer system resulting from participating in, or accessing or downloading information, including licenses and other information germane to the running of the Competition or otherwise in connection with this Competition.  Sponsor reserves the right to cancel or suspend the Competition, in its sole discretion, should it receive fewer than XX Submissions, or receive no Submissions that have a judged score above a threshold set by the Sponsor [INSERT LINK HERE], or due to circumstances beyond its control, including natural disasters, pandemic, computer virus, excessive cheating, or any other event that would undermine the fair play of the Competition. Submissions will not be returned and may be destroyed. -- Reach the validation target on the fixed workload within the maximum runtime. -- Reach the validation target fixed workload within 4x of the fastest submission. -- Reach the validation target on the held-out workload (corresponding to the fixed workload) within the maximum runtime. -- Reach the validation target on the held-out workload (corresponding to the fixed workload) within 4x of the fastest submission. To determine the fastest submission on a held-out workload, we only consider submissions that reached the target on the corresponding fixed workload. This protects us against extremely fast submissions that only work on a specific held-out workload and are useless as general algorithms. +## Jurisdiction -Only if all four requirements are met, does the submission get a finite score. Otherwise, a submission will receive a training time of infinity. +The internal laws of the State of California in the United States of America will govern disputes regarding these Official Rules and/or this Contest. All cases and claims pertaining to this Contest must be brought in a court of competent jurisdiction in the City of San Francisco. -This essentially means that being unable to successfully train a held-out workload can "disqualify" a submission from getting a good score on the fixed workload it is based on. In other words, we require submissions to be robust enough to handle workload variations. This protocol ensures that we prioritize the fixed workloads for scoring since they are the most relevant version of that workload in practice. However, we also protect our benchmark from egregious workload-specific tuning and penalize brittle methods that break with slight modifications of the workload. +## Cancellation and Modification -##### Alternative scores +Sponsor reserves the right, in its sole discretion, to cancel, modify or suspend the Competition should a virus, bug, computer problem, unauthorized intervention or other causes beyond Sponsor’s control, corrupt the administration, security or proper play of the Competition. Sponsor reserves the right to cancel the competition should it receive fewer than two (2) prize money-eligible submissions per ruleset, or which are not above a threshold score as noted in these rules. Sponsor may prohibit an entrant Team (or a single person) from participating in the Competition or winning prize if, in its sole discretion, it determines such entrant is attempting to undermine the legitimate operation of the Competition in any way by cheating, hacking, deception, or any other unfair practices, including intention to annoy, abuse, threaten or harass any other competitors or Sponsor representatives. Any attempts to circumvent safeguards and benchmarks will result in disqualification, including the relevant IP address becoming ineligible for the entire Competition. Caution: any attempt to deliberately damage or undermine the legitimate operation of the Competition may be in violation of criminal and civil laws and will result in disqualification from participation in the contest. Should such an attempt be made, Sponsor reserves the right to seek remedies and damages (including attorney fees) to the fullest extent of the law, including criminal prosecution. -Performance profiles and the benchmark score derived from them, take a bit of effort to explain. -However, we believe that they are fairer and well-supported by research in machine learning and the optimization community. To have some simpler to interpret numbers, e.g. for press releases, we will also release a series of alternative scores. +## Publicity -For a given workload $\bar{w}$, we define the "speedup of a submission $\bar{s}$ over the target-setting reference" as $\frac{t_{\text{ref}, \bar{w}}}{t_{\bar{s}, \bar{w}}}$. For example, if a submission was 2x faster than the target-setting reference, this would be equal to 2. In addition to the raw $t_{s,w}$ values, we will release the geometric mean of the speedups across all workloads, i.e. $\left(\prod_{w \in \mathcal{W}} \frac{t_{\text{ref}, w}}{t_{\bar{s}, w}}\right)^{\frac{1}{n}}$. +Except where prohibited, all entrants agree that Sponsor, its shareholders, agents and representatives, affiliates, subsidiaries, advertising, promotion and fulfillment agencies, and legal advisors are not responsible or liable for, and shall be released and held harmless from any and all losses, damages, rights, claims and actions of any kind in connection with or resulting from participation in the Contest, or acceptance of the prize, including without limitation, claims based on publicity rights, defamation, or invasion of privacy. Except where prohibited by law, Sponsor reserves the right to use the Submissions to the Competition, in whole or in part, for publicity purposes prior to, during, or after the Competition, in any media, and to use the name, likeness, hometown name, of any Contestant, including all or part of their Submission throughout the world, in perpetuity, without any compensation or prior review unless specifically prohibited by law. Except as outlined herein for winners, Teams and their members will not be paid for their Submissions or for granting Sponsor any of these rights. Should any Selected Team be unwilling or otherwise unable to provide permissions and or releases or otherwise cannot accept or receive the prize for any reason, the Selected Team with the next highest score will be chosen from the remaining entries until one who is able to meet all requirements can be selected -### Benchmark Procedure +## Privacy -For a description of how to submit a training algorithm to the AlgoPerf: Training Algorithms Benchmark, see the [Call for submissions](CALL_FOR_SUBMISSIONS.md), which details the entire competition process. +All personal information collected by Sponsor will be used for administration of the Competition. In addition, Team members may receive email correspondence from, or on behalf of Sponsor, via electronic communication relating to the Competition.   All personal information will be held on servers located in the United States.  Sponsor will use reasonable commercial efforts to comply with Federal CAN-SPAM guidelines and other privacy guidelines, and US residents may receive commercial communications, which they may subsequently opt-out of receiving further advertising emails by following the opt-out instructions contained in any email communications received. -## Model Track +## Official Rules and Winners List -🚧 **Coming soon!** 🚧 +For a copy of these Official Rules or of the winner(s) of this Competition, send your request via email to [email address]. The Request and the request must be received within 90 days of the Competition end date. Please allow a reasonable time for a response. From 2bfdc31d7c462349d4be46b45bbfac5e5b669567 Mon Sep 17 00:00:00 2001 From: Frank Schneider Date: Tue, 14 Nov 2023 16:24:05 +0100 Subject: [PATCH 06/27] Modify rules --- RULES.md | 82 +++++++++++++++++++++++++++----------------------------- 1 file changed, 40 insertions(+), 42 deletions(-) diff --git a/RULES.md b/RULES.md index 0126c7ead..041a45400 100644 --- a/RULES.md +++ b/RULES.md @@ -25,64 +25,61 @@ ## Goal -To discover new training algorithms that can train general (not customized) neural networks faster. Sponsor will use an objective measuring program to allocate a score to each entry (“Submission”) and determine [xxx] winners, each of which will be eligible to win a prize. +To discover new training algorithms that can train general neural networks faster. Sponsor will use an objective measuring program to allocate a score to each entry ("Submission") and determine two winners (one in each ruleset), each of which will be eligible to win a prize. ## Sponsor -This Competition (“Competition”) is sponsored by [MLCommons and ___________], [public facing address]. +This Competition ("Competition") is sponsored by MLCommons (649 Mission Street, 5th Floor San Francisco, CA 94105, USA). ## Eligibility -The Competition is open to English-speaking individuals and teams (made of individuals), who are the age of majority as of the Competition start date, have internet access, a GitHub account in good standing, and can legally participate in this Competition (“Teams”). A Team may have unlimited participants, but all names must be entered. ML Commons Chairs and Sponsor’s associated institutions are not eligible for prizes, but may participate. No natural person can be on multiple teams. This Competition is void wherever such competitions are prohibited. This Competition is subject to all applicable laws, including national, state, and local laws. +The Competition is open to English-speaking individuals and teams (made of individuals), who are the age of majority as of the Competition start date, have internet access, a GitHub account in good standing, and can legally participate in this Competition ("Teams"). A Team may have unlimited participants, but all names must be entered. MLCommons Chairs and Sponsor's associated institutions are not eligible for prizes, but may participate. No natural person can be on multiple teams. This Competition is void wherever such competitions are prohibited. This Competition is subject to all applicable laws, including national, state, and local laws. ## Competition Period -The Competition begins at 12:01am (ET) on [date] and ends at 11:59pm (ET) on [date], all according to Sponsor’s time clock, which decisions are final (the “Competition Period”). There are several deadlines contained within the Competition Period: +The Competition begins at 12:01am (ET) on November 21, 2023 and ends at 11:59pm (ET) on May 21, 2024, all according to Sponsor's time clock, which decisions are final (the "Competition Period"). There are several deadlines contained within the Competition Period: - • Intention to Submit. You must register your Intention to Submit no later than 11:59pm ET on [date]. - • Submission Period. You must complete your Submission and enter it no later than 11:59pm ET on [date]. - • Deadline for specifying the Submission batch sizes for held-out workloads. 11:59pm ET on [date] - • Deadline for self-reporting results: 11:59pm ET on [date] +- **Intention to Submit.** You must register your Intention to Submit no later than 11:59pm ET on January 21, 2024. +- **Submission Period.** You must complete your Submission and enter it after the Intention to Submit deadline, but no later than 11:59pm ET on March 21, 2024. +- **Deadline for specifying the Submission batch sizes for held-out workloads.** 11:59pm ET on April 4, 2024. +- **Deadline for self-reporting results.** 11:59pm ET on May 21, 2024. ## Agreement to Official Rules -By participating, Teams agree to be fully unconditionally bound by these Rules, and you represent and warrant that you meet the eligibility requirements set forth herein. In addition, you agree to accept the decisions of Sponsor, as final and binding, and waive any right to claim ambiguity in the Competition or these Rules. +By participating, Teams agree to be fully unconditionally bound by these Rules, and you represent and warrant that you meet the eligibility requirements set forth herein. In addition, you agree to accept the decisions of Sponsor, as final and binding, and waive any right to claim ambiguity in the Competition or these Rules. ## How to Enter -There are five (5) steps to a successful submission (“Submission”). +There are five (5) steps to a successful submission ("Submission"). -Register Intent to Submit. Registration of intent does not obligate you to enter a Submission, but you must register prior to entering your Submission. Click for the Intent Form. This is your “Team,” even if you are a single person. Please note that natural persons may not be on multiple teams, but each Team may enter multiple Submissions. +1. **Register Intent to Submit.** Registration of intent does not obligate you to enter a Submission, but you must register prior to entering your Submission. Click for the [Intent Form](https://forms.gle/K7ty8MaYdi2AxJ4N8). This is your "Team," even if you are a single person. Please note that natural persons may not be on multiple teams, but each Team may enter multiple Submissions. +2. **Develop your Submission.** Develop your Submission according to the guidelines set forth in these rules, along with the links to various necessary information. Please note that all Submissions must be entered subject to the Apache 2.0 license. In order to develop your Submission, you must: + - *Fork the Benchmark Codebase.* Begin by creating a (public or private) GitHub repository for your contest submission. Once you submitted, this repository must be a clone of the frozen main branch of the benchmark codebase. Ensure that all elements of the original codebase remain unaltered, with the exception of the `/submission` directory. + - *Preserve the Apache 2 License.* You must maintain the same Apache 2 License for your repository as the benchmark codebase. This means you may not change the licensing terms. Submissions that change the terms or otherwise fail to maintain the license, will be deemed ineligible submissions. + - *Define Software Dependencies.* If your Submission will have any software dependencies, you must create a `requirements.txt` file in the `/submission` directory. This file must clearly list all software dependencies your Submission requires in order to be a valid Submission. File must be "pip readable" (the dependencies listed can be installed via the `pip install -r requirements.txt` command). You may not modify the package versions of the software dependencies used by the benchmarking codebase, including using a different version of libraries such as PyTorch or JAX from those specified in the benchmark. +3. **Submit Your Entry & Complete Your Submission Forms.** During the Submission Period, once your Submission is complete, submit it using the [Submission Form](https://forms.gle/yXQqwJ6Nm6sszPw49). The submission form must contain the URL of your submission's GitHub repository and the following agreements: + - A signed [Contributor License Agreement (CLA) "Corporate CLA"](https://mlcommons.org/en/policies/) of MLCommons. + - *Either* a membership in MLCommons *or* a signed [non-member test agreement](https://mlcommons.org/en/policies/). + - A signed trademark license agreement, either the member or the non-member version, as appropriate. These license agreements are available upon request to [support@mlcommons.org](mailto:support@mlcommons.org). -Develop your Submission. Develop your Submission according to the guidelines set forth in these rules, along with the links to various necessary information. Please note that all Submissions must be entered subject to the Apache 2.0 license. In order to develop your Submission, you must: + The form is sent to the working group chairs, who will process your Submission. Failure to complete the proper Submission Forms will results in disqualification of your Submission. At the close of the Submission Period, your GitHub repository must be public. -Fork the Benchmark Codebase. Begin by creating a (public or private) GitHub repository for your contest submission. Once you submitted, this repository must be a clone of the frozen main branch of the benchmark codebase. Ensure that all elements of the original codebase remain unaltered, with the exception of the /submission directory. - -Preserve the Apache 2 License. You must maintain the same Apache 2 License for your repository as the benchmark codebase. This means you may not change the licensing terms. Submissions that change the terms or otherwise fail to maintain the license, will be deemed ineligible submissions. - -Define Software Dependencies. If your Submission will have any software dependencies, you must create a requirements.txt file in the /submission directory. This file must clearly list all software dependencies your Submission requires in order to be a valid Submission. File must be "pip readable" (the dependencies listed can be installed via the pip install -r requirements.txt command). You may not modify the package versions of the software dependencies used by the benchmarking codebase, including using a different version of libraries such as PyTorch or JAX from those specified in the benchmark. - -Complete Your Submission Forms. Please complete the following agreements [links to tm/cla here] (“Agreements”). You will need to attach them to the email in which your Submission is sent to the working group chairs, who will process your Submission. Failure to complete the proper Submission Forms will results in disqualification of your Submission. - -Submit Your Entry. During the Submission Period, once your Submission and your is complete, send an email to the working group chairs at [emailaddress] containing the URL of your GitHub repository, along with the name [BENCHMARK COMPETITION 2024] in the subject line, and attach all Agreements. At the close of the Submission Period, your GitHub repository must be public. [We might want to simplify this process by having them fill out another form (instead of asking them for an email). Would it be okay to change this after starting the competition? Yes.] - -Define the batch sizes for held-out workloads. Once the held-out workloads have been sampled, you have until the "Deadline for specifying the submission batch sizes for held-out workloads" to define the batch sizes for the held-out workloads via the get_batch_size function of your submission. - -Report Results. Prior to the Deadline for Self-Reporting Results, run your Submission on either the qualification set or the full benchmark set and report the results. You must report your scores by [ how? sending an email to emailaddress@xxx] Reported scores must include all unmodified logs that the benchmarking codebase automatically generates in a separate /results directory within the /submission folder. +4. **Define the batch sizes for held-out workloads.** Once the held-out workloads have been sampled, you have until the "Deadline for specifying the Submission batch sizes for held-out workloads" to define the batch sizes for the held-out workloads via the `get_batch_size` function of your submission. +5. **Report Results.** Prior to the Deadline for self-reporting results, run your Submission on either the qualification set or the full benchmark set and report the results. You must report your scores by uploading all unmodified logs that the benchmarking codebase automatically generates in a separate `/results` directory within the `/submission` folder f your Submission's GitHub repository. ## Submission Conditions -All Submissions must meet the requirements of the terms contained in these rules, including reliance on new algorithmic or mathematical ideas and concepts, and must not use software engineering approaches in order to increase primitive operations in PyTorch, JAX, their dependencies, the operating system9s0, or the hardware. By entering, all Team members warrant that their Submission does not infringe any third party’s rights, and that Team members have obtained all necessary permissions from all relevant third parties to submit the Submission. If, in the sole discretion of Sponsor, any Submission constitutes copyright or other intellectual property infringement, the Submission will be disqualified. Team must hold all rights through license or ownership to the entire Submission. Team members agree to indemnify Sponsor against any and all claims of infringement from any third party for any use by Sponsor of a Submission. Team members may not be: 1) represented under contract that would limit or impair Sponsor’s ability to use the Submission; or 2) are under any other contractual relationship, including but not limited to guild and/or union memberships, that may prohibit them from participating fully in this Competition, or from allowing Sponsor to use royalty-free, the Submission worldwide in all media in perpetuity. +All Submissions must meet the requirements of the terms contained in these rules, including reliance on new algorithmic or mathematical ideas and concepts, and must not use software engineering approaches in order to increase primitive operations in PyTorch, JAX, their dependencies, the operating systems, or the hardware. By entering, all Team members warrant that their Submission does not infringe any third party's rights, and that Team members have obtained all necessary permissions from all relevant third parties to submit the Submission. If, in the sole discretion of Sponsor, any Submission constitutes copyright or other intellectual property infringement, the Submission will be disqualified. Team must hold all rights through license or ownership to the entire Submission. Team members agree to indemnify Sponsor against any and all claims of infringement from any third party for any use by Sponsor of a Submission. Team members may not be: 1) represented under contract that would limit or impair Sponsor's ability to use the Submission; or 2) are under any other contractual relationship, including but not limited to guild and/or union memberships, that may prohibit them from participating fully in this Competition, or from allowing Sponsor to use royalty-free, the Submission worldwide in all media in perpetuity. -No Submission may depict any offensive or obscene subject matter as determined in Sponsor’s sole discretion. No Submission shall portray Sponsor in a negative light. The Submission will be deemed to be owned equally by all team members, regardless of any agreement between the team members, which will not be honored by Sponsor). A Submission may be disqualified by Sponsor, in its sole discretion, if they violate the spirit and goodwill of the rules, including without limitation, if Sponsor determines a Submission is a slavish copy or derivative work of a third party that was previously developed. Submissions will be disqualified if they circumvent any rules, or protocols, including circumventing the tuning rules by looking up the result of an offline computation performed ahead of time; computing any form of pairwise metrics between the fixed and held-out workloads. Submission may use public APIs in JAX and PyTorch from within the submission function APIs, but may not use APIs to optimize the internals of primitive operations and/or standard dependencies to benefit any Submission. +No Submission may depict any offensive or obscene subject matter as determined in Sponsor's sole discretion. No Submission shall portray Sponsor in a negative light. The Submission will be deemed to be owned equally by all team members, regardless of any agreement between the team members, which will not be honored by Sponsor. A Submission may be disqualified by Sponsor, in its sole discretion, if they violate the spirit and goodwill of the rules, including without limitation, if Sponsor determines a Submission is a slavish copy or derivative work of a third party that was previously developed. Submissions will be disqualified if they circumvent any rules, or protocols, including circumventing the tuning rules by looking up the result of an offline computation performed ahead of time; computing any form of pairwise metrics between the fixed and held-out workloads. Submission may use public APIs in JAX and PyTorch from within the submission function APIs, but may not use APIs to optimize the internals of primitive operations and/or standard dependencies to benefit any Submission. ## Software Dependencies -Submissions must use specific version of PyTorch and JAX, provided by Sponsor. Additional dependencies may be added, provided Teams include a description of the additions and their function. Submissions can include dependencies that support new algorithmic and mathematical ideas provided they do not circumvent the intention of the benchmark in any way that changes measurement of the training speeds. +Submissions must use specific versions of PyTorch and JAX, provided by Sponsor. Additional dependencies may be added, provided Teams include a description of the additions and their function. Submissions can include dependencies that support new algorithmic and mathematical ideas provided they do not circumvent the intention of the benchmark in any way that changes measurement of the training speeds. ## Scoring -All otherwise qualified Submissions shall be scored. Submissions will be scored based on their required training time to reach the target performance on the test set of each workload, using measuring techniques designed to give all Submissions equal parity. In the event that no Submission receives a minimum training time set by judges of XXX, no prizes will be awarded. The Teams with the highest scores will be determined to be winners (“Selected Teams”). In the event of a tie the prize money will be split equally between the winners. +All otherwise qualified Submissions shall be scored. Submissions will be scored based on their required training time to reach the target performance on the validation set of each workload, using measuring techniques designed to give all Submissions equal parity. In the event that no Submission receives a score exceeding that of the [NAdamW baseline](https://github.com/mlcommons/algorithmic-efficiency/tree/dev/baselines/nadamw), no prizes will be awarded. The Teams with the highest scores will be determined to be winners ("Selected Teams"). In the event of a tie the prize money will be split equally between the winners. ## Submissions @@ -90,44 +87,45 @@ Teams may enter as many Submissions as they like during the Submission Period an ## Optional -Team members may join the Algorithm mailing group, located here. This mailing group provides information to Teams regarding the status of the Competition. +Team members may join the Algorithm mailing group, located [here](https://groups.google.com/u/4/a/mlcommons.org/g/algorithms). This mailing group provides information to Teams regarding the status of the Competition. ## Physical Review -All Submission are subject to human review and testing to determine whether, in Sponsor’s sole and exclusive discretion, any Submission fails to comply with the spirit of the Competition, and is thus disqualified. Both physical review team and other judges shall be qualified to judge the Competition. +All Submission are subject to human review and testing to determine whether, in Sponsor's sole and exclusive discretion, any Submission fails to comply with the spirit of the Competition, and is thus disqualified. Both physical review team and other judges shall be qualified to judge the Competition. ## Notification -On or about [date], the Selected Team with the best scores as determined by Sponsor will be notified that they are potential winners of the Competition. The Selected Team will be notified by either phone or email at the sole discretion of Sponsor or Sponsor’s representative. Selected Team will be required to respond (as directed) to a phone and/or e-mail notification within 72 hours of attempted notification.  The failure to respond timely to the notification may result in forfeiture of the prize; and, in such case, Sponsor may choose the next highest scoring Submission from among the remaining eligible Submissions. Selected Team members will each be required to sign and return a Declaration (or affidavit, at Sponsor’s option) of Eligibility and Liability/Publicity Release (“Declaration”) and any other documents Sponsor or Sponsor’s representative may require within 72 hours of receipt of the Declaration. Failure to timely return a signed Declaration (or failure of a Team member to return it), or any other required documents or the return of any prize notification as undeliverable will result in Prize forfeiture. National and state income taxes may apply and are the sole responsibility of the winner. All expenses not specifically stated as being included are excluded, and are the responsibility of the Selected Teams. No assignment, transfer or substitution of Prize is permitted, however, Sponsor reserves the right to substitute a prize for one of comparable or greater value should Prize become impracticable to award or unavailable for any reason. +On or about June 30, 2024, the Selected Team with the best scores as determined by Sponsor will be notified that they are potential winners of the Competition. The Selected Team will be notified by either phone or email at the sole discretion of Sponsor or Sponsor's representative. Selected Team will be required to respond (as directed) to a phone and/or e-mail notification within 72 hours of attempted notification. The failure to respond timely to the notification may result in forfeiture of the prize; and, in such case, Sponsor may choose the next highest scoring Submission from among the remaining eligible Submissions. Selected Team members will each be required to sign and return a Declaration (or affidavit, at Sponsor's option) of Eligibility and Liability/Publicity Release ("Declaration") and any other documents Sponsor or Sponsor's representative may require within 72 hours of receipt of the Declaration. Failure to timely return a signed Declaration (or failure of a Team member to return it), or any other required documents or the return of any prize notification as undeliverable will result in Prize forfeiture. National and state income taxes may apply and are the sole responsibility of the winner. All expenses not specifically stated as being included are excluded, and are the responsibility of the Selected Teams. No assignment, transfer or substitution of Prize is permitted, however, Sponsor reserves the right to substitute a prize for one of comparable or greater value should Prize become impracticable to award or unavailable for any reason. ## Prizes -There will be two prizes awarded, one per each ruleset. Prizes will be awarded in US Dollars. Prize will be awarded in cash, or as a gift card, at Sponsor’s option. In the event the prize is a gift card, Team will be required to accept the terms and conditions of gift card. Prizes will be divided evenly among enumerated Team members listed as of the date of the Submission. In the event Sponsor is unable to award the prize, as outlined herein, for any reason, Sponsor may substitute a prize of equal or greater value. -"Best Performance ‘external-tuning’" US $25,000 -"Best Performance ‘self- tuning’" US $25,000 +There will be two prizes awarded, one per each ruleset. Prizes will be awarded in US Dollars. Prize will be awarded in cash, or as a gift card, at Sponsor's option. In the event the prize is a gift card, Team will be required to accept the terms and conditions of gift card. Prizes will be divided evenly among enumerated Team members listed as of the date of the Submission. In the event Sponsor is unable to award the prize, as outlined herein, for any reason, Sponsor may substitute a prize of equal or greater value. + +- "Best Performance '*external-tuning*'": US $25,000 +- "Best Performance '*self- tuning*'": US $25,000 ## Prize Conditions -For all prizes, all national, state, province, and local taxes and other expenses in connection with the prize not expressly described herein as being awarded are the sole responsibility of the Selected Contestant. Selected Teams are solely responsible for any other unspecified expenses related to prize. Selected Teams cannot assign their prize to another person. No substitution of prize, provided however that Sponsor reserves the right to substitute a prize with another prize of equal or greater value. In the event of noncompliance with the foregoing requirements or if prize notification is returned as undeliverable, prize will be forfeited and, at Sponsor’s discretion, an alternate Selected Teams with the next highest score will be chosen. +For all prizes, all national, state, province, and local taxes and other expenses in connection with the prize not expressly described herein as being awarded are the sole responsibility of the Selected Contestant. Selected Teams are solely responsible for any other unspecified expenses related to prize. Selected Teams cannot assign their prize to another person. No substitution of prize, provided however that Sponsor reserves the right to substitute a prize with another prize of equal or greater value. In the event of noncompliance with the foregoing requirements or if prize notification is returned as undeliverable, prize will be forfeited and, at Sponsor's discretion, an alternate Selected Teams with the next highest score will be chosen. -Competition is subject to these Official Rules. By participating, Teams agree: (i) to be bound by these complete Official Rules and the decisions of Sponsor which shall be final and binding; and (ii) to waive any right to claim ambiguity in the Competition or these Official Rules, except where prohibited by law. By participating in Competition or by accepting a prize, Selected Team agrees to release Sponsor, including its parent, subsidiary and affiliated entities together with the respective directors, employees, officers, licensees, licensors and agents, and respective advertising and promotion entities and any person or entity associated with the production, judging, or administration of the Competition (collectively, the “Releasees”) from any and all liability, loss or damage arising from or in connection with awarding, receipt and/or use or misuse of prize or participation in any prize-related activities. Releases shall not be liable for: (i) telephone system, telephone or computer hardware, software or other technical or computer malfunctions, lost connections, disconnections, delays or transmission errors; (ii) data corruption, theft, destruction, unauthorized access to or alteration of entry or other materials; (iii) any injuries, losses or damages of any kind, including death, caused by the use of the prize money, or resulting from acceptance, possession or use of a prize, or from participation in the Competition; or (iv) any printing, typographical, administrative or technological errors in any materials associated with the Competition.  Sponsor disclaims any liability for damage to any computer system resulting from participating in, or accessing or downloading information, including licenses and other information germane to the running of the Competition or otherwise in connection with this Competition.  Sponsor reserves the right to cancel or suspend the Competition, in its sole discretion, should it receive fewer than XX Submissions, or receive no Submissions that have a judged score above a threshold set by the Sponsor [INSERT LINK HERE], or due to circumstances beyond its control, including natural disasters, pandemic, computer virus, excessive cheating, or any other event that would undermine the fair play of the Competition. Submissions will not be returned and may be destroyed. +Competition is subject to these Official Rules. By participating, Teams agree: (i) to be bound by these complete Official Rules and the decisions of Sponsor which shall be final and binding; and (ii) to waive any right to claim ambiguity in the Competition or these Official Rules, except where prohibited by law. By participating in Competition or by accepting a prize, Selected Team agrees to release Sponsor, including its parent, subsidiary and affiliated entities together with the respective directors, employees, officers, licensees, licensors and agents, and respective advertising and promotion entities and any person or entity associated with the production, judging, or administration of the Competition (collectively, the "Releasees") from any and all liability, loss or damage arising from or in connection with awarding, receipt and/or use or misuse of prize or participation in any prize-related activities. Releases shall not be liable for: (i) telephone system, telephone or computer hardware, software or other technical or computer malfunctions, lost connections, disconnections, delays or transmission errors; (ii) data corruption, theft, destruction, unauthorized access to or alteration of entry or other materials; (iii) any injuries, losses or damages of any kind, including death, caused by the use of the prize money, or resulting from acceptance, possession or use of a prize, or from participation in the Competition; or (iv) any printing, typographical, administrative or technological errors in any materials associated with the Competition. Sponsor disclaims any liability for damage to any computer system resulting from participating in, or accessing or downloading information, including licenses and other information germane to the running of the Competition or otherwise in connection with this Competition. Sponsor reserves the right to cancel or suspend the Competition, in its sole discretion, should it receive fewer than two (2) prize money-eligible Submissions per ruleset, or receive no Submissions that have a judged score above a threshold set by the Sponsor, or due to circumstances beyond its control, including natural disasters, pandemic, computer virus, excessive cheating, or any other event that would undermine the fair play of the Competition. Submissions will not be returned and may be destroyed. ## Jurisdiction -The internal laws of the State of California in the United States of America will govern disputes regarding these Official Rules and/or this Contest. All cases and claims pertaining to this Contest must be brought in a court of competent jurisdiction in the City of San Francisco. +The internal laws of the State of California in the United States of America will govern disputes regarding these Official Rules and/or this Contest. All cases and claims pertaining to this Contest must be brought in a court of competent jurisdiction in the City of San Francisco. ## Cancellation and Modification -Sponsor reserves the right, in its sole discretion, to cancel, modify or suspend the Competition should a virus, bug, computer problem, unauthorized intervention or other causes beyond Sponsor’s control, corrupt the administration, security or proper play of the Competition. Sponsor reserves the right to cancel the competition should it receive fewer than two (2) prize money-eligible submissions per ruleset, or which are not above a threshold score as noted in these rules. Sponsor may prohibit an entrant Team (or a single person) from participating in the Competition or winning prize if, in its sole discretion, it determines such entrant is attempting to undermine the legitimate operation of the Competition in any way by cheating, hacking, deception, or any other unfair practices, including intention to annoy, abuse, threaten or harass any other competitors or Sponsor representatives. Any attempts to circumvent safeguards and benchmarks will result in disqualification, including the relevant IP address becoming ineligible for the entire Competition. Caution: any attempt to deliberately damage or undermine the legitimate operation of the Competition may be in violation of criminal and civil laws and will result in disqualification from participation in the contest. Should such an attempt be made, Sponsor reserves the right to seek remedies and damages (including attorney fees) to the fullest extent of the law, including criminal prosecution. +Sponsor reserves the right, in its sole discretion, to cancel, modify or suspend the Competition should a virus, bug, computer problem, unauthorized intervention or other causes beyond Sponsor's control, corrupt the administration, security or proper play of the Competition. Sponsor reserves the right to cancel the competition should it receive fewer than two (2) prize money-eligible submissions per ruleset, or which are not above a threshold score as noted in these rules. Sponsor may prohibit an entrant Team (or a single person) from participating in the Competition or winning prize if, in its sole discretion, it determines such entrant is attempting to undermine the legitimate operation of the Competition in any way by cheating, hacking, deception, or any other unfair practices, including intention to annoy, abuse, threaten or harass any other competitors or Sponsor representatives. Any attempts to circumvent safeguards and benchmarks will result in disqualification, including the relevant IP address becoming ineligible for the entire Competition. Caution: any attempt to deliberately damage or undermine the legitimate operation of the Competition may be in violation of criminal and civil laws and will result in disqualification from participation in the contest. Should such an attempt be made, Sponsor reserves the right to seek remedies and damages (including attorney fees) to the fullest extent of the law, including criminal prosecution. ## Publicity -Except where prohibited, all entrants agree that Sponsor, its shareholders, agents and representatives, affiliates, subsidiaries, advertising, promotion and fulfillment agencies, and legal advisors are not responsible or liable for, and shall be released and held harmless from any and all losses, damages, rights, claims and actions of any kind in connection with or resulting from participation in the Contest, or acceptance of the prize, including without limitation, claims based on publicity rights, defamation, or invasion of privacy. Except where prohibited by law, Sponsor reserves the right to use the Submissions to the Competition, in whole or in part, for publicity purposes prior to, during, or after the Competition, in any media, and to use the name, likeness, hometown name, of any Contestant, including all or part of their Submission throughout the world, in perpetuity, without any compensation or prior review unless specifically prohibited by law. Except as outlined herein for winners, Teams and their members will not be paid for their Submissions or for granting Sponsor any of these rights. Should any Selected Team be unwilling or otherwise unable to provide permissions and or releases or otherwise cannot accept or receive the prize for any reason, the Selected Team with the next highest score will be chosen from the remaining entries until one who is able to meet all requirements can be selected +Except where prohibited, all entrants agree that Sponsor, its shareholders, agents and representatives, affiliates, subsidiaries, advertising, promotion and fulfillment agencies, and legal advisors are not responsible or liable for, and shall be released and held harmless from any and all losses, damages, rights, claims and actions of any kind in connection with or resulting from participation in the Contest, or acceptance of the prize, including without limitation, claims based on publicity rights, defamation, or invasion of privacy. Except where prohibited by law, Sponsor reserves the right to use the Submissions to the Competition, in whole or in part, for publicity purposes prior to, during, or after the Competition, in any media, and to use the name, likeness, hometown name, of any Contestant, including all or part of their Submission throughout the world, in perpetuity, without any compensation or prior review unless specifically prohibited by law. Except as outlined herein for winners, Teams and their members will not be paid for their Submissions or for granting Sponsor any of these rights. Should any Selected Team be unwilling or otherwise unable to provide permissions and or releases or otherwise cannot accept or receive the prize for any reason, the Selected Team with the next highest score will be chosen from the remaining entries until one who is able to meet all requirements can be selected ## Privacy -All personal information collected by Sponsor will be used for administration of the Competition. In addition, Team members may receive email correspondence from, or on behalf of Sponsor, via electronic communication relating to the Competition.   All personal information will be held on servers located in the United States.  Sponsor will use reasonable commercial efforts to comply with Federal CAN-SPAM guidelines and other privacy guidelines, and US residents may receive commercial communications, which they may subsequently opt-out of receiving further advertising emails by following the opt-out instructions contained in any email communications received. +All personal information collected by Sponsor will be used for administration of the Competition. In addition, Team members may receive email correspondence from, or on behalf of Sponsor, via electronic communication relating to the Competition. All personal information will be held on servers located in the United States. Sponsor will use reasonable commercial efforts to comply with Federal CAN-SPAM guidelines and other privacy guidelines, and US residents may receive commercial communications, which they may subsequently opt-out of receiving further advertising emails by following the opt-out instructions contained in any email communications received. ## Official Rules and Winners List -For a copy of these Official Rules or of the winner(s) of this Competition, send your request via email to [email address]. The Request and the request must be received within 90 days of the Competition end date. Please allow a reasonable time for a response. +For a copy of these Official Rules or of the winner(s) of this Competition, send your request via email to [algorithms-chairs@mlcommons.org](mailto:algorithms-chairs@mlcommons.org). The Request and the request must be received within 90 days of the Competition end date. Please allow a reasonable time for a response. From b93daac1f67aae00ab1f03c2bd955380cc2b4da9 Mon Sep 17 00:00:00 2001 From: Frank Schneider Date: Tue, 14 Nov 2023 17:46:10 +0100 Subject: [PATCH 07/27] Reorganize technical documentation --- CALL_FOR_SUBMISSIONS.md | 3 - CHANGELOG.md | 6 +- CONTRIBUTING.md | 48 +-- DOCUMENTATION.md | 586 ++++++++++++++++++++++++++++++++++++ GETTING_STARTED.md | 248 ++++++++++++--- README.md | 311 +++---------------- RULES.md | 2 + SUBMISSION_PROCESS_RULES.md | 171 ----------- 8 files changed, 860 insertions(+), 515 deletions(-) delete mode 100644 CALL_FOR_SUBMISSIONS.md create mode 100644 DOCUMENTATION.md delete mode 100644 SUBMISSION_PROCESS_RULES.md diff --git a/CALL_FOR_SUBMISSIONS.md b/CALL_FOR_SUBMISSIONS.md deleted file mode 100644 index ecc7840e7..000000000 --- a/CALL_FOR_SUBMISSIONS.md +++ /dev/null @@ -1,3 +0,0 @@ -# MLCommons™ AlgoPerf: Call for Submissions - -🚧 **Coming soon!** 🚧 diff --git a/CHANGELOG.md b/CHANGELOG.md index b71e42e01..f8c3db0e6 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,5 @@ -# Change log +# Change Log -## TODO: algorithmic-efficiency 0.1.0 +## [0.1.0] - 2023-11-21 -First release of AlgoPerf benchmarking code. +First release of the AlgoPerf: Training algorithms benchmarking code. diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 025cb6d30..b22cb5f3a 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -2,15 +2,16 @@ ## Table of Contents -- [Setup](#setup) +- [Contributing to MLCommons](#contributing-to-mlcommons) +- [Setup for Contributing](#setup-for-contributing) - [Setting up a Linux VM on GCP](#setting-up-a-linux-vm-on-gcp) - [Installing GPU Drivers](#installing-gpu-drivers) - [Authentication for Google Cloud Container Registry](#authentication-for-google-cloud-container-registry) - [Installation](#installation) -- [Docker workflows](#docker-workflows) +- [Docker Workflows](#docker-workflows) - [Pre-built Images on Google Cloud Container Registry](#pre-built-images-on-google-cloud-container-registry) - - [Trigger rebuild and push of maintained images](#trigger-rebuild-and-push-of-maintained-images) - - [Trigger build and push of images on other branch](#trigger-build-and-push-of-images-on-other-branch) + - [Trigger Rebuild and Push of Maintained Images](#trigger-rebuild-and-push-of-maintained-images) + - [Trigger Build and Push of Images on Other Branch](#trigger-build-and-push-of-images-on-other-branch) - [GCP Data and Experiment Integration](#gcp-data-and-experiment-integration) - [Downloading Data from GCP](#downloading-data-from-gcp) - [Saving Experiments to GCP](#saving-experiments-to-gcp) @@ -19,10 +20,12 @@ - [Submitting PRs](#submitting-prs) - [Testing](#testing) - [Style Testing](#style-testing) - - [Unit and integration tests](#unit-and-integration-tests) - - [Regression tests](#regression-tests) + - [Unit and Integration Tests](#unit-and-integration-tests) + - [Regression Tests](#regression-tests) -We invite everyone to look through our rules and codebase and submit issues and pull requests, e.g. for rules changes, clarifications, or any bugs you might encounter. If you are interested in contributing to the work of the working group and influence the benchmark's design decisions, please [join the weekly meetings](https://mlcommons.org/en/groups/research-algorithms/) and consider becoming a member of the working group. +## Contributing to MLCommons + +We invite everyone to look through our technical documentation and codebase and submit issues and pull requests, e.g. for changes, clarifications, or any bugs you might encounter. If you are interested in contributing to the work of the working group and influence the benchmark's design decisions, please [join the weekly meetings](https://mlcommons.org/en/groups/research-algorithms/) and consider becoming a member of the working group. The best way to contribute to the MLCommons is to get involved with one of our many project communities. You find more information about getting involved with MLCommons [here](https://mlcommons.org/en/get-involved/#getting-started). @@ -32,7 +35,7 @@ To get started contributing code, you or your organization needs to sign the MLC MLCommons project work is tracked with issue trackers and pull requests. Modify the project in your own fork and issue a pull request once you want other developers to take a look at what you have done and discuss the proposed changes. Ensure that cla-bot and other checks pass for your Pull requests. -## Setup +## Setup for Contributing ### Setting up a Linux VM on GCP @@ -51,7 +54,7 @@ Use the gcloud credential helper as documented [here](https://cloud.google.com/a ## Installation -If you have not installed the package and dependencies yet see [Installation](./README.md#installation). +If you have not installed the package and dependencies yet see [Installation](/README.md#installation). To use the development tools such as `pytest` or `pylint` use the `dev` option: @@ -62,14 +65,14 @@ pre-commit install To get an installation with the requirements for all workloads and development, use the argument `[full_dev]`. -## Docker workflows +## Docker Workflows We recommend developing in our Docker image to ensure a consistent environment between developing, testing and scoring submissions. To get started see also: -- [Installation with Docker](./README.md#docker) -- [Running a submission inside a Docker Container](./getting_started.md#run-your-submission-in-a-docker-container) +- [Installation with Docker](/GETTING_STARTED.md#docker) +- [Running a submission inside a Docker Container](/GETTING_STARTED.md#run-your-submission-in-a-docker-container) ### Pre-built Images on Google Cloud Container Registry @@ -100,7 +103,7 @@ Currently maintained images on the repository are: To reference the pulled image you will have to use the full `image_path`, e.g. `us-central1-docker.pkg.dev/training-algorithms-external/mlcommons-docker-repo/algoperf_jax_main`. -### Trigger rebuild and push of maintained images +### Trigger Rebuild and Push of Maintained Images To build and push all images (`pytorch`, `jax`, `both`) on maintained branches (`dev`, `main`). @@ -108,7 +111,7 @@ To build and push all images (`pytorch`, `jax`, `both`) on maintained branches ( bash docker/build_docker_images.sh -b ``` -#### Trigger build and push of images on other branch +#### Trigger Build and Push of Images on Other Branch You can also use the above script to build images from a different branch. @@ -121,9 +124,7 @@ You can also use the above script to build images from a different branch. ### GCP Data and Experiment Integration -The Docker entrypoint script can transfer data to and from -our GCP buckets on our internal GCP project. If -you are an approved contributor you can get access to these resources to automatically download the datasets and upload experiment results. +The Docker entrypoint script can transfer data to and from our GCP buckets on our internal GCP project. If you are an approved contributor you can get access to these resources to automatically download the datasets and upload experiment results. You can use these features by setting the `--internal_contributor` flag to 'true' for the Docker entrypoint script. ### Downloading Data from GCP @@ -216,7 +217,7 @@ New PRs will be merged on the dev branch by default, given that they pass the pr ## Testing -We run tests with GitHub Actions, configured in the [.github/workflows](https://github.com/mlcommons/algorithmic-efficiency/tree/main/.github/workflows) folder. +We run tests with GitHub Actions, configured in the [.github/workflows](.github/workflows/) folder. ### Style Testing @@ -253,14 +254,15 @@ pylint submission_runner.py pylint tests ``` -## Unit and integration tests -We run unit tests and integration tests as part of the of github actions as well. +### Unit and Integration Tests + +We run unit tests and integration tests as part of the of github actions as well. You can also use `python tests/reference_algorithm_tests.py` to run a single model update and two model evals for each workload using the reference algorithm in `reference_algorithms/target_setting_algorithms/`. -### Regression tests +### Regression Tests -We also have regression tests available in [.github/workflows/regression_tests.yml](https://github.com/mlcommons/algorithmic-efficiency/tree/main/.github/workflows/regression_tests.yml) that can be run semi-automatically. -The regression tests are shorter end-to-end submissions run in a containerized environment across all 8 workloads, in both the jax and pytorch frameworks. +We also have regression tests available in [.github/workflows/regression_tests.yml](.github/workflows/regression_tests.yml) that can be run semi-automatically. +The regression tests are shorter end-to-end submissions run in a containerized environment across all 8 workloads, in both the JAX and PyTorch frameworks. The regression tests run on self-hosted runners and are triggered for pull requests that target the main branch. Typically these PRs will be from the `dev` branch so the tests will run containers based on images build from the `dev` branch. To run a regression test: diff --git a/DOCUMENTATION.md b/DOCUMENTATION.md new file mode 100644 index 000000000..b6b48849f --- /dev/null +++ b/DOCUMENTATION.md @@ -0,0 +1,586 @@ +# MLCommons™ AlgoPerf: Technical Documentation & FAQs + +**Version:** 0.0.19 *(Last updated November 14, 2023)* + +> **TL;DR** New training algorithms and models can make neural net training faster. +> We need a rigorous training time benchmark that measures time to result given a fixed hardware configuration and stimulates algorithmic progress. We propose a *Training Algorithm Track* and a *Model Track* in order to help disentangle optimizer improvements and model architecture improvements. This two-track structure lets us enforce a requirement that new optimizers work well on multiple models and that new models aren't highly specific to particular training hacks. The following is the technical documentation for the Training Algorithm Track. + +## Table of Contents + +- [Introduction](#introduction) +- [Technical Documentation of the Training Algorithm Track](#technical-documentation-of-the-training-algorithm-track) + - [Competition Rules](#competition-rules) + - [Submissions](#submissions) + - [Specification](#specification) + - [Evaluation during training](#evaluation-during-training) + - [Valid submissions](#valid-submissions) + - [Tuning](#tuning) + - [External tuning ruleset](#external-tuning-ruleset) + - [Self-tuning ruleset](#self-tuning-ruleset) + - [Workloads](#workloads) + - [Fixed workloads](#fixed-workloads) + - [Randomized workloads](#randomized-workloads) + - [Qualification set](#qualification-set) + - [Scoring](#scoring) + - [Benchmarking hardware](#benchmarking-hardware) + - [Defining target performance](#defining-target-performance) + - [Benchmark score using performance profiles](#benchmark-score-using-performance-profiles) + - [Version freeze](#version-freeze) +- [FAQs](#faqs) + - [Setup and Platform](#setup-and-platform) + - [My machine only has one GPU. How can I use this repo?](#my-machine-only-has-one-gpu-how-can-i-use-this-repo) + - [How do I run this on my SLURM cluster?](#how-do-i-run-this-on-my-slurm-cluster) + - [How can I run this on my AWS/GCP/Azure cloud project?](#how-can-i-run-this-on-my-awsgcpazure-cloud-project) + - [Submissions](#submissions-1) + - [Can I submit multiple times to the benchmark competition?](#can-i-submit-multiple-times-to-the-benchmark-competition) + - [Can my submission be structured using multiple files?](#can-my-submission-be-structured-using-multiple-files) + - [Can I install custom dependencies?](#can-i-install-custom-dependencies) + - [How can I know if my code can be run on benchmarking hardware?](#how-can-i-know-if-my-code-can-be-run-on-benchmarking-hardware) + - [Are we allowed to use our own hardware to self-report the results?](#are-we-allowed-to-use-our-own-hardware-to-self-report-the-results) + - [What can I do if running the benchmark is too expensive for me?](#what-can-i-do-if-running-the-benchmark-is-too-expensive-for-me) + - [Can I submit existing (i.e. published) training algorithms as submissions?](#can-i-submit-existing-ie-published-training-algorithms-as-submissions) +- [Disclaimers](#disclaimers) + - [Shared Data Pipelines between JAX and PyTorch](#shared-data-pipelines-between-jax-and-pytorch) + - [Pytorch Conformer CUDA OOM](#pytorch-conformer-cuda-oom) + +## Introduction + +We need a more scientifically sound methodology for evaluating training speedups due to new algorithms, including both new optimizers and new model architectures. Cutting edge machine learning (ML) models are exceeding the compute budgets of many researchers, and ML compute is becoming a larger and larger cost in industry. To reduce the compute and potentially environmental cost of ML research and practice, we need rigorous benchmarking of efficiency. Such benchmarks will guide us in selecting the best directions to evolve existing techniques and ultimately enable progress toward models that produce not only better results, but better results **at lower cost**. + +MLCommons' mission is to build fair and useful benchmarks for measuring training and inference performance of ML hardware, software, and services. Improvements in training speed can come from better hardware, better software stacks, and better algorithms. +To date, the Closed Division of the MLPerf™ Training benchmark has been extremely successful in driving systems innovation by requiring mathematical equivalence to a reference implementation, while still allowing submissions on different hardware. Although the Open Division allows new models and training algorithms, it has several issues that make it inappropriate as a benchmark for progress in training algorithms. By allowing arbitrary hardware, it is impossible to isolate improvements due to algorithms or due to extra computation. Unrestricted hardware makes the benchmark only accessible to the most well-funded organizations, even if many academic labs and others have interesting algorithms to measure. Finally, even if we could isolate improvements due to particular algorithmic changes and make the benchmark more broadly accessible, there is still no incentive to avoid hyper-specific changes that only help the particular benchmark workload. + +In order to drive innovation in machine learning algorithms that reduce the time needed to create useful models, we propose a new set of benchmarks called **AlgoPerf** to evaluate the training time for different algorithms (models, optimizers, preprocessing, etc.) on a **fixed hardware configuration** (future iterations can adopt new hardware configurations as needed). Our proposal includes two tracks: (1) the **Training Algorithm Track** and (2) the **Model Track**. The goal of the Training Algorithm Track is to find training algorithms (optimizers, etc.) that train benchmark models to reach the goal out-of-sample error rate as fast as possible. However, to incentivize practically useful algorithms, in the Training Algorithm Track we require that a single training algorithm simultaneously performs well across all benchmark models and datasets. Similarly, the goal of the Model Track is to find models that can be trained to achieve the target solution quality (out-of-sample error) in the least amount of time on each benchmark dataset. Although submissions in the Model Track will be inherently dataset-specific, we sharply constrain what parts of the training program can be modified in the Model Track and require submitted models to be easily trainable using standard optimizers. Thus the two-track structure discourages overly specific solutions that aren't generally useful to practitioners and will hopefully produce evidence on the relative returns of speeding up training by finding new models or by developing new training algorithms. + +In the following, we will focus on the **Training Algorithm Track** of the *AlgoPerf benchmark*. + +## Technical Documentation of the Training Algorithm Track + +The goal of the **AlgoPerf: Training Algorithm Track** is to reach the same results faster ("time to result") by using better optimizers, data ordering/weighting schemes, and weight update strategies while producing techniques that work well on a wide variety of models and datasets. We hope to encourage generally useful training algorithms that are not specific to only a small number of particular workloads. + +In general, submissions to the Training Algorithm Track will replace specific pieces of a reference implementation in order to produce a training program that reaches the same results faster on as many workloads as possible. The training program has a fixed, high-level structure and competitors are allowed to replace a particular set of functions in the program (the [**submission functions**](#submission-functions)), but must leave all other pieces ([**fixed functions**](#fixed-functions) and high-level structure) of the reference implementation unchanged. The submitted code must perform well on multiple datasets and models simultaneously (a model and dataset pair constitute a [workload](#workloads) for the purposes of this track). + +Submissions to the Training Algorithm Track can be entered under two separate rulesets, named [external tuning ruleset](#external-tuning-ruleset) and [self-tuning ruleset](#self-tuning-ruleset), with it being possible to submit to both rulesets. The main difference is that the external tuning ruleset allows moderate, automatic, parallel tuning of the optimizer's hyperparameters on each workload, using the submitted workload-agnostic search space. This allows the training algorithm to adapt to a particular task while ensuring that it is not too difficult to tune automatically. Under the self-tuning ruleset, there is no external tuning and submissions need to adapt to a particular task autonomously within a single optimization run. Unless otherwise specified, the rules in this section apply to both rulesets (see, for example, the [Tuning](#tuning) section for the most substantial difference between the rulesets). + +The intention is that a training algorithm submission will be broadly applicable and useful without customization to the specific [workload](#workloads) (model, dataset, loss function). We want to discourage detecting the particular workload and doing something highly specific that isn't generally useful. In order to further discourage submissions that overfit to the particular [fixed benchmark workloads](#fixed-workloads), submissions will also be evaluated on [held-out workloads](#randomized-workloads) specified after the submission deadline. + +### Competition Rules + +For a description of the competition rules and how to submit a training algorithm to the AlgoPerf: Training Algorithms Benchmark, see the [Competition Rules](/RULES.md), which details the entire competition process. + +### Submissions + +A valid submission is a piece of code that defines all of the submission functions and is able to train all benchmark workloads on the [benchmarking hardware](#benchmarking-hardware) (defined in the [Scoring](#scoring) section). Both the validation set and the test set performance will be checked regularly during training (see the [Evaluation during training](#evaluation-during-training) section), however, only the validation performance is relevant for scoring. Training halts when the workload-specific [target performance](#defining-target-performance) for the validation and test sets have been reached. For each workload, only the training time to reach the *validation* set target error is used as input to the [scoring process](#scoring) for the submission. Submissions using [external tuning](#external-tuning-ruleset) will be tuned independently for each workload using a single workload-agnostic search space for their specified hyperparameters. The tuning trials are selected based on the time to reach the *validation* target. Submissions under either tuning ruleset may always self-tune while on the clock. + +#### Specification + +Any function defined in the reference implementations that isn't a [submission function](#submission-functions) is a [fixed function](#fixed-functions) for the Training Algorithm Track. No submitted code is run to compute the evaluation metrics in the Training Algorithm Track. We just use the final model parameters and the fixed functions from this track at test time. + +In principle, submissions are allowed to use the available hardware systems in any data- or model-parallel manner they desire, within the constraints of the submission function APIs. However, in practice, model-parallelism may not be possible with the API. They are allowed to access any framework-specific device information necessary to exploit the hardware. + +Submissions provide a [per-workload batch size](#batch-size-getter) to use. Specification of the batch size for each workload is necessary to avoid running out of memory for different workloads. Therefore, submitters can determine this batch size in advance and specify it as part of the submission. Submitters may also provide per-workload batch sizes for all [randomized workloads](#randomized-workloads). If no such batch size is provided for a randomized workload, by default, submissions will then use the batch size of the most similar [fixed workload](#fixed-workloads) (for example, if there is an ImageNet fixed workload and also a randomized workload with a similarly sized model on similarly sized images, the ImageNet batch size will be used for held-out workloads generated from this randomized workload). + +The **submission functions** are the *batch size getter*, *optimizer state initializer*, *variable update*, and *data selection functions*. The *fixed functions* are the *data augmentation/preprocessing*, *model initialization*, *forward pass*, and *loss function*. The trained model will be evaluated in a separate step that does not call any of the submitted code. + +##### Fixed functions + +With the exception of `_build_input_queue`, submitters can call any of these functions (along with any public function in the provided `Workload` instance) at any time in their submitted functions. + +```python +@property +def step_hint(self): -> int +``` + +- The `step_hint` function gives the number of global steps the baseline algorithm was allowed to use to reach the targets for a workload. Note that the baseline algorithms may have reached the target in fewer steps than this, but these were the max number of steps the baseline algorithms used for their learning rate schedules. Submitters can use this to help specify learning rate (or other) schedules. + +###### Data augmentation and preprocessing + +```python +def _build_input_queue( + self, + data_rng: RandomState, + split: str, + data_dir: str, + global_batch_size: int) -> Iterator[Dict[str, Tensor]]: +``` + +- The `_build_input_queue` function will be called to produce the iterator over batches that the submitted data selection function consumes. It is responsible for all data reading, shuffling, repeating, preprocessing, and batching. Note that for Jax this should return an iterator over tensors of shape `(num_devices, per_device_batch_size, ...)`, and for PyTorch this should return tensors of shape `(per_device_batch_size, ...)` (assuming PyTorch's [DDP](https://pytorch.org/docs/stable/notes/ddp.html) is used). + +###### Model initialization + +```python +def init_model_fn( + self, + rng: RandomState, + dropout_rate: Optional[float] = None, + aux_dropout_rate: Optional[float] = None +) -> initial model parameters +``` + +- Unlike in the *Model Track*, this function that initializes the parameters of the model, is fixed. While it can be called by the submission (e.g. to restart the model after a failed training effort) it cannot be changed. + +###### Forward pass + +```python +def model_fn( + self, + params: ParameterContainer, + augmented_and_preprocessed_input_batch: Tensor, + model_state: ModelAuxiliaryState, + mode: ForwardPassMode, # mode \in {train, eval} + rng: RandomState, + hyperparameters: Hyperparameters, + update_batch_norm: bool +) -> (logits_output_batch, new_model_state): Tuple[Tensor, ModelAuxiliaryState] +``` + +- `params` is whatever the structure is that contains the (`float32`) model parameters. The naming is overloaded due to having to handle the more object-oriented `PyTorch` style and the functional `JAX` style of development. In the `Flax` library (written in `JAX`), this is typically a nested dictionary of `JAX`/`numpy` arrays, but in `PyTorch` this is the `torch.nn.Model`. +- It is possible that `model_parameters` will be endowed with additional information about the kind of each parameter, e.g. "weights" or "bias" or "batch norm", although `model_fn` does not really need that information we might use the same nested structure elsewhere +- `logits_output_batch` is before the output activation +- `new_model_state` is for batch norm or similar side effects and will only be updated if `update_batch_norm` is set +- `hyperparameters` will contain only dropout rates, which will be used in the models that support it. These can be tuned or will default to documented model-specific values. Note that adding additional dropout would be considered changing the model, which is not allowed, but the tuning of dropout in existing dropout layers can be considered a regularizer, so we allow it. There should be at most two dropout rates in a model (if there are more than two we will reuse the same values). + +###### Loss function + +```python +def loss_fn( + self, + # Dense or one-hot labels, or a tuple of (tensor, padding) for speech. + label_batch: Union[Tuple[Tensor, Tensor], Tensor], + logits_batch: Union[Tuple[Tensor, Tensor], Tensor], + mask_batch: Optional[Tensor] = None, + label_smoothing: float = 0.0) -> Dict[str, Tensor] # differentiable +``` + +- Unlike in the *Model Track*, we will specify the loss function name in order to let training algorithms depend on the loss function. It will be one of {**mean squared error**, **cross-entropy**, **CTC**, or **L1 reconstruction error**}. + - The optimizer must work with all values of the enum, which will be provided via a property on the workload object that is provided to all submissions functions. +- The loss function does **not** include regularization. Instead, regularization can be added by the submissions in the `update_params` function. +- The loss function returns a dict {'summed': scalar summed loss, 'n_valid_examples': scalar number of valid examples in batch, 'per_example': 1-d array of per-example losses}. + Note that the returned quantities are not synced across devices; this can be done by the user in the `update_params` function. + +##### Submission functions + +###### Batch size getter + +```python +def get_batch_size(workload_name: str) -> int +``` + +- Submitters define a specific batch size for each [workload](#workloads). +- For example, in advance, they can determine the largest batch size without running out of memory for each workload. +- For the [held-out workloads](#randomized-workloads), submitters may provide a batch size once the submission code is frozen and the held-out workloads are sampled from the randomized workloads. By default, this function will use the `workload_name` of the fixed workload it is based on. + +###### Optimizer state initializer + +```python +def init_optimizer_state( + workload: Workload, + model_params: ParameterContainer, + model_state: ModelAuxiliaryState, + hyperparameters: Hyperparameters, + rng: RandomState +) -> initial_optimizer_state +``` + +- Allowed to create state for the optimizer +- Does not involve the initialization for the model parameters, which in the Training Algorithm Track, is considered a fixed function, see [Model initialization](#model-initialization). +- The optimizer state is a dictionary (`Dict[str, Any]`). For a PyTorch submission, any value in this dictionary which is a class instance with internal state has to have a `state_dict()` method implemented to be stored correctly at the training checkpoints. + +###### Variable update function + +```python +def update_params( + workload: Workload, + current_param_container: ParameterContainer, + current_params_types: ParameterTypeTree, + model_state: ModelAuxiliaryState, + hyperparameters: Hyperparameters, + batch: Dict[str, Tensor], + loss_type: LossType, + optimizer_state: OptimizerState, + eval_results: List[Tuple[int, float]], + global_step: int, + rng: RandomState +) -> (updated_optimizer_state, updated_variables, updated_model_state) +``` + +- `current_param_container` is the same kind of nested structure as used by `model_fn` which constitutes a nested collection of `float32` arrays, each endowed with information about what kind of parameter that array represents stored in a parallel structure of `current_params_types`. + - Parameter kind is one of {"weights", "biases", "embeddings", "conv", "batch norm"}. +- `model_state` holds auxiliary state necessary for some models, such as the current batch norm statistics. +- The loss function will be one of a small set of known possibilities and the update function is allowed to branch on the `loss_type` enum/name. +- The `loss_fn` produces a loss per example and a summed loss (both only for one device), which both can be used. +- Allowed to update state for the optimizer. +- Uses the `model_fn` of the `workload` in order to decouple the loss from the model so that model outputs (forward passes) can be reused (by storing them in the optimizer state). +- The submission can access the target evaluation metric via the `workload` variable. +- **A call to this function will be considered a step** + - The time between a call to this function and the next call to this function will be considered the per-step time. +- Cannot modify the given hyperparameters in a workload-conditional way (please see the [Valid submission](#valid-submissions) section). This rule is intended to prohibit circumventing the tuning rules by looking up a pre-tuned optimal set of hyperparameters for each workload. It is not intended to prohibit line searches and other similar techniques. + - This will be checked by the spirit jury. +- The fixed `init_model_fn` can optionally be called during training, for example, to reinitialize the model after a failed training effort. +- Cannot replace the model parameters with pre-trained ones. + - This will be checked by the spirit jury. +- This API supports Polyak averaging and similar methods that implement moving averages of model parameters. +- Batch norm should work here because the `model_fn` will return updated batch norm moving averages when it is told to with `update_batch_norm`. + +###### Data selection + +```python +def data_selection( + workload: Workload, + input_queue: Iterator[Tuple[Tensor, Tensor]], + optimizer_state: OptimizerState, + current_param_container: ParameterContainer, + hyperparameters: Hyperparameters, + global_step: int, + rng: RandomState +) -> Dict[str, Tensor] +``` + +- `input_queue` can yield up to the number of elements in the training dataset +- Want to allow for submitters to construct their own data batches from the dataset +- Submissions are allowed to arbitrarily modify the input examples, as long as the modifications are sufficiently generic to be applicable to any workload +- This is only called on the training inputs. **No submitted code will be called at eval in the training track.** +- This allows for any of the following methods: + - Data echoing + - Curriculum learning + - Bootstrapping + - Biased sampling (based on loss values, so need to store the forward pass in the `optimizer_state`, potentially forward pass of a cheaper proxy model) + - Submissions need batching control + +#### Evaluation during training + +In general, with noisy, non-deterministic training, evaluation frequency can affect training time measurements as more "bites of the apple" potentially allows the training code to exploit instability. We also want to discourage submissions from complicated and unrealistic logic that attempts to guess when training is close to complete and increases the evaluation rate, while not producing a well-sampled training curve at the start of training. Simply allowing submissions complete freedom over evaluation frequency encourages competitors to work to minimize the number of evaluations, which distracts from the primary goal of finding better training algorithms. + +Submissions are eligible for an untimed eval every `eval_period` seconds, run as soon as the current call of `update_params` completes. Any additional evaluations performed by the submission code count against the runtime for scoring. The harness that runs the submission code will attempt to eval every `eval_period` seconds by checking between each submission step (call of `update_params`) whether it has been at least `eval_period` seconds since that last eval and, if so, pausing the clock and running an eval. This means that if calls to `update_params` typically take a lot more than `eval_period` seconds, such submissions will not receive as many untimed evals as a submission that had an `update_params` function that took less time. However, for appropriate settings of `eval_period`, we expect this to be quite rare. Submissions are always free to restructure their `update_params` code to split work into two subsequent steps to regain the potential benefits of these untimed model evaluations. For each workload, the `eval_period` will be set such that the total evaluation time is roughly between 10% and 20% of the total training time for the target-setting runs. + +#### Valid submissions + +The intention of this benchmark is to identify training algorithm submissions that will be broadly applicable and effective in practical scenarios without customization to the specific [workload](#workloads) (model, dataset, and loss function). Generally useful training algorithms can train models faster and thus require less compute resources, decreasing the cost of machine learning. We want to discourage all submissions that sidestep the purpose of this benchmark. + +We reserve the right to disqualify submissions if they clearly violate this spirit of the benchmark, even if those submissions perform well in our benchmark. Unfortunately, we can't easily write rules that make it completely clear if a submission is circumventing the spirit of the benchmark in a way that would encompass all possible cases. Instead, we will have to prohibit these activities in the abstract and defer rulings about specific submissions to a **"spirit [of the rules] jury"** that can hear the justifications of the submitters, inspect the code, and ultimately decide if the spirit of the rules has been violated. The jury might also ask the submitters to explain how the submission was produced, for example, by disclosing their intermediate experiments. + +We want to state clearly that we welcome creative ideas and novel research. Therefore, the API aims to allow a wide variety of submissions, however, in some cases, routines that would be allowed in principle might not be practically feasible in the provided framework. The spirit jury, however, will only be invoked for submissions that aim to bypass the core premise of this benchmark since submissions like this would also be irrelevant in practice. + +In order to help clarify which submissions are [allowed](#allowed-submissions) and [disallowed](#disallowed-submissions), we described a few examples below. Two essential questions can help provide a general guideline for whether a submission is allowed or not: + +1. What **information** is being used by the submission? +2. What **action** is the submission code taking based on this information? + +In general, both parts are needed to decide if a particular piece of code is within the spirit of the rules. For example, it is fine to use the shape information of the model parameters to switch between a low-memory and a high-memory approximation, but it isn't allowed to use this shape as a "fingerprint" to uniquely identify a workload and then use pre-computed hyperparameters for this specific workload. As a rule of thumb, submissions are allowed if it is reasonable to assume that the method will work comparably well on unseen workloads automatically without requiring human engineering labor. + +##### Allowed submissions + +Submissions are allowed to use the provided model parameter information, e.g. the shapes and types of the layers, if the resulting action works on generic workloads. + +
+Examples: + +- Using shape information of the parameters to switch between low-memory and high-memory routines is allowed. +- Using shape information of the parameters to conditionally construct variables to avoid running out of memory, e.g. by approximating larger matrices, is allowed. +- Using the ordering of the parameters to train deeper layers differently, e.g. training them sequentially, is allowed. +- Submissions are allowed to use the layer type to change the update rules, e.g. use a different update rule for all batch normalization layers, or use different sub-routines for each layer type, e.g. compute variances for convolutional layers but not for batch normalization layers. + +
+
+ +Automatic methods for determining or dynamically setting hyperparameters are allowed if they function on generic workloads. + +
+Examples: + +- Submissions are allowed to use automatic procedures for setting hyperparameters, e.g. automated learning rate range tests. +- Inner-loop tuning methods for setting hyperparameters, e.g. line searches, are allowed. +- Changing the batch size dynamically during training. + +
+
+ +Submissions can also be based on learned training algorithms. + +
+Examples: + +- Submission are allowed to learn the update rule of the training method. +- In the [self-tuning ruleset](#self-tuning-ruleset), submissions could try out a learned list of hyperparameters. + +
+
+ +Submissions can use additional software dependencies provided they have the intention of supporting new algorithmic and mathematical ideas. The procedure for adding dependencies is described in more detail in the [Software dependencies](#software-dependencies) section. + +
+Examples: + +- [`BackPACK`](https://docs.backpack.pt/en/master/index.html) is a `pip` package that hooks into `PyTorch` to extract additional information from the backward pass. An allowed use of `BackPACK` would be to compute batch statistics (e.g. within-batch gradient variances, etc.) to calibrate or auto-tune training algorithms. + +
+ +##### Disallowed submissions + +Submissions are not allowed to circumvent the tuning rules by looking up the result of an offline computation that was performed ahead of time. + +
+Examples: + +- Submissions are not allowed to look up (pre-trained) model parameters. +- Computing the optimal hyperparameters for every fixed workload offline and having the submission look up those pre-computed values (and finding the closest fixed workload for a held-out workload) is not allowed. In contrast, finding and hard-coding a single good setting of the hyperparameters that works well across all the workloads simultaneously would be allowed. +- Submissions are not allowed to adjust the hyperparameter search spaces for the external tuning ruleset, such that it differs between the workloads. + +
+
+ +Submissions are not allowed to detect the particular workload (irrespective of which information they use to this end) in order to use settings that are specified for individual workloads. This would result in highly specific behavior that isn't generally useful. This also extends to learned approaches that ultimately detect specific workloads. In general, all else being equal, if some submission was written that was extremely effective on a small set of the workloads (and far worse on the rest) and another submission with the opposite performance pattern, we would prefer both submissions to be submitted and tested on **all** workloads. + +
+Examples: + +- A hard-coded switching of the update rule based on the workload is not allowed, e.g. using Adam for RNNs and SGD with momentum on CNNs. Although submissions can specialize for certain layer types in generic ways, they should not uniquely identify a model or dataset. In other words, if there are two workloads A and B that both have convolutional layers and fully connected layers the submission shouldn't detect whether it is dealing with A or B specifically and choose Adam for one and SGD with momentum for the other. However, if the updates for all parameters of convolutional layers always used SGD with momentum and the updates for all other layers always used Adam and a workload with both types of layers had mixed updates, that would be fine. +It is also allowed to make the update rule part of the (external) hyperparameter tuning or determine the optimal update rule during the run, i.e. while "on-the-clock". +- Submissions are not allowed to look up learning rate schedules that are only utilized for specific subsets of the workloads. It is allowed to use one general learning rate schedule or dynamically adapt the learning rate based on general information such as curvature. + +
+
+ +It is not allowed to compute any kind of pairwise metrics between the fixed workloads and the held-out workloads. + +
+Examples: + +- On a held-out workload, submissions are not allowed to find the nearest neighbor among the fixed workloads to set any hyperparameter. + +
+
+ +Valid submissions must rely on new algorithmic or mathematical ideas and should not use software engineering approaches to speed up primitive operations in `PyTorch`, `JAX`, their dependencies, the operating system, or the hardware. We recognize that the way a method is implemented will impact its performance in the benchmark. It is generally acceptable to make clever, judicious, and efficient use of public APIs in `JAX` and/or `PyTorch` from within the submission function APIs. It is not acceptable to use these APIs to optimize the internals of primitive operations and standard dependencies in ways that could generally benefit any submission. + +
+Examples: + +- Submissions are allowed to use `CUDA` streams to schedule operations, e.g., transfering data between CPU and GPU, or among GPUs, while performing other computations. +- Submissions are not allowed to use `CUDA` streams or asynchronous operations (e.g., spawning additional threads) to perform additional computations that run during the [untimed evaluations](#evaluation-during-training). +- Submissions are not allowed to use faster GPU kernels than other submitters by writing their own, using `TVM`, or using a different version of `cuDNN`/`cuBLAS`. +- Submissions are not allowed to skip or reduce system or framework overhead, such as modifying `JAX` to skip internal steps like pytree flattening/unflattening. +- Submissions are not allowed to introduce new compiler optimizations, such as modifying `XLA` to perform more or less kernel fusion. + +
+ +##### Software dependencies + +We require submissions to use specific versions of `PyTorch`/`JAX` as well as additional dependencies in order to facilitate fair comparisons. Submitters must build on top of these provided software packages, which might be provided as a `Docker` container. Additional dependencies can be added as long as they include a comment describing what was added and why. Submitters are free to add dependencies that support new algorithmic and mathematical ideas but they should not circumvent the intention of the benchmark to measure training speedups due to new training methods. For example, software engineering techniques that lead to faster implementations of existing software, e.g. using newer versions of `PyTorch` or `JAX`, are not allowed and these are described in more detail in the [Disallowed submissions](#disallowed-submissions) section. In case of doubts, these additional dependencies will be judged by the spirit jury. + +### Tuning + +Tuning will be substantially different for the [external](#external-tuning-ruleset) and the [self-tuning ruleset](#self-tuning-ruleset) and the individual specifications for each will be described in the following. + +#### External tuning ruleset + +For each workload, the hyperparameters are tuned using $O=20$ tuning **trials**. To estimate the variance of the results, this tuning will be repeated for $S=5$ **studies**, for a total of $S\cdot O = 100$ different hyperparameter settings. The submitters will provide a workload-agnostic search space and the working group will then return $100$ hyperparameters settings obtained using [(quasi)random search](https://arxiv.org/abs/1706.03200). The working group will also randomly partition these $100$ trials into $5$ studies of $20$ trials each. In lieu of independent samples from a search space, submissions can instead supply a fixed list of $20$ hyper-parameter points that will be sampled without replacement. + +In each trial, the tuning trial with the fastest training time to achieve the *validation target* is determined among the $O=20$ hyperparameter settings. For scoring, we use this required training time to reach the *validation targets* of those $5$ selected runs. The median of these $5$ per-study training times will be the final training time for the submission on this workload and is used in the scoring procedure (see the "[Scoring submissions](#scoring)" section). Runs that do not reach the target performance of the evaluation metric have an infinite time. Submissions are always free to perform additional self-tuning while being timed. + +#### Self-tuning ruleset + +Submissions to this ruleset are not allowed to have user-defined hyperparameters. This ruleset allows both submissions that use the same hyperparameters for all workloads, including the randomized ones (e.g. Adam with default parameters), as well as submissions that perform inner-loop tuning during their training run (e.g. SGD with line searches). + +Submissions will run on one instance of the [benchmarking hardware](#benchmarking-hardware). As always, submissions are allowed to perform inner-loop tuning (e.g. for their learning rate) but the tuning efforts will be part of their score. A submission will run *S=5* times and its score will be the median time to reach the target evaluation metric value on the validation set. To account for the lack of external tuning, submissions have a longer time budget to reach the target performance. Compared to the [external tuning ruleset](#external-tuning-ruleset), the `max_runtime` is tripled. Runs that do not reach the target performance of the evaluation metric within this allotted time budget have an infinite time. + +### Workloads + +For the purposes of the Training Algorithm Track, we consider a workload the combination of a `dataset`, `model`, `loss_fn`, along with a target that is defined over some evaluation metric. E.g., ResNet50 on ImageNet using the cross-entropy loss until a target error of 22.6% on the validation set has been reached, would constitute a workload. The evaluation metric, in this example the misclassification error rate, is directly implied by the dataset/task. + +Submissions will be scored based on their performance on the [fixed workload](#fixed-workloads). However, additionally submissions must also perform resonably well on a set of [held-out workloads](#randomized-workloads) in order for their score on the fixed workload to count (for full details see the [Scoring](#scoring) section). These held-out workloads will be generated after the submission deadline, but their randomized generating process is publicly available (see "[Randomized workloads](#randomized-workloads)" section). + +Furthermore, a less computationally expensive subset of the fixed workloads is collected with the [qualification set](#qualification-set). Submitters without enough compute resources to self-report on the full set of fixed and held-out workloads can instead self-report on this smaller qualification set. Well-performing submissions can thereby qualify for computational resources provided by sponsors of the benchmark to be scored on the full benchmark set. + +#### Fixed workloads + +The fixed workloads are fully specified with the call for submissions. They contain a diverse set of tasks such as image classification, machine translation, speech recognition, or other typical machine learning tasks. For a single task there might be multiple models and therefore multiple fixed workloads. The entire set of fixed workloads should have a combined runtime of roughly 100 hours on the [benchmarking hardware](#benchmarking-hardware). + +The currently eight fixed workloads are: + +| | **Task** | **Dataset** | **Model** | **Loss** | **Metric** | Validation
**Target** | Test
**Target** | Maximum
**Runtime**
(in secs) | +|------------|-------------------------------|-------------|-------------------------|----------|------------|--------------------------|----------------------|------------------------| +| **1** | Clickthrough rate prediction | Criteo 1TB | DLRMsmall | CE | CE | 0.123649 | 0.126060 | 21,600 | +| **2** | MRI reconstruction | fastMRI | U-Net | L1 | SSIM | 0.7344 | 0.741652 | 10,800 | +| **3
4** | Image classification | ImageNet | ResNet-50
ViT | CE | ER | 0.22569
0.22691 | 0.3440
0.3481 | 111,600
111,600 | +| **5
6** | Speech recognition | LibriSpeech | Conformer
DeepSpeech | CTC | WER | 0.078477
0.1162 | 0.046973
0.068093 |
72,000 | +| **7** | Molecular property prediction | OGBG | GNN | CE | mAP | 0.28098 | 0.268729 | 12,000 | +| **8** | Translation | WMT | Transformer | CE | BLEU | 30.8491 | 30.7219 | 80,000 | + +#### Randomized workloads + +In addition to the [fixed and known workloads](#fixed-workloads), there will also be randomized workloads in our benchmark. These randomized workloads will introduce minor modifications to a fixed workload (e.g. small model changes). The exact instances of these randomized workloads will only be created after the submission deadline and are thus unknown to both the submitters as well as the benchmark organizers. The instructions for creating them, i.e. providing a set or distribution of workloads to sample from, will be defined by this working group and made public with the call for submissions, to allow the members of this working group to submit as well as ensure that they do not possess any additional information compared to other submitters. We will refer to the unspecific workloads as *randomized workloads*, e.g. the set or distribution. The specific instance of such a randomized workload we call a *held-out workload*. That is, a held-out workload is a specific sample of a randomized workload that is used for one iteration of the benchmark. While we may reuse randomized workloads between iterations of the benchmark, new held-out workloads will be sampled for each new benchmark iteration. + +The held-out workloads function similarly to a holdout test set discouraging submissions that overfit to the [fixed and known workloads](#fixed-workloads). After the submission deadline, a third party will draw samples from the randomized workloads (e.g. from the set or the distribution) to generate a specific set of held-out workloads. The validation and test targets on each held-out workload will be defined using the [same protocol as the fixed workloads](#defining-target-performance) (with the only change being that only two target-setting training algorithms are used instead of four, to save computational resources) using the same training time budget as the fixed workload they are based on. + +Modifications could, for example, include changing the number of layers or units (drawn from an interval), swapping the activation function (drawn from a set of applicable functions), or using different data augmentations (drawn from a list of possible pre-processing steps). The sample space should be wide enough to discourage submitters from simply trying them all out, but at the same time should be restricted enough to produce realistic workloads with acceptable achievable performances. + +In the first iteration of this benchmark, we manually designed three different workloads variants for each fixed workload. The variants are designed such that they achieve a comparable performance to the fixed workload and that they might require different hyperparameters to achieve this performance. After the submission deadline, one held-out workload will be sampled for each fixed workload. + +Our scoring procedure uses the held-out workloads only to penalize submissions that can't handle the introduced modifications (see the [Scoring](#scoring) section for further details). + +#### Qualification set + +The qualification set is designed for submitters that may not have the compute resources to self-report on the full set of [fixed](#fixed-workloads) and [held-out workloads](#randomized-workloads). They may instead self-report numbers on this smaller qualification set. The best-performing submissions may then qualify for compute sponsorship offering a free evaluation on the full benchmark set and therefore the possibility to win [awards and prizes](/RULES.md#prizes). + +The qualification set consists of the same [fixed workloads](#fixed-workloads) as mentioned above, except for both workloads on *ImageNet*, both workloads on *LibriSpeech*, and the *fastMRI* workload. The remaining three workloads (*WMT*, *Criteo 1TB*, and *OGBG*) form the qualification set. There are no [randomized workloads](#randomized-workloads) in the qualification set. The qualification set of workloads aims to have a combined runtime of roughly 24 hours on the [benchmarking hardware](#benchmarking-hardware). + +For the [external tuning ruleset](#external-tuning-ruleset), we will only use $1$ study instead of the proposed $5$, when evaluating on the qualification set. The [self-tuning ruleset](#self-tuning-ruleset) will use $5$ studies on the qualification set as well since it is computationally cheaper. + +### Scoring + +Submissions will be scored based on their required training time to reach the target performance on the validation set of each workload. This target performance metric can be the same as the loss function but might also be a different workload-specific metric such as the error rate or BLEU score. The target performance was defined using four standard training algorithms, see the "[Defining target performance](#defining-target-performance)" section for more details. The training time of a submission includes the compilation times for computation graphs and ops that could happen just-in-time during training; all our benchmarks should be fast enough to compile so as not to dramatically impact overall performance. The overall ranking is then determined by summarizing the performances across all [fixed workloads](#fixed-workloads), using [performance profiles](#benchmark-score-using-performance-profiles), as explained below. + +The training time until the target performance on the test set was reached is not used in the scoring procedure but might be used for additional analysis of the competition results. + +#### Benchmarking hardware + +All scored runs have to be performed on the benchmarking hardware to allow for a fair comparison of training times. The benchmarking hardware has to be chosen to be easily accessible via common cloud computing providers. The exact hardware specification will most likely change with each iteration of the benchmark. The specs of the benchmarking hardware for this iteration of the benchmark are: + +- 8xV100 GPUs +- 240 GB in RAM +- 2 TB in storage (for datasets). + +For self-reported results, it is acceptable to perform the tuning trials on hardware different from the benchmarking hardware, as long as the same hardware is used for all tuning trials. Once the best trial, i.e. the one that reached the *validation* target the fastest, was determined, this run has to be repeated on the competition hardware. For example, submitters can tune using their locally available hardware but have to use the benchmarking hardware, e.g. via cloud providers, for the $5$ scored runs. This allows for a fair comparison to the reported results of other submitters while allowing some flexibility in the hardware. + +#### Defining target performance + +Target performances on the validation and test sets will be defined for each [workload](#workloads) separately. For the [fixed workloads](#fixed-workloads), we take the best performance achievable by one of four standard algorithms (AdamW, NadamW, Nesterov Momentum, and Heavy Ball Momentum). These target-setting algorithms will follow the general process of the external tuning ruleset, with a slightly larger tuning budget of $200$ trials to guarantee competitive performance. Once the best algorithm and its hyperparameters are determined, training is repeated $20$ times. The median of the best achieved validation errors across seeds is used as the *validation* target. Out of the $10$ repeated runs that achieved this validation target, we took the worst achieved test error across seeds as our *test* target. Taking the median validation performance after rerunning the best hyperparameter point prevents our procedure from selecting a lucky outlier. +To save computational resources, we only tuned two training algorithms instead of four, for the [randomized workloads](#randomized-workloads). For each workload variant, we used NadamW and the other best-performing training algorithm on the corresponding fixed workload the randomized workload is based on. + +Both [tuning rulesets](#tuning) will use the same target performances. The runtime of the target-setting algorithms on each workload will be chosen to match published results and is constrained by the overall time budget of roughly a single week for all fixed workloads. The `max_runtime` for submissions on each workload is $\frac{1}{3}$ longer than the runtime of the target-setting algorithms (this `max_runtime` will be three times as much for the self-tuning ruleset, see the [Self-tuning ruleset](#self-tuning-ruleset) section). + +#### Benchmark score using performance profiles + +We will aggregate the training times of a submission on all fixed workloads using [Performance Profiles](http://www.argmin.net/2018/03/26/performance-profiles/) (originally from [Dolan and Moré](https://arxiv.org/abs/cs/0102001)). Below we surface several relevant definitions from their work for easier readability, before explaining how we integrate the performance profiles to reach a scalar benchmark score that will be used for ranking submissions. + +*Notation:* We have a set $\mathcal{S} = \{s_1, s_2, \dots, s_k\}$ of in total $k$ submissions that we evaluate on a set of $n$ fixed workloads: $\mathcal{W} = \{w_1, w_2, \dots, w_n\}$. For each submission $s$ and each workload $w$ we have a training time score $t_{s,w} \in [0,\infty)$. This is the time it took the submission to reach the validation target performance on this particular workload. + +##### Computing performance ratios + +For all workloads and submissions, we first compute their performance ratio $r$, which is defined for a particular submission $\bar{s}$ and a particular workload $\bar{w}$ to be: + +$$r_{\bar{s},\bar{w}} = \frac{t_{\bar{s},\bar{w}}}{\min_{s \in \mathcal{S}} t_{s,\bar{w}}} \in [1,\infty)$$ + +This performance ratio $r_{s,w}$ expresses the "time spent by submission $s$ on workload $w$" relative to the "time spent by the best submission on this workload". E.g. If a submission takes twice as long on a particular workload compared to the best submission on this workload it will have a performance ratio of $2$. Lower performance ratios are therefore better, with an optimal ratio of $1$ if the given submission is the fastest on this workload. + +##### Building performance profiles + +Next, we compute how often a submission is within a factor $\tau \in [1,\infty)$ of the optimal submission. For this, we determine the following function for every submission $\bar{s}$: + +$$\rho_{\bar{s}}(\tau) = \left(\frac{1}{n}\right) \cdot \left[\text{number of workloads where}\, r_{\bar{s},w}\leq \tau\right]$$ + +In other words, we compute the fraction of workloads where a submission $\bar{s}$ is less than $\tau$ away from the optimal submission. The function $\rho_{\bar{s}}(\tau)$ is monotonically increasing with $\tau$ and bounded between $0$ and $1$. + +An example of a performance profiles plot is shown below, where we plot $\rho_{\bar{s}}(\tau)$ for seven "submissions": + +![Example performance profile](.assets/performance_profiles.png) + +##### Integrating performance profiles for the benchmark score + +To get a scalar score that is usable for ranking submissions, we will integrate the performance profiles $\rho_{\bar{s}}(\tau)$ of all submissions to get their benchmark score $B_{\bar{s}}$, with + +$$B_{\bar{s}} = \frac{1}{r_{\text{max}}-1} \int_{1}^{r_{\text{max}}} \rho_{\bar{s}}(\tau) \,d\tau \in [0, 1].$$ + +The upper integration limit will be set to $r_{\text{max}} = 4$ which also serves as the upper limit of the performance profile plot. +This means that any submission that requires more than four times the runtime of the fastest submission will not get any credit on this workload compared to a training algorithm that is unable to successfully train within the maximum allowed runtime budget. +The integral is normalized by the total integration area, with higher benchmark scores being better. + +##### Using held-out workloads in scoring + +For the benchmark score, we compute and integrate the performance profiles using the training times of only the fixed workloads. But we use the submission's performance on the held-out workloads to penalize submissions. Specifically, if a submission is unable to train a held-out workload, we score the submission on the corresponding fixed workload as if that submission did not reach the target. In other words, for a submission to receive a finite training time on a fixed workload, it needs to: + +- Reach the validation target on the fixed workload within the maximum runtime. +- Reach the validation target fixed workload within 4x of the fastest submission. +- Reach the validation target on the held-out workload (corresponding to the fixed workload) within the maximum runtime. +- Reach the validation target on the held-out workload (corresponding to the fixed workload) within 4x of the fastest submission. To determine the fastest submission on a held-out workload, we only consider submissions that reached the target on the corresponding fixed workload. This protects us against extremely fast submissions that only work on a specific held-out workload and are useless as general algorithms. + +Only if all four requirements are met, does the submission get a finite score. Otherwise, a submission will receive a training time of infinity. + +This essentially means that being unable to successfully train a held-out workload can "disqualify" a submission from getting a good score on the fixed workload it is based on. In other words, we require submissions to be robust enough to handle workload variations. This protocol ensures that we prioritize the fixed workloads for scoring since they are the most relevant version of that workload in practice. However, we also protect our benchmark from egregious workload-specific tuning and penalize brittle methods that break with slight modifications of the workload. + +##### Alternative scores + +Performance profiles and the benchmark score derived from them, take a bit of effort to explain. +However, we believe that they are fairer and well-supported by research in machine learning and the optimization community. To have some simpler to interpret numbers, e.g. for press releases, we will also release a series of alternative scores. + +For a given workload $\bar{w}$, we define the "speedup of a submission $\bar{s}$ over the target-setting reference" as $\frac{t_{\text{ref}, \bar{w}}}{t_{\bar{s}, \bar{w}}}$. For example, if a submission was 2x faster than the target-setting reference, this would be equal to 2. In addition to the raw $t_{s,w}$ values, we will release the geometric mean of the speedups across all workloads, i.e. $\left(\prod_{w \in \mathcal{W}} \frac{t_{\text{ref}, w}}{t_{\bar{s}, w}}\right)^{\frac{1}{n}}$. + +##### Verifying scores + +The working group will independently verify the scores of the highest-scoring submissions in each ruleset. Results that have been verified by the working group will be clearly marked on the leaderboard. + +### Version freeze + +The benchmark code base is subject to change after the call for submissions is published. For example, while interacting with the codebase, if submitters encounter bugs or API limitations, they have the option to issue a bug report. This might lead to modifications of the benchmark codebase even after the publication of the call for submissions. + +To ensure that all submitters can develop their submissions based on the same code that will be utilized for scoring, we will freeze the package versions of the codebase dependencies before the submission deadline. By doing so, we level the playing field for everyone involved, ensuring fairness and consistency in the assessment of submissions. We will also try to minimize changes to the benchmark codebase as best as possible. + +## FAQs + +### Setup and Platform + +#### My machine only has one GPU. How can I use this repo? + +You can run this repo on a machine with an arbitrary number of GPUs. However, the default batch sizes in our reference algorithms `algorithmic-efficiency/baselines` and `algorithmic-efficiency/reference_algorithms` are tuned for a machine with 8 16GB V100 GPUs. You may run into OOMs if you run these algorithms with fewer than 8 GPUs. If you run into these issues because you are using a machine with less total GPU memory, please reduce the batch sizes for the submission. Note that your final submission must 'fit' on the benchmarking hardware, so if you are using fewer +GPUs with higher per GPU memory, please monitor your memory usage to make make sure it will fit on 8xV100 GPUs with 16GB of VRAM per card. + +#### How do I run this on my SLURM cluster? + +You may run into issues with `sudo` and `docker` on a SLURM cluster. To run the workloads in a SLURM cluster you can use Apptainer (previously Singularity), see this [section](/GETTING_STARTED.md#using-singularityapptainer-instead-of-docker). + +#### How can I run this on my AWS/GCP/Azure cloud project? + +Depending on your virtual machine, you may have to install the correct GPU drivers and the NVIDIA Docker toolkit. For example, in GCP you will have to do the following. + +1. If you don't have a VM instance yet, we recommend creating a +new Compute Instance with the "Deep Learning on Linux" Image in Boot disk options. +2. To install the NVIDIA Docker toolkit, you can use `scripts/cloud-startup.sh` as a startup script for the VM. This will automate the installation of the NVIDIA GPU Drivers and NVIDIA Docker toolkit. + +### Submissions + +#### Can I submit multiple times to the benchmark competition? + +Our benchmark allows multiple submissions by the same team of submitters. However, we would like to prevent submitters from circumventing the purpose of the benchmark by, for example, submitting dozens of copies of the same submission with slightly different hyperparameters. Such a bulk submission would result in an unfair advantage on the randomized workloads and is not in the spirit of the benchmark. + +Submitters may submit algorithms marked as *baselines*. These might include existing algorithms with different search spaces or learning rate schedules. These baseline algorithms are not eligible for winning the competition or prize money but they are also not required to be "substantially different" from other submissions by the same submitters. + +#### Can my submission be structured using multiple files? + +Yes, your submission can be structured using multiple files. + +#### Can I install custom dependencies? + +You may use custom dependencies as long as they do not conflict with any of the pinned packages in `algorithmic-efficiency/setup.cfg`. +To include your custom dependencies in your submission, please include them in a requirements.txt file. Please refer to the [Software dependencies](#software-dependencies) section of our rules. + +#### How can I know if my code can be run on benchmarking hardware? + +The benchmarking hardware specifications are documented in the [Benchmarking Hardware Section](#benchmarking-hardware). We recommend monitoring your submission's memory usage so that it does not exceed the available memory +on the benchmarking hardware. We also recommend to do a dry run using a cloud instance. + +#### Are we allowed to use our own hardware to self-report the results? + +You only have to use the benchmarking hardware for runs that are directly involved in the scoring procedure. This includes all runs for the self-tuning ruleset, but only the runs of the best hyperparameter configuration in each study for the external tuning ruleset. For example, you could use your own (different) hardware to tune your submission and identify the best hyperparameter configuration (in each study) and then only run this configuration (i.e. 5 runs, one for each study) on the benchmarking hardware. + +#### What can I do if running the benchmark is too expensive for me? + +Submitters unable to self-fund scoring costs can instead self-report only on the [qualification set of workloads](/RULES.md#qualification-set) that excludes some of the most expensive workloads. Based on this performance on the qualification set, the working group will provide - as funding allows - compute to evaluate and score the most promising submissions. Additionally, we encourage researchers to reach out to the [working group](mailto:algorithms@mlcommons.org) to find potential collaborators with the resources to run larger, more comprehensive experiments for both developing and scoring submissions. + +#### Can I submit existing (i.e. published) training algorithms as submissions? + +Yes you may, as long as it isn't an exact copy of an existing submission. +For example, you may submit the Adam optimizer with your hyperparameters or hyperparameter search spaces, as this constitues a different training algorithm. + +## Disclaimers + +### Shared Data Pipelines between JAX and PyTorch + +The JAX and PyTorch versions of the Criteo, FastMRI, Librispeech, OGBG, and WMT workloads are using the same TensorFlow input pipelines. Due to differences in how JAX and PyTorch distribute computations across devices, the PyTorch workloads have an additional overhead for these workloads. + +Since we use PyTorch's [`DistributedDataParallel`](https://pytorch.org/docs/stable/generated/torch.nn.parallel.DistributedDataParallel.html#torch.nn.parallel.DistributedDataParallel) implementation, there is one Python process for each device. Depending on the hardware and the settings of the cluster, running a TensorFlow input pipeline in each Python process can lead to errors, since too many threads are created in each process. See [this PR thread](https://github.com/mlcommons/algorithmic-efficiency/pull/85) for more details. +While this issue might not affect all setups, we currently implement a different strategy: we only run the TensorFlow input pipeline in one Python process (with `rank == 0`), and [broadcast](https://pytorch.org/docs/stable/distributed.html#torch.distributed.broadcast) the batches to all other devices. This introduces an additional communication overhead for each batch. See the [implementation for the WMT workload](https://github.com/mlcommons/algorithmic-efficiency/blob/main/algorithmic_efficiency/workloads/wmt/wmt_pytorch/workload.py#L215-L288) as an example. + +### Pytorch Conformer CUDA OOM + +The Conformer PyTorch workload may run out of memory in the current state. Please set the `submission_runner.py` flag `reduce_pytorch_max_split_size` to `True` as a temporary workaround if you encounter this issue. This will set `max_split_size_mb:256`. Note that this will adversely impact the performance of the submission on this workload. See [tracking issue](https://github.com/mlcommons/algorithmic-efficiency/issues/497). diff --git a/GETTING_STARTED.md b/GETTING_STARTED.md index 1369f5cc7..d9f2a7051 100644 --- a/GETTING_STARTED.md +++ b/GETTING_STARTED.md @@ -2,79 +2,231 @@ ## Table of Contents -- [Set up and installation](#set-up-and-installation) -- [Download the data](#download-the-data) -- [Develop your submission](#develop-your-submission) - - [Set up your directory structure (Optional)](#set-up-your-directory-structure-optional) - - [Coding your submission](#coding-your-submission) -- [Run your submission](#run-your-submission) +- [Set Up and Installation](#set-up-and-installation) + - [Python Virtual Environment](#python-virtual-environment) + - [Docker](#docker) + - [Building Docker Image](#building-docker-image) + - [Running Docker Container (Interactive)](#running-docker-container-interactive) + - [Using Singularity/Apptainer instead of Docker](#using-singularityapptainer-instead-of-docker) +- [Download the Data](#download-the-data) +- [Develop your Submission](#develop-your-submission) + - [Set Up Your Directory Structure (Optional)](#set-up-your-directory-structure-optional) + - [Coding your Submission](#coding-your-submission) +- [Run your Submission](#run-your-submission) - [Pytorch DDP](#pytorch-ddp) - - [Run your submission in a Docker container](#run-your-submission-in-a-docker-container) + - [Run your Submission in a Docker Container](#run-your-submission-in-a-docker-container) - [Docker Tips](#docker-tips) -- [Score your submission](#score-your-submission) -- [Good Luck](#good-luck) +- [Score your Submission](#score-your-submission) -## Set up and installation +## Set Up and Installation To get started you will have to make a few decisions and install the repository along with its dependencies. Specifically: -1. Decide if you would like to develop your submission in either Pytorch or Jax. -2. Set up your workstation or VM. We recommend to use a setup similar to the [benchmarking hardware](https://github.com/mlcommons/algorithmic-efficiency/blob/main/RULES.md#benchmarking-hardware). +1. Decide if you would like to develop your submission in either PyTorch or JAX. +2. Set up your workstation or VM. We recommend to use a setup similar to the [benchmarking hardware](/DOCUMENTATION.md#benchmarking-hardware). The specs on the benchmarking machines are: - - 8 V100 GPUs + - 8xV100 GPUs - 240 GB in RAM - 2 TB in storage (for datasets). +3. Install the algorithmic package and dependencies either in a [Python virtual environment](#python-virtual-environment) or use a [Docker](#docker) (recommended) or [Singularity/Apptainer container](#using-singularityapptainer-instead-of-docker). -3. Install the algorithmic package and dependencies, see [Installation](./README.md#installation). +### Python Virtual Environment -## Download the data +> **Prerequisites:** +> +> - Python minimum requirement >= 3.8 +> - CUDA 11.8 +> - NVIDIA Driver version 535.104.05 -The workloads in this benchmark use 6 different datasets across 8 workloads. You may choose to download some or all of the datasets as you are developing your submission, but your submission will be scored across all 8 workloads. For instructions on obtaining and setting up the datasets see [datasets/README](https://github.com/mlcommons/algorithmic-efficiency/blob/main/datasets/README.md#dataset-setup). +To set up a virtual enviornment and install this repository -## Develop your submission +1. Create new environment, e.g. via `conda` or `virtualenv` -To develop a submission you will write a python module containing your optimizer algorithm. Your optimizer must implement a set of predefined API methods for the initialization and update steps. + ```bash + sudo apt-get install python3-venv + python3 -m venv env + source env/bin/activate + ``` + +2. Clone this repository + + ```bash + git clone https://github.com/mlcommons/algorithmic-efficiency.git + cd algorithmic-efficiency + ``` + +3. Run the following pip3 install commands based on your chosen framework to install `algorithmic_efficiency` and its dependencies. + + For **JAX**: + + ```bash + pip3 install -e '.[pytorch_cpu]' + pip3 install -e '.[jax_gpu]' -f 'https://storage.googleapis.com/jax-releases/jax_cuda_releases.html' + pip3 install -e '.[full]' + ``` + + For **PyTorch** + + ```bash + pip3 install -e '.[jax_cpu]' + pip3 install -e '.[pytorch_gpu]' -f 'https://download.pytorch.org/whl/torch_stable.html' + pip3 install -e '.[full]' + ``` + +
+ +Per workload installations + +You can also install the requirements for individual workloads, e.g. via + +```bash +pip3 install -e '.[librispeech]' +``` + +or all workloads at once via + +```bash +pip3 install -e '.[full]' +``` + +
+ +### Docker + +We recommend using a Docker container to ensure a similar environment to our scoring and testing environments. Alternatively, a Singularity/Apptainer container can also be used (see instructions below). + +> **Prerequisites:** +> +> - NVIDIA Driver version 535.104.05 +> - NVIDIA Container Toolkit so that the containers can locate the NVIDIA drivers and GPUs. See instructions [here](https://github.com/NVIDIA/nvidia-docker). + +#### Building Docker Image + +1. Clone this repository + + ```bash + cd ~ && git clone https://github.com/mlcommons/algorithmic-efficiency.git + ``` + +2. Build Docker image + + ```bash + cd algorithmic-efficiency/docker + docker build -t . --build-arg framework= + ``` + + The `framework` flag can be either `pytorch`, `jax` or `both`. Specifying the framework will install the framework specific dependencies. + The `docker_image_name` is arbitrary. + +#### Running Docker Container (Interactive) -### Set up your directory structure (Optional) +To use the Docker container as an interactive virtual environment, you can run a container mounted to your local data and code directories and execute the `bash` program. This may be useful if you are in the process of developing a submission. + +1. Run detached Docker container. The `container_id` will be printed if the container is run successfully. + + ```bash + docker run -t -d \ + -v $HOME/data/:/data/ \ + -v $HOME/experiment_runs/:/experiment_runs \ + -v $HOME/experiment_runs/logs:/logs \ + -v $HOME/algorithmic-efficiency:/algorithmic-efficiency \ + --gpus all \ + --ipc=host \ + \ + --keep_container_alive true + ``` + + > Note: You may have to use double quotes around `algorithmic-efficiency` [path] in the mounting `-v` flag. If the above command fails try replacing the following line: + > + > ```bash + > -v $HOME/algorithmic-efficiency:/algorithmic-efficiency2 \ + > ``` + > + > with + > + > ```bash + > -v $HOME"/algorithmic-efficiency:/algorithmic-efficiency" \ + > ``` + +2. Open a bash terminal + + ```bash + docker exec -it /bin/bash + ``` + +### Using Singularity/Apptainer instead of Docker + +Since many compute clusters don't allow the usage of Docker due to securtiy concerns and instead encourage the use of [Singularity/Apptainer](https://github.com/apptainer/apptainer) (formerly Singularity, now called Apptainer), we also provide instructions on how to build an Apptainer container based on the here provided Dockerfile. + +To convert the Dockerfile into an Apptainer definition file, we will use [spython](https://github.com/singularityhub/singularity-cli): + +```bash +pip3 install spython +cd algorithmic-efficiency/docker +spython recipe Dockerfile &> Singularity.def +``` + +Now we can build the Apptainer image by running + +```bash +singularity build --fakeroot .sif Singularity.def +``` + +To start a shell session with GPU support (by using the `--nv` flag), we can run + +```bash +singularity shell --nv .sif +``` + +Similarly to Docker, Apptainer allows you to bind specific paths on the host system and the container by specifying the `--bind` flag, as explained [here](https://docs.sylabs.io/guides/3.7/user-guide/bind_paths_and_mounts.html). + +## Download the Data + +The workloads in this benchmark use 6 different datasets across 8 workloads. You may choose to download some or all of the datasets as you are developing your submission, but your submission will be scored across all 8 workloads. For instructions on obtaining and setting up the datasets see [datasets/README](/datasets/README.md#dataset-setup). + +## Develop your Submission + +To develop a submission you will write a Python module containing your training algorithm. Your training algorithm must implement a set of predefined API methods for the initialization and update steps. + +### Set Up Your Directory Structure (Optional) Make a submissions subdirectory to store your submission modules e.g. `algorithmic-effiency/submissions/my_submissions`. -### Coding your submission +### Coding your Submission You can find examples of sumbission modules under `algorithmic-efficiency/baselines` and `algorithmic-efficiency/reference_algorithms`. \ A submission for the external ruleset will consist of a submission module and a tuning search space definition. 1. Copy the template submission module `submissions/template/submission.py` into your submissions directory e.g. in `algorithmic-efficiency/my_submissions`. -2. Implement at least the methods in the template submission module. Feel free to use helper functions and/or modules as you see fit. Make sure you adhere to to the competition rules. Check out the guidelines for [allowed submissions](https://github.com/mlcommons/algorithmic-efficiency/blob/main/RULES.md#disallowed-submissions), [disallowed submissions](https://github.com/mlcommons/algorithmic-efficiency/blob/main/RULES.md#disallowed-submissions) and pay special attention to the [software dependencies rule](https://github.com/mlcommons/algorithmic-efficiency/blob/main/RULES.md#software-dependencies). +2. Implement at least the methods in the template submission module. Feel free to use helper functions and/or modules as you see fit. Make sure you adhere to to the competition rules. Check out the guidelines for [allowed submissions](/DOCUMENTATION.md#allowed-submissions), [disallowed submissions](/DOCUMENTATION.md#allowed-submissions) and pay special attention to the [software dependencies rule](/DOCUMENTATION.md#software-dependencies). 3. Add a tuning configuration e.g. `tuning_search_space.json` file to your submission directory. For the tuning search space you can either: 1. Define the set of feasible points by defining a value for "feasible_points" for the hyperparameters: - ```JSON - { - "learning_rate": { - "feasible_points": 0.999 - }, - } - ``` + ```JSON + { + "learning_rate": { + "feasible_points": 0.999 + }, + } + ``` - For a complete example see [tuning_search_space.json](https://github.com/mlcommons/algorithmic-efficiency/blob/main/reference_algorithms/target_setting_algorithms/imagenet_resnet/tuning_search_space.json). + For a complete example see [tuning_search_space.json](https://github.com/mlcommons/algorithmic-efficiency/blob/main/reference_algorithms/target_setting_algorithms/imagenet_resnet/tuning_search_space.json). - 2. Define a range of values for quasirandom sampling by specifing a `min`, `max` and `scaling` - keys for the hyperparameter: + 2. Define a range of values for quasirandom sampling by specifing a `min`, `max` and `scaling` keys for the hyperparameter: - ```JSON - { - "weight_decay": { - "min": 5e-3, - "max": 1.0, - "scaling": "log", - } - } - ``` + ```JSON + { + "weight_decay": { + "min": 5e-3, + "max": 1.0, + "scaling": "log", + } + } + ``` - For a complete example see [tuning_search_space.json](https://github.com/mlcommons/algorithmic-efficiency/blob/main/baselines/nadamw/tuning_search_space.json). + For a complete example see [tuning_search_space.json](https://github.com/mlcommons/algorithmic-efficiency/blob/main/baselines/nadamw/tuning_search_space.json). -## Run your submission +## Run your Submission From your virtual environment or interactively running Docker container run your submission with `submission_runner.py`: @@ -104,14 +256,14 @@ python3 submission_runner.py \ ### Pytorch DDP -We recommend using PyTorch's [Distributed Data Parallel (DDP)](https://pytorch.org/tutorials/intermediate/ddp_tutorial.html) -when using multiple GPUs on a single node. You can initialize ddp with torchrun. -For example, on single host with 8 GPUs simply replace `python3` in the above command by: +We recommend using PyTorch's [Distributed Data Parallel (DDP)](https://pytorch.org/tutorials/intermediate/ddp_tutorial.html) when using multiple GPUs on a single node. You can initialize ddp with torchrun. For example, on single host with 8 GPUs simply replace `python3` in the above command by: ```bash torchrun --redirects 1:0,2:0,3:0,4:0,5:0,6:0,7:0 --standalone --nnodes=1 --nproc_per_node=N_GPUS ``` +where `N_GPUS` is the number of available GPUs on the node. + So the complete command is: ```bash @@ -128,7 +280,7 @@ torchrun --redirects 1:0,2:0,3:0,4:0,5:0,6:0,7:0 \ --tuning_search_space= ``` -### Run your submission in a Docker container +### Run your Submission in a Docker Container The container entrypoint script provides the following flags: @@ -182,7 +334,7 @@ To enter a bash session in the container docker exec -it /bin/bash ``` -## Score your submission +## Score your Submission To produce performance profile and performance table: @@ -192,4 +344,4 @@ python3 scoring/score_submission.py --experiment_path= - We provide the scores and performance profiles for the baseline algorithms in the "Baseline Results" section in [Benchmarking Neural Network Training Algorithms](https://arxiv.org/abs/2306.07179). -## Good Luck +**Good Luck!** diff --git a/README.md b/README.md index 071198036..bbd5f1c66 100644 --- a/README.md +++ b/README.md @@ -1,4 +1,4 @@ -# MLCommons™ Algorithmic Efficiency +# MLCommons™ AlgoPerf: Training Algorithms Benchmark

@@ -8,9 +8,11 @@

Paper (arXiv)Installation • - Rules • - Contributing • - License + Getting Started • + Competition Rules • + Documentation • + Contributing • + License

[![CI](https://github.com/mlcommons/algorithmic-efficiency/actions/workflows/CI.yml/badge.svg)](https://github.com/mlcommons/algorithmic-efficiency/actions/workflows/CI.yml) @@ -20,203 +22,45 @@ --- -[MLCommons Algorithmic Efficiency](https://mlcommons.org/en/groups/research-algorithms/) is a benchmark and competition measuring neural network training speedups due to algorithmic improvements in both training algorithms and models. This repository holds the [competition rules](RULES.md) and the benchmark code to run it. For a detailed description of the benchmark design, see our [paper](https://arxiv.org/abs/2306.07179). +> *AlgoPerf* is a suite of benchmarks and competitions to measure neural network training speedups due to algorithmic improvements in both training algorithms and models. This is the repository for the *AlgoPerf: Training algorithms benchmark*. It is developed by the [MLCommons Algorithms Working Group](https://mlcommons.org/en/groups/research-algorithms/). This repository holds the [**competition rules**](/RULES.md), the [**technical documentation**](/DOCUMENTATION.md), [**getting started guides**](/GETTING_STARTED.md), and the benchmark code to run it. For a detailed description of the benchmark design, see our [**paper**](https://arxiv.org/abs/2306.07179). -## Table of Contents +## Table of Contents -- [Table of Contents](#table-of-contents) - [Installation](#installation) - - [Python virtual environment](#python-virtual-environment) - - [Docker](#docker) - - [Building Docker Image](#building-docker-image) - - [Running Docker Container (Interactive)](#running-docker-container-interactive) - - [Running Docker Container (End-to-end)](#running-docker-container-end-to-end) - - [Using Singularity/Apptainer instead of Docker](#using-singularityapptainer-instead-of-docker) - [Getting Started](#getting-started) - - [Running a workload](#running-a-workload) - - [JAX](#jax) - - [Pytorch](#pytorch) -- [Rules](#rules) +- [Competition Rules](#competition-rules) +- [Technical Documentation \& FAQs](#technical-documentation--faqs) - [Contributing](#contributing) -- [Shared data pipelines between JAX and PyTorch](#shared-data-pipelines-between-jax-and-pytorch) -- [Setup and Platform](#setup-and-platform) - - [My machine only has one GPU. How can I use this repo?](#my-machine-only-has-one-gpu-how-can-i-use-this-repo) - - [How do I run this on my SLURM cluster?](#how-do-i-run-this-on-my-slurm-cluster) - - [How can I run this on my AWS/GCP/Azure cloud project?](#how-can-i-run-this-on-my-awsgcpazure-cloud-project) -- [Submissions](#submissions) - - [Can submission be structured using multiple files?](#can-submission-be-structured-using-multiple-files) - - [Can I install custom dependencies?](#can-i-install-custom-dependencies) - - [How can I know if my code can be run on benchmarking hardware?](#how-can-i-know-if-my-code-can-be-run-on-benchmarking-hardware) - - [Are we allowed to use our own hardware to self-report the results?](#are-we-allowed-to-use-our-own-hardware-to-self-report-the-results) - - - +- [License](#license) +- [Paper and Citing the AlgoPerf Benchmark](#paper-and-citing-the-algoperf-benchmark) ## Installation -You can install this package and dependences in a [python virtual environment](#virtual-environment) or use a [Docker/Singularity/Apptainer container](#install-in-docker) (recommended). - - *TL;DR to install the Jax version for GPU run:* - - ```bash - pip3 install -e '.[pytorch_cpu]' - pip3 install -e '.[jax_gpu]' -f 'https://storage.googleapis.com/jax-releases/jax_cuda_releases.html' - pip3 install -e '.[full]' - ``` - - *TL;DR to install the PyTorch version for GPU run:* - - ```bash - pip3 install -e '.[jax_cpu]' - pip3 install -e '.[pytorch_gpu]' -f 'https://download.pytorch.org/whl/torch_stable.html' - pip3 install -e '.[full]' - ``` - -### Python virtual environment - -#### Prerequisites -- Python minimum requirement >= 3.8 -- CUDA 11.8 -- NVIDIA Driver version 535.104.05 - -To set up a virtual enviornment and install this repository - -1. Create new environment, e.g. via `conda` or `virtualenv` - - ```bash - sudo apt-get install python3-venv - python3 -m venv env - source env/bin/activate - ``` - -2. Clone this repository - - ```bash - git clone https://github.com/mlcommons/algorithmic-efficiency.git - cd algorithmic-efficiency - ``` - -3. Run pip3 install commands above to install `algorithmic_efficiency`. - -
- -Per workload installations - -You can also install the requirements for individual workloads, e.g. via - -```bash -pip3 install -e '.[librispeech]' -``` +You can install this package and dependencies in a [Python virtual environment](/GETTING_STARTED.md#python-virtual-environment) or use a [Docker/Singularity/Apptainer container](/GETTING_STARTED.md#docker) (recommended). +We recommend using a Docker container (or alternatively, a Singularity/Apptainer container) to ensure a similar environment to our scoring and testing environments. +Both options are described in detail in the [**Getting Started**](/GETTING_STARTED.md) document. -or all workloads at once via +*TL;DR to install the Jax version for GPU run:* ```bash +pip3 install -e '.[pytorch_cpu]' +pip3 install -e '.[jax_gpu]' -f 'https://storage.googleapis.com/jax-releases/jax_cuda_releases.html' pip3 install -e '.[full]' ``` -
- -### Docker - -We recommend using a Docker container to ensure a similar environment to our scoring and testing environments. -Alternatively, a Singularity/Apptainer container can also be used (see instructions below). - -We recommend using a Docker container to ensure a similar environment to our scoring and testing environments. - -#### Prerequisites -- NVIDIA Driver version 535.104.05 -- NVIDIA Container Toolkit so that the containers can locate the NVIDIA drivers and GPUs. -See instructions [here](https://github.com/NVIDIA/nvidia-docker). - -#### Building Docker Image - -1. Clone this repository - - ```bash - cd ~ && git clone https://github.com/mlcommons/algorithmic-efficiency.git - ``` - -2. Build Docker Image - - ```bash - cd algorithmic-efficiency/docker - docker build -t . --build-arg framework= - ``` - - The `framework` flag can be either `pytorch`, `jax` or `both`. Specifying the framework will install the framework specific dependencies. - The `docker_image_name` is arbitrary. - -#### Running Docker Container (Interactive) - -To use the Docker container as an interactive virtual environment, you can run a container mounted to your local data and code directories and execute the `bash` program. This may be useful if you are in the process of developing a submission. - -1. Run detached Docker Container. The container_id will be printed if the container is run successfully. - - ```bash - docker run -t -d \ - -v $HOME/data/:/data/ \ - -v $HOME/experiment_runs/:/experiment_runs \ - -v $HOME/experiment_runs/logs:/logs \ - -v $HOME/algorithmic-efficiency:/algorithmic-efficiency \ - --gpus all \ - --ipc=host \ - \ - --keep_container_alive true - ``` - Note: You may have to use double quotes around `algorithmic-efficiency` [path] in the mounting `-v` flag. If the above command fails try replacing the following line: - ```bash - -v $HOME/algorithmic-efficiency:/algorithmic-efficiency2 \ - ``` - with - ``` - -v $HOME"/algorithmic-efficiency:/algorithmic-efficiency" \ - ``` - - Open a bash terminal - ```bash - docker exec -it /bin/bash - ``` - -#### Running Docker Container (End-to-end) - -To run a submission end-to-end in a containerized environment see [Getting Started Document](./getting_started.md#run-your-submission-in-a-docker-container). - -### Using Singularity/Apptainer instead of Docker - -Since many compute clusters don't allow the usage of Docker due to securtiy concerns and instead encourage the use of [Singularity/Apptainer](https://github.com/apptainer/apptainer) (formerly Singularity, now called Apptainer), we also provide instructions on how to build an Apptainer container based on the here provided Dockerfile. - -To convert the Dockerfile into an Apptainer definition file, we will use [spython](https://github.com/singularityhub/singularity-cli): - -```bash -pip3 install spython -cd algorithmic-efficiency/docker -spython recipe Dockerfile &> Singularity.def -``` - -Now we can build the Apptainer image by running - -```bash -singularity build --fakeroot .sif Singularity.def -``` - -To start a shell session with GPU support (by using the `--nv` flag), we can run +*TL;DR to install the PyTorch version for GPU run:* ```bash -singularity shell --nv .sif +pip3 install -e '.[jax_cpu]' +pip3 install -e '.[pytorch_gpu]' -f 'https://download.pytorch.org/whl/torch_stable.html' +pip3 install -e '.[full]' ``` -Similarly to Docker, Apptainer allows you to bind specific paths on the host system and the container by specifying the `--bind` flag, as explained [here](https://docs.sylabs.io/guides/3.7/user-guide/bind_paths_and_mounts.html). - ## Getting Started -For instructions on developing and scoring your own algorithm in the benchmark see [Getting Started Document](./getting_started.md). +For detailed instructions on developing and scoring your own algorithm in the benchmark see the [Getting Started](/GETTING_STARTED.md) document. -### Running a workload - -To run a submission directly by running a Docker container, see [Getting Started Document](./getting_started.md#run-your-submission-in-a-docker-container). - -From your virtual environment or interactively running Docker container run: - -#### JAX +*TL;DR running a JAX workload:* ```bash python3 submission_runner.py \ @@ -228,7 +72,7 @@ python3 submission_runner.py \ --tuning_search_space=baselines/adamw/tuning_search_space.json ``` -#### Pytorch +*TL;DR running a PyTorch workload:* ```bash python3 submission_runner.py \ @@ -240,107 +84,40 @@ python3 submission_runner.py \ --tuning_search_space=baselines/adamw/tuning_search_space.json ``` -
- -Using Pytorch DDP (Recommended) - - -When using multiple GPUs on a single node it is recommended to use PyTorch's [distributed data parallel](https://pytorch.org/tutorials/intermediate/ddp_tutorial.html). -To do so, simply replace `python3` by - -```bash -torchrun --standalone --nnodes=1 --nproc_per_node=N_GPUS -``` - -where `N_GPUS` is the number of available GPUs on the node. To only see output from the first process, you can run the following to redirect the output from processes 1-7 to a log file: - -```bash -torchrun --redirects 1:0,2:0,3:0,4:0,5:0,6:0,7:0 --standalone --nnodes=1 --nproc_per_node=8 - ``` - -So the complete command is for example: - -```bash -torchrun --redirects 1:0,2:0,3:0,4:0,5:0,6:0,7:0 --standalone --nnodes=1 --nproc_per_node=8 \ -submission_runner.py \ - --framework=pytorch \ - --workload=mnist \ - --experiment_dir=$HOME/experiments \ - --experiment_name=baseline \ - --submission_path=baselines/adamw/jax/submission.py \ - --tuning_search_space=baselines/adamw/tuning_search_space.json -``` +## Competition Rules -
+The competition rules for the *AlgoPerf: Training algorithms* benchmark competition can be found in the seperate [**Competition Rules**](/RULES.md) document. -## Rules +## Technical Documentation & FAQs -The rules for the MLCommons Algorithmic Efficency benchmark can be found in the seperate [rules document](RULES.md). Suggestions, clarifications and questions can be raised via pull requests. +We provide additional technical documentation and answer frequently asked questions in a separate [**Documentation**](/DOCUMENTATION.md) page. Suggestions, clarifications and questions can be raised via pull requests, creating an issue, or by sending an email to the [working group](mailto:algorithms@mlcommons.org). ## Contributing -If you are interested in contributing to the work of the working group, feel free to [join the weekly meetings](https://mlcommons.org/en/groups/research-algorithms/), open issues. See our [CONTRIBUTING.md](CONTRIBUTING.md) for MLCommons contributing guidelines and setup and workflow instructions. +We invite everyone to look through our rules, documentation, and codebase and submit issues and pull requests, e.g. for rules changes, clarifications, or any bugs you might encounter. If you are interested in contributing to the work of the working group and influence the benchmark's design decisions, please [join the weekly meetings](https://mlcommons.org/en/groups/research-algorithms/) and consider becoming a member of the working group. +Our [**Contributing**](/CONTRIBUTING.md) document provides further MLCommons contributing guidelines and additional setup and workflow instructions. -# Disclaimers +## License -## Shared data pipelines between JAX and PyTorch +The *AlgoPerf* codebase is licensed under the [Apache License 2.0](/LICENSE.md). -The JAX and PyTorch versions of the Criteo, FastMRI, Librispeech, OGBG, and WMT workloads are using the same TensorFlow input pipelines. Due to differences in how Jax and PyTorch distribute computations across devices, the PyTorch workloads have an additional overhead for these workloads. +## Paper and Citing the AlgoPerf Benchmark -Since we use PyTorch's [`DistributedDataParallel`](https://pytorch.org/docs/stable/generated/torch.nn.parallel.DistributedDataParallel.html#torch.nn.parallel.DistributedDataParallel) implementation, there is one Python process for each device. Depending on the hardware and the settings of the cluster, running a TensorFlow input pipeline in each Python process can lead to errors, since too many threads are created in each process. See [this PR thread](https://github.com/mlcommons/algorithmic-efficiency/pull/85) for more details. -While this issue might not affect all setups, we currently implement a different strategy: we only run the TensorFlow input pipeline in one Python process (with `rank == 0`), and [broadcast](https://pytorch.org/docs/stable/distributed.html#torch.distributed.broadcast) the batches to all other devices. This introduces an additional communication overhead for each batch. See the [implementation for the WMT workload](https://github.com/mlcommons/algorithmic-efficiency/blob/main/algorithmic_efficiency/workloads/wmt/wmt_pytorch/workload.py#L215-L288) as an example. +In our paper ["Benchmarking Neural Network Training Algorithms"](http://arxiv.org/abs/2306.07179) we motivate, describe, and justify the *AlgoPerf: Training algorithms* benchmark. -## Pytorch Conformer CUDA OOM +If you are using the *AlgoPerf benchmark*, its codebase, baselines, or workloads, please consider citing our paper: -The conformer pytorch workload may run out of memory in current state. Please set the `submission_runner.py` flag `reduce_pytorch_max_split_size` to `True` as a temporary workaround if you encounter this issue. This will set 'max_split_size_mb:256'. Note that this will adversely impact the performance of the submission on this workload. See [tracking issue](https://github.com/mlcommons/algorithmic-efficiency/issues/497). - - -# FAQS - -## Setup and Platform - -### My machine only has one GPU. How can I use this repo? -You can run this repo on a machine with an arbitrary number of GPUs. However, the default batch sizes in our reference algorithms `algorithmic-efficiency/baselines` and `algorithmic-efficiency/reference_algorithms` are tuned for a machine with 8 16GB V100 GPUs. You may run into OOMs if you run these algorithms with fewer than 8 GPUs. If you run into these issues because you are using a machine with less total GPU memory, please reduce the batch sizes for the submission. Note that your final submission must 'fit' -on the benchmarking hardware, so if you are using fewer -GPUs with higher per GPU memory, please monitor your memory usage -to make make sure it will fit on 8xV100 GPUs with 16GB of VRAM per card. - -### How do I run this on my SLURM cluster? -You may run into issues with `sudo` and `docker` on a SLURM cluster. To run the workloads in a SLURM cluster you can use Apptainer (previously Singularity), see this [section](using-singularity/apptainer-instead-of-docker). -### How can I run this on my AWS/GCP/Azure cloud project? - Depending on your virtual machine, you may have to install the correct GPU drivers and the NVIDIA Docker toolkit. For example, in GCP you will have to do the following. -1. If you don't have a VM instance yet, we recommend creating a -new Compute Instance with the "Deep Learning on Linux" Image in Boot disk options. -2. To install the NVIDIA Docker toolkit, you can use `scripts/cloud-startup.sh` as a startup script for the VM. This will automate the installation of the NVIDIA GPU Drivers and NVIDIA Docker toolkit. - -## Submissions -### Can submission be structured using multiple files? -Yes, your submission can be structured using multiple files. -### Can I install custom dependencies? -You may use custom dependencies as long as they do not conflict with any of the pinned packages in `algorithmic-efficiency/setup.cfg`. -To include your custom dependencies in your submission, please include them in a requirements.txt file. Please refer to the [Software dependencies](https://github.com/mlcommons/algorithmic-efficiency/blob/main/RULES.md#software-dependencies) section of our rules. -### How can I know if my code can be run on benchmarking hardware? -The benchmarking hardware specifications are documented in the [Getting Started Document](./getting_started.md). -We recommend monitoring your submission's memory usage so that it does not exceed the available memory -on the competition hardware. We also recommend to do a dry run using a cloud instance. -### Are we allowed to use our own hardware to self-report the results? -You only have to use the competition hardware for runs that are directly involved in the scoring procedure. This includes all runs for the self-tuning ruleset, but only the runs of the best hyperparameter configuration in each study for the external tuning ruleset. For example, you could use your own (different) hardware to tune your submission and identify the best hyperparameter configuration (in each study) and then only run this configuration (i.e. 5 runs, one for each study) on the competition hardware. - -# Citing AlgoPerf Benchmark -If you use the **AlgoPerf** Benchmark in your work, please consider citing: - -> [George E. Dahl, Frank Schneider, Zachary Nado, et al.
+> [Dahl, Schneider, Nado, et al.
> **Benchmarking Neural Network Training Algorithms**
> *arXiv 2306.07179*](http://arxiv.org/abs/2306.07179) ```bibtex -@misc{dahl2023algoperf, - title={{Benchmarking Neural Network Training Algorithms}}, - author={Dahl, George E. and Schneider, Frank and Nado, Zachary and Agarwal, Naman and Sastry, Chandramouli Shama and Hennig, Philipp and Medapati, Sourabh and Eschenhagen, Runa and Kasimbeg, Priya and Suo, Daniel and Bae, Juhan and Gilmer, Justin and Peirson, Abel L. and Khan, Bilal and Anil, Rohan and Rabbat, Mike and Krishnan, Shankar and Snider, Daniel and Amid, Ehsan and Chen, Kongtao and Maddison, Chris J. and Vasudev, Rakshith and Badura, Michal and Garg, Ankush and Mattson, Peter}, - year={2023}, - eprint={2306.07179}, - archivePrefix={arXiv}, - primaryClass={cs.LG} +@Misc{Dahl2023AlgoPerf, + title = {{Benchmarking Neural Network Training Algorithms}}, + author = {Dahl, George E. and Schneider, Frank and Nado, Zachary and Agarwal, Naman and Sastry, Chandramouli Shama and Hennig, Philipp and Medapati, Sourabh and Eschenhagen, Runa and Kasimbeg, Priya and Suo, Daniel and Bae, Juhan and Gilmer, Justin and Peirson, Abel L. and Khan, Bilal and Anil, Rohan and Rabbat, Mike and Krishnan, Shankar and Snider, Daniel and Amid, Ehsan and Chen, Kongtao and Maddison, Chris J. and Vasudev, Rakshith and Badura, Michal and Garg, Ankush and Mattson, Peter}, + year = {2023}, + archiveprefix = {arXiv}, + eprint = {2306.07179}, } -``` \ No newline at end of file +``` diff --git a/RULES.md b/RULES.md index 041a45400..1f132688b 100644 --- a/RULES.md +++ b/RULES.md @@ -1,5 +1,7 @@ # MLCommons™ AlgoPerf: Competition Rules +**Version:** 0.0.1 *(Last updated November 14, 2023)* + ## Table of Contents - [Goal](#goal) diff --git a/SUBMISSION_PROCESS_RULES.md b/SUBMISSION_PROCESS_RULES.md deleted file mode 100644 index 227d6128b..000000000 --- a/SUBMISSION_PROCESS_RULES.md +++ /dev/null @@ -1,171 +0,0 @@ -# MLCommons™ AlgoPerf: Submission Process Rules - -**Version:** 0.0.3 *(Last updated 10 Oktober 2023)* - -## Table of Contents - -- [Basics](#basics) -- [Schedule](#schedule) - - [Dates](#dates) - - [Version freeze](#version-freeze) - - [Submission deadline](#submission-deadline) -- [Submission](#submission) - - [Register a submission](#register-a-submission) - - [How to submit](#how-to-submit) - - [Submission repository](#submission-repository) - - [Licensing](#licensing) - - [Multiple Submission](#multiple-submission) -- [Scoring](#scoring) - - [Self-reporting scores](#self-reporting-scores) - - [Verifying scores](#verifying-scores) - - [Sampling held-out workloads and hyperparameters](#sampling-held-out-workloads-and-hyperparameters) - - [Leaderboard](#leaderboard) -- [Sprit jury \& challenging submissions](#sprit-jury--challenging-submissions) -- [Awards and prize money](#awards-and-prize-money) - - [Awards committee](#awards-committee) -- [Ineligibility and conflict of interest](#ineligibility-and-conflict-of-interest) - -## Basics - -This is the submission process rules for the AlgoPerf: Training Algorithms Benchmark. It describes the process of submitting a new training algorithm and details how it will be scored. This process applies to both the external tuning ruleset and the self-tuning ruleset although, for all intents and purposes, they are two separate competitions, with separate leaderboards. - -Three additional documents complement this document: - -- [**Benchmark rules**](RULES.md): While the submission process rules detail the *logistical* aspects of submitting to the AlgoPerf: Training Algorithms Benchmark, the [rules document](RULES.md) describes the *scientific* rules of the competition. This includes, for example, how tuning is performed in each ruleset, what types of submissions are allowed, or how the benchmark score is computed. -- [**AlgoPerf paper**](https://arxiv.org/abs/2306.07179): The paper titled ["Benchmarking Neural Network Training Algorithms"](https://arxiv.org/abs/2306.07179) motivates the need for the benchmark, explains the rules, and justifies the specific design choices of the AlgoPerf: Training Algorithms Benchmark. Additionally, it evaluates baseline submissions, constructed using various optimizers like Adam, Shampoo, or SAM, on the benchmark, demonstrating the feasibility but also the difficulty of the benchmark. -- [**Benchmark codebase**](https://github.com/mlcommons/algorithmic-efficiency): The codebase implements the rules, provides exact specifications of the workloads, and it will ultimately be used to score submissions. - -## Schedule - -### Dates - -- **Publication of the call for submission: 17. October 2023 (08:00 AM UTC)** -- Registration deadline to express non-binding intent to submit: 15. December 2023 (08:00 AM UTC) -- Version freeze for the benchmark codebase: 17. January 2024 (08:00 AM UTC) -- **Submission deadline: 15. February 2024 (08:00 AM UTC)** -- Sampling the held-out workloads and hyperparameters: 16. February 2024 (08:00 AM UTC) -- Deadline for specifying the submission batch sizes for held-out workloads: 28. February 2024 (08:00 AM UTC) -- Deadline for self-reporting results: 10. April 2024 (08:00 AM UTC) -- **[extra tentative] Announcement of all results: 22. May 2024 (08:00 AM UTC)** - -The presented dates are subject to change and adjustments may be made by the [MLCommmons Algorithms Working Group](https://mlcommons.org/en/groups/research-algorithms/). - -### Version freeze - -The benchmark code base is subject to change after the call for submissions is published. For example, while interacting with the codebase, if submitters encounter bugs or API limitations, they have the option to issue a bug report. This might lead to modifications of the benchmark codebase even after the publication of the call for submissions. - -To ensure that all submitters can develop their submissions based on the same code that will be utilized for scoring, we will freeze the package versions of the codebase dependencies before the submission deadline. By doing so, we level the playing field for everyone involved, ensuring fairness and consistency in the assessment of submissions. We will also try to minimize changes to the benchmark codebase as best as possible. - -### Submission deadline - -With the submission deadline, all submissions need to be available as a *public* repository with the appropriate license (see the [Licensing section](#licensing)). No changes to the submission code are allowed after the submission deadline (with the notable exception of specifying the batch size for the - at that point unknown - held-out workloads). Once the submission deadline has passed, the working group will publish a list of all submitted algorithms, along with their associated repositories. Anyone has the right to challenge a submission, i.e. request a review by the spirit jury to determine whether a submission violates the rules of the competition, see the [Spirit jury section](#sprit-jury--challenging-submissions). - -Directly after the submission deadline, all randomized aspects of the competition are fixed. This includes sampling the held-out workloads from the set of randomized workloads, as well as, sampling the hyperparameters for each submission in the external tuning ruleset (for more details see the [Sampling held-out workloads and hyperparameters section](#sampling-held-out-workloads-and-hyperparameters)). After that, submitters can now ascertain the appropriate batch size of their submission on each held-out workload and self-report scores on either the qualification set or the full benchmarking set of workloads including both fixed and held-out workloads (see the [Self-reporting scores section](#self-reporting-scores)). - -## Submission - -For a guide on the technical steps and details on how to write a submission, please refer to the [**Getting started document**](GETTING_STARTED.md). Additionally, the folders [/reference_algorithms](/reference_algorithms/) and [/baselines](/baselines/) provide example submissions that can serve as a template for creating new submissions. - -In the following, we describe the logistical steps required to submit a training algorithm to the AlgoPerf: Training Algorithms Benchmark. - -### Register an intent to submit - -All submitters need to register an intent to submit before the submission registration deadline. This registration is mandatory, i.e. required for all submissions, but not binding, i.e. you don't have to submit a registered submission. This registration is necessary, to estimate the number of submissions and provide support for potential submitters. - -To register an intent to submission, please fill out this [online form](https://forms.gle/iY1bUhwSjj1JZ4fa9) with the following information - -- Name of the submission (e.g. name of the algorithm, or any other arbitrary identifier). -- Ruleset under which the submission will be scored. -- Name, email, and affiliations of all submitters associated with this submission. -- Interest in compute support. - -The submission will be issued a unique **submission ID** that will be used throughout the submission process. - -### How to submit - -Submitters have the flexibility to submit their training algorithm anytime between the registration of the submission and the submission deadline. To submit a submission, please write an email to with the subject "[Submission] *submission_ID*" and the following information: - -- Submission ID. -- URL of the associated *public* GitHub repository. -- If applicable, a list of all changes to the names, emails, or affiliations compared to the registration of the submission. -- A digital version of all relevant licensing documents (see the [Licensing section](#licensing)). - -#### Submission repository - -The *public* GitHub repository needs to be a clone of the frozen `main` branch of the [benchmark codebase](https://github.com/mlcommons/algorithmic-efficiency). All elements of the original codebase, except for the `/submission` directory need to be unaltered from the original benchmark code. In particular, the repository must use the same [Apache 2 License](https://www.apache.org/licenses/LICENSE-2.0) as the benchmark codebase. Once the submission deadline has passed, modifications of the submission repository's code are generally prohibited. The sole exception to this rule is the definition of the batch sizes for the held-out workloads. - -Any software dependencies required for the submission need to be defined in a `requirements.txt` file within the `/submission` directory. This file needs to be `pip` readable, i.e. installable via `pip install -r requirements.txt`. In order to comply with the rules, submissions are not allowed to modify the used package version of the software dependencies of the benchmarking codebase, e.g. by using a different version of PyTorch or JAX (see [](RULES.md#disallowed-submissions)). - -#### Licensing - -Submitting to the AlgoPerf: Training Algorithms Benchmark requires the following legal considerations: - -- A signed [Contributor License Agreement (CLA) "Corporate CLA"](https://mlcommons.org/en/policies/) of MLCommons. -- *Either* a membership in MLCommons *or* a signed [non-member test agreement](https://mlcommons.org/en/policies/). -- A signed trademark license agreement, either the member or the non-member version, as appropriate. These license agreements are available upon request to [support@mlcommons.org](mailto:support@mlcommons.org). - -We furthermore require all submissions to be made available open source on the submission deadline under the [Apache 2 License](https://www.apache.org/licenses/LICENSE-2.0). - -### Multiple Submission - -Our benchmark allows multiple submissions by the same submitter(s). However, we would like to prevent submitters from circumventing the purpose of the benchmark by, for example, submitting dozens of copies of the same submission with slightly different hyperparameters. Such a bulk submission would result in an unfair advantage on the randomized workloads and is not in the spirit of the benchmark. - -Submitters may submit algorithms marked as *baselines*. These might include existing algorithms with different search spaces or learning rate schedules. These baseline algorithms are not eligible for winning the competition or prize money but they are also not required to be "substantially different" from other submissions by the same submitters. - -## Scoring - -### Self-reporting scores - -Submitters are expected to self-report scores on the full benchmark set before the deadline for self-reporting results. Reporting the scores involves providing all unmodified logs that the benchmarking codebase automatically generates in a separate `/results` directory within the `/submission` folder. For submissions competing in the external tuning ruleset, this includes all the logs of the tuning trials using the [hyperparameter samples provided by the working group](#sampling-held-out-workloads-and-hyperparameters). Note, that while the tuning runs can be performed on non-competition hardware, they still need to show that the "winning hyperparameter configuration" in each study was selected according to the [tuning rules](/RULES.md#external-tuning-ruleset), i.e. the fastest hyperparameter to reach the validation target. Additionally, the logs of the "winning hyperparameter configuration" (or each trial, in the self-tuning ruleset) in each of the five studies need to be computed on the competition hardware, to allow wall-clock runtime comparisons. - -Submitters unable to self-fund scoring costs can instead self-report only on the [qualification set of workloads](/RULES.md#qualification-set) that excludes some of the most expensive workloads. Based on this performance on the qualification set, the working group will provide - as funding allows - compute to evaluate and score the most promising submissions. Additionally, we encourage researchers to reach out to the [working group](mailto:algorithms@mlcommons.org) to find potential collaborators with the resources to run larger, more comprehensive experiments for both developing and scoring submissions. - -#### Verifying scores - -The working group will independently verify the scores of the highest-scoring submissions in each ruleset. Results that have been verified by the working group will be clearly marked on the leaderboard. - -### Sampling held-out workloads and hyperparameters - -After the submission deadline has passed and all submission code is frozen, the working group will sample a specific instance of held-out workloads from the set of randomized workloads. Additionally, every submission in the external tuning ruleset will receive its specific set of 5x20 hyperparameter values grouped by study. This set of hyperparameter values is sampled from the search space provided by the submitters. - -The sampling code for the held-out workloads and the hyperparameters is publicly available (**TODO link to both functions!**). Both sampling functions take as input a random seed, which will be provided by a trusted third party after the submission deadline. - -### Leaderboard - -The announcement of the results will contain two separate leaderboards, one for the self-tuning and one for the external tuning ruleset. All valid submissions will be ranked by the benchmark score, taking into account all workloads, including the held-out ones. The leaderboard will clearly mark scores that were verified by the working group. - -## Sprit jury & challenging submissions - -The spirit jury, consisting of selected active members of the working group, will be responsible for deciding whether a submission violates the "spirit of the rules". Submitters with specific concerns about a particular submission can request a review by the spirit jury to determine whether a submission violates the rules of the competition. To challenge a submission, please write an email to with the subject "[Challenge] *submission_name*". The email needs to link to the challenged submission and include a detailed description of why the submission should be reviewed. This request must be made reasonably in advance of the results announcement deadline to allow the Spirit Jury sufficient time to conduct a thorough review. - -The spirit jury may then hear the justifications of the submitters, inspect the code, and also ask the submitters to explain how the submission was produced, for example, by disclosing their intermediate experiments. Example cases that might be reviewed by the spirit jury are cases of multiple similar submissions by the same submitter or extensive workload-specific tuning. - -## Awards and prize money - -An awards committee will award a prize for the "*Best Performance*" in each ruleset as well as a "*Innovative Submission Award*". The prize for the best-performing submission will take into account the [benchmark score](RULES.md#benchmark-score-using-performance-profiles) on the full benchmark. The "*Innovative Submission Award*" will favor more out-of-the-box ideas that show great potential, even though the method may not be of practical value with the current landscape of models, software, etc. - -The prize money for "*Best Performance*" in a ruleset is $20,000 each. The winner of the "*Innovative Submission Award*" will be awarded $10,000. We reserve the right to split the prize money and distribute it among multiple submissions. - -If a submission is ineligible to win prize money it can still win an award. The prize money will then go to the highest-ranking eligible submission. - -### Awards committee - -The awards committee will be responsible for awarding prize money to submissions. The committee will try to reach a consensus on how to award prize money and settle disagreements by majority vote, if necessary. - -**TODO Who is on the Awards committee?** - -## Ineligibility and conflict of interest - -To ensure a fair process and avoid conflicts of interest, some individuals and institutions are ineligible to win prize money. This includes: - -- The chairs of the MLCommons Algorithms Working Group (presently *George Dahl* and *Frank Schneider*) and their associated institutions (currently *Google Inc.* and the *University of Tübingen*) -- All individuals serving on the awards committee and their associated institutions. - -A submission with at least one participating ineligible entity may still win an award, but the prize money will then be given to the top-ranked submission that does not contain ineligible entities. - -Additionally, we require members of the spirit jury to abstain from being involved in a review if: - -- They are part of the reviewed submission. -- The reviewed submission contains individuals from their institution. - -The spirit jury can still take a decision if at least one member of the jury is without a conflict of interest. From da11c6766c119ad248eb03d0530cbbed3e342a4d Mon Sep 17 00:00:00 2001 From: Frank Schneider Date: Tue, 14 Nov 2023 17:54:42 +0100 Subject: [PATCH 08/27] Clarify tuning for held-out workloads --- DOCUMENTATION.md | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/DOCUMENTATION.md b/DOCUMENTATION.md index b6b48849f..e8957f9c6 100644 --- a/DOCUMENTATION.md +++ b/DOCUMENTATION.md @@ -31,7 +31,7 @@ - [My machine only has one GPU. How can I use this repo?](#my-machine-only-has-one-gpu-how-can-i-use-this-repo) - [How do I run this on my SLURM cluster?](#how-do-i-run-this-on-my-slurm-cluster) - [How can I run this on my AWS/GCP/Azure cloud project?](#how-can-i-run-this-on-my-awsgcpazure-cloud-project) - - [Submissions](#submissions-1) + - [Submitting](#submitting) - [Can I submit multiple times to the benchmark competition?](#can-i-submit-multiple-times-to-the-benchmark-competition) - [Can my submission be structured using multiple files?](#can-my-submission-be-structured-using-multiple-files) - [Can I install custom dependencies?](#can-i-install-custom-dependencies) @@ -495,7 +495,7 @@ For the benchmark score, we compute and integrate the performance profiles using - Reach the validation target on the held-out workload (corresponding to the fixed workload) within the maximum runtime. - Reach the validation target on the held-out workload (corresponding to the fixed workload) within 4x of the fastest submission. To determine the fastest submission on a held-out workload, we only consider submissions that reached the target on the corresponding fixed workload. This protects us against extremely fast submissions that only work on a specific held-out workload and are useless as general algorithms. -Only if all four requirements are met, does the submission get a finite score. Otherwise, a submission will receive a training time of infinity. +Only if all four requirements are met, does the submission get a finite score. Otherwise, a submission will receive a training time of infinity. Note that the tuning process works the same for held-out workloads as for the fixed workloads, i.e. in the external tuning ruleset there are multiple tuning trials and only the fastest trial per study is relevant for scoring. This essentially means that being unable to successfully train a held-out workload can "disqualify" a submission from getting a good score on the fixed workload it is based on. In other words, we require submissions to be robust enough to handle workload variations. This protocol ensures that we prioritize the fixed workloads for scoring since they are the most relevant version of that workload in practice. However, we also protect our benchmark from egregious workload-specific tuning and penalize brittle methods that break with slight modifications of the workload. @@ -537,7 +537,7 @@ Depending on your virtual machine, you may have to install the correct GPU drive new Compute Instance with the "Deep Learning on Linux" Image in Boot disk options. 2. To install the NVIDIA Docker toolkit, you can use `scripts/cloud-startup.sh` as a startup script for the VM. This will automate the installation of the NVIDIA GPU Drivers and NVIDIA Docker toolkit. -### Submissions +### Submitting #### Can I submit multiple times to the benchmark competition? From f182e32490e0506a8e17758bfff643628d40e647 Mon Sep 17 00:00:00 2001 From: Frank Schneider Date: Tue, 14 Nov 2023 17:56:15 +0100 Subject: [PATCH 09/27] Add numerical description as well --- RULES.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/RULES.md b/RULES.md index 1f132688b..f89ada3d8 100644 --- a/RULES.md +++ b/RULES.md @@ -27,7 +27,7 @@ ## Goal -To discover new training algorithms that can train general neural networks faster. Sponsor will use an objective measuring program to allocate a score to each entry ("Submission") and determine two winners (one in each ruleset), each of which will be eligible to win a prize. +To discover new training algorithms that can train general neural networks faster. Sponsor will use an objective measuring program to allocate a score to each entry ("Submission") and determine two (2) winners (one in each ruleset), each of which will be eligible to win a prize. ## Sponsor From a87b36cd37db221a0a08df247426ed3137ffd87b Mon Sep 17 00:00:00 2001 From: Frank Schneider Date: Thu, 16 Nov 2023 10:39:40 +0100 Subject: [PATCH 10/27] Rename rules -> competition rules --- RULES.md => COMPETITION_RULES.md | 0 DOCUMENTATION.md | 6 +++--- README.md | 6 +++--- scoring/performance_profile.py | 4 ++-- submissions/template/submission.py | 6 +++--- 5 files changed, 11 insertions(+), 11 deletions(-) rename RULES.md => COMPETITION_RULES.md (100%) diff --git a/RULES.md b/COMPETITION_RULES.md similarity index 100% rename from RULES.md rename to COMPETITION_RULES.md diff --git a/DOCUMENTATION.md b/DOCUMENTATION.md index e8957f9c6..1b209fadc 100644 --- a/DOCUMENTATION.md +++ b/DOCUMENTATION.md @@ -66,7 +66,7 @@ The intention is that a training algorithm submission will be broadly applicable ### Competition Rules -For a description of the competition rules and how to submit a training algorithm to the AlgoPerf: Training Algorithms Benchmark, see the [Competition Rules](/RULES.md), which details the entire competition process. +For a description of the competition rules and how to submit a training algorithm to the AlgoPerf: Training Algorithms Benchmark, see the [Competition Rules](/COMPETITION_RULES.md), which details the entire competition process. ### Submissions @@ -421,7 +421,7 @@ Our scoring procedure uses the held-out workloads only to penalize submissions t #### Qualification set -The qualification set is designed for submitters that may not have the compute resources to self-report on the full set of [fixed](#fixed-workloads) and [held-out workloads](#randomized-workloads). They may instead self-report numbers on this smaller qualification set. The best-performing submissions may then qualify for compute sponsorship offering a free evaluation on the full benchmark set and therefore the possibility to win [awards and prizes](/RULES.md#prizes). +The qualification set is designed for submitters that may not have the compute resources to self-report on the full set of [fixed](#fixed-workloads) and [held-out workloads](#randomized-workloads). They may instead self-report numbers on this smaller qualification set. The best-performing submissions may then qualify for compute sponsorship offering a free evaluation on the full benchmark set and therefore the possibility to win [awards and prizes](/COMPETITION_RULES.md#prizes). The qualification set consists of the same [fixed workloads](#fixed-workloads) as mentioned above, except for both workloads on *ImageNet*, both workloads on *LibriSpeech*, and the *fastMRI* workload. The remaining three workloads (*WMT*, *Criteo 1TB*, and *OGBG*) form the qualification set. There are no [randomized workloads](#randomized-workloads) in the qualification set. The qualification set of workloads aims to have a combined runtime of roughly 24 hours on the [benchmarking hardware](#benchmarking-hardware). @@ -565,7 +565,7 @@ You only have to use the benchmarking hardware for runs that are directly involv #### What can I do if running the benchmark is too expensive for me? -Submitters unable to self-fund scoring costs can instead self-report only on the [qualification set of workloads](/RULES.md#qualification-set) that excludes some of the most expensive workloads. Based on this performance on the qualification set, the working group will provide - as funding allows - compute to evaluate and score the most promising submissions. Additionally, we encourage researchers to reach out to the [working group](mailto:algorithms@mlcommons.org) to find potential collaborators with the resources to run larger, more comprehensive experiments for both developing and scoring submissions. +Submitters unable to self-fund scoring costs can instead self-report only on the [qualification set of workloads](/COMPETITION_RULES.md#qualification-set) that excludes some of the most expensive workloads. Based on this performance on the qualification set, the working group will provide - as funding allows - compute to evaluate and score the most promising submissions. Additionally, we encourage researchers to reach out to the [working group](mailto:algorithms@mlcommons.org) to find potential collaborators with the resources to run larger, more comprehensive experiments for both developing and scoring submissions. #### Can I submit existing (i.e. published) training algorithms as submissions? diff --git a/README.md b/README.md index bbd5f1c66..dae868260 100644 --- a/README.md +++ b/README.md @@ -9,7 +9,7 @@ Paper (arXiv)InstallationGetting Started • - Competition Rules • + Competition RulesDocumentationContributingLicense @@ -22,7 +22,7 @@ --- -> *AlgoPerf* is a suite of benchmarks and competitions to measure neural network training speedups due to algorithmic improvements in both training algorithms and models. This is the repository for the *AlgoPerf: Training algorithms benchmark*. It is developed by the [MLCommons Algorithms Working Group](https://mlcommons.org/en/groups/research-algorithms/). This repository holds the [**competition rules**](/RULES.md), the [**technical documentation**](/DOCUMENTATION.md), [**getting started guides**](/GETTING_STARTED.md), and the benchmark code to run it. For a detailed description of the benchmark design, see our [**paper**](https://arxiv.org/abs/2306.07179). +> *AlgoPerf* is a suite of benchmarks and competitions to measure neural network training speedups due to algorithmic improvements in both training algorithms and models. This is the repository for the *AlgoPerf: Training algorithms benchmark*. It is developed by the [MLCommons Algorithms Working Group](https://mlcommons.org/en/groups/research-algorithms/). This repository holds the [**competition rules**](/COMPETITION_RULES.md), the [**technical documentation**](/DOCUMENTATION.md) of the benchmark, [**getting started guides**](/GETTING_STARTED.md), and the benchmark code. For a detailed description of the benchmark design, see our [**paper**](https://arxiv.org/abs/2306.07179). ## Table of Contents @@ -86,7 +86,7 @@ python3 submission_runner.py \ ## Competition Rules -The competition rules for the *AlgoPerf: Training algorithms* benchmark competition can be found in the seperate [**Competition Rules**](/RULES.md) document. +The competition rules for the *AlgoPerf: Training algorithms* benchmark competition can be found in the seperate [**Competition Rules**](/COMPETITION_RULES.md) document. ## Technical Documentation & FAQs diff --git a/scoring/performance_profile.py b/scoring/performance_profile.py index e62e8e18e..84788c7ae 100644 --- a/scoring/performance_profile.py +++ b/scoring/performance_profile.py @@ -2,8 +2,8 @@ The three primary methods exposed by the `scoring` module are: - `compute_performance_profiles`: generates performance profiles for a set of - submissions over all workloads as defined in the scoring rules: - https://github.com/mlcommons/algorithmic-efficiency/blob/main/RULES.md + submissions over all workloads as defined in the scoring section: + https://github.com/mlcommons/algorithmic-efficiency/blob/main/DOCUMENTATION.md - `compute_leaderboard_score`: computes final scores from performance profiles. - `plot_performance_profiles`: plot performance profiles for a set of submissions. diff --git a/submissions/template/submission.py b/submissions/template/submission.py index 83297a7d9..f8089b571 100644 --- a/submissions/template/submission.py +++ b/submissions/template/submission.py @@ -1,8 +1,8 @@ """Template submission module. -See https://github.com/mlcommons/algorithmic-efficiency/blob/main/RULES.md#allowed-submissions -and https://github.com/mlcommons/algorithmic-efficiency/blob/main/RULES.md#disallowed-submissions -for guidelines. +See https://github.com/mlcommons/algorithmic-efficiency/blob/main/DOCUMENTATION.md#allowed-submissions +and https://github.com/mlcommons/algorithmic-efficiency/blob/main/DOCUMENTATION.md#disallowed-submissions +for guidelines. """ From 38cfe242ee43e3b9c714cfa15f7940dcd55fb07e Mon Sep 17 00:00:00 2001 From: Frank Schneider Date: Thu, 16 Nov 2023 10:44:30 +0100 Subject: [PATCH 11/27] Change tuning budget to 5 trials (instead of 20) --- DOCUMENTATION.md | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/DOCUMENTATION.md b/DOCUMENTATION.md index 1b209fadc..94c82d9a9 100644 --- a/DOCUMENTATION.md +++ b/DOCUMENTATION.md @@ -374,9 +374,9 @@ Tuning will be substantially different for the [external](#external-tuning-rules #### External tuning ruleset -For each workload, the hyperparameters are tuned using $O=20$ tuning **trials**. To estimate the variance of the results, this tuning will be repeated for $S=5$ **studies**, for a total of $S\cdot O = 100$ different hyperparameter settings. The submitters will provide a workload-agnostic search space and the working group will then return $100$ hyperparameters settings obtained using [(quasi)random search](https://arxiv.org/abs/1706.03200). The working group will also randomly partition these $100$ trials into $5$ studies of $20$ trials each. In lieu of independent samples from a search space, submissions can instead supply a fixed list of $20$ hyper-parameter points that will be sampled without replacement. +For each workload, the hyperparameters are tuned using $O=5$ tuning **trials**. To estimate the variance of the results, this tuning will be repeated for $S=5$ **studies**, for a total of $S\cdot O = 25$ different hyperparameter settings. The submitters will provide a workload-agnostic search space and the working group will then return $25$ hyperparameters settings obtained using [(quasi)random search](https://arxiv.org/abs/1706.03200). The working group will also randomly partition these $25$ trials into $5$ studies of $5$ trials each. In lieu of independent samples from a search space, submissions can instead supply a fixed list of $5$ hyper-parameter points that will be sampled without replacement. -In each trial, the tuning trial with the fastest training time to achieve the *validation target* is determined among the $O=20$ hyperparameter settings. For scoring, we use this required training time to reach the *validation targets* of those $5$ selected runs. The median of these $5$ per-study training times will be the final training time for the submission on this workload and is used in the scoring procedure (see the "[Scoring submissions](#scoring)" section). Runs that do not reach the target performance of the evaluation metric have an infinite time. Submissions are always free to perform additional self-tuning while being timed. +In each trial, the tuning trial with the fastest training time to achieve the *validation target* is determined among the $O=5$ hyperparameter settings. For scoring, we use this required training time to reach the *validation targets* of those $5$ selected runs. The median of these $5$ per-study training times will be the final training time for the submission on this workload and is used in the scoring procedure (see the "[Scoring submissions](#scoring)" section). Runs that do not reach the target performance of the evaluation metric have an infinite time. Submissions are always free to perform additional self-tuning while being timed. #### Self-tuning ruleset @@ -445,7 +445,7 @@ For self-reported results, it is acceptable to perform the tuning trials on hard #### Defining target performance -Target performances on the validation and test sets will be defined for each [workload](#workloads) separately. For the [fixed workloads](#fixed-workloads), we take the best performance achievable by one of four standard algorithms (AdamW, NadamW, Nesterov Momentum, and Heavy Ball Momentum). These target-setting algorithms will follow the general process of the external tuning ruleset, with a slightly larger tuning budget of $200$ trials to guarantee competitive performance. Once the best algorithm and its hyperparameters are determined, training is repeated $20$ times. The median of the best achieved validation errors across seeds is used as the *validation* target. Out of the $10$ repeated runs that achieved this validation target, we took the worst achieved test error across seeds as our *test* target. Taking the median validation performance after rerunning the best hyperparameter point prevents our procedure from selecting a lucky outlier. +Target performances on the validation and test sets will be defined for each [workload](#workloads) separately. For the [fixed workloads](#fixed-workloads), we take the best performance achievable by one of four standard algorithms (AdamW, NadamW, Nesterov Momentum, and Heavy Ball Momentum). These target-setting algorithms will follow the general process of the external tuning ruleset, with a significantly larger tuning budget of $200$ trials to guarantee competitive performance. Once the best algorithm and its hyperparameters are determined, training is repeated $20$ times. The median of the best achieved validation errors across seeds is used as the *validation* target. Out of the $10$ repeated runs that achieved this validation target, we took the worst achieved test error across seeds as our *test* target. Taking the median validation performance after rerunning the best hyperparameter point prevents our procedure from selecting a lucky outlier. To save computational resources, we only tuned two training algorithms instead of four, for the [randomized workloads](#randomized-workloads). For each workload variant, we used NadamW and the other best-performing training algorithm on the corresponding fixed workload the randomized workload is based on. Both [tuning rulesets](#tuning) will use the same target performances. The runtime of the target-setting algorithms on each workload will be chosen to match published results and is constrained by the overall time budget of roughly a single week for all fixed workloads. The `max_runtime` for submissions on each workload is $\frac{1}{3}$ longer than the runtime of the target-setting algorithms (this `max_runtime` will be three times as much for the self-tuning ruleset, see the [Self-tuning ruleset](#self-tuning-ruleset) section). From 7c152a48ff4bf85d68a2b287af24cdaff865c26b Mon Sep 17 00:00:00 2001 From: Frank Schneider Date: Thu, 16 Nov 2023 13:07:05 +0100 Subject: [PATCH 12/27] Add CfS. Separate Benchmark and Competition --- CALL_FOR_SUBMISSIONS.md | 30 ++++++++++++++++++++++++++++++ README.md | 26 +++++++++++++++----------- 2 files changed, 45 insertions(+), 11 deletions(-) create mode 100644 CALL_FOR_SUBMISSIONS.md diff --git a/CALL_FOR_SUBMISSIONS.md b/CALL_FOR_SUBMISSIONS.md new file mode 100644 index 000000000..bb771ad1c --- /dev/null +++ b/CALL_FOR_SUBMISSIONS.md @@ -0,0 +1,30 @@ +# MLCommons™ AlgoPerf: Call for Submissions + +## Announcing the AlgoPerf: Training Algorithms Benchmark Competition + +Neural networks must be trained to be useful. However, training is a resource-intensive task, often demanding extensive compute and energy resources. +To promote faster training algorithms, the [MLCommons® Algorithms Working Group](https://mlcommons.org/en/groups/research-algorithms/) is delighted to present the **AlgoPerf: Training Algorithms** benchmark. This benchmark competition is designed to measure neural network training speedups due to *algorithmic improvements*. We welcome submissions that implement both novel and existing training algorithms, including, but not limited to: + +- Optimizer update rules +- Hyperparameter tuning protocols, search spaces, or schedules +- Data sampling strategies + +Submissions can compete under two hyperparameter tuning rulesets (with separate prizes and awards): an external tuning ruleset meant to simulate tuning with a fixed amount of parallel resources, or a self-tuning ruleset for hyperparameter-free algorithms. + +## Dates + +- **Call for submissions: October 17th, 2023** +- Registration deadline to express non-binding intent to submit: December 15th, 2023 +- **Submission deadline: February 15th, 2024** +- Deadline for self-reporting preliminary results: April 10th, 2024 +- **[tentative] Announcement of all results: late May 2024** + +For a detailed and up-to-date timeline see the [Competition Rules](/COMPETITION_RULES.md). + +## Participation + +For details on how to participate in the competition, please refer to our [Competition Rules](/COMPETITION_RULES.md). To learn more about the benchmark, see our [technical documentation](/DOCUMENTATION.md). The benchmark is further motivated, explained, and justified in the accompanying [paper](https://arxiv.org/abs/2306.07179). We require all submissions to be provided under the open-source [Apache 2.0 license](https://www.apache.org/licenses/LICENSE-2.0). + +## Prize Money & Funding + +MLCommons has provided a total of $50,000 in prize money for eligible winning submissions. We would also like to express our gratitude to Google for their generous support in providing computational resources to score the top submissions, and resources to help score some promising submissions from submitters with more limited resources. diff --git a/README.md b/README.md index dae868260..3e888a0a9 100644 --- a/README.md +++ b/README.md @@ -7,12 +7,11 @@

Paper (arXiv) • - Installation • + Call for SubmissionsGetting StartedCompetition RulesDocumentation • - Contributing • - License + Contributing

[![CI](https://github.com/mlcommons/algorithmic-efficiency/actions/workflows/CI.yml/badge.svg)](https://github.com/mlcommons/algorithmic-efficiency/actions/workflows/CI.yml) @@ -22,14 +21,15 @@ --- -> *AlgoPerf* is a suite of benchmarks and competitions to measure neural network training speedups due to algorithmic improvements in both training algorithms and models. This is the repository for the *AlgoPerf: Training algorithms benchmark*. It is developed by the [MLCommons Algorithms Working Group](https://mlcommons.org/en/groups/research-algorithms/). This repository holds the [**competition rules**](/COMPETITION_RULES.md), the [**technical documentation**](/DOCUMENTATION.md) of the benchmark, [**getting started guides**](/GETTING_STARTED.md), and the benchmark code. For a detailed description of the benchmark design, see our [**paper**](https://arxiv.org/abs/2306.07179). +> *AlgoPerf* is a suite of benchmarks and competitions to measure neural network training speedups due to algorithmic improvements in both training algorithms and models. This is the repository for the *AlgoPerf: Training Algorithms benchmark* and its associated competition. It is developed by the [MLCommons Algorithms Working Group](https://mlcommons.org/en/groups/research-algorithms/). This repository holds the [**competition rules**](/COMPETITION_RULES.md), the [**technical documentation**](/DOCUMENTATION.md) of the benchmark, [**getting started guides**](/GETTING_STARTED.md), and the benchmark code. For a detailed description of the benchmark design, see our [**paper**](https://arxiv.org/abs/2306.07179). ## Table of Contents - [Installation](#installation) - [Getting Started](#getting-started) -- [Competition Rules](#competition-rules) -- [Technical Documentation \& FAQs](#technical-documentation--faqs) +- [Call for Submissions](#call-for-submissions) + - [Competition Rules](#competition-rules) + - [Technical Documentation of the Benchmark \& FAQs](#technical-documentation-of-the-benchmark--faqs) - [Contributing](#contributing) - [License](#license) - [Paper and Citing the AlgoPerf Benchmark](#paper-and-citing-the-algoperf-benchmark) @@ -84,13 +84,17 @@ python3 submission_runner.py \ --tuning_search_space=baselines/adamw/tuning_search_space.json ``` -## Competition Rules +## Call for Submissions -The competition rules for the *AlgoPerf: Training algorithms* benchmark competition can be found in the seperate [**Competition Rules**](/COMPETITION_RULES.md) document. +The [Call for Submissions](/CALL_FOR_SUBMISSIONS.md) announces the first iteration of the AlgoPerf: Training Algorithms competition based on the benchmark by the same name. -## Technical Documentation & FAQs +### Competition Rules -We provide additional technical documentation and answer frequently asked questions in a separate [**Documentation**](/DOCUMENTATION.md) page. Suggestions, clarifications and questions can be raised via pull requests, creating an issue, or by sending an email to the [working group](mailto:algorithms@mlcommons.org). +The competition rules for the *AlgoPerf: Training Algorithms* competition can be found in the separate [**Competition Rules**](/COMPETITION_RULES.md) document. + +### Technical Documentation of the Benchmark & FAQs + +We provide additional technical documentation of the benchmark and answer frequently asked questions in a separate [**Documentation**](/DOCUMENTATION.md) page. Suggestions, clarifications and questions can be raised via pull requests, creating an issue, or by sending an email to the [working group](mailto:algorithms@mlcommons.org). ## Contributing @@ -104,7 +108,7 @@ The *AlgoPerf* codebase is licensed under the [Apache License 2.0](/LICENSE.md). ## Paper and Citing the AlgoPerf Benchmark -In our paper ["Benchmarking Neural Network Training Algorithms"](http://arxiv.org/abs/2306.07179) we motivate, describe, and justify the *AlgoPerf: Training algorithms* benchmark. +In our paper ["Benchmarking Neural Network Training Algorithms"](http://arxiv.org/abs/2306.07179) we motivate, describe, and justify the *AlgoPerf: Training Algorithms* benchmark. If you are using the *AlgoPerf benchmark*, its codebase, baselines, or workloads, please consider citing our paper: From b4ad421159334c5f03ef9eed4c36139d8e44c5cf Mon Sep 17 00:00:00 2001 From: Frank Schneider Date: Thu, 16 Nov 2023 13:15:38 +0100 Subject: [PATCH 13/27] Remove the extra deadline for held-out batch sizes --- CALL_FOR_SUBMISSIONS.md | 4 ++-- COMPETITION_RULES.md | 6 ++---- DOCUMENTATION.md | 4 ++-- 3 files changed, 6 insertions(+), 8 deletions(-) diff --git a/CALL_FOR_SUBMISSIONS.md b/CALL_FOR_SUBMISSIONS.md index bb771ad1c..76b2d21b6 100644 --- a/CALL_FOR_SUBMISSIONS.md +++ b/CALL_FOR_SUBMISSIONS.md @@ -16,8 +16,8 @@ Submissions can compete under two hyperparameter tuning rulesets (with separate - **Call for submissions: October 17th, 2023** - Registration deadline to express non-binding intent to submit: December 15th, 2023 - **Submission deadline: February 15th, 2024** -- Deadline for self-reporting preliminary results: April 10th, 2024 -- **[tentative] Announcement of all results: late May 2024** +- **Deadline for self-reporting preliminary results: April 10th, 2024** +- [tentative] Announcement of all results: late May 2024 For a detailed and up-to-date timeline see the [Competition Rules](/COMPETITION_RULES.md). diff --git a/COMPETITION_RULES.md b/COMPETITION_RULES.md index f89ada3d8..8428232b5 100644 --- a/COMPETITION_RULES.md +++ b/COMPETITION_RULES.md @@ -43,7 +43,6 @@ The Competition begins at 12:01am (ET) on November 21, 2023 and ends at 11:59pm - **Intention to Submit.** You must register your Intention to Submit no later than 11:59pm ET on January 21, 2024. - **Submission Period.** You must complete your Submission and enter it after the Intention to Submit deadline, but no later than 11:59pm ET on March 21, 2024. -- **Deadline for specifying the Submission batch sizes for held-out workloads.** 11:59pm ET on April 4, 2024. - **Deadline for self-reporting results.** 11:59pm ET on May 21, 2024. ## Agreement to Official Rules @@ -52,7 +51,7 @@ By participating, Teams agree to be fully unconditionally bound by these Rules, ## How to Enter -There are five (5) steps to a successful submission ("Submission"). +There are four (4) steps to a successful submission ("Submission"). 1. **Register Intent to Submit.** Registration of intent does not obligate you to enter a Submission, but you must register prior to entering your Submission. Click for the [Intent Form](https://forms.gle/K7ty8MaYdi2AxJ4N8). This is your "Team," even if you are a single person. Please note that natural persons may not be on multiple teams, but each Team may enter multiple Submissions. 2. **Develop your Submission.** Develop your Submission according to the guidelines set forth in these rules, along with the links to various necessary information. Please note that all Submissions must be entered subject to the Apache 2.0 license. In order to develop your Submission, you must: @@ -66,8 +65,7 @@ There are five (5) steps to a successful submission ("Submission"). The form is sent to the working group chairs, who will process your Submission. Failure to complete the proper Submission Forms will results in disqualification of your Submission. At the close of the Submission Period, your GitHub repository must be public. -4. **Define the batch sizes for held-out workloads.** Once the held-out workloads have been sampled, you have until the "Deadline for specifying the Submission batch sizes for held-out workloads" to define the batch sizes for the held-out workloads via the `get_batch_size` function of your submission. -5. **Report Results.** Prior to the Deadline for self-reporting results, run your Submission on either the qualification set or the full benchmark set and report the results. You must report your scores by uploading all unmodified logs that the benchmarking codebase automatically generates in a separate `/results` directory within the `/submission` folder f your Submission's GitHub repository. +4. **Report Results.** Prior to the Deadline for self-reporting results, run your Submission on either the qualification set or the full benchmark set and report the results. You must report your scores by uploading all unmodified logs that the benchmarking codebase automatically generates in a separate `/results` directory within the `/submission` folder f your Submission's GitHub repository. ## Submission Conditions diff --git a/DOCUMENTATION.md b/DOCUMENTATION.md index 94c82d9a9..25f94e902 100644 --- a/DOCUMENTATION.md +++ b/DOCUMENTATION.md @@ -78,7 +78,7 @@ Any function defined in the reference implementations that isn't a [submission f In principle, submissions are allowed to use the available hardware systems in any data- or model-parallel manner they desire, within the constraints of the submission function APIs. However, in practice, model-parallelism may not be possible with the API. They are allowed to access any framework-specific device information necessary to exploit the hardware. -Submissions provide a [per-workload batch size](#batch-size-getter) to use. Specification of the batch size for each workload is necessary to avoid running out of memory for different workloads. Therefore, submitters can determine this batch size in advance and specify it as part of the submission. Submitters may also provide per-workload batch sizes for all [randomized workloads](#randomized-workloads). If no such batch size is provided for a randomized workload, by default, submissions will then use the batch size of the most similar [fixed workload](#fixed-workloads) (for example, if there is an ImageNet fixed workload and also a randomized workload with a similarly sized model on similarly sized images, the ImageNet batch size will be used for held-out workloads generated from this randomized workload). +Submissions provide a [per-workload batch size](#batch-size-getter) to use. Specification of the batch size for each workload is necessary to avoid running out of memory for different workloads. Therefore, submitters can determine this batch size in advance and specify it as part of the submission. Submitters may also provide per-workload batch sizes for all [randomized workloads](#randomized-workloads). If no such batch size is provided for a randomized workload, by default, submissions will then use the batch size of the most similar [fixed workload](#fixed-workloads) (for example, if there is an ImageNet fixed workload and also a randomized workload with a similarly sized model on similarly sized images, the ImageNet batch size will be used for held-out workloads generated from this randomized workload).1`` The **submission functions** are the *batch size getter*, *optimizer state initializer*, *variable update*, and *data selection functions*. The *fixed functions* are the *data augmentation/preprocessing*, *model initialization*, *forward pass*, and *loss function*. The trained model will be evaluated in a separate step that does not call any of the submitted code. @@ -168,7 +168,7 @@ def get_batch_size(workload_name: str) -> int - Submitters define a specific batch size for each [workload](#workloads). - For example, in advance, they can determine the largest batch size without running out of memory for each workload. -- For the [held-out workloads](#randomized-workloads), submitters may provide a batch size once the submission code is frozen and the held-out workloads are sampled from the randomized workloads. By default, this function will use the `workload_name` of the fixed workload it is based on. +- For the [held-out workloads](#randomized-workloads), by default, this function will use the `workload_name` of the fixed workload it is based on. ###### Optimizer state initializer From 8b14658ec3eb4ecb0fe6eec931b334fa75b2e1f2 Mon Sep 17 00:00:00 2001 From: Frank Schneider Date: Thu, 16 Nov 2023 13:27:21 +0100 Subject: [PATCH 14/27] Remove spirit jury from technical documentation --- DOCUMENTATION.md | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/DOCUMENTATION.md b/DOCUMENTATION.md index 25f94e902..de1a78e72 100644 --- a/DOCUMENTATION.md +++ b/DOCUMENTATION.md @@ -215,10 +215,8 @@ def update_params( - **A call to this function will be considered a step** - The time between a call to this function and the next call to this function will be considered the per-step time. - Cannot modify the given hyperparameters in a workload-conditional way (please see the [Valid submission](#valid-submissions) section). This rule is intended to prohibit circumventing the tuning rules by looking up a pre-tuned optimal set of hyperparameters for each workload. It is not intended to prohibit line searches and other similar techniques. - - This will be checked by the spirit jury. - The fixed `init_model_fn` can optionally be called during training, for example, to reinitialize the model after a failed training effort. - Cannot replace the model parameters with pre-trained ones. - - This will be checked by the spirit jury. - This API supports Polyak averaging and similar methods that implement moving averages of model parameters. - Batch norm should work here because the `model_fn` will return updated batch norm moving averages when it is told to with `update_batch_norm`. @@ -257,9 +255,7 @@ Submissions are eligible for an untimed eval every `eval_period` seconds, run as The intention of this benchmark is to identify training algorithm submissions that will be broadly applicable and effective in practical scenarios without customization to the specific [workload](#workloads) (model, dataset, and loss function). Generally useful training algorithms can train models faster and thus require less compute resources, decreasing the cost of machine learning. We want to discourage all submissions that sidestep the purpose of this benchmark. -We reserve the right to disqualify submissions if they clearly violate this spirit of the benchmark, even if those submissions perform well in our benchmark. Unfortunately, we can't easily write rules that make it completely clear if a submission is circumventing the spirit of the benchmark in a way that would encompass all possible cases. Instead, we will have to prohibit these activities in the abstract and defer rulings about specific submissions to a **"spirit [of the rules] jury"** that can hear the justifications of the submitters, inspect the code, and ultimately decide if the spirit of the rules has been violated. The jury might also ask the submitters to explain how the submission was produced, for example, by disclosing their intermediate experiments. - -We want to state clearly that we welcome creative ideas and novel research. Therefore, the API aims to allow a wide variety of submissions, however, in some cases, routines that would be allowed in principle might not be practically feasible in the provided framework. The spirit jury, however, will only be invoked for submissions that aim to bypass the core premise of this benchmark since submissions like this would also be irrelevant in practice. +Unfortunately, we can't easily write rules that make it completely clear if a submission is circumventing the spirit of the benchmark in a way that would encompass all possible cases. We want to state clearly that we welcome creative ideas and novel research. Therefore, the API aims to allow a wide variety of submissions, however, in some cases, routines that would be allowed in principle might not be practically feasible in the provided framework. In order to help clarify which submissions are [allowed](#allowed-submissions) and [disallowed](#disallowed-submissions), we described a few examples below. Two essential questions can help provide a general guideline for whether a submission is allowed or not: @@ -366,7 +362,7 @@ Valid submissions must rely on new algorithmic or mathematical ideas and should ##### Software dependencies -We require submissions to use specific versions of `PyTorch`/`JAX` as well as additional dependencies in order to facilitate fair comparisons. Submitters must build on top of these provided software packages, which might be provided as a `Docker` container. Additional dependencies can be added as long as they include a comment describing what was added and why. Submitters are free to add dependencies that support new algorithmic and mathematical ideas but they should not circumvent the intention of the benchmark to measure training speedups due to new training methods. For example, software engineering techniques that lead to faster implementations of existing software, e.g. using newer versions of `PyTorch` or `JAX`, are not allowed and these are described in more detail in the [Disallowed submissions](#disallowed-submissions) section. In case of doubts, these additional dependencies will be judged by the spirit jury. +We require submissions to use specific versions of `PyTorch`/`JAX` as well as additional dependencies in order to facilitate fair comparisons. Submitters must build on top of these provided software packages, which might be provided as a `Docker` container. Additional dependencies can be added as long as they include a comment describing what was added and why. Submitters are free to add dependencies that support new algorithmic and mathematical ideas but they should not circumvent the intention of the benchmark to measure training speedups due to new training methods. For example, software engineering techniques that lead to faster implementations of existing software, e.g. using newer versions of `PyTorch` or `JAX`, are not allowed and these are described in more detail in the [Disallowed submissions](#disallowed-submissions) section. ### Tuning From 6fd1b2fb5383e467a28c94c71f3e1f5b742554cb Mon Sep 17 00:00:00 2001 From: Frank Schneider Date: Thu, 16 Nov 2023 14:20:48 +0100 Subject: [PATCH 15/27] Typo --- COMPETITION_RULES.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/COMPETITION_RULES.md b/COMPETITION_RULES.md index 8428232b5..ca470ea7e 100644 --- a/COMPETITION_RULES.md +++ b/COMPETITION_RULES.md @@ -65,7 +65,7 @@ There are four (4) steps to a successful submission ("Submission"). The form is sent to the working group chairs, who will process your Submission. Failure to complete the proper Submission Forms will results in disqualification of your Submission. At the close of the Submission Period, your GitHub repository must be public. -4. **Report Results.** Prior to the Deadline for self-reporting results, run your Submission on either the qualification set or the full benchmark set and report the results. You must report your scores by uploading all unmodified logs that the benchmarking codebase automatically generates in a separate `/results` directory within the `/submission` folder f your Submission's GitHub repository. +4. **Report Results.** Prior to the Deadline for self-reporting results, run your Submission on either the qualification set or the full benchmark set and report the results. You must report your scores by uploading all unmodified logs that the benchmarking codebase automatically generates in a separate `/results` directory within the `/submission` folder of your Submission's GitHub repository. ## Submission Conditions From 39b8e352c18d719ba3e8c32197dc4718c79087b0 Mon Sep 17 00:00:00 2001 From: Frank Date: Thu, 16 Nov 2023 21:24:47 +0100 Subject: [PATCH 16/27] Typo --- DOCUMENTATION.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/DOCUMENTATION.md b/DOCUMENTATION.md index de1a78e72..1377433e4 100644 --- a/DOCUMENTATION.md +++ b/DOCUMENTATION.md @@ -78,7 +78,7 @@ Any function defined in the reference implementations that isn't a [submission f In principle, submissions are allowed to use the available hardware systems in any data- or model-parallel manner they desire, within the constraints of the submission function APIs. However, in practice, model-parallelism may not be possible with the API. They are allowed to access any framework-specific device information necessary to exploit the hardware. -Submissions provide a [per-workload batch size](#batch-size-getter) to use. Specification of the batch size for each workload is necessary to avoid running out of memory for different workloads. Therefore, submitters can determine this batch size in advance and specify it as part of the submission. Submitters may also provide per-workload batch sizes for all [randomized workloads](#randomized-workloads). If no such batch size is provided for a randomized workload, by default, submissions will then use the batch size of the most similar [fixed workload](#fixed-workloads) (for example, if there is an ImageNet fixed workload and also a randomized workload with a similarly sized model on similarly sized images, the ImageNet batch size will be used for held-out workloads generated from this randomized workload).1`` +Submissions provide a [per-workload batch size](#batch-size-getter) to use. Specification of the batch size for each workload is necessary to avoid running out of memory for different workloads. Therefore, submitters can determine this batch size in advance and specify it as part of the submission. Submitters may also provide per-workload batch sizes for all [randomized workloads](#randomized-workloads). If no such batch size is provided for a randomized workload, by default, submissions will then use the batch size of the most similar [fixed workload](#fixed-workloads) (for example, if there is an ImageNet fixed workload and also a randomized workload with a similarly sized model on similarly sized images, the ImageNet batch size will be used for held-out workloads generated from this randomized workload). The **submission functions** are the *batch size getter*, *optimizer state initializer*, *variable update*, and *data selection functions*. The *fixed functions* are the *data augmentation/preprocessing*, *model initialization*, *forward pass*, and *loss function*. The trained model will be evaluated in a separate step that does not call any of the submitted code. From ebd2559edf7518c274a5d91bc18c9334a515d085 Mon Sep 17 00:00:00 2001 From: Frank Date: Thu, 16 Nov 2023 21:27:13 +0100 Subject: [PATCH 17/27] Update deadline dates --- COMPETITION_RULES.md | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/COMPETITION_RULES.md b/COMPETITION_RULES.md index ca470ea7e..94238cae8 100644 --- a/COMPETITION_RULES.md +++ b/COMPETITION_RULES.md @@ -39,11 +39,11 @@ The Competition is open to English-speaking individuals and teams (made of indiv ## Competition Period -The Competition begins at 12:01am (ET) on November 21, 2023 and ends at 11:59pm (ET) on May 21, 2024, all according to Sponsor's time clock, which decisions are final (the "Competition Period"). There are several deadlines contained within the Competition Period: +The Competition begins at 12:01am (ET) on November 28, 2023 and ends at 11:59pm (ET) on May 28, 2024, all according to Sponsor's time clock, which decisions are final (the "Competition Period"). There are several deadlines contained within the Competition Period: -- **Intention to Submit.** You must register your Intention to Submit no later than 11:59pm ET on January 21, 2024. -- **Submission Period.** You must complete your Submission and enter it after the Intention to Submit deadline, but no later than 11:59pm ET on March 21, 2024. -- **Deadline for self-reporting results.** 11:59pm ET on May 21, 2024. +- **Intention to Submit.** You must register your Intention to Submit no later than 11:59pm ET on January 28, 2024. +- **Submission Period.** You must complete your Submission and enter it after the Intention to Submit deadline, but no later than 11:59pm ET on March 28, 2024. +- **Deadline for self-reporting results.** 11:59pm ET on May 28, 2024. ## Agreement to Official Rules @@ -95,7 +95,7 @@ All Submission are subject to human review and testing to determine whether, in ## Notification -On or about June 30, 2024, the Selected Team with the best scores as determined by Sponsor will be notified that they are potential winners of the Competition. The Selected Team will be notified by either phone or email at the sole discretion of Sponsor or Sponsor's representative. Selected Team will be required to respond (as directed) to a phone and/or e-mail notification within 72 hours of attempted notification. The failure to respond timely to the notification may result in forfeiture of the prize; and, in such case, Sponsor may choose the next highest scoring Submission from among the remaining eligible Submissions. Selected Team members will each be required to sign and return a Declaration (or affidavit, at Sponsor's option) of Eligibility and Liability/Publicity Release ("Declaration") and any other documents Sponsor or Sponsor's representative may require within 72 hours of receipt of the Declaration. Failure to timely return a signed Declaration (or failure of a Team member to return it), or any other required documents or the return of any prize notification as undeliverable will result in Prize forfeiture. National and state income taxes may apply and are the sole responsibility of the winner. All expenses not specifically stated as being included are excluded, and are the responsibility of the Selected Teams. No assignment, transfer or substitution of Prize is permitted, however, Sponsor reserves the right to substitute a prize for one of comparable or greater value should Prize become impracticable to award or unavailable for any reason. +On or about July 15, 2024, the Selected Team with the best scores as determined by Sponsor will be notified that they are potential winners of the Competition. The Selected Team will be notified by either phone or email at the sole discretion of Sponsor or Sponsor's representative. Selected Team will be required to respond (as directed) to a phone and/or e-mail notification within 72 hours of attempted notification. The failure to respond timely to the notification may result in forfeiture of the prize; and, in such case, Sponsor may choose the next highest scoring Submission from among the remaining eligible Submissions. Selected Team members will each be required to sign and return a Declaration (or affidavit, at Sponsor's option) of Eligibility and Liability/Publicity Release ("Declaration") and any other documents Sponsor or Sponsor's representative may require within 72 hours of receipt of the Declaration. Failure to timely return a signed Declaration (or failure of a Team member to return it), or any other required documents or the return of any prize notification as undeliverable will result in Prize forfeiture. National and state income taxes may apply and are the sole responsibility of the winner. All expenses not specifically stated as being included are excluded, and are the responsibility of the Selected Teams. No assignment, transfer or substitution of Prize is permitted, however, Sponsor reserves the right to substitute a prize for one of comparable or greater value should Prize become impracticable to award or unavailable for any reason. ## Prizes From 1a4db99f0af389d63faffdc4cef735b6da08195c Mon Sep 17 00:00:00 2001 From: Frank Date: Thu, 16 Nov 2023 21:28:06 +0100 Subject: [PATCH 18/27] Update deadline dates --- CALL_FOR_SUBMISSIONS.md | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/CALL_FOR_SUBMISSIONS.md b/CALL_FOR_SUBMISSIONS.md index 76b2d21b6..c177baf15 100644 --- a/CALL_FOR_SUBMISSIONS.md +++ b/CALL_FOR_SUBMISSIONS.md @@ -13,11 +13,11 @@ Submissions can compete under two hyperparameter tuning rulesets (with separate ## Dates -- **Call for submissions: October 17th, 2023** -- Registration deadline to express non-binding intent to submit: December 15th, 2023 -- **Submission deadline: February 15th, 2024** -- **Deadline for self-reporting preliminary results: April 10th, 2024** -- [tentative] Announcement of all results: late May 2024 +- **Call for submissions: November 28th, 2023** +- Registration deadline to express non-binding intent to submit: January 28th, 2024 +- **Submission deadline: March 28th, 2024** +- **Deadline for self-reporting preliminary results: May 28th, 2024** +- [tentative] Announcement of all results: July 15th 2024 For a detailed and up-to-date timeline see the [Competition Rules](/COMPETITION_RULES.md). From 1a669fcec55bc340ba3b2db33017ba3454e5ecda Mon Sep 17 00:00:00 2001 From: Frank Date: Thu, 16 Nov 2023 21:30:50 +0100 Subject: [PATCH 19/27] Missing comma --- CALL_FOR_SUBMISSIONS.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CALL_FOR_SUBMISSIONS.md b/CALL_FOR_SUBMISSIONS.md index c177baf15..30207ac7f 100644 --- a/CALL_FOR_SUBMISSIONS.md +++ b/CALL_FOR_SUBMISSIONS.md @@ -17,7 +17,7 @@ Submissions can compete under two hyperparameter tuning rulesets (with separate - Registration deadline to express non-binding intent to submit: January 28th, 2024 - **Submission deadline: March 28th, 2024** - **Deadline for self-reporting preliminary results: May 28th, 2024** -- [tentative] Announcement of all results: July 15th 2024 +- [tentative] Announcement of all results: July 15th, 2024 For a detailed and up-to-date timeline see the [Competition Rules](/COMPETITION_RULES.md). From 13ff2fec3642316646138e01ba539f3eb927eaac Mon Sep 17 00:00:00 2001 From: Frank Schneider Date: Thu, 23 Nov 2023 14:31:02 +0100 Subject: [PATCH 20/27] Add reference to threshold baselines --- COMPETITION_RULES.md | 2 +- .../prize_qualification_baselines/README.md | 20 +++++++++++-------- 2 files changed, 13 insertions(+), 9 deletions(-) diff --git a/COMPETITION_RULES.md b/COMPETITION_RULES.md index 94238cae8..85f16c4cf 100644 --- a/COMPETITION_RULES.md +++ b/COMPETITION_RULES.md @@ -79,7 +79,7 @@ Submissions must use specific versions of PyTorch and JAX, provided by Sponsor. ## Scoring -All otherwise qualified Submissions shall be scored. Submissions will be scored based on their required training time to reach the target performance on the validation set of each workload, using measuring techniques designed to give all Submissions equal parity. In the event that no Submission receives a score exceeding that of the [NAdamW baseline](https://github.com/mlcommons/algorithmic-efficiency/tree/dev/baselines/nadamw), no prizes will be awarded. The Teams with the highest scores will be determined to be winners ("Selected Teams"). In the event of a tie the prize money will be split equally between the winners. +All otherwise qualified Submissions shall be scored. Submissions will be scored based on their required training time to reach the target performance on the validation set of each workload, using measuring techniques designed to give all Submissions equal parity. In the event that no Submission in a ruleset receives a score exceeding that of both [prize qualification baselines](./reference_algorithms/prize_qualification_baselines/README.md), no prizes will be awarded for this ruleset. The Teams with the highest scores will be determined to be winners ("Selected Teams"). In the event of a tie the prize money will be split equally between the winners. ## Submissions diff --git a/reference_algorithms/prize_qualification_baselines/README.md b/reference_algorithms/prize_qualification_baselines/README.md index 614f87b32..8276887da 100644 --- a/reference_algorithms/prize_qualification_baselines/README.md +++ b/reference_algorithms/prize_qualification_baselines/README.md @@ -1,13 +1,13 @@ # Prize Qualification Baselines -This directory contains the baseine(s) that submissions that must beat to qualify for prizes. -TODO: link back to section in rules. +This directory contains the baseline(s) that submissions must beat to qualify for prizes, see the [Scoring Section](/COMPETITION_RULES.md#scoring) of the competition rules. ## Externally Tuned Ruleset ### JAX -The prize qualification baseline submissions for jax are: +The prize qualification baseline submissions for JAX are: + - `reference_algorithms/prize_qualification_baselines/external_tuning/jax_nadamw_target_setting.py` - `feference_algorithms/prize_qualification_baselines/external_tuning/jax_nadamw_full_budget.py` @@ -27,9 +27,9 @@ python3 submission_runner.py \ ### PyTorch The prize qualification baseline submissionss for PyTorch are: -- `reference_algorithms/prize_qualification_baselines/external_tuning/pytorch_nadamw_target_setting.py` -- `feference_algorithms/prize_qualification_baselines/external_tuning/pytorch_nadamw_full_budget.py` +- `reference_algorithms/prize_qualification_baselines/external_tuning/pytorch_nadamw_target_setting.py` +- `feference_algorithms/prize_qualification_baselines/external_tuning/pytorch_nadamw_full_budget.py` Example command: @@ -48,11 +48,13 @@ torchrun --redirects 1:0,2:0,3:0,4:0,5:0,6:0,7:0 --standalone --nnodes=1 --nproc ### JAX -The prize qualification baseline submissionss for jax are: +The prize qualification baseline submissionss for jax are: + - `reference_algorithms/prize_qualification_baselines/external_tuning/jax_nadamw_target_setting.py` - `feference_algorithms/prize_qualification_baselines/external_tuning/jax_nadamw_full_budget.py` Example command: + ```bash python3 submission_runner.py \ --framework=jax \ @@ -67,10 +69,12 @@ python3 submission_runner.py \ ### PyTorch The prize qualification baseline submissionss for PyTorch are: -- `reference_algorithms/prize_qualification_baselines/external_tuning/pytorch_nadamw_target_setting.py` + +- `reference_algorithms/prize_qualification_baselines/external_tuning/pytorch_nadamw_target_setting.py` - `feference_algorithms/prize_qualification_baselines/external_tuning/pytorch_nadamw_full_budget.py` Example command: + ```bash torchrun --redirects 1:0,2:0,3:0,4:0,5:0,6:0,7:0 --standalone --nnodes=1 --nproc_per_node=8 submission_runner.py \ --framework=pytorch \ @@ -80,4 +84,4 @@ torchrun --redirects 1:0,2:0,3:0,4:0,5:0,6:0,7:0 --standalone --nnodes=1 --nproc --workload=\ --submission_path=reference_algorithms/prize_qualification_baselines/external_tuning/pytorch_nadamw_target_setting.py \ --tuning_ruleset=self -``` \ No newline at end of file +``` From 9977bc2ad05fc32b48a7c96f25a935b2a4540e41 Mon Sep 17 00:00:00 2001 From: Frank Schneider Date: Thu, 23 Nov 2023 14:32:58 +0100 Subject: [PATCH 21/27] Update "multiple submission" FAQ --- DOCUMENTATION.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/DOCUMENTATION.md b/DOCUMENTATION.md index 1377433e4..20ceffd19 100644 --- a/DOCUMENTATION.md +++ b/DOCUMENTATION.md @@ -537,7 +537,7 @@ new Compute Instance with the "Deep Learning on Linux" Image in Boot disk option #### Can I submit multiple times to the benchmark competition? -Our benchmark allows multiple submissions by the same team of submitters. However, we would like to prevent submitters from circumventing the purpose of the benchmark by, for example, submitting dozens of copies of the same submission with slightly different hyperparameters. Such a bulk submission would result in an unfair advantage on the randomized workloads and is not in the spirit of the benchmark. +Our benchmark allows multiple submissions by the same team of submitters as long as they are substantially different. We disallow submitters from circumventing the purpose of the benchmark by, for example, submitting dozens of copies of the same submission with slightly different hyperparameters. Such a bulk submission would result in an unfair advantage on the randomized workloads and is not in the spirit of the benchmark. Submitters may submit algorithms marked as *baselines*. These might include existing algorithms with different search spaces or learning rate schedules. These baseline algorithms are not eligible for winning the competition or prize money but they are also not required to be "substantially different" from other submissions by the same submitters. From fb811f6cafc338947f9da167d1f419d6ab2d469f Mon Sep 17 00:00:00 2001 From: Frank Schneider Date: Thu, 23 Nov 2023 14:34:29 +0100 Subject: [PATCH 22/27] Update "allowed submission" part --- DOCUMENTATION.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/DOCUMENTATION.md b/DOCUMENTATION.md index 20ceffd19..adfb7b6ff 100644 --- a/DOCUMENTATION.md +++ b/DOCUMENTATION.md @@ -255,7 +255,7 @@ Submissions are eligible for an untimed eval every `eval_period` seconds, run as The intention of this benchmark is to identify training algorithm submissions that will be broadly applicable and effective in practical scenarios without customization to the specific [workload](#workloads) (model, dataset, and loss function). Generally useful training algorithms can train models faster and thus require less compute resources, decreasing the cost of machine learning. We want to discourage all submissions that sidestep the purpose of this benchmark. -Unfortunately, we can't easily write rules that make it completely clear if a submission is circumventing the spirit of the benchmark in a way that would encompass all possible cases. We want to state clearly that we welcome creative ideas and novel research. Therefore, the API aims to allow a wide variety of submissions, however, in some cases, routines that would be allowed in principle might not be practically feasible in the provided framework. +Unfortunately, we can't easily write rules that make it completely clear if a submission is circumventing the spirit of the benchmark in a way that would encompass all possible cases. We welcome creative ideas and novel research. Therefore, the API aims to allow a wide variety of submissions. However, in some cases, routines that would be allowed in principle might not be practically feasible to express in the provided framework. In order to help clarify which submissions are [allowed](#allowed-submissions) and [disallowed](#disallowed-submissions), we described a few examples below. Two essential questions can help provide a general guideline for whether a submission is allowed or not: From de2b433c167967f21096ddfce6c5fea7dbbe9f37 Mon Sep 17 00:00:00 2001 From: Frank Date: Tue, 28 Nov 2023 07:18:55 +0100 Subject: [PATCH 23/27] Address George's comments --- DOCUMENTATION.md | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/DOCUMENTATION.md b/DOCUMENTATION.md index adfb7b6ff..dac623b03 100644 --- a/DOCUMENTATION.md +++ b/DOCUMENTATION.md @@ -38,7 +38,7 @@ - [How can I know if my code can be run on benchmarking hardware?](#how-can-i-know-if-my-code-can-be-run-on-benchmarking-hardware) - [Are we allowed to use our own hardware to self-report the results?](#are-we-allowed-to-use-our-own-hardware-to-self-report-the-results) - [What can I do if running the benchmark is too expensive for me?](#what-can-i-do-if-running-the-benchmark-is-too-expensive-for-me) - - [Can I submit existing (i.e. published) training algorithms as submissions?](#can-i-submit-existing-ie-published-training-algorithms-as-submissions) + - [Can I submit existing (i.e. published) training algorithms as submissions?](#can-i-submit-previously-published-training-algorithms-as-submissions) - [Disclaimers](#disclaimers) - [Shared Data Pipelines between JAX and PyTorch](#shared-data-pipelines-between-jax-and-pytorch) - [Pytorch Conformer CUDA OOM](#pytorch-conformer-cuda-oom) @@ -253,10 +253,9 @@ Submissions are eligible for an untimed eval every `eval_period` seconds, run as #### Valid submissions -The intention of this benchmark is to identify training algorithm submissions that will be broadly applicable and effective in practical scenarios without customization to the specific [workload](#workloads) (model, dataset, and loss function). Generally useful training algorithms can train models faster and thus require less compute resources, decreasing the cost of machine learning. We want to discourage all submissions that sidestep the purpose of this benchmark. - -Unfortunately, we can't easily write rules that make it completely clear if a submission is circumventing the spirit of the benchmark in a way that would encompass all possible cases. We welcome creative ideas and novel research. Therefore, the API aims to allow a wide variety of submissions. However, in some cases, routines that would be allowed in principle might not be practically feasible to express in the provided framework. +The intention of this benchmark is to identify training algorithm submissions that will be broadly applicable and effective in practical scenarios without customization to the specific [workload](#workloads) (model, dataset, and loss function). Generally useful training algorithms can train models faster and thus require less compute resources, decreasing the cost of machine learning. We want to discourage all submissions that sidestep the purpose of this benchmark. We welcome creative ideas and novel research. Therefore, the API aims to allow a wide variety of submissions. However, in some cases, routines that would be allowed in principle might not be practically feasible to express in the provided framework. +Submissions that specialize to the specific workloads in the benchmark and have not been implemented in a way that plausibly generalizes to novel workloads are prohibited. In order to help clarify which submissions are [allowed](#allowed-submissions) and [disallowed](#disallowed-submissions), we described a few examples below. Two essential questions can help provide a general guideline for whether a submission is allowed or not: 1. What **information** is being used by the submission? @@ -519,7 +518,7 @@ To ensure that all submitters can develop their submissions based on the same co #### My machine only has one GPU. How can I use this repo? You can run this repo on a machine with an arbitrary number of GPUs. However, the default batch sizes in our reference algorithms `algorithmic-efficiency/baselines` and `algorithmic-efficiency/reference_algorithms` are tuned for a machine with 8 16GB V100 GPUs. You may run into OOMs if you run these algorithms with fewer than 8 GPUs. If you run into these issues because you are using a machine with less total GPU memory, please reduce the batch sizes for the submission. Note that your final submission must 'fit' on the benchmarking hardware, so if you are using fewer -GPUs with higher per GPU memory, please monitor your memory usage to make make sure it will fit on 8xV100 GPUs with 16GB of VRAM per card. +GPUs with higher per GPU memory, please monitor your memory usage to make sure it will fit on 8xV100 GPUs with 16GB of VRAM per card. #### How do I run this on my SLURM cluster? @@ -563,16 +562,17 @@ You only have to use the benchmarking hardware for runs that are directly involv Submitters unable to self-fund scoring costs can instead self-report only on the [qualification set of workloads](/COMPETITION_RULES.md#qualification-set) that excludes some of the most expensive workloads. Based on this performance on the qualification set, the working group will provide - as funding allows - compute to evaluate and score the most promising submissions. Additionally, we encourage researchers to reach out to the [working group](mailto:algorithms@mlcommons.org) to find potential collaborators with the resources to run larger, more comprehensive experiments for both developing and scoring submissions. -#### Can I submit existing (i.e. published) training algorithms as submissions? +#### Can I submit previously published training algorithms as submissions? -Yes you may, as long as it isn't an exact copy of an existing submission. -For example, you may submit the Adam optimizer with your hyperparameters or hyperparameter search spaces, as this constitues a different training algorithm. +Yes, you may, as long as it isn't an exact copy of an existing submission. +For example, you may submit the Adam optimizer with your particularly effective hyperparameter search space and hyperparameter configuration, as different choices for hyperparameter values and/or search spaces constitute different training algorithms and are potential sources of innovation. +That said, while submitting Adam with some novel heuristic to set various hyperparameters, some especially effective hyperparameter search space, or your single best hyperparameter configuration is fine, avoid making multiple submissions that only differ by their hyperparameter configuration without a convincing justification they are substantially different (see ["Can I submit multiple times to the benchmark competition?"](/COMPETITION_RULES.md#can-i-submit-multiple-times-to-the-benchmark-competition), above). ## Disclaimers ### Shared Data Pipelines between JAX and PyTorch -The JAX and PyTorch versions of the Criteo, FastMRI, Librispeech, OGBG, and WMT workloads are using the same TensorFlow input pipelines. Due to differences in how JAX and PyTorch distribute computations across devices, the PyTorch workloads have an additional overhead for these workloads. +The JAX and PyTorch versions of the Criteo, FastMRI, Librispeech, OGBG, and WMT workloads use the same TensorFlow input pipelines. Due to differences in how JAX and PyTorch distribute computations across devices, the PyTorch workloads have an additional overhead for these workloads. Since we use PyTorch's [`DistributedDataParallel`](https://pytorch.org/docs/stable/generated/torch.nn.parallel.DistributedDataParallel.html#torch.nn.parallel.DistributedDataParallel) implementation, there is one Python process for each device. Depending on the hardware and the settings of the cluster, running a TensorFlow input pipeline in each Python process can lead to errors, since too many threads are created in each process. See [this PR thread](https://github.com/mlcommons/algorithmic-efficiency/pull/85) for more details. While this issue might not affect all setups, we currently implement a different strategy: we only run the TensorFlow input pipeline in one Python process (with `rank == 0`), and [broadcast](https://pytorch.org/docs/stable/distributed.html#torch.distributed.broadcast) the batches to all other devices. This introduces an additional communication overhead for each batch. See the [implementation for the WMT workload](https://github.com/mlcommons/algorithmic-efficiency/blob/main/algorithmic_efficiency/workloads/wmt/wmt_pytorch/workload.py#L215-L288) as an example. From 06102a12859866f9b5d3a262d3b704c95facf3db Mon Sep 17 00:00:00 2001 From: Frank Date: Tue, 28 Nov 2023 07:19:26 +0100 Subject: [PATCH 24/27] Increment version number --- DOCUMENTATION.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/DOCUMENTATION.md b/DOCUMENTATION.md index dac623b03..ae380af34 100644 --- a/DOCUMENTATION.md +++ b/DOCUMENTATION.md @@ -1,6 +1,6 @@ # MLCommons™ AlgoPerf: Technical Documentation & FAQs -**Version:** 0.0.19 *(Last updated November 14, 2023)* +**Version:** 0.0.20 *(Last updated November 28, 2023)* > **TL;DR** New training algorithms and models can make neural net training faster. > We need a rigorous training time benchmark that measures time to result given a fixed hardware configuration and stimulates algorithmic progress. We propose a *Training Algorithm Track* and a *Model Track* in order to help disentangle optimizer improvements and model architecture improvements. This two-track structure lets us enforce a requirement that new optimizers work well on multiple models and that new models aren't highly specific to particular training hacks. The following is the technical documentation for the Training Algorithm Track. From 4472cd5f13ba41de5061bb99e00b4c6933d0d017 Mon Sep 17 00:00:00 2001 From: Priya Kasimbeg Date: Tue, 28 Nov 2023 18:13:44 +0000 Subject: [PATCH 25/27] add entry to changelog. remove conformer pytorch warning --- CHANGELOG.md | 2 +- DOCUMENTATION.md | 3 --- 2 files changed, 1 insertion(+), 4 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index f8c3db0e6..4ff1cc068 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,5 @@ # Change Log -## [0.1.0] - 2023-11-21 +## algoperf-benchmark-0.1.0 (2023-11-28) First release of the AlgoPerf: Training algorithms benchmarking code. diff --git a/DOCUMENTATION.md b/DOCUMENTATION.md index ae380af34..de7a3b7f8 100644 --- a/DOCUMENTATION.md +++ b/DOCUMENTATION.md @@ -577,6 +577,3 @@ The JAX and PyTorch versions of the Criteo, FastMRI, Librispeech, OGBG, and WMT Since we use PyTorch's [`DistributedDataParallel`](https://pytorch.org/docs/stable/generated/torch.nn.parallel.DistributedDataParallel.html#torch.nn.parallel.DistributedDataParallel) implementation, there is one Python process for each device. Depending on the hardware and the settings of the cluster, running a TensorFlow input pipeline in each Python process can lead to errors, since too many threads are created in each process. See [this PR thread](https://github.com/mlcommons/algorithmic-efficiency/pull/85) for more details. While this issue might not affect all setups, we currently implement a different strategy: we only run the TensorFlow input pipeline in one Python process (with `rank == 0`), and [broadcast](https://pytorch.org/docs/stable/distributed.html#torch.distributed.broadcast) the batches to all other devices. This introduces an additional communication overhead for each batch. See the [implementation for the WMT workload](https://github.com/mlcommons/algorithmic-efficiency/blob/main/algorithmic_efficiency/workloads/wmt/wmt_pytorch/workload.py#L215-L288) as an example. -### Pytorch Conformer CUDA OOM - -The Conformer PyTorch workload may run out of memory in the current state. Please set the `submission_runner.py` flag `reduce_pytorch_max_split_size` to `True` as a temporary workaround if you encounter this issue. This will set `max_split_size_mb:256`. Note that this will adversely impact the performance of the submission on this workload. See [tracking issue](https://github.com/mlcommons/algorithmic-efficiency/issues/497). From 157fead93a98100af9e68d073b290cd0da8d8c64 Mon Sep 17 00:00:00 2001 From: cssastry Date: Thu, 30 Nov 2023 14:51:43 -0400 Subject: [PATCH 26/27] style changes --- scoring/scoring_utils.py | 1 - scoring/test_scoring_utils.py | 1 + 2 files changed, 1 insertion(+), 1 deletion(-) diff --git a/scoring/scoring_utils.py b/scoring/scoring_utils.py index 3768d0fbb..8252c75a9 100644 --- a/scoring/scoring_utils.py +++ b/scoring/scoring_utils.py @@ -9,7 +9,6 @@ import algorithmic_efficiency.workloads.workloads as workloads_registry - TRIAL_LINE_REGEX = '(.*) --- Tuning run (\d+)/(\d+) ---' METRICS_LINE_REGEX = '(.*) Metrics: ({.*})' TRIAL_DIR_REGEX = 'trial_(\d+)' diff --git a/scoring/test_scoring_utils.py b/scoring/test_scoring_utils.py index 34365dcde..e6c1d7c63 100644 --- a/scoring/test_scoring_utils.py +++ b/scoring/test_scoring_utils.py @@ -41,5 +41,6 @@ def test_scores(self): scale='linear', verbosity=0) + if __name__ == '__main__': absltest.main() From c07e0e7061bc19314aaedeace8b4acf0326375c4 Mon Sep 17 00:00:00 2001 From: priyakasimbeg Date: Thu, 7 Dec 2023 08:31:50 +0000 Subject: [PATCH 27/27] fixes --- .../criteo1tb/criteo1tb_pytorch/models.py | 16 ++++++++-------- algorithmic_efficiency/workloads/workloads.py | 2 +- 2 files changed, 9 insertions(+), 9 deletions(-) diff --git a/algorithmic_efficiency/workloads/criteo1tb/criteo1tb_pytorch/models.py b/algorithmic_efficiency/workloads/criteo1tb/criteo1tb_pytorch/models.py index 4110b30fc..7a40f0e81 100644 --- a/algorithmic_efficiency/workloads/criteo1tb/criteo1tb_pytorch/models.py +++ b/algorithmic_efficiency/workloads/criteo1tb/criteo1tb_pytorch/models.py @@ -72,13 +72,13 @@ def __init__(self, # Ideally, we should use the pooled embedding implementation from # `TorchRec`. However, in order to have identical implementation # with that of Jax, we define a single embedding matrix. - num_chucks = 4 - assert vocab_size % num_chucks == 0 + num_chunks = 4 + assert vocab_size % num_chunks == 0 self.embedding_table_chucks = [] scale = 1.0 / torch.sqrt(self.vocab_size) - for i in range(num_chucks): + for i in range(num_chunks): chunk = nn.Parameter( - torch.Tensor(self.vocab_size // num_chucks, self.embed_dim)) + torch.Tensor(self.vocab_size // num_chunks, self.embed_dim)) chunk.data.uniform_(0, 1) chunk.data = scale * chunk.data self.register_parameter(f'embedding_chunk_{i}', chunk) @@ -194,8 +194,8 @@ def __init__(self, # Ideally, we should use the pooled embedding implementation from # `TorchRec`. However, in order to have identical implementation # with that of Jax, we define a single embedding matrix. - num_chucks = 4 - assert vocab_size % num_chucks == 0 + num_chunks = 4 + assert vocab_size % num_chunks == 0 self.embedding_table_chucks = [] if self.embedding_init_multiplier is None: @@ -203,9 +203,9 @@ def __init__(self, else: scale = self.embedding_init_multiplier - for i in range(num_chucks): + for i in range(num_chunks): chunk = nn.Parameter( - torch.Tensor(self.vocab_size // num_chucks, self.embed_dim)) + torch.Tensor(self.vocab_size // num_chunks, self.embed_dim)) chunk.data.uniform_(0, 1) chunk.data = scale * chunk.data self.register_parameter(f'embedding_chunk_{i}', chunk) diff --git a/algorithmic_efficiency/workloads/workloads.py b/algorithmic_efficiency/workloads/workloads.py index 8891a6e18..fdd350156 100644 --- a/algorithmic_efficiency/workloads/workloads.py +++ b/algorithmic_efficiency/workloads/workloads.py @@ -64,7 +64,7 @@ BASE_WORKLOADS = [ 'criteo1tb', 'fastmri', - ' imagenet_resnet', + 'imagenet_resnet', 'imagenet_vit', 'librispeech_conformer', 'librispeech_deepspeech',