diff --git a/scoring/compute_speedups.py b/scoring/compute_speedups.py new file mode 100644 index 000000000..5fb5f259d --- /dev/null +++ b/scoring/compute_speedups.py @@ -0,0 +1,112 @@ +"""File to compute speedups (i.e. geometric means between runtimes).""" + +import pickle + +from absl import app +from absl import flags +import numpy as np +import pandas as pd +from performance_profile import BASE_WORKLOADS +from performance_profile import get_workloads_time_to_target +from scipy import stats + +flags.DEFINE_string('results_txt', None, 'Path to full scoring results file.') +flags.DEFINE_string( + 'base', + 'prize_qualification_baseline', + 'Base submission to compare to. Defaults to the `prize_qualification_baseline`.' +) +flags.DEFINE_string('comparison', None, 'Submission to compute the speedup of.') +flags.DEFINE_boolean('self_tuning_ruleset', + False, + 'Whether the self-tuning ruleset is being scored.') +flags.DEFINE_boolean('save_results', + False, + 'Whether to save the results to disk.') +FLAGS = flags.FLAGS + +MAX_BUDGETS = { + 'criteo1tb': 7703, + 'fastmri': 8859, + 'imagenet_resnet': 63_008, + 'imagenet_vit': 77_520, + 'librispeech_conformer': 61_068, + 'librispeech_deepspeech': 55_506, + 'ogbg': 18_477, + 'wmt': 48_151, +} + + +def replace_inf(row): + """Replace ifs with maximum runtime budget (+1 second). + + Args: + row (pd.Series): The original row. + + Returns: + pd.Series: The row with infs replaced. + """ + workload_name = row.name + # Factor of 3 for self-tuning ruleset + factor = 3 if FLAGS.self_tuning_ruleset else 1 + max_runtime_workload = factor * MAX_BUDGETS[workload_name] + row.replace(np.inf, max_runtime_workload + 1, inplace=True) + return row + + +def compute_speedup(): + """Compute speedup between two algorithms.""" + # Load results from disk + with open(FLAGS.results_txt, 'rb') as f: + results = pickle.load(f) + + # Compute median over runtimes for both training algorithms + base_results = get_workloads_time_to_target( + results[FLAGS.base], + FLAGS.base, + time_col="score", + self_tuning_ruleset=FLAGS.self_tuning_ruleset, + ) + comparison_results = get_workloads_time_to_target( + results[FLAGS.comparison], + FLAGS.comparison, + time_col="score", + self_tuning_ruleset=FLAGS.self_tuning_ruleset, + ) + + # Merge results + merged_results = pd.concat([base_results, comparison_results]).transpose() + + # Ignore workload variants (only consider base workloads) for speedup + merged_results = merged_results.loc[merged_results.index.isin(BASE_WORKLOADS)] + + # Replace infs with maximum runtime budget (+1 second) + merged_results = merged_results.apply(replace_inf, axis=1) + + # Compute speedup + merged_results['speedup'] = merged_results[ + f'{FLAGS.comparison}'] / merged_results[f'{FLAGS.base}'] + speedups = merged_results['speedup'].to_numpy() + mean_speedup = stats.gmean(speedups) # Geometric mean over workload speedups + + print(merged_results, end='\n\n') + print( + f"Average speedup of {FLAGS.comparison} compared to {FLAGS.base}: {mean_speedup} or roughly {(1-mean_speedup):.1%}" + ) + + if FLAGS.save_results: + # Optionally save results to disk + print("Saving results to disk...") + filename = f'{FLAGS.comparison}_vs_{FLAGS.base}_speedup_{(1-mean_speedup):.1%}.csv' + merged_results.to_csv(filename) + + +def main(_): + """Main function to compute speedup between two algorithms.""" + compute_speedup() + + +if __name__ == '__main__': + flags.mark_flag_as_required('results_txt') + flags.mark_flag_as_required('comparison') + app.run(main) diff --git a/scoring/performance_profile.py b/scoring/performance_profile.py index 372684fe2..b77d69d05 100644 --- a/scoring/performance_profile.py +++ b/scoring/performance_profile.py @@ -307,6 +307,14 @@ def compute_performance_profiles(submissions, strict)) df = pd.concat(dfs) + # For each held-out workload set to inf if the base workload is inf + for workload in df.keys(): + if workload not in BASE_WORKLOADS: + # If base do not have finite score set variant score to inf + base_workload = get_base_workload_name(workload) + df[workload] = df.apply( + variant_criteria_filter(workload, base_workload), axis=1) + # Set score to inf if not within 4x of fastest submission best_scores = df.min(axis=0) df[df.apply(lambda x: x > 4 * best_scores, axis=1)] = np.inf diff --git a/scoring/score_submissions.py b/scoring/score_submissions.py index 22b978fd7..02ad82fc0 100644 --- a/scoring/score_submissions.py +++ b/scoring/score_submissions.py @@ -198,7 +198,7 @@ def main(_): results, time_col='score', min_tau=1.0, - max_tau=None, + max_tau=4.0, reference_submission_tag=None, num_points=100, scale='linear',