diff --git a/scoring/scoring.py b/scoring/performance_profile.py similarity index 91% rename from scoring/scoring.py rename to scoring/performance_profile.py index dba254233..e62e8e18e 100644 --- a/scoring/scoring.py +++ b/scoring/performance_profile.py @@ -25,17 +25,18 @@ The keys in this dictionary should match the workload identifiers used in the dictionary of submissions. """ - import itertools import operator import os import re +from absl import logging import matplotlib.pyplot as plt import numpy as np import pandas as pd import algorithmic_efficiency.workloads.workloads as workloads_registry +from scoring import scoring_utils WORKLOADS = workloads_registry.WORKLOADS WORKLOAD_NAME_PATTERN = '(.*)(_jax|_pytorch)' @@ -153,7 +154,8 @@ def get_index_that_reaches_target(workload_df, def get_times_for_submission(submission, submission_tag, time_col='global_step', - verbosity=1): + verbosity=1, + self_tuning_ruleset=False): """Get times to target for each workload in a submission. Args: @@ -168,25 +170,16 @@ def get_times_for_submission(submission, """ workloads = [] submission_name = submission_tag.split('.')[1] - + num_workloads = len(submission.groupby('workload')) + if num_workloads != NUM_WORKLOADS: + logging.warning(f'Expecting {NUM_WORKLOADS} workloads ' + f'but found {num_workloads} workloads.') for workload, group in submission.groupby('workload'): - workload_name = re.match(WORKLOAD_NAME_PATTERN, workload).group(1) - framework = re.match(WORKLOAD_NAME_PATTERN, workload).group(2) - workload_metadata = WORKLOADS[workload_name] - - # Extend path according to framework. - workload_metadata['workload_path'] = os.path.join( - BASE_WORKLOADS_DIR, - workload_metadata['workload_path'] + f'{framework}', - 'workload.py') - workload_init_kwargs = {} - workload_obj = workloads_registry.import_workload( - workload_path=workload_metadata['workload_path'], - workload_class_name=workload_metadata['workload_class_name'], - workload_init_kwargs=workload_init_kwargs) - metric_name = workload_obj.target_metric_name - validation_metric = f'validation/{metric_name}' - validation_target = workload_obj.validation_target_value + num_trials = len(group) + if num_trials != NUM_TRIALS and not self_tuning_ruleset: + logging.warning(f'Expecting {NUM_TRIALS} trials for workload ' + f'{workload} but found {num_trials} trials.') + validation_metric, validation_target = scoring_utils.get_workload_validation_target(workload) trial_idx, time_idx = get_index_that_reaches_target( group, validation_metric, validation_target) @@ -250,21 +243,22 @@ def compute_performance_profiles(results, dfs = [] for submission_tag, result in results.items(): - print(f'\nComputing performance profile with respect to `{time_col}` for ' - f'{submission_tag}') + logging.info( + f'\nComputing performance profile with respect to `{time_col}` for ' + f'{submission_tag}') dfs.append( get_times_for_submission(result, submission_tag, time_col, verbosity)) df = pd.concat(dfs) if verbosity > 0: - print(f'\n`{time_col}` to reach target:') + logging.info('\n`{time_col}` to reach target:') with pd.option_context('display.max_rows', None, 'display.max_columns', None, 'display.width', 1000): - print(df) + logging.info(df) # Divide by the fastest. if reference_submission_tag is None: @@ -273,14 +267,14 @@ def compute_performance_profiles(results, df.update(df.div(df.loc[reference_submission_tag, :], axis=1)) if verbosity > 0: - print(f'\n`{time_col}` to reach target normalized to best:') + logging.info('\n`{time_col}` to reach target normalized to best:') with pd.option_context('display.max_rows', None, 'display.max_columns', None, 'display.width', 1000): - print(df) + logging.info(df) # If no max_tau is supplied, choose the value of tau that would plot all non # inf or nan data. diff --git a/scoring/score_submission.py b/scoring/score_submission.py index e8a6ac010..0dd84ff55 100644 --- a/scoring/score_submission.py +++ b/scoring/score_submission.py @@ -1,11 +1,15 @@ +import operator import os from absl import app from absl import flags from absl import logging +import numpy as np +import pandas as pd import scoring_utils +from tabulate import tabulate -from scoring import scoring +from scoring import performance_profile flags.DEFINE_string( 'experiment_path', @@ -15,29 +19,70 @@ flags.DEFINE_string('output_dir', 'scoring_results', 'Path to save performance profile table and plot.') +flags.DEFINE_boolean('compute_performance_profiles', + False, + 'Whether or not to compute the performance profiles.') FLAGS = flags.FLAGS +def get_summary_df(workload, workload_df): + validation_metric, validation_target = scoring_utils.get_workload_validation_target(workload) + is_minimized = performance_profile.check_if_minimized(validation_metric) + target_op = operator.le if is_minimized else operator.ge + best_op = min if is_minimized else max + idx_op = np.argmin if is_minimized else np.argmax + + summary_df = pd.DataFrame() + summary_df['workload'] = workload_df['workload'] + summary_df['trial'] = workload_df['trial'] + summary_df['target metric name'] = validation_metric + summary_df['target metric value'] = validation_target + + summary_df['target reached'] = workload_df[validation_metric].apply( + lambda x: target_op(x, validation_target)).apply(np.any) + summary_df['best target'] = workload_df[validation_metric].apply( + lambda x: best_op(x)) + workload_df['index best eval'] = workload_df[validation_metric].apply( + lambda x: idx_op(x)) + summary_df['submission time'] = workload_df.apply( + lambda x: x['accumulated_submission_time'][x['index best eval']], axis=1) + summary_df['score'] = summary_df.apply( + lambda x: x['submission time'] if x['target reached'] else np.inf, axis=1) + + return summary_df + + def main(_): df = scoring_utils.get_experiment_df(FLAGS.experiment_path) results = { FLAGS.submission_tag: df, } - performance_profile_df = scoring.compute_performance_profiles( - results, - time_col='score', - min_tau=1.0, - max_tau=None, - reference_submission_tag=None, - num_points=100, - scale='linear', - verbosity=0) - if not os.path.exists(FLAGS.output_dir): - os.mkdir(FLAGS.output_dir) - scoring.plot_performance_profiles( - performance_profile_df, 'score', save_dir=FLAGS.output_dir) - - logging.info(performance_profile_df) + + dfs = [] + for workload, group in df.groupby('workload'): + summary_df = get_summary_df(workload, group) + dfs.append(summary_df) + + df = pd.concat(dfs) + logging.info(tabulate(df, headers='keys', tablefmt='psql')) + + if FLAGS.compute_performance_profiles: + performance_profile_df = performance_profile.compute_performance_profiles( + results, + time_col='score', + min_tau=1.0, + max_tau=None, + reference_submission_tag=None, + num_points=100, + scale='linear', + verbosity=0) + if not os.path.exists(FLAGS.output_dir): + os.mkdir(FLAGS.output_dir) + performance_profile.plot_performance_profiles( + performance_profile_df, 'score', save_dir=FLAGS.output_dir) + perf_df = tabulate( + performance_profile_df.T, headers='keys', tablefmt='psql') + logging.info(f'Performance profile:\n {perf_df}') if __name__ == '__main__': diff --git a/scoring/scoring_utils.py b/scoring/scoring_utils.py index 1a15db2f5..d10617896 100644 --- a/scoring/scoring_utils.py +++ b/scoring/scoring_utils.py @@ -1,19 +1,22 @@ +import copy import json import os import re -import warnings from absl import logging import pandas as pd -from scoring.scoring import NUM_TRIALS -from scoring.scoring import NUM_WORKLOADS +import algorithmic_efficiency.workloads.workloads as workloads_registry TRIAL_LINE_REGEX = '(.*) --- Tuning run (\d+)/(\d+) ---' METRICS_LINE_REGEX = '(.*) Metrics: ({.*})' TRIAL_DIR_REGEX = 'trial_(\d+)' MEASUREMENTS_FILENAME = 'eval_measurements.csv' +WORKLOADS = workloads_registry.WORKLOADS +WORKLOAD_NAME_PATTERN = '(.*)(_jax|_pytorch)' +BASE_WORKLOADS_DIR = 'algorithmic_efficiency/workloads/' + #### File IO helper functions ### def get_logfile_paths(logdir): @@ -137,7 +140,7 @@ def get_trials_df(logfile): def get_experiment_df(experiment_dir): """Gets a df of per trial results from an experiment dir. The output df can be provided as input to - scoring.compute_performance_profiles. + performance_profile.compute_performance_profiles. Args: experiment_dir: path to experiment directory containing results for workloads. @@ -160,9 +163,6 @@ def get_experiment_df(experiment_dir): df = pd.DataFrame() workload_dirs = os.listdir(experiment_dir) num_workloads = len(workload_dirs) - if num_workloads != NUM_WORKLOADS: - warnings.warn(f'There should be {NUM_WORKLOADS} workloads but there are ' - f'{num_workloads}.') for workload in workload_dirs: data = { 'workload': workload, @@ -190,9 +190,28 @@ def get_experiment_df(experiment_dir): data[column] = values trial_df = pd.DataFrame([data]) workload_df = pd.concat([workload_df, trial_df], ignore_index=True) - num_trials = len(workload_df) - if num_trials != NUM_TRIALS: - warnings.warn(f'There should be {NUM_TRIALS} trials for workload ' - f'{workload} but there are only {num_trials}.') df = pd.concat([df, workload_df], ignore_index=True) return df + + +## Get workload properties +def get_workload_validation_target(workload): + """Returns workload target metric name and value.""" + workload_name = re.match(WORKLOAD_NAME_PATTERN, workload).group(1) + framework = re.match(WORKLOAD_NAME_PATTERN, workload).group(2) + workload_metadata = copy.copy(WORKLOADS[workload_name]) + + # Extend path according to framework. + workload_metadata['workload_path'] = os.path.join( + BASE_WORKLOADS_DIR, + workload_metadata['workload_path'] + f'{framework}', + 'workload.py') + workload_init_kwargs = {} + workload_obj = workloads_registry.import_workload( + workload_path=workload_metadata['workload_path'], + workload_class_name=workload_metadata['workload_class_name'], + workload_init_kwargs=workload_init_kwargs) + metric_name = workload_obj.target_metric_name + validation_metric = f'validation/{metric_name}' + validation_target = workload_obj.validation_target_value + return validation_metric, validation_target diff --git a/setup.cfg b/setup.cfg index a7ce5ebb2..9aa4ffb5f 100644 --- a/setup.cfg +++ b/setup.cfg @@ -45,6 +45,7 @@ install_requires = psutil==5.9.5 clu==0.0.7 matplotlib>=3.7.2 + tabulate==0.9.0 python_requires = >=3.8