Skip to content

Commit

Permalink
Merge pull request #565 from mlcommons/raw_scores
Browse files Browse the repository at this point in the history
Add score table to scoring script with raw scores
  • Loading branch information
priyakasimbeg authored Nov 10, 2023
2 parents b8afd87 + dca6a26 commit fd090dc
Show file tree
Hide file tree
Showing 4 changed files with 112 additions and 53 deletions.
46 changes: 20 additions & 26 deletions scoring/scoring.py → scoring/performance_profile.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,17 +25,18 @@
The keys in this dictionary should match the workload identifiers used in
the dictionary of submissions.
"""

import itertools
import operator
import os
import re

from absl import logging
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

import algorithmic_efficiency.workloads.workloads as workloads_registry
from scoring import scoring_utils

WORKLOADS = workloads_registry.WORKLOADS
WORKLOAD_NAME_PATTERN = '(.*)(_jax|_pytorch)'
Expand Down Expand Up @@ -153,7 +154,8 @@ def get_index_that_reaches_target(workload_df,
def get_times_for_submission(submission,
submission_tag,
time_col='global_step',
verbosity=1):
verbosity=1,
self_tuning_ruleset=False):
"""Get times to target for each workload in a submission.
Args:
Expand All @@ -168,25 +170,16 @@ def get_times_for_submission(submission,
"""
workloads = []
submission_name = submission_tag.split('.')[1]

num_workloads = len(submission.groupby('workload'))
if num_workloads != NUM_WORKLOADS:
logging.warning(f'Expecting {NUM_WORKLOADS} workloads '
f'but found {num_workloads} workloads.')
for workload, group in submission.groupby('workload'):
workload_name = re.match(WORKLOAD_NAME_PATTERN, workload).group(1)
framework = re.match(WORKLOAD_NAME_PATTERN, workload).group(2)
workload_metadata = WORKLOADS[workload_name]

# Extend path according to framework.
workload_metadata['workload_path'] = os.path.join(
BASE_WORKLOADS_DIR,
workload_metadata['workload_path'] + f'{framework}',
'workload.py')
workload_init_kwargs = {}
workload_obj = workloads_registry.import_workload(
workload_path=workload_metadata['workload_path'],
workload_class_name=workload_metadata['workload_class_name'],
workload_init_kwargs=workload_init_kwargs)
metric_name = workload_obj.target_metric_name
validation_metric = f'validation/{metric_name}'
validation_target = workload_obj.validation_target_value
num_trials = len(group)
if num_trials != NUM_TRIALS and not self_tuning_ruleset:
logging.warning(f'Expecting {NUM_TRIALS} trials for workload '
f'{workload} but found {num_trials} trials.')
validation_metric, validation_target = scoring_utils.get_workload_validation_target(workload)

trial_idx, time_idx = get_index_that_reaches_target(
group, validation_metric, validation_target)
Expand Down Expand Up @@ -250,21 +243,22 @@ def compute_performance_profiles(results,
dfs = []

for submission_tag, result in results.items():
print(f'\nComputing performance profile with respect to `{time_col}` for '
f'{submission_tag}')
logging.info(
f'\nComputing performance profile with respect to `{time_col}` for '
f'{submission_tag}')
dfs.append(
get_times_for_submission(result, submission_tag, time_col, verbosity))
df = pd.concat(dfs)

if verbosity > 0:
print(f'\n`{time_col}` to reach target:')
logging.info('\n`{time_col}` to reach target:')
with pd.option_context('display.max_rows',
None,
'display.max_columns',
None,
'display.width',
1000):
print(df)
logging.info(df)

# Divide by the fastest.
if reference_submission_tag is None:
Expand All @@ -273,14 +267,14 @@ def compute_performance_profiles(results,
df.update(df.div(df.loc[reference_submission_tag, :], axis=1))

if verbosity > 0:
print(f'\n`{time_col}` to reach target normalized to best:')
logging.info('\n`{time_col}` to reach target normalized to best:')
with pd.option_context('display.max_rows',
None,
'display.max_columns',
None,
'display.width',
1000):
print(df)
logging.info(df)

# If no max_tau is supplied, choose the value of tau that would plot all non
# inf or nan data.
Expand Down
77 changes: 61 additions & 16 deletions scoring/score_submission.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,15 @@
import operator
import os

from absl import app
from absl import flags
from absl import logging
import numpy as np
import pandas as pd
import scoring_utils
from tabulate import tabulate

from scoring import scoring
from scoring import performance_profile

flags.DEFINE_string(
'experiment_path',
Expand All @@ -15,29 +19,70 @@
flags.DEFINE_string('output_dir',
'scoring_results',
'Path to save performance profile table and plot.')
flags.DEFINE_boolean('compute_performance_profiles',
False,
'Whether or not to compute the performance profiles.')
FLAGS = flags.FLAGS


def get_summary_df(workload, workload_df):
validation_metric, validation_target = scoring_utils.get_workload_validation_target(workload)
is_minimized = performance_profile.check_if_minimized(validation_metric)
target_op = operator.le if is_minimized else operator.ge
best_op = min if is_minimized else max
idx_op = np.argmin if is_minimized else np.argmax

summary_df = pd.DataFrame()
summary_df['workload'] = workload_df['workload']
summary_df['trial'] = workload_df['trial']
summary_df['target metric name'] = validation_metric
summary_df['target metric value'] = validation_target

summary_df['target reached'] = workload_df[validation_metric].apply(
lambda x: target_op(x, validation_target)).apply(np.any)
summary_df['best target'] = workload_df[validation_metric].apply(
lambda x: best_op(x))
workload_df['index best eval'] = workload_df[validation_metric].apply(
lambda x: idx_op(x))
summary_df['submission time'] = workload_df.apply(
lambda x: x['accumulated_submission_time'][x['index best eval']], axis=1)
summary_df['score'] = summary_df.apply(
lambda x: x['submission time'] if x['target reached'] else np.inf, axis=1)

return summary_df


def main(_):
df = scoring_utils.get_experiment_df(FLAGS.experiment_path)
results = {
FLAGS.submission_tag: df,
}
performance_profile_df = scoring.compute_performance_profiles(
results,
time_col='score',
min_tau=1.0,
max_tau=None,
reference_submission_tag=None,
num_points=100,
scale='linear',
verbosity=0)
if not os.path.exists(FLAGS.output_dir):
os.mkdir(FLAGS.output_dir)
scoring.plot_performance_profiles(
performance_profile_df, 'score', save_dir=FLAGS.output_dir)

logging.info(performance_profile_df)

dfs = []
for workload, group in df.groupby('workload'):
summary_df = get_summary_df(workload, group)
dfs.append(summary_df)

df = pd.concat(dfs)
logging.info(tabulate(df, headers='keys', tablefmt='psql'))

if FLAGS.compute_performance_profiles:
performance_profile_df = performance_profile.compute_performance_profiles(
results,
time_col='score',
min_tau=1.0,
max_tau=None,
reference_submission_tag=None,
num_points=100,
scale='linear',
verbosity=0)
if not os.path.exists(FLAGS.output_dir):
os.mkdir(FLAGS.output_dir)
performance_profile.plot_performance_profiles(
performance_profile_df, 'score', save_dir=FLAGS.output_dir)
perf_df = tabulate(
performance_profile_df.T, headers='keys', tablefmt='psql')
logging.info(f'Performance profile:\n {perf_df}')


if __name__ == '__main__':
Expand Down
41 changes: 30 additions & 11 deletions scoring/scoring_utils.py
Original file line number Diff line number Diff line change
@@ -1,19 +1,22 @@
import copy
import json
import os
import re
import warnings

from absl import logging
import pandas as pd

from scoring.scoring import NUM_TRIALS
from scoring.scoring import NUM_WORKLOADS
import algorithmic_efficiency.workloads.workloads as workloads_registry

TRIAL_LINE_REGEX = '(.*) --- Tuning run (\d+)/(\d+) ---'
METRICS_LINE_REGEX = '(.*) Metrics: ({.*})'
TRIAL_DIR_REGEX = 'trial_(\d+)'
MEASUREMENTS_FILENAME = 'eval_measurements.csv'

WORKLOADS = workloads_registry.WORKLOADS
WORKLOAD_NAME_PATTERN = '(.*)(_jax|_pytorch)'
BASE_WORKLOADS_DIR = 'algorithmic_efficiency/workloads/'


#### File IO helper functions ###
def get_logfile_paths(logdir):
Expand Down Expand Up @@ -137,7 +140,7 @@ def get_trials_df(logfile):
def get_experiment_df(experiment_dir):
"""Gets a df of per trial results from an experiment dir.
The output df can be provided as input to
scoring.compute_performance_profiles.
performance_profile.compute_performance_profiles.
Args:
experiment_dir: path to experiment directory containing
results for workloads.
Expand All @@ -160,9 +163,6 @@ def get_experiment_df(experiment_dir):
df = pd.DataFrame()
workload_dirs = os.listdir(experiment_dir)
num_workloads = len(workload_dirs)
if num_workloads != NUM_WORKLOADS:
warnings.warn(f'There should be {NUM_WORKLOADS} workloads but there are '
f'{num_workloads}.')
for workload in workload_dirs:
data = {
'workload': workload,
Expand Down Expand Up @@ -190,9 +190,28 @@ def get_experiment_df(experiment_dir):
data[column] = values
trial_df = pd.DataFrame([data])
workload_df = pd.concat([workload_df, trial_df], ignore_index=True)
num_trials = len(workload_df)
if num_trials != NUM_TRIALS:
warnings.warn(f'There should be {NUM_TRIALS} trials for workload '
f'{workload} but there are only {num_trials}.')
df = pd.concat([df, workload_df], ignore_index=True)
return df


## Get workload properties
def get_workload_validation_target(workload):
"""Returns workload target metric name and value."""
workload_name = re.match(WORKLOAD_NAME_PATTERN, workload).group(1)
framework = re.match(WORKLOAD_NAME_PATTERN, workload).group(2)
workload_metadata = copy.copy(WORKLOADS[workload_name])

# Extend path according to framework.
workload_metadata['workload_path'] = os.path.join(
BASE_WORKLOADS_DIR,
workload_metadata['workload_path'] + f'{framework}',
'workload.py')
workload_init_kwargs = {}
workload_obj = workloads_registry.import_workload(
workload_path=workload_metadata['workload_path'],
workload_class_name=workload_metadata['workload_class_name'],
workload_init_kwargs=workload_init_kwargs)
metric_name = workload_obj.target_metric_name
validation_metric = f'validation/{metric_name}'
validation_target = workload_obj.validation_target_value
return validation_metric, validation_target
1 change: 1 addition & 0 deletions setup.cfg
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,7 @@ install_requires =
psutil==5.9.5
clu==0.0.7
matplotlib>=3.7.2
tabulate==0.9.0
python_requires = >=3.8


Expand Down

0 comments on commit fd090dc

Please sign in to comment.