diff --git a/scoring/scoring_utils.py b/scoring/scoring_utils.py index 37db73dd4..90a4e8867 100644 --- a/scoring/scoring_utils.py +++ b/scoring/scoring_utils.py @@ -1,3 +1,4 @@ +import glob import json import os import re @@ -9,6 +10,7 @@ METRICS_LINE_REGEX = '(.*) Metrics: ({.*})' TRIAL_DIR_REGEX = 'trial_(\d+)' MEASUREMENTS_FILENAME = 'eval_measurements.csv' +TIMESTAMP = r"-\d{4}(-\d{2}){5}" #### File IO helper functions ### @@ -137,7 +139,9 @@ def get_experiment_df(experiment_dir): scoring.compute_performance_profiles. Args: experiment_dir: path to experiment directory containing - results for workloads. + results for workloads. Measurements from experiments + sharing the same prefix but different timestamps are + collected together. The directory structure is assumed to be: + experiment_dir + @@ -148,38 +152,42 @@ def get_experiment_df(experiment_dir): df: DataFrame where indices are trials, columns are metric names and values are lists. e.g - +----+-----------+---------+--------------------+--------------------+ - | | workload | trial | validation/accuracy| score | - |----+-----------+---------+--------------------+--------------------| - | 0 | mnist_jax | trial_1 | [0.0911, 0.0949] | [10.6396, 10.6464] | - +----+-----------+---------+--------------------+--------------------+ + +----+-----------+-----------------------------+--------------------+--------------------+ + | | workload | trial | validation/accuracy| score | + |----+-----------+-----------------------------+--------------------+--------------------| + | 0 | mnist_jax | (trial_1, ) | [0.0911, 0.0949] | [10.6396, 10.6464] | + +----+-----------+-----------------------------+--------------------+--------------------+ """ df = pd.DataFrame() - workload_dirs = os.listdir(experiment_dir) - for workload in workload_dirs: - data = { - 'workload': workload, - } - trial_dirs = [ - t for t in os.listdir(os.path.join(experiment_dir, workload)) - if re.match(TRIAL_DIR_REGEX, t) - ] - for trial in trial_dirs: - eval_measurements_filepath = os.path.join( - experiment_dir, - workload, - trial, - MEASUREMENTS_FILENAME, - ) - try: - trial_df = pd.read_csv(eval_measurements_filepath) - except FileNotFoundError as e: - logging.info(f'Could not read {eval_measurements_filepath}') - continue - data['trial'] = trial - for column in trial_df.columns: - values = trial_df[column].to_numpy() - data[column] = values - trial_df = pd.DataFrame([data]) - df = pd.concat([df, trial_df], ignore_index=True) + paths = filter( + lambda x: re.match(experiment_dir + TIMESTAMP, x) or x == experiment_dir, + glob.glob(f"{experiment_dir}*")) + for experiment_dir in list(paths): + workload_dirs = os.listdir(experiment_dir) + for workload in workload_dirs: + data = { + 'workload': workload, + } + trial_dirs = [ + t for t in os.listdir(os.path.join(experiment_dir, workload)) + if re.match(TRIAL_DIR_REGEX, t) + ] + for trial in trial_dirs: + eval_measurements_filepath = os.path.join( + experiment_dir, + workload, + trial, + MEASUREMENTS_FILENAME, + ) + try: + trial_df = pd.read_csv(eval_measurements_filepath) + except FileNotFoundError as e: + logging.info(f'Could not read {eval_measurements_filepath}') + continue + data['trial'] = (trial, experiment_dir) + for column in trial_df.columns: + values = trial_df[column].to_numpy() + data[column] = values + trial_df = pd.DataFrame([data]) + df = pd.concat([df, trial_df], ignore_index=True) return df