From dad42a2aa71c303665454a2d747e53e5e692f9c7 Mon Sep 17 00:00:00 2001 From: Haider Al-Tahan Date: Mon, 2 Sep 2024 12:18:52 -0400 Subject: [PATCH] Update 0.3.0: - Updated README.md - Fixed #4 - Fixed saving aggregate --- README.md | 28 ++++++++++++++------- setup.py | 2 +- unibench/__init__.py | 2 +- unibench/evaluator.py | 58 +++++++++++++++++++++++++++++++++++++------ unibench/output.py | 44 +++++++++++++++++++++----------- 5 files changed, 100 insertions(+), 34 deletions(-) diff --git a/README.md b/README.md index fdbaf37..a221977 100644 --- a/README.md +++ b/README.md @@ -22,7 +22,7 @@ This repository is designed to simplify the evaluation process of vision-languag Install the package: ``` -pip install git+https://github.com/facebookresearch/unibench.git +pip install unibench -U ```
@@ -73,17 +73,27 @@ evaluator.evaluate() ```console Args: - num_workers (int): Number of CPU cores to use to load data. Default to 80. - models (list or str): The models to evaluate. Defaults to "all". - benchmarks (list or str): The benchmarks to evaluate. Defaults to "all". - model_id (int): The index of the specific model to evaluate. Defaults to None. - dataset_id (int): The index of the specific dataset to evaluate. Defaults to None. - model_types (str): The types of models to evaluate. Defaults to "all". - dataset_types (str): The types of benchmarks to evaluate. Defaults to "all". save_freq (int): The frequency at which to save results. Defaults to 1000. face_blur (bool): Whether to use face blurring during evaluation. Defaults to False. device (str): The device to use for evaluation. Defaults to "cuda" if available otherwise "cpu". - batch_per_gpu (int): Evaluation batch size per gpu. Defaults to 32. + batch_per_gpu (int): Evaluation batch size per GPU. Defaults to 32. +``` + + +The `Evaluator` class takes the following arguments: + +```console +Args: + seed (int): Random seed for reproducibility. + num_workers (int): Number of workers for data loading. + models (Union[List[str], str]): List of models to evaluate or "all" to evaluate all available models. + benchmarks (Union[List[str], str]): List of benchmarks to evaluate or "all" to evaluate all available benchmarks. + model_id (Union[int, None]): Specific model ID to evaluate. + benchmark_id (Union[int, None]): Specific benchmark ID to evaluate. + output_dir (str): Directory to save evaluation results. + benchmarks_dir (str): Directory containing benchmark data. + download_aggregate_precomputed (bool): Whether to download aggregate precomputed results. + download_all_precomputed (bool): Whether to download all precomputed results. ``` ### Example diff --git a/setup.py b/setup.py index 4f33a73..66b5c64 100644 --- a/setup.py +++ b/setup.py @@ -13,7 +13,7 @@ setuptools.setup( name="unibench", - version="0.2.0", + version="0.3.0", author="Haider Al-Tahan", author_email="haideraltahan@meta.com", description="This repository is designed to simplify the evaluation process of vision-language models. It provides a comprehensive set of tools and scripts for evaluating VLM models and benchmarks.", diff --git a/unibench/__init__.py b/unibench/__init__.py index 7dd972f..583bdfd 100644 --- a/unibench/__init__.py +++ b/unibench/__init__.py @@ -4,7 +4,7 @@ This source code is licensed under the license found in the LICENSE file in the root directory of this source tree. """ -__version__ = "0.2.0" +__version__ = "0.3.0" __author__ = "Haider Al-Tahan" from .evaluator import Evaluator \ No newline at end of file diff --git a/unibench/evaluator.py b/unibench/evaluator.py index 1c5cfd1..3bf8483 100644 --- a/unibench/evaluator.py +++ b/unibench/evaluator.py @@ -10,7 +10,6 @@ import fire import torch -from tqdm.auto import tqdm import pandas as pd from rich.progress import Progress from torch.utils.data import Dataset @@ -39,6 +38,55 @@ class Evaluator(object): + """ + The Evaluator class is responsible for evaluating machine learning models on various benchmarks. + It provides methods to update the list of models and benchmarks, download benchmarks, add new models and benchmarks, + generate aggregate results, and evaluate models. + + Attributes: + seed (int): Random seed for reproducibility. + num_workers (int): Number of workers for data loading. + models (Union[List[str], str]): List of models to evaluate or "all" to evaluate all available models. + benchmarks (Union[List[str], str]): List of benchmarks to evaluate or "all" to evaluate all available benchmarks. + model_id (Union[int, None]): Specific model ID to evaluate. + benchmark_id (Union[int, None]): Specific benchmark ID to evaluate. + output_dir (str): Directory to save evaluation results. + benchmarks_dir (str): Directory containing benchmark data. + download_aggregate_precomputed (bool): Whether to download aggregate precomputed results. Used for minor analysis and fast loading. + download_all_precomputed (bool): Whether to download all precomputed results. Used for slow loading and comprehensive analysis. + + Methods: + update_benchmark_list(benchmarks, benchmark_id=None): + Updates the list of benchmarks to evaluate. + + update_model_list(models, model_id=None): + Updates the list of models to evaluate. + + download_benchmarks(): + Downloads the specified benchmarks. + + list_models(): + Lists all available models. + + add_benchmark(benchmark, handler, meta_data={}): + Adds a new benchmark to the list of benchmarks. + + generate_aggregate_results(): + Generates aggregate results from the evaluation. + + list_benchmarks(): + Lists all available benchmarks. + + add_model(model, meta_data={}): + Adds a new model to the list of models. + + show_results(): + Displays the evaluation results. + + evaluate(save_freq=1000, face_blur=False, device="cuda" if torch.cuda.is_available() else "cpu", batch_per_gpu=32): + Evaluates the models on the benchmarks and saves the results. + """ + def __init__( self, seed: int = 1337, @@ -189,17 +237,10 @@ def evaluate( Evaluate models on benchmarks and return and saving the results. Args: - models (list or str): The models to evaluate. Defaults to "all". - benchmarks (list or str): The benchmarks to evaluate. Defaults to "all". - model_id (int): The index of the specific model to evaluate. Defaults to None. - benchmark_id (int): The index of the specific benchmark to evaluate. Defaults to None. - model_types (str): The types of models to evaluate. Defaults to "all". - benchmark_types (str): The types of benchmarks to evaluate. Defaults to "all". save_freq (int): The frequency at which to save results. Defaults to 1000. face_blur (bool): Whether to use face blurring during evaluation. Defaults to False. device (str): The device to use for evaluation. Defaults to "cuda" if available otherwise "cpu". batch_per_gpu (int): The batch size per GPU. Defaults to 32. - use_data_parallel (bool): Whether to use data parallelism. Defaults to torch.cuda.device_count() > 1. Returns: query results: The results of the query for the specified benchmarks and models. @@ -301,6 +342,7 @@ def evaluate( progress.update(pg_benchmark, advance=1) progress.update(pg_benchmark, visible=False) self.outputhandler.save_csv(model_name, benchmark_name) + self.outputhandler.save_aggregate_results(model_name, benchmark_name) progress.update(pg_benchmarks, advance=1) progress.update(pg_models, advance=1) diff --git a/unibench/output.py b/unibench/output.py index 9bdb0e3..4597743 100644 --- a/unibench/output.py +++ b/unibench/output.py @@ -44,6 +44,7 @@ def reset_local_csv(self): self._local_csv = pd.DataFrame() def check_if_computed(self, model_name, benchmark_name, **kwargs): + self.load_aggregate_results() res = self.query( df=self._aggregate, **{"model_name": model_name, "benchmark_name": benchmark_name} @@ -78,7 +79,6 @@ def load_all_csv(self, model_name, benchmark_name): model_folder = self.output_dir.joinpath(model) for benchmark in benchmark_name: file = model_folder.joinpath(benchmark + ".f") - print("Loading file: ", file) if file.exists(): try: dfs.append(pd.read_feather(file)) @@ -128,19 +128,15 @@ def query(self, df=None, **kwargs): if len(kwargs) == 0: return df - def create_compare(k, v): - return k + "=='" + v + "'" if isinstance(v, str) else k + "==" + str(v) + mask = pd.Series([True] * len(df)) - expr = "" - for i, (k, v) in enumerate(kwargs.items()): + for k, v in kwargs.items(): if isinstance(v, list): - expr += "(" + " or ".join(create_compare(k, v_) for v_ in v) + ")" + mask &= df[k].isin(v) else: - expr += create_compare(k, v) - if i < len(kwargs.items()) - 1: - expr += " and " + mask &= (df[k] == v) - return df.query(expr) + return df[mask] def delete_rows(self, model_name, benchmark_name, **kwargs): # file_name = str(OUTPUT_DIR.joinpath(model_name + ".f")) @@ -172,19 +168,37 @@ def _get_benchmark_mappings(self, axis): @lockutils.synchronized(name="aggregate", external=True, fair=True) def load_aggregate_results(self): - self._aggregate = pd.read_feather(self.output_dir.joinpath("aggregate.f")) + file = self.output_dir.joinpath("aggregate.f") + if file.exists(): + self._aggregate = pd.read_feather(file) @lockutils.synchronized(name="aggregate", external=True, fair=True) - def generate_aggregate_results(self): + def save_aggregate_results(self, model_name, benchmark_name): + file_dir = self.output_dir.joinpath("aggregate.f") + if file_dir.exists(): + self._aggregate = pd.read_feather(file_dir) + + df = self.query( + self._model_csv, + **{"model_name": [model_name], "benchmark_name": [benchmark_name]} + ) + df = ( - self._model_csv.groupby(["model_name", "benchmark_name"])["correctness"] + df.groupby(["model_name", "benchmark_name"])["correctness"] .mean() .reset_index() ) - df.to_feather(self.output_dir.joinpath("aggregate.f")) + df = ( + pd.concat([self._aggregate, df]) + .drop_duplicates(subset=["model_name", "benchmark_name"], keep="last") + .reset_index(drop=True) + ) + + df.to_feather(file_dir) def print_dataframe(self, **kwargs): + self.load_aggregate_results() df = self.query(df=self._aggregate, **kwargs) benchmark_mappings = self._get_benchmark_mappings("benchmark_type") df["benchmark_type"] = df["benchmark_name"].map(benchmark_mappings) @@ -230,4 +244,4 @@ def save_csv(self, model_name, benchmark_name): # Save the model csv self._model_csv.to_feather(file_name) - self.reset_local_csv() + self.reset_local_csv() \ No newline at end of file