Skip to content

Commit

Permalink
Update 0.3.0:
Browse files Browse the repository at this point in the history
- Updated README.md
- Fixed #4
- Fixed saving aggregate
  • Loading branch information
haideraltahan committed Sep 2, 2024
1 parent 5416aca commit dad42a2
Show file tree
Hide file tree
Showing 5 changed files with 100 additions and 34 deletions.
28 changes: 19 additions & 9 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@ This repository is designed to simplify the evaluation process of vision-languag

Install the package:
```
pip install git+https://github.com/facebookresearch/unibench.git
pip install unibench -U
```

<details >
Expand Down Expand Up @@ -73,17 +73,27 @@ evaluator.evaluate()

```console
Args:
num_workers (int): Number of CPU cores to use to load data. Default to 80.
models (list or str): The models to evaluate. Defaults to "all".
benchmarks (list or str): The benchmarks to evaluate. Defaults to "all".
model_id (int): The index of the specific model to evaluate. Defaults to None.
dataset_id (int): The index of the specific dataset to evaluate. Defaults to None.
model_types (str): The types of models to evaluate. Defaults to "all".
dataset_types (str): The types of benchmarks to evaluate. Defaults to "all".
save_freq (int): The frequency at which to save results. Defaults to 1000.
face_blur (bool): Whether to use face blurring during evaluation. Defaults to False.
device (str): The device to use for evaluation. Defaults to "cuda" if available otherwise "cpu".
batch_per_gpu (int): Evaluation batch size per gpu. Defaults to 32.
batch_per_gpu (int): Evaluation batch size per GPU. Defaults to 32.
```


The `Evaluator` class takes the following arguments:

```console
Args:
seed (int): Random seed for reproducibility.
num_workers (int): Number of workers for data loading.
models (Union[List[str], str]): List of models to evaluate or "all" to evaluate all available models.
benchmarks (Union[List[str], str]): List of benchmarks to evaluate or "all" to evaluate all available benchmarks.
model_id (Union[int, None]): Specific model ID to evaluate.
benchmark_id (Union[int, None]): Specific benchmark ID to evaluate.
output_dir (str): Directory to save evaluation results.
benchmarks_dir (str): Directory containing benchmark data.
download_aggregate_precomputed (bool): Whether to download aggregate precomputed results.
download_all_precomputed (bool): Whether to download all precomputed results.
```

### Example
Expand Down
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@

setuptools.setup(
name="unibench",
version="0.2.0",
version="0.3.0",
author="Haider Al-Tahan",
author_email="[email protected]",
description="This repository is designed to simplify the evaluation process of vision-language models. It provides a comprehensive set of tools and scripts for evaluating VLM models and benchmarks.",
Expand Down
2 changes: 1 addition & 1 deletion unibench/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
This source code is licensed under the license found in the
LICENSE file in the root directory of this source tree.
"""
__version__ = "0.2.0"
__version__ = "0.3.0"
__author__ = "Haider Al-Tahan"

from .evaluator import Evaluator
58 changes: 50 additions & 8 deletions unibench/evaluator.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,6 @@

import fire
import torch
from tqdm.auto import tqdm
import pandas as pd
from rich.progress import Progress
from torch.utils.data import Dataset
Expand Down Expand Up @@ -39,6 +38,55 @@


class Evaluator(object):
"""
The Evaluator class is responsible for evaluating machine learning models on various benchmarks.
It provides methods to update the list of models and benchmarks, download benchmarks, add new models and benchmarks,
generate aggregate results, and evaluate models.
Attributes:
seed (int): Random seed for reproducibility.
num_workers (int): Number of workers for data loading.
models (Union[List[str], str]): List of models to evaluate or "all" to evaluate all available models.
benchmarks (Union[List[str], str]): List of benchmarks to evaluate or "all" to evaluate all available benchmarks.
model_id (Union[int, None]): Specific model ID to evaluate.
benchmark_id (Union[int, None]): Specific benchmark ID to evaluate.
output_dir (str): Directory to save evaluation results.
benchmarks_dir (str): Directory containing benchmark data.
download_aggregate_precomputed (bool): Whether to download aggregate precomputed results. Used for minor analysis and fast loading.
download_all_precomputed (bool): Whether to download all precomputed results. Used for slow loading and comprehensive analysis.
Methods:
update_benchmark_list(benchmarks, benchmark_id=None):
Updates the list of benchmarks to evaluate.
update_model_list(models, model_id=None):
Updates the list of models to evaluate.
download_benchmarks():
Downloads the specified benchmarks.
list_models():
Lists all available models.
add_benchmark(benchmark, handler, meta_data={}):
Adds a new benchmark to the list of benchmarks.
generate_aggregate_results():
Generates aggregate results from the evaluation.
list_benchmarks():
Lists all available benchmarks.
add_model(model, meta_data={}):
Adds a new model to the list of models.
show_results():
Displays the evaluation results.
evaluate(save_freq=1000, face_blur=False, device="cuda" if torch.cuda.is_available() else "cpu", batch_per_gpu=32):
Evaluates the models on the benchmarks and saves the results.
"""

def __init__(
self,
seed: int = 1337,
Expand Down Expand Up @@ -189,17 +237,10 @@ def evaluate(
Evaluate models on benchmarks and return and saving the results.
Args:
models (list or str): The models to evaluate. Defaults to "all".
benchmarks (list or str): The benchmarks to evaluate. Defaults to "all".
model_id (int): The index of the specific model to evaluate. Defaults to None.
benchmark_id (int): The index of the specific benchmark to evaluate. Defaults to None.
model_types (str): The types of models to evaluate. Defaults to "all".
benchmark_types (str): The types of benchmarks to evaluate. Defaults to "all".
save_freq (int): The frequency at which to save results. Defaults to 1000.
face_blur (bool): Whether to use face blurring during evaluation. Defaults to False.
device (str): The device to use for evaluation. Defaults to "cuda" if available otherwise "cpu".
batch_per_gpu (int): The batch size per GPU. Defaults to 32.
use_data_parallel (bool): Whether to use data parallelism. Defaults to torch.cuda.device_count() > 1.
Returns:
query results: The results of the query for the specified benchmarks and models.
Expand Down Expand Up @@ -301,6 +342,7 @@ def evaluate(
progress.update(pg_benchmark, advance=1)
progress.update(pg_benchmark, visible=False)
self.outputhandler.save_csv(model_name, benchmark_name)
self.outputhandler.save_aggregate_results(model_name, benchmark_name)
progress.update(pg_benchmarks, advance=1)
progress.update(pg_models, advance=1)

Expand Down
44 changes: 29 additions & 15 deletions unibench/output.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,7 @@ def reset_local_csv(self):
self._local_csv = pd.DataFrame()

def check_if_computed(self, model_name, benchmark_name, **kwargs):
self.load_aggregate_results()
res = self.query(
df=self._aggregate,
**{"model_name": model_name, "benchmark_name": benchmark_name}
Expand Down Expand Up @@ -78,7 +79,6 @@ def load_all_csv(self, model_name, benchmark_name):
model_folder = self.output_dir.joinpath(model)
for benchmark in benchmark_name:
file = model_folder.joinpath(benchmark + ".f")
print("Loading file: ", file)
if file.exists():
try:
dfs.append(pd.read_feather(file))
Expand Down Expand Up @@ -128,19 +128,15 @@ def query(self, df=None, **kwargs):
if len(kwargs) == 0:
return df

def create_compare(k, v):
return k + "=='" + v + "'" if isinstance(v, str) else k + "==" + str(v)
mask = pd.Series([True] * len(df))

expr = ""
for i, (k, v) in enumerate(kwargs.items()):
for k, v in kwargs.items():
if isinstance(v, list):
expr += "(" + " or ".join(create_compare(k, v_) for v_ in v) + ")"
mask &= df[k].isin(v)
else:
expr += create_compare(k, v)
if i < len(kwargs.items()) - 1:
expr += " and "
mask &= (df[k] == v)

return df.query(expr)
return df[mask]

def delete_rows(self, model_name, benchmark_name, **kwargs):
# file_name = str(OUTPUT_DIR.joinpath(model_name + ".f"))
Expand Down Expand Up @@ -172,19 +168,37 @@ def _get_benchmark_mappings(self, axis):

@lockutils.synchronized(name="aggregate", external=True, fair=True)
def load_aggregate_results(self):
self._aggregate = pd.read_feather(self.output_dir.joinpath("aggregate.f"))
file = self.output_dir.joinpath("aggregate.f")
if file.exists():
self._aggregate = pd.read_feather(file)

@lockutils.synchronized(name="aggregate", external=True, fair=True)
def generate_aggregate_results(self):
def save_aggregate_results(self, model_name, benchmark_name):
file_dir = self.output_dir.joinpath("aggregate.f")
if file_dir.exists():
self._aggregate = pd.read_feather(file_dir)

df = self.query(
self._model_csv,
**{"model_name": [model_name], "benchmark_name": [benchmark_name]}
)

df = (
self._model_csv.groupby(["model_name", "benchmark_name"])["correctness"]
df.groupby(["model_name", "benchmark_name"])["correctness"]
.mean()
.reset_index()
)

df.to_feather(self.output_dir.joinpath("aggregate.f"))
df = (
pd.concat([self._aggregate, df])
.drop_duplicates(subset=["model_name", "benchmark_name"], keep="last")
.reset_index(drop=True)
)

df.to_feather(file_dir)

def print_dataframe(self, **kwargs):
self.load_aggregate_results()
df = self.query(df=self._aggregate, **kwargs)
benchmark_mappings = self._get_benchmark_mappings("benchmark_type")
df["benchmark_type"] = df["benchmark_name"].map(benchmark_mappings)
Expand Down Expand Up @@ -230,4 +244,4 @@ def save_csv(self, model_name, benchmark_name):

# Save the model csv
self._model_csv.to_feather(file_name)
self.reset_local_csv()
self.reset_local_csv()

0 comments on commit dad42a2

Please sign in to comment.