From dad42a2aa71c303665454a2d747e53e5e692f9c7 Mon Sep 17 00:00:00 2001
From: Haider Al-Tahan <haltaha@uwo.ca>
Date: Mon, 2 Sep 2024 12:18:52 -0400
Subject: [PATCH] Update 0.3.0: - Updated README.md - Fixed #4 - Fixed saving
 aggregate

---
 README.md             | 28 ++++++++++++++-------
 setup.py              |  2 +-
 unibench/__init__.py  |  2 +-
 unibench/evaluator.py | 58 +++++++++++++++++++++++++++++++++++++------
 unibench/output.py    | 44 +++++++++++++++++++++-----------
 5 files changed, 100 insertions(+), 34 deletions(-)
diff --git a/README.md b/README.md
index fdbaf37..a221977 100644
--- a/README.md
+++ b/README.md
@@ -22,7 +22,7 @@ This repository is designed to simplify the evaluation process of vision-languag
 
 Install the package: 
 ```
-pip install git+https://github.com/facebookresearch/unibench.git
+pip install unibench -U
 ```
 
 <details > 
@@ -73,17 +73,27 @@ evaluator.evaluate()
 
 ```console
 Args:
-    num_workers (int): Number of CPU cores to use to load data. Default to 80.
-    models (list or str): The models to evaluate. Defaults to "all".
-    benchmarks (list or str): The benchmarks to evaluate. Defaults to "all".
-    model_id (int): The index of the specific model to evaluate. Defaults to None.
-    dataset_id (int): The index of the specific dataset to evaluate. Defaults to None.
-    model_types (str): The types of models to evaluate. Defaults to "all".
-    dataset_types (str): The types of benchmarks to evaluate. Defaults to "all".
     save_freq (int): The frequency at which to save results. Defaults to 1000.
     face_blur (bool): Whether to use face blurring during evaluation. Defaults to False.
     device (str): The device to use for evaluation. Defaults to "cuda" if available otherwise "cpu".
-    batch_per_gpu (int): Evaluation batch size per gpu. Defaults to 32.
+    batch_per_gpu (int): Evaluation batch size per GPU. Defaults to 32.
+```
+
+
+The `Evaluator` class takes the following arguments:
+
+```console
+Args:
+    seed (int): Random seed for reproducibility.
+    num_workers (int): Number of workers for data loading.
+    models (Union[List[str], str]): List of models to evaluate or "all" to evaluate all available models.
+    benchmarks (Union[List[str], str]): List of benchmarks to evaluate or "all" to evaluate all available benchmarks.
+    model_id (Union[int, None]): Specific model ID to evaluate.
+    benchmark_id (Union[int, None]): Specific benchmark ID to evaluate.
+    output_dir (str): Directory to save evaluation results.
+    benchmarks_dir (str): Directory containing benchmark data.
+    download_aggregate_precomputed (bool): Whether to download aggregate precomputed results.
+    download_all_precomputed (bool): Whether to download all precomputed results.
 ```
 
 ### Example
diff --git a/setup.py b/setup.py
index 4f33a73..66b5c64 100644
--- a/setup.py
+++ b/setup.py
@@ -13,7 +13,7 @@
 
 setuptools.setup(
     name="unibench",
-    version="0.2.0",
+    version="0.3.0",
     author="Haider Al-Tahan",
     author_email="haideraltahan@meta.com",
     description="This repository is designed to simplify the evaluation process of vision-language models. It provides a comprehensive set of tools and scripts for evaluating VLM models and benchmarks.",
diff --git a/unibench/__init__.py b/unibench/__init__.py
index 7dd972f..583bdfd 100644
--- a/unibench/__init__.py
+++ b/unibench/__init__.py
@@ -4,7 +4,7 @@
 This source code is licensed under the license found in the
 LICENSE file in the root directory of this source tree.
 """
-__version__ = "0.2.0"
+__version__ = "0.3.0"
 __author__ = "Haider Al-Tahan"
 
 from .evaluator import Evaluator
\ No newline at end of file
diff --git a/unibench/evaluator.py b/unibench/evaluator.py
index 1c5cfd1..3bf8483 100644
--- a/unibench/evaluator.py
+++ b/unibench/evaluator.py
@@ -10,7 +10,6 @@
 
 import fire
 import torch
-from tqdm.auto import tqdm
 import pandas as pd
 from rich.progress import Progress
 from torch.utils.data import Dataset
@@ -39,6 +38,55 @@
 
 
 class Evaluator(object):
+    """
+    The Evaluator class is responsible for evaluating machine learning models on various benchmarks.
+    It provides methods to update the list of models and benchmarks, download benchmarks, add new models and benchmarks,
+    generate aggregate results, and evaluate models.
+
+    Attributes:
+        seed (int): Random seed for reproducibility.
+        num_workers (int): Number of workers for data loading.
+        models (Union[List[str], str]): List of models to evaluate or "all" to evaluate all available models.
+        benchmarks (Union[List[str], str]): List of benchmarks to evaluate or "all" to evaluate all available benchmarks.
+        model_id (Union[int, None]): Specific model ID to evaluate.
+        benchmark_id (Union[int, None]): Specific benchmark ID to evaluate.
+        output_dir (str): Directory to save evaluation results.
+        benchmarks_dir (str): Directory containing benchmark data.
+        download_aggregate_precomputed (bool): Whether to download aggregate precomputed results. Used for minor analysis and fast loading.
+        download_all_precomputed (bool): Whether to download all precomputed results. Used for slow loading and comprehensive analysis.
+
+    Methods:
+        update_benchmark_list(benchmarks, benchmark_id=None):
+            Updates the list of benchmarks to evaluate.
+
+        update_model_list(models, model_id=None):
+            Updates the list of models to evaluate.
+
+        download_benchmarks():
+            Downloads the specified benchmarks.
+
+        list_models():
+            Lists all available models.
+
+        add_benchmark(benchmark, handler, meta_data={}):
+            Adds a new benchmark to the list of benchmarks.
+
+        generate_aggregate_results():
+            Generates aggregate results from the evaluation.
+
+        list_benchmarks():
+            Lists all available benchmarks.
+
+        add_model(model, meta_data={}):
+            Adds a new model to the list of models.
+
+        show_results():
+            Displays the evaluation results.
+
+        evaluate(save_freq=1000, face_blur=False, device="cuda" if torch.cuda.is_available() else "cpu", batch_per_gpu=32):
+            Evaluates the models on the benchmarks and saves the results.
+    """
+
     def __init__(
         self,
         seed: int = 1337,
@@ -189,17 +237,10 @@ def evaluate(
         Evaluate models on benchmarks and return and saving the results.
 
         Args:
-            models (list or str): The models to evaluate. Defaults to "all".
-            benchmarks (list or str): The benchmarks to evaluate. Defaults to "all".
-            model_id (int): The index of the specific model to evaluate. Defaults to None.
-            benchmark_id (int): The index of the specific benchmark to evaluate. Defaults to None.
-            model_types (str): The types of models to evaluate. Defaults to "all".
-            benchmark_types (str): The types of benchmarks to evaluate. Defaults to "all".
             save_freq (int): The frequency at which to save results. Defaults to 1000.
             face_blur (bool): Whether to use face blurring during evaluation. Defaults to False.
             device (str): The device to use for evaluation. Defaults to "cuda" if available otherwise "cpu".
             batch_per_gpu (int): The batch size per GPU. Defaults to 32.
-            use_data_parallel (bool): Whether to use data parallelism. Defaults to torch.cuda.device_count() > 1.
 
         Returns:
             query results: The results of the query for the specified benchmarks and models.
@@ -301,6 +342,7 @@ def evaluate(
                         progress.update(pg_benchmark, advance=1)
                     progress.update(pg_benchmark, visible=False)
                     self.outputhandler.save_csv(model_name, benchmark_name)
+                    self.outputhandler.save_aggregate_results(model_name, benchmark_name)
                     progress.update(pg_benchmarks, advance=1)
                 progress.update(pg_models, advance=1)
 
diff --git a/unibench/output.py b/unibench/output.py
index 9bdb0e3..4597743 100644
--- a/unibench/output.py
+++ b/unibench/output.py
@@ -44,6 +44,7 @@ def reset_local_csv(self):
         self._local_csv = pd.DataFrame()
 
     def check_if_computed(self, model_name, benchmark_name, **kwargs):
+        self.load_aggregate_results()
         res = self.query(
             df=self._aggregate,
             **{"model_name": model_name, "benchmark_name": benchmark_name}
@@ -78,7 +79,6 @@ def load_all_csv(self, model_name, benchmark_name):
             model_folder = self.output_dir.joinpath(model)
             for benchmark in benchmark_name:
                 file = model_folder.joinpath(benchmark + ".f")
-                print("Loading file: ", file)
                 if file.exists():
                     try:
                         dfs.append(pd.read_feather(file))
@@ -128,19 +128,15 @@ def query(self, df=None, **kwargs):
         if len(kwargs) == 0:
             return df
 
-        def create_compare(k, v):
-            return k + "=='" + v + "'" if isinstance(v, str) else k + "==" + str(v)
+        mask = pd.Series([True] * len(df))
 
-        expr = ""
-        for i, (k, v) in enumerate(kwargs.items()):
+        for k, v in kwargs.items():
             if isinstance(v, list):
-                expr += "(" + " or ".join(create_compare(k, v_) for v_ in v) + ")"
+                mask &= df[k].isin(v)
             else:
-                expr += create_compare(k, v)
-            if i < len(kwargs.items()) - 1:
-                expr += " and "
+                mask &= (df[k] == v)
 
-        return df.query(expr)
+        return df[mask]
 
     def delete_rows(self, model_name, benchmark_name, **kwargs):
         # file_name = str(OUTPUT_DIR.joinpath(model_name + ".f"))
@@ -172,19 +168,37 @@ def _get_benchmark_mappings(self, axis):
 
     @lockutils.synchronized(name="aggregate", external=True, fair=True)
     def load_aggregate_results(self):
-        self._aggregate = pd.read_feather(self.output_dir.joinpath("aggregate.f"))
+        file = self.output_dir.joinpath("aggregate.f")
+        if file.exists():
+            self._aggregate = pd.read_feather(file)
 
     @lockutils.synchronized(name="aggregate", external=True, fair=True)
-    def generate_aggregate_results(self):
+    def save_aggregate_results(self, model_name, benchmark_name):
+        file_dir = self.output_dir.joinpath("aggregate.f")
+        if file_dir.exists():
+            self._aggregate = pd.read_feather(file_dir)
+
+        df = self.query(
+            self._model_csv,
+            **{"model_name": [model_name], "benchmark_name": [benchmark_name]}
+        )
+
         df = (
-            self._model_csv.groupby(["model_name", "benchmark_name"])["correctness"]
+            df.groupby(["model_name", "benchmark_name"])["correctness"]
             .mean()
             .reset_index()
         )
 
-        df.to_feather(self.output_dir.joinpath("aggregate.f"))
+        df = (
+            pd.concat([self._aggregate, df])
+            .drop_duplicates(subset=["model_name", "benchmark_name"], keep="last")
+            .reset_index(drop=True)
+        )
+
+        df.to_feather(file_dir)
 
     def print_dataframe(self, **kwargs):
+        self.load_aggregate_results()
         df = self.query(df=self._aggregate, **kwargs)
         benchmark_mappings = self._get_benchmark_mappings("benchmark_type")
         df["benchmark_type"] = df["benchmark_name"].map(benchmark_mappings)
@@ -230,4 +244,4 @@ def save_csv(self, model_name, benchmark_name):
 
         # Save the model csv
         self._model_csv.to_feather(file_name)
-        self.reset_local_csv()
+        self.reset_local_csv()
\ No newline at end of file