add example scripts to load the datasets

lmb-freiburg · Jul 8, 2024 · d551b50 · d551b50
1 parent 5e3ecb7
commit d551b50
Show file tree

Hide file tree

Showing 7 changed files with 567 additions and 0 deletions.
diff --git a/README.md b/README.md
@@ -20,6 +20,8 @@ accepted to ICLR 2024 as Spotlight paper.
 
 ## News
 
+July 2024: Add example `load_*.py` scripts to enable easier usage of the proposed datasets.
+
 May 2024: Upload slides and poster to prepare for the [ICLR poster session](https://iclr.cc/virtual/2024/poster/19102). Release paper v2 on arxiv with minor fixes.
 
 February 2024: Code release.
@@ -93,6 +95,8 @@ You have following options to change those paths (1 or 2 are the recommended way
 
 ### Setup datasets
 
+Use the respective `load_*.py` script in the root directory to test and check out the dataset after setting it up.
+
 #### Imagenet
 
 Download validation images and devkit from the 

diff --git a/load_activitynet_vqa_dataset.py b/load_activitynet_vqa_dataset.py
@@ -0,0 +1,66 @@
+"""
+Example script on how to load this dataset without depending on the entire framework.
+"""
+
+from pprint import pprint
+
+from packg.paths import get_data_dir
+from torch.utils.data import DataLoader
+
+from ovqa.datasets.activitynet_vqa_dataset import ActivityNetVQADataset
+from ovqa.processors import BlipImageEvalProcessor
+
+
+def text_processor_noop(x):
+    return x
+
+
+def main():
+    data_dir = get_data_dir()
+    activitynet_dir = data_dir / "activitynet"
+    vis_root = activitynet_dir / "frames_uncropped"
+    ann_paths = [
+        activitynet_dir / "activity_net.v1-3.min.json",
+    ]
+    vis_processor = None  # None will give a pillow image back
+
+    # select which question the model will be asked
+    question_type = "what-is-this"  # "what-is-happening-image", "what-act-is-this"
+
+    # see ovqa/configs/datasets/activitynet.yaml
+    config = {
+        "question_type": question_type,
+        "class_name_key": "activity",
+    }
+    dataset = ActivityNetVQADataset(
+        vis_processor=vis_processor,
+        text_processor=text_processor_noop,
+        vis_root=vis_root,
+        ann_paths=ann_paths,
+        config=config,
+    )
+    datapoint = dataset[0]
+    pprint(datapoint)
+    print()
+
+    # in order to use a dataloader, we need to transform the images to tensors, so we can stack them
+    dataset.vis_processor = BlipImageEvalProcessor(
+        image_size=224, mean=(0.5, 0.5, 0.5), std=(0.25, 0.25, 0.25)
+    )
+    dataloader = DataLoader(
+        dataset,
+        shuffle=False,
+        num_workers=0,
+        batch_size=2,
+        collate_fn=dataset.collater,
+    )
+    for i, batch in enumerate(dataloader):
+        image_tensor = batch.pop("image")
+        print("image:", image_tensor.shape, image_tensor.dtype, image_tensor.device)
+        pprint(batch)
+        print()
+        break
+
+
+if __name__ == "__main__":
+    main()
diff --git a/load_activitynet_vqa_dataset_followup.py b/load_activitynet_vqa_dataset_followup.py
@@ -0,0 +1,142 @@
+"""
+Example script on how to load this dataset without depending on the entire framework.
+
+For followup question we need existing output to ask a followup question about.
+The model output for this dataset can be downloaded as described in the readme.
+Also note that correctly answered questions will not be asked again, so the dataset becomes smaller.
+"""
+from collections import Counter
+from copy import deepcopy
+from pprint import pprint
+
+from packg.debugging import connect_to_pycharm_debug_server
+from packg.iotools import load_yaml
+from packg.paths import get_data_dir
+from torch.utils.data import DataLoader
+
+from ovqa.datasets.activitynet_vqa_dataset import ActivityNetVQADataset
+from ovqa.datasets.imagenet_hierarchy import load_hierarchy
+from ovqa.followup import Followup
+from ovqa.processors import BlipImageEvalProcessor
+from ovqa.result_loader import read_single_result
+
+
+def text_processor_noop(x):
+    return x
+
+
+def main():
+    connect_to_pycharm_debug_server("edna", 33553)
+    # ----- load dataset as before
+    data_dir = get_data_dir()
+    activitynet_dir = data_dir / "activitynet"
+    vis_root = activitynet_dir / "frames_uncropped"
+    ann_paths = [
+        activitynet_dir / "activity_net.v1-3.min.json",
+    ]
+    vis_processor = None  # None will give a pillow image back
+
+    # select which question the model will be asked
+    question_type = "what-is-this"  # "what-is-happening-image", "what-act-is-this"
+
+    # see ovqa/configs/datasets/activitynet.yaml
+    config = {
+        "question_type": question_type,
+        "class_name_key": "activity",
+    }
+    dataset = ActivityNetVQADataset(
+        vis_processor=vis_processor,
+        text_processor=text_processor_noop,
+        vis_root=vis_root,
+        ann_paths=ann_paths,
+        config=config,
+    )
+    print(f"Original dataset length: {len(dataset)}")
+
+    # ----- load existing model output and apply followup
+    followup_cfg = load_yaml("ovqa/configs/followup/followup_activitynet.yaml")["run"]["followup_cfg"]
+    pprint(followup_cfg)
+    default_followup_object = followup_cfg["default_followup_object"]
+
+    classsynonyms = dataset.classsynonyms
+    synonym_dict = {name: i for i, names in enumerate(classsynonyms) for name in names}
+    hier = load_hierarchy("activitynet")
+    targets = {v["key"]: v["class_idx"] for v in dataset.annotation}
+    follower = Followup(followup_cfg, hier, dataset.classnames, synonym_dict, targets)
+
+    # load previous model output
+    followup_prev_dir = "output/activitynet~val/blip1~ftvqa~default~none~what-is-this"
+    result_obj = read_single_result(followup_prev_dir)
+    assert result_obj is not None, f"Failed to read output from: {followup_prev_dir}"
+    preds = result_obj.load_output()
+    if next(iter(targets.keys())) not in preds:
+        # fix prediction keys in this case from str '0' to int 0
+        new_preds = {}
+        for i, v in enumerate(dataset.annotation):
+            key = v["key"]
+            pred = preds[str(i)]
+            new_preds[key] = pred
+        preds = new_preds
+
+    # run followup pipeline
+    to_followup = follower.evaluate_pipeline(preds)
+    # to_followup now looks like
+    # {'val_00000003': {'status': 'followup', 'object': 'dog'},} ...
+    # where status is "correct", "failed" or "followup" and in case of followup "object" is set.
+    counter_followup = Counter(v["status"] for v in to_followup.values())
+    print(str(dict(counter_followup)))
+
+    # update dataset and config based on the followup questions to ask
+    new_anns = []
+    for ann in dataset.annotation:
+        ann_followup = to_followup[ann["key"]]
+        if ann_followup["status"] in "correct":
+            continue
+        # define the followup question
+        if ann_followup["status"] == "followup":
+            ask_object = ann_followup[default_followup_object]
+        elif ann_followup["status"] == "failed":
+            ask_object = default_followup_object
+        else:
+            raise ValueError(f"Unknown status: {ann_followup['status']}")
+        new_ann = deepcopy(ann)
+        # note this is used in ClassifierVQADataset.get_item
+        new_ann["question_followup"] = ask_object
+        new_anns.append(new_ann)
+    dataset.annotation = new_anns
+    print(f"Updated dataset, new length: {len(dataset.annotation)}")
+
+    # ----- look at the final dataset
+    # note that to get the final followup question, the text_input from the dataset must be
+    # formatted with the correct prompt. the correct prompt depends on the model (see model configs)
+    followup_prompt = "What type of {} is this?"
+
+    datapoint = dataset[0]
+    pprint(datapoint)
+    followup_question = followup_prompt.format(datapoint["text_input"])
+    print(f"Actual text_input: {followup_question}")
+    print()
+
+    # in order to use a dataloader, we need to transform the images to tensors, so we can stack them
+    dataset.vis_processor = BlipImageEvalProcessor(
+        image_size=224, mean=(0.5, 0.5, 0.5), std=(0.25, 0.25, 0.25)
+    )
+    dataloader = DataLoader(
+        dataset,
+        shuffle=False,
+        num_workers=0,
+        batch_size=2,
+        collate_fn=dataset.collater,
+    )
+    for i, batch in enumerate(dataloader):
+        image_tensor = batch.pop("image")
+        print("image:", image_tensor.shape, image_tensor.dtype, image_tensor.device)
+        pprint(batch)
+        followup_questions = [followup_prompt.format(t) for t in batch["text_input"]]
+        print(f"Followup questions: {followup_questions}")
+        print()
+        break
+
+
+if __name__ == "__main__":
+    main()
diff --git a/load_coco_objects_vqa_dataset.py b/load_coco_objects_vqa_dataset.py
@@ -0,0 +1,67 @@
+"""
+Example script on how to load this dataset without depending on the entire framework.
+"""
+
+from pprint import pprint
+
+from packg.paths import get_data_dir
+from torch.utils.data import DataLoader
+
+from ovqa.datasets.coco_objects_vqa_dataset import COCOObjectsVQADataset
+from ovqa.processors import BlipImageEvalProcessor
+
+
+def text_processor_noop(x):
+    return x
+
+
+def main():
+    data_dir = get_data_dir()
+    coco_dir = data_dir / "coco"
+    vis_root = coco_dir / "images" / "val2017"
+    ann_paths = [coco_dir / "annotations/instances_val2017.json"]
+    vis_processor = None  # None will give a pillow image back
+
+    # select which question the model will be asked
+    question_type = "what-seen-image"  # "what-is-in-image", "whats-this"
+
+    # see ovqa/configs/datasets/coco.yaml
+    config = {
+        "question_type": question_type,
+        "class_name_key": "object",
+        "square_box": False,
+        "min_side": 40.0,
+        "margin_side": 2.0,
+    }
+    dataset = COCOObjectsVQADataset(
+        vis_processor=vis_processor,
+        text_processor=text_processor_noop,
+        vis_root=vis_root,
+        ann_paths=ann_paths,
+        config=config,
+    )
+    datapoint = dataset[0]
+    pprint(datapoint)
+    print()
+
+    # in order to use a dataloader, we need to transform the images to tensors, so we can stack them
+    dataset.vis_processor = BlipImageEvalProcessor(
+        image_size=224, mean=(0.5, 0.5, 0.5), std=(0.25, 0.25, 0.25)
+    )
+    dataloader = DataLoader(
+        dataset,
+        shuffle=False,
+        num_workers=0,
+        batch_size=2,
+        collate_fn=dataset.collater,
+    )
+    for i, batch in enumerate(dataloader):
+        image_tensor = batch.pop("image")
+        print("image:", image_tensor.shape, image_tensor.dtype, image_tensor.device)
+        pprint(batch)
+        print()
+        break
+
+
+if __name__ == "__main__":
+    main()
diff --git a/load_imagenet_vqa_dataset.py b/load_imagenet_vqa_dataset.py
@@ -0,0 +1,71 @@
+"""
+Example script on how to load this dataset without depending on the entire framework.
+"""
+
+from pprint import pprint
+
+from packg.paths import get_data_dir
+from torch.utils.data import DataLoader
+
+from ovqa.datasets.classifier_vqa_dataset import ClassifierVQADataset
+from ovqa.processors import BlipImageEvalProcessor
+
+
+def text_processor_noop(x):
+    return x
+
+
+def main():
+    data_dir = get_data_dir()
+    imagenet_dir = data_dir / "imagenet1k"
+    vis_root = imagenet_dir
+    ann_paths = [
+        "ovqa/annotations/imagenet1k/generated/val.json",
+        "ovqa/annotations/imagenet1k/generated/classes_data.json",
+    ]
+    vis_processor = None  # None will give a pillow image back
+
+    # select which question the model will be asked
+    question_type = "what-seen-image"  # "what-is-in-image", "whats-this"
+
+    # whether to use cropped images for imagenet or not
+    cropped_images_dir = "square"  # "" or "square"
+
+    # see ovqa/configs/datasets/imagenet1k.yaml
+    config = {
+        "question_type": question_type,
+        "class_name_key": "clip_bench_label",
+        "cropped_images_dir": cropped_images_dir,
+    }
+    dataset = ClassifierVQADataset(
+        vis_processor=vis_processor,
+        text_processor=text_processor_noop,
+        vis_root=vis_root,
+        ann_paths=ann_paths,
+        config=config,
+    )
+    datapoint = dataset[0]
+    pprint(datapoint)
+    print()
+
+    # in order to use a dataloader, we need to transform the images to tensors, so we can stack them
+    dataset.vis_processor = BlipImageEvalProcessor(
+        image_size=224, mean=(0.5, 0.5, 0.5), std=(0.25, 0.25, 0.25)
+    )
+    dataloader = DataLoader(
+        dataset,
+        shuffle=False,
+        num_workers=0,
+        batch_size=2,
+        collate_fn=dataset.collater,
+    )
+    for i, batch in enumerate(dataloader):
+        image_tensor = batch.pop("image")
+        print("image:", image_tensor.shape, image_tensor.dtype, image_tensor.device)
+        pprint(batch)
+        print()
+        break
+
+
+if __name__ == "__main__":
+    main()