Vision-CAIR · 1429904852 · Aug 19, 2023
diff --git a/dataset/.DS_Store b/dataset/.DS_Store
diff --git a/dataset/ScienceQA/pid_splits.json b/dataset/ScienceQA/pid_splits.json
diff --git a/dataset/ScienceQA/problems.json b/dataset/ScienceQA/problems.json
diff --git a/dataset/ScienceQA/test_QCM-A.json b/dataset/ScienceQA/test_QCM-A.json
diff --git a/dataset/ScienceQA/train_QCM-A.json b/dataset/ScienceQA/train_QCM-A.json
diff --git a/dataset/ScienceQA/val_QCM-A.json b/dataset/ScienceQA/val_QCM-A.json
diff --git a/eval_configs/.DS_Store b/eval_configs/.DS_Store
diff --git a/inference.py b/inference.py
@@ -0,0 +1,71 @@
+import argparse
+import os
+import random
+
+import numpy as np
+import torch
+import torch.backends.cudnn as cudnn
+import gradio as gr
+import json
+
+from minigpt4.common.config import Config
+from minigpt4.common.dist_utils import get_rank
+from minigpt4.common.registry import registry
+from minigpt4.conversation.response import Chat
+
+# imports modules for registration
+from minigpt4.datasets.builders import *
+from minigpt4.models import *
+from minigpt4.processors import *
+from minigpt4.runners import *
+from minigpt4.tasks import *
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(description="Demo")
+    parser.add_argument("--cfg-path", required=True, help="path to configuration file.")
+    parser.add_argument("--gpu-id", type=int, default=0, help="specify the gpu to load the model.")
+    parser.add_argument(
+        "--options",
+        nargs="+",
+        help="override some settings in the used config, the key-value pair "
+        "in xxx=yyy format will be merged into config file (deprecate), "
+        "change to --cfg-options instead.",
+    )
+    args = parser.parse_args()
+    return args
+
+
+def setup_seeds(config):
+    seed = config.run_cfg.seed + get_rank()
+    random.seed(seed)
+    np.random.seed(seed)
+    torch.manual_seed(seed)
+    cudnn.benchmark = False
+    cudnn.deterministic = True
+
+
+args = parse_args()
+cfg = Config(args)
+
+model_config = cfg.model_cfg
+model_config.device_8bit = args.gpu_id
+model_cls = registry.get_model_class(model_config.arch)
+model = model_cls.from_config(model_config).to('cuda:{}'.format(args.gpu_id))
+
+vis_processor_cfg = cfg.datasets_cfg.cc_sbu_align.vis_processor.train
+vis_processor = registry.get_processor_class(vis_processor_cfg.name).from_config(vis_processor_cfg)
+chat = Chat(model, vis_processor, device='cuda:{}'.format(args.gpu_id))
+
+# print(model_config.output_path)
+with open(model_config.output_path, 'r') as json_file:
+    for line in json_file:
+        item = json.loads(line)
+        # print(item["image"])
+        # print(item["text"])
+        image_emb = chat.upload_img(item["image"])
+        # [1, 32, 4096]
+        # print(image_emb.shape)
+        embedding = chat.get_context_emb(item["text"], image_emb)
+        llm_message = chat.answer(embs=embedding, max_new_tokens=300, max_length=2000)[0]
+        print(llm_message)
diff --git a/minigpt4/.DS_Store b/minigpt4/.DS_Store
diff --git a/minigpt4/configs/.DS_Store b/minigpt4/configs/.DS_Store
diff --git a/minigpt4/configs/datasets/.DS_Store b/minigpt4/configs/datasets/.DS_Store
diff --git a/minigpt4/configs/datasets/ScienceQA/.DS_Store b/minigpt4/configs/datasets/ScienceQA/.DS_Store
diff --git a/minigpt4/configs/datasets/ScienceQA/align.yaml b/minigpt4/configs/datasets/ScienceQA/align.yaml
@@ -0,0 +1,5 @@
+datasets:
+  ScienceQA:
+    data_type: images
+    build_info:
+      storage: /path/to/MiniGPT-4/dataset/ScienceQA/
diff --git a/minigpt4/conversation/response.py b/minigpt4/conversation/response.py
@@ -0,0 +1,107 @@
+import argparse
+import time
+from PIL import Image
+
+import torch
+from transformers import AutoTokenizer, AutoModelForCausalLM, LlamaTokenizer
+from transformers import StoppingCriteria, StoppingCriteriaList
+
+import dataclasses
+from enum import auto, Enum
+from typing import List, Tuple, Any
+
+from minigpt4.common.registry import registry
+
+
+class StoppingCriteriaSub(StoppingCriteria):
+
+    def __init__(self, stops=[], encounters=1):
+        super().__init__()
+        self.stops = stops
+
+    def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor):
+        for stop in self.stops:
+            if torch.all((stop == input_ids[0][-len(stop):])).item():
+                return True
+
+        return False
+
+
+class Chat:
+    def __init__(self, model, vis_processor, device='cuda:0'):
+        self.device = device
+        self.model = model
+        self.vis_processor = vis_processor
+        stop_words_ids = [torch.tensor([835]).to(self.device),
+                          torch.tensor([2277, 29937]).to(self.device)]  # '###' can be encoded in two different ways.
+        self.stopping_criteria = StoppingCriteriaList([StoppingCriteriaSub(stops=stop_words_ids)])
+
+    def answer(self, embs, max_new_tokens=300, num_beams=1, min_length=1, top_p=0.9,
+               repetition_penalty=1.0, length_penalty=1, temperature=1.0, max_length=2000):
+
+        # embs = self.get_context_emb(img_list)
+
+        current_max_len = embs.shape[1] + max_new_tokens
+        if current_max_len - max_length > 0:
+            print('Warning: The number of tokens in current conversation exceeds the max length. '
+                  'The model will not see the contexts outside the range.')
+        begin_idx = max(0, current_max_len - max_length)
+        embs = embs[:, begin_idx:]
+
+        outputs = self.model.llama_model.generate(
+            inputs_embeds=embs,
+            max_new_tokens=max_new_tokens,
+            stopping_criteria=self.stopping_criteria,
+            num_beams=num_beams,
+            do_sample=True,
+            min_length=min_length,
+            top_p=top_p,
+            repetition_penalty=repetition_penalty,
+            length_penalty=length_penalty,
+            temperature=temperature,
+        )
+        output_token = outputs[0]
+        if output_token[0] == 0:  # the model might output a unknow token <unk> at the beginning. remove it
+            output_token = output_token[1:]
+        if output_token[0] == 1:  # some users find that there is a start token <s> at the beginning. remove it
+            output_token = output_token[1:]
+        output_text = self.model.llama_tokenizer.decode(output_token, add_special_tokens=False)
+        output_text = output_text.split('###')[0]  # remove the stop sign '###'
+        output_text = output_text.split('Assistant:')[-1].strip()
+        return output_text, output_token.cpu().numpy()
+
+    def upload_img(self, image):
+        if isinstance(image, str):  # is a image path
+            raw_image = Image.open(image).convert('RGB')
+            image = self.vis_processor(raw_image).unsqueeze(0).to(self.device)
+        elif isinstance(image, Image.Image):
+            raw_image = image
+            image = self.vis_processor(raw_image).unsqueeze(0).to(self.device)
+        elif isinstance(image, torch.Tensor):
+            if len(image.shape) == 3:
+                image = image.unsqueeze(0)
+            image = image.to(self.device)
+        image_emb, _ = self.model.encode_img(image)
+        return image_emb
+
+    def get_context_emb(self, text_list, img_list):
+        system = "Give the following image: <Img>ImageContent</Img>. You will be able to see the image once I provide it to you. Please answer my questions." + "###"
+        prompt = "Human" + ": " + "<Img><ImageHere></Img> " + text_list + "###"
+        prompt = system + prompt
+
+        prompt_segs = prompt.split('<ImageHere>')
+        assert len(prompt_segs) == len(img_list) + 1, "Unmatched numbers of image placeholders and images."
+        seg_tokens = [
+            self.model.llama_tokenizer(
+                seg, return_tensors="pt", add_special_tokens=i == 0).to(self.device).input_ids
+            # only add bos to the first seg
+            for i, seg in enumerate(prompt_segs)
+        ]
+        seg_embs = [self.model.llama_model.model.embed_tokens(seg_t) for seg_t in seg_tokens]
+        # [1, 42, 4096]
+        # [1, 13, 4096]
+        # print(seg_embs[:-1].shape)
+        # print(seg_embs[-1].shape)
+        mixed_embs = torch.cat([seg_embs[0], img_list, seg_embs[1]], dim=1)
+        # mixed_embs = torch.cat(mixed_embs, dim=1)
+        return mixed_embs
diff --git a/minigpt4/datasets/.DS_Store b/minigpt4/datasets/.DS_Store
diff --git a/minigpt4/datasets/builders/image_text_pair_builder.py b/minigpt4/datasets/builders/image_text_pair_builder.py
@@ -103,3 +103,35 @@ def build_datasets(self):
         )
 
         return datasets
+
+@registry.register_builder("ScienceQA")
+class ScienceQABuilder(BaseDatasetBuilder):
+    train_dataset_cls = ScienceQADataset
+
+    DATASET_CONFIG_DICT = {
+        "default": "configs/datasets/ScienceQA/align.yaml",
+    }
+
+    def build_datasets(self):
+        # at this point, all the annotations and image/videos should be all downloaded to the specified locations.
+        logging.info("Building datasets...")
+        self.build_processors()
+
+        build_info = self.config.build_info
+        storage_path = build_info.storage
+
+        datasets = dict()
+
+        if not os.path.exists(storage_path):
+            warnings.warn("storage path {} does not exist.".format(storage_path))
+
+        # create datasets
+        dataset_cls = self.train_dataset_cls
+        datasets['train'] = dataset_cls(
+            vis_processor=self.vis_processors["train"],
+            text_processor=self.text_processors["train"],
+            ann_paths=[os.path.join(storage_path, 'train_QCM-A.json')],
+            vis_root=os.path.join(storage_path, 'train'),
+        )
+
+        return datasets
diff --git a/minigpt4/datasets/datasets/caption_datasets.py b/minigpt4/datasets/datasets/caption_datasets.py
@@ -36,28 +36,58 @@ def __init__(self, vis_processor, text_processor, vis_root, ann_paths):
         self.img_ids = {}
         n = 0
         for ann in self.annotation:
-            img_id = ann["image_id"]
-            if img_id not in self.img_ids.keys():
-                self.img_ids[img_id] = n
-                n += 1
+            if "image_id" in ann:
+                img_id = ann["image_id"]
+                if "/" in img_id:
+                    image_id = img_id.split("/")[0]
+                    if image_id not in self.img_ids.keys():
+                        self.img_ids[image_id] = n
+                        n += 1
+                else:
+                    if img_id not in self.img_ids.keys():
+                        self.img_ids[img_id] = n
+                        n += 1
 
     def __getitem__(self, index):
 
         # TODO this assumes image input, not general enough
         ann = self.annotation[index]
-
-        img_file = '{:0>12}.jpg'.format(ann["image_id"])
-        image_path = os.path.join(self.vis_root, img_file)
-        image = Image.open(image_path).convert("RGB")
-
-        image = self.vis_processor(image)
-        caption = self.text_processor(ann["caption"])
-
-        return {
-            "image": image,
-            "text_input": caption,
-            "image_id": self.img_ids[ann["image_id"]],
-        }
+        if "image_id" in ann:
+            if "id" in ann:
+                img_file = ann["image_id"]
+                input_prompt = self.text_processor(ann["input"])
+                image_path = os.path.join(self.vis_root, img_file)
+                # print(image_path)
+                image = Image.open(image_path).convert("RGB")
+                image = self.vis_processor(image)
+                # print(image.shape)
+                caption = self.text_processor(ann["caption"])
+                return {
+                    "image": image,
+                    "input_prompt": input_prompt,
+                    "text_input": caption,
+                    "image_id": self.img_ids[ann["image_id"].split("/")[0]],
+                }
+            else:
+                img_file = '{:0>12}.jpg'.format(ann["image_id"])
+                image_path = os.path.join(self.vis_root, img_file)
+                image = Image.open(image_path).convert("RGB")
+                image = self.vis_processor(image)
+                caption = self.text_processor(ann["caption"])
+                return {
+                    "image": image,
+                    "text_input": caption,
+                    "image_id": self.img_ids[ann["image_id"]],
+                }
+        else:
+            input_prompt = self.text_processor(ann["input"])
+            caption = self.text_processor(ann["caption"])
+            return {
+                "image": torch.zeros(3, 224, 224),
+                "input_prompt": input_prompt,
+                "text_input": caption,
+                "image_id": -100,
+            }
 
 
 class CaptionEvalDataset(BaseDataset, __DisplMixin):

diff --git a/minigpt4/datasets/datasets/science_qa_dataset.py b/minigpt4/datasets/datasets/science_qa_dataset.py
@@ -0,0 +1,42 @@
+import os
+from PIL import Image
+import webdataset as wds
+from minigpt4.datasets.datasets.base_dataset import BaseDataset
+from minigpt4.datasets.datasets.caption_datasets import CaptionDataset
+import torch
+
+class ScienceQADataset(CaptionDataset):
+    def __getitem__(self, index):
+        # TODO this assumes image input, not general enough
+        ann = self.annotation[index]
+        if "image_id" in ann:
+            # if "id" in ann:
+            #     img_file = ann["image_id"]
+            #     input_prompt = ann["input"]
+            # else:
+            #     img_file = '{}.jpg'.format(ann["image_id"])
+            img_file = ann["image_id"]
+            input_prompt = ann["input"]
+
+            image_path = os.path.join(self.vis_root, img_file)
+            image = Image.open(image_path).convert("RGB")
+
+            image = self.vis_processor(image)
+            # print(image.shape)
+            caption = ann["caption"]
+
+            return {
+                "image": image,
+                "input_prompt": input_prompt,
+                "text_input": caption,
+                "image_id": self.img_ids[ann["image_id"].split("/")[0]]
+            }
+        else:
+            input_prompt = ann["input"]
+            caption = ann["caption"]
+            return {
+                "image": torch.zeros(3, 224, 224),
+                "input_prompt": input_prompt,
+                "text_input": caption,
+                "image_id": -100,
+            }