From 5b62a52902c8db95c0a10d660fc7d8968cbede4c Mon Sep 17 00:00:00 2001
From: haileyschoelkopf <hailey@eleuther.ai>
Date: Tue, 2 Jul 2024 12:30:48 +0000
Subject: [PATCH 01/60] add WIP hf vlm class

---
 lm_eval/models/__init__.py    |   1 +
 lm_eval/models/hf_vlms.py     | 300 ++++++++++++++++++++++++++++++++++
 lm_eval/models/huggingface.py |  36 +++-
 3 files changed, 334 insertions(+), 3 deletions(-)
 create mode 100644 lm_eval/models/hf_vlms.py

diff --git a/lm_eval/models/__init__.py b/lm_eval/models/__init__.py
index 698c912f27..0ddf87982b 100644
--- a/lm_eval/models/__init__.py
+++ b/lm_eval/models/__init__.py
@@ -2,6 +2,7 @@
     anthropic_llms,
     dummy,
     gguf,
+    hf_vlms,
     huggingface,
     mamba_lm,
     nemo_lm,
diff --git a/lm_eval/models/hf_vlms.py b/lm_eval/models/hf_vlms.py
new file mode 100644
index 0000000000..416567d4a8
--- /dev/null
+++ b/lm_eval/models/hf_vlms.py
@@ -0,0 +1,300 @@
+import copy
+from typing import List, Optional, Tuple, Union
+
+import transformers
+from tqdm import tqdm
+
+from lm_eval.api.instance import Instance
+from lm_eval.api.registry import register_model
+from lm_eval.models.huggingface import HFLM
+from lm_eval.models.utils import Collator, stop_sequences_criteria
+
+
+@register_model("hf-multimodal")
+class HFMultimodalLM(HFLM):
+    """
+    An abstracted Hugging Face model class for multimodal LMs like Llava and Idefics.
+    """
+
+    AUTO_MODEL_CLASS = transformers.AutoModelForVision2Seq  # TODO: what's the right way to handle this. maybe phase out the direct class-equality checks in HFLM?
+
+    def _create_tokenizer(
+        self,
+        pretrained: Union[str, transformers.PreTrainedModel],
+        tokenizer: Optional[
+            Union[
+                str,
+                transformers.ProcessorMixin,
+            ]
+        ],
+        revision: Optional[str] = "main",
+        trust_remote_code: Optional[bool] = False,
+        **kwargs,
+    ) -> None:
+        """
+        Helper method during initialization.
+
+        For the multimodal variant, we initialize not just
+        `self.tokenizer` but also `self.processor`.
+        """
+
+        if tokenizer:
+            if isinstance(tokenizer, str):
+                return transformers.AutoProcessor.from_pretrained(
+                    tokenizer,
+                    revision=revision,
+                    trust_remote_code=trust_remote_code,
+                    # use_fast=use_fast_tokenizer,
+                )
+            else:
+                assert isinstance(
+                    tokenizer, transformers.PreTrainedTokenizer
+                ) or isinstance(tokenizer, transformers.PreTrainedTokenizerFast)
+                return tokenizer
+
+        # Get tokenizer based on 'pretrained'
+        if isinstance(pretrained, str):
+            model_name = pretrained
+        else:
+            # get the HF hub name via accessor on model
+            model_name = self.model.name_or_path
+
+        self.processor = transformers.AutoProcessor.from_pretrained(
+            model_name,
+            revision=revision,
+            trust_remote_code=trust_remote_code,
+            # use_fast=use_fast_tokenizer,
+        )
+
+        self.tokenizer = self.processor.tokenizer
+
+    # def apply_chat_template(self, chat_history: List[Dict[str, str]]) -> str:
+    #     """
+    #     Method to apply a chat template to a list of chat history between user and model.
+    #     """
+    #     return self.tokenizer.apply_chat_template(
+    #         chat_history, tokenize=False, add_generation_prompt=True
+    #     )
+
+    # def tok_encode(
+    #     self, string: str, left_truncate_len=None, add_special_tokens=None
+    # ) -> List[int]:
+    #     """ """
+    #     # default for None - empty dict, use predefined tokenizer param
+    #     # used for all models except for CausalLM or predefined value
+    #     special_tokens_kwargs = {}
+
+    #     # by default for CausalLM - false or self.add_bos_token is set
+    #     if add_special_tokens is None:
+    #         if self.AUTO_MODEL_CLASS == transformers.AutoModelForCausalLM:
+    #             special_tokens_kwargs = {
+    #                 "add_special_tokens": False or self.add_bos_token
+    #             }
+    #     # otherwise the method explicitly defines the value
+    #     else:
+    #         special_tokens_kwargs = {"add_special_tokens": add_special_tokens}
+
+    #     encoding = self.tokenizer.encode(string, **special_tokens_kwargs)
+
+    #     # left-truncate the encoded context to be at most `left_truncate_len` tokens long
+    #     if left_truncate_len:
+    #         encoding = encoding[-left_truncate_len:]
+
+    #     return encoding
+
+    # def tok_batch_encode(
+    #     self,
+    #     strings: List[str],
+    #     padding_side: str = "left",
+    #     left_truncate_len: int = None,
+    #     truncation: bool = False,
+    # ) -> Tuple[torch.Tensor, torch.Tensor]:
+    #     # encode a batch of strings. converts to tensors and pads automatically, unlike tok_encode.
+    #     old_padding_side = self.tokenizer.padding_side
+    #     self.tokenizer.padding_side = padding_side
+
+    #     add_special_tokens = {}
+    #     if self.AUTO_MODEL_CLASS == transformers.AutoModelForCausalLM:
+    #         add_special_tokens = {"add_special_tokens": False or self.add_bos_token}
+
+    #     encoding = self.tokenizer(
+    #         strings,
+    #         truncation=truncation,
+    #         padding="longest",
+    #         return_tensors="pt",
+    #         **add_special_tokens,
+    #     )
+    #     if left_truncate_len:
+    #         encoding["input_ids"] = encoding["input_ids"][:, -left_truncate_len:]
+    #         encoding["attention_mask"] = encoding["attention_mask"][
+    #             :, -left_truncate_len:
+    #         ]
+    #     self.tokenizer.padding_side = old_padding_side
+
+    #     return encoding["input_ids"], encoding["attention_mask"]
+
+    # def tok_decode(self, tokens, skip_special_tokens=True):
+    #     return self.tokenizer.decode(tokens, skip_special_tokens=skip_special_tokens)
+
+    def _model_generate(self, inputs, stop, **gen_kwargs):
+        # TODO: handle max_length
+        # gen_kwargs["image_sizes"] = [visuals[idx].size for idx in range(len(visuals))]
+        if "max_new_tokens" not in gen_kwargs:
+            gen_kwargs["max_new_tokens"] = 1024
+        if "temperature" not in gen_kwargs:
+            gen_kwargs["temperature"] = 0
+        if "top_p" not in gen_kwargs:
+            gen_kwargs["top_p"] = None
+        if "num_beams" not in gen_kwargs:
+            gen_kwargs["num_beams"] = 1
+
+        stopping_criteria = stop_sequences_criteria(
+            self.tokenizer,
+            stop,
+            inputs["input_ids"].shape[1],
+            inputs["input_ids"].shape[0],
+        )
+        return self.model.generate(
+            **inputs,
+            # max_length=max_length,
+            stopping_criteria=stopping_criteria,
+            do_sample=True if gen_kwargs["temperature"] > 0 else False,
+            temperature=gen_kwargs["temperature"],
+            top_p=gen_kwargs["top_p"],
+            num_beams=gen_kwargs["num_beams"],
+            max_new_tokens=gen_kwargs["max_new_tokens"],
+            use_cache=True,
+            pad_token_id=self.tokenizer.eos_token_id,
+        )
+
+    def loglikelihood_rolling(self, requests: List[Instance]) -> List[float]:
+        raise NotImplementedError(
+            "model type `hf-multimodal` does not support loglikelihood_rolling. Use 'hf' model type for text-only loglikelihood_rolling tasks"
+        )
+
+    def loglikelihood(self, requests: List[Instance]) -> List[Tuple[float, bool]]:
+        raise NotImplementedError(
+            "model type `hf-multimodal` does not support loglikelihood or multiple choice. Use 'hf' model type for text-only loglikelihood tasks"
+        )
+
+    def generate_until(
+        self, requests: List[Instance], disable_tqdm: bool = False
+    ) -> List[str]:
+        res = []
+
+        def _collate(x):
+            # the negative sign on len(toks) sorts descending - this has a few advantages:
+            # - time estimates will always be over not underestimates, which is more useful for planning
+            # - to know the size of a batch when going through the list, you know the first one is always the batch
+            #   padded context length. this is useful to simplify the batching logic and more importantly to make
+            #   automatic adaptive batches much much easier to implement
+            # - any OOMs will happen right away rather than near the end
+            toks = self.tok_encode(x[0])
+            return -len(toks), x[0]
+
+        pbar = tqdm(
+            total=len(requests),
+            disable=(disable_tqdm or (self.rank != 0)),
+            desc="Running generate_until requests with text+image input",
+        )
+        # TODO: port auto-batch sizing into this.
+
+        # we group requests by their generation_kwargs,
+        # so that we don't try to execute e.g. greedy sampling and temp=0.8 sampling
+        # in the same batch.
+        re_ords = Collator(
+            [reg.args for reg in requests],
+            _collate,
+            group_by="gen_kwargs",
+            group_fn=lambda x: x[1],
+        )
+        chunks = re_ords.get_batched(n=self.batch_size, batch_fn=None)
+
+        ### Up to here: was identical to non-multimodal HFLM generate_until ###
+
+        for chunk in chunks:
+            contexts, all_gen_kwargs, doc_to_visual, doc = zip(
+                *chunk
+            )  # TODO: what should be passed in here as part of a chunk?
+
+            visuals = [
+                vis(d) for vis, d in zip(doc_to_visual, doc)
+            ]  # TODO: I think *fully* flattening is just wrong for bs>1 ?
+
+            ### this part onward: same as HFLM ###
+
+            # we assume all gen kwargs in the batch are the same
+            # this is safe to assume because the `grouper` object ensures it.
+            gen_kwargs = all_gen_kwargs[0]
+            # unpack our keyword arguments.
+            until = None
+            if isinstance(gen_kwargs, dict):
+                kwargs = copy.deepcopy(gen_kwargs)  # edge case for repeats > 1
+                if "until" in kwargs.keys():
+                    until = kwargs.pop("until")
+                    if isinstance(until, str):
+                        until = [until]
+                    elif not isinstance(until, list):
+                        raise ValueError(
+                            f"Expected `kwargs['until']` to be of type Union[str,list] but got {until}"
+                        )
+            else:
+                raise ValueError(
+                    f"Expected `kwargs` to be of type `dict` but got {type(gen_kwargs)}"
+                )
+            # add EOS token to stop sequences
+            eos = self.tok_decode(self.eot_token_id, skip_special_tokens=False)
+            if not until:
+                until = [eos]
+            else:
+                until.append(eos)
+            if "max_gen_toks" in kwargs.keys():
+                max_gen_toks = kwargs.pop("max_gen_toks")
+            else:
+                max_gen_toks = self.max_gen_toks
+
+            ### end stuff that's entirely copied verbatim from HFLM ###
+
+            max_ctx_len = self.max_length - max_gen_toks  # noqa: F841 # TODO: this assumes we are using a causal LM. is that always valid? shouldn't be
+
+            self.tokenizer.padding_side = "left"
+            inputs = self.processor(  # TODO: write this as tok_batch_encode (and allow that to either take a visuals value or None)
+                images=visuals, text=contexts, return_tensors="pt", padding=True
+            ).to(
+                self.device, self.model.dtype
+            )  # TODO: factor out into a tok_batch_encode bit ; truncate from left using max_ctx_len
+
+            print(inputs)
+
+            context_enc = inputs["input_ids"]
+
+            if "max_length" not in kwargs:
+                kwargs["max_length"] = context_enc.shape[1] + max_gen_toks
+
+            cont = self._model_generate(inputs, stop=until, **gen_kwargs)
+
+            ### essentially same as HFLM beyond this line!
+
+            cont_toks_list = cont.tolist()
+            for cont_toks, context in zip(cont_toks_list, contexts):
+                # discard context + left-padding toks if using causal decoder-only LM
+                # if self.AUTO_MODEL_CLASS == transformers.AutoModelForCausalLM: # TODO: ensure this holds for VLMs
+                cont_toks = cont_toks[context_enc.shape[1] :]
+
+                s = self.tok_decode(cont_toks)
+
+                # use secondary stop seqs to cut off should-have-been-stopped content post-hoc
+                for term in until:
+                    if len(term) > 0:
+                        # ignore '' separator,
+                        # for seq2seq case where self.tok_decode(self.eot_token_id) = ''
+                        s = s.split(term)[0]
+
+                res.append(s)
+                self.cache_hook.add_partial("generate_until", (context, gen_kwargs), s)
+                pbar.update(1)
+        # reorder this group of results back to original unsorted form
+        res = re_ords.get_original(res)
+
+        pbar.close()
+        return res
diff --git a/lm_eval/models/huggingface.py b/lm_eval/models/huggingface.py
index 331684f5a6..52e972214e 100644
--- a/lm_eval/models/huggingface.py
+++ b/lm_eval/models/huggingface.py
@@ -30,7 +30,6 @@
 from lm_eval.models.utils import (
     Collator,
     clear_torch_cache,
-    configure_pad_token,
     get_dtype,
     pad_and_concat,
     stop_sequences_criteria,
@@ -254,10 +253,32 @@ def __init__(
         self.logits_cache = logits_cache
         self.vocab_size = self.tokenizer.vocab_size
         # select (or create) a pad token to use
-        self.tokenizer = configure_pad_token(self.tokenizer, model_config=self.config)
+        if self.tokenizer.pad_token:
+            pass
+        elif self.tokenizer.unk_token:
+            self.tokenizer.pad_token_id = self.tokenizer.unk_token_id
+        elif self.tokenizer.eos_token:
+            self.tokenizer.pad_token_id = self.tokenizer.eos_token_id
+        else:
+            if getattr(self.config, "model_type", None) == "qwen":
+                # Qwen's trust_remote_code tokenizer does not allow for adding special tokens
+                self.tokenizer.pad_token = "<|endoftext|>"
+            elif (
+                self.tokenizer.__class__.__name__ == "RWKVWorldTokenizer"
+                or self.tokenizer.__class__.__name__ == "Rwkv5Tokenizer"
+            ):
+                # The RWKV world tokenizer, does not allow for adding special tokens / setting the pad token (which is set as 0)
+                # The additional tokenizer name check is needed, as there exists rwkv4 models with neox tokenizer
+                # ---
+                # Note that the world tokenizer class name, might change in the future for the final huggingface merge
+                # https://github.com/huggingface/transformers/pull/26963
+                assert self.tokenizer.pad_token_id == 0
+            else:
+                self.tokenizer.add_special_tokens({"pad_token": "<|pad|>"})
 
+        # TODO: override this for Gemma
         self.add_bos_token = add_bos_token
-        if getattr(self.config, "model_type", None) in ["gemma", "gemma2"]:
+        if getattr(self.config, "model_type", None) == "gemma":
             self.add_bos_token = True
             eval_logger.info(
                 f"Model type is '{self.config.model_type}', a BOS token will be used as Gemma underperforms without it."
@@ -418,7 +439,16 @@ def _get_backend(
         Helper method during initialization.
         Determines the backend ("causal" (decoder-only) or "seq2seq" (encoder-decoder))
         model type to be used.
+        sets `self.AUTO_MODEL_CLASS` appropriately if not already set.
         """
+        # escape hatch: if we're using a subclass that shouldn't follow
+        # the default _get_backend logic,
+        # then skip over the method.
+        # TODO: this seems very much undesirable in some cases--our code in HFLM
+        # references AutoModelForCausalLM at times to check for equality
+        if self.AUTO_MODEL_CLASS is not None:
+            return
+
         assert backend in ["default", "causal", "seq2seq"]
 
         if backend != "default":

From 34a079e3f402373994239592e057fe99441a7b6e Mon Sep 17 00:00:00 2001
From: lintangsutawika <lintang@eleuther.ai>
Date: Tue, 2 Jul 2024 20:18:05 +0700
Subject: [PATCH 02/60] add doc_to_image

---
 lm_eval/api/task.py | 109 ++++++++++++++++++++++++++++++--------------
 1 file changed, 74 insertions(+), 35 deletions(-)

diff --git a/lm_eval/api/task.py b/lm_eval/api/task.py
index ccfda50906..711576bd8a 100644
--- a/lm_eval/api/task.py
+++ b/lm_eval/api/task.py
@@ -75,6 +75,7 @@ class TaskConfig(dict):
     process_docs: Optional[Callable] = None
     doc_to_text: Optional[Union[Callable, str]] = None
     doc_to_target: Optional[Union[Callable, str]] = None
+    doc_to_image: Union[Callable, str] = None
     doc_to_choice: Optional[Union[Callable, str, dict, list]] = None
     process_results: Optional[Union[Callable, str]] = None
     use_prompt: Optional[str] = None
@@ -365,19 +366,22 @@ def doc_to_text(self, doc):
     def doc_to_target(self, doc):
         pass
 
+    @abc.abstractmethod
+    def doc_to_image(self, doc):
+        pass
+
     def build_all_requests(
         self,
         *,
-        limit: Union[int, None] = None,
-        rank: int = 0,
-        world_size: int = 1,
-        cache_requests: bool = False,
-        rewrite_requests_cache: bool = False,
-        system_instruction: Optional[str] = None,
-        apply_chat_template: bool = False,
-        fewshot_as_multiturn: bool = False,
-        chat_template: Optional[Callable] = None,
-        tokenizer_name: str = "",
+        limit=None,
+        rank=None,
+        world_size=None,
+        cache_requests=False,
+        rewrite_requests_cache=False,
+        system_instruction=None,
+        apply_chat_template=False,
+        fewshot_as_multiturn=False,
+        lm=None,
     ) -> None:
         """Build a set of Instances for a task, and store them in task.instances"""
 
@@ -392,7 +396,7 @@ def build_all_requests(
             if system_instruction is not None
             else ""
         )
-        cache_key += f"-tokenizer{tokenizer_name}"
+        cache_key += f"-tokenizer{lm.tokenizer_name}" if apply_chat_template else ""
 
         cached_instances = load_from_cache(file_name=cache_key)
 
@@ -437,7 +441,7 @@ def build_all_requests(
                 system_instruction,
                 apply_chat_template,
                 fewshot_as_multiturn,
-                chat_template,
+                lm,
             )
 
             # TODO: we should override self.config.repeats if doing greedy gen so users don't waste time+compute
@@ -1015,7 +1019,7 @@ def fewshot_context(
         system_instruction: Optional[str] = None,
         apply_chat_template: bool = False,
         fewshot_as_multiturn: bool = False,
-        chat_template: Optional[Callable] = None,
+        lm=None,
     ) -> str:
         """Returns a fewshot context string that is made up of a prepended description
         (if provided), the `num_fewshot` number of examples, and an appended prompt example.
@@ -1030,8 +1034,8 @@ def fewshot_context(
             Whether to apply the chat template to the fewshot context.
         :param fewshot_as_multiturn: bool
             Whether to provide the fewshot examples as a multiturn conversation or a single user turn.
-        :param chat_template: Callable
-            Chat template to be applied to the fewshot context.
+        :param lm:
+            Language model with definition of the tokenizer/function to use for applying the chat template.
         :returns: str
             The fewshot context.
         """
@@ -1078,7 +1082,7 @@ def fewshot_context(
         example = self.doc_to_text(doc)
         if apply_chat_template:
             if self.multiple_input:
-                return chat_template(labeled_examples)
+                return lm.apply_chat_template(labeled_examples)
             if isinstance(example, str):
                 self.append_target_question(
                     labeled_examples, example, fewshot_as_multiturn
@@ -1090,7 +1094,7 @@ def fewshot_context(
                 for ex in example:
                     chat = deepcopy(labeled_examples)
                     self.append_target_question(chat, ex, fewshot_as_multiturn)
-                    labeled_examples_list.append(chat_template(chat))
+                    labeled_examples_list.append(lm.apply_chat_template(chat))
                 return labeled_examples_list
             # if example is an integer, append the choice or convert to string
             elif isinstance(example, int):
@@ -1104,7 +1108,7 @@ def fewshot_context(
                         labeled_examples, str(example), fewshot_as_multiturn
                     )
                 # return lm.apply_chat_template(labeled_examples)
-            return chat_template(labeled_examples)
+            return lm.apply_chat_template(labeled_examples)
         else:
             if self.multiple_input:
                 return labeled_examples
@@ -1261,9 +1265,27 @@ def doc_to_choice(self, doc: Any) -> List[str]:
         else:
             raise TypeError
 
+    def doc_to_image(self, doc: Any) -> Union[int, str, list]:
+        if self.config.doc_to_image is None:
+            eval_logger.error("doc_to_image was called but not set in config")
+        else:
+            doc_to_image = self.config.doc_to_image
+
+        if isinstance(self.config.doc_to_image, str):
+            if doc_to_image in self.features:
+                return doc[doc_to_image]
+            else:
+                return ast.literal_eval(utils.apply_template(doc_to_image, doc))
+        elif callable(doc_to_image):
+            return doc_to_image(doc)
+        else:
+            return None
+
     def construct_requests(
         self, doc: dict, ctx: str, **kwargs
     ) -> Union[List[Instance], Instance]:
+        aux_arguments = None
+
         if self.OUTPUT_TYPE == "loglikelihood":
             arguments = (ctx, self.doc_to_target(doc))
         elif self.OUTPUT_TYPE == "loglikelihood_rolling":
@@ -1281,16 +1303,6 @@ def construct_requests(
                 # Otherwise they are placed in the continuation
                 arguments = [(ctx, f"{target_delimiter}{cont}") for cont in choices]
 
-            request_list = [
-                Instance(
-                    request_type="loglikelihood",
-                    doc=doc,
-                    arguments=arg,
-                    idx=i,
-                    **kwargs,
-                )
-                for i, arg in enumerate(arguments)
-            ]
             # TODO: we should raise a warning telling users this will at most ~2x runtime.
             if "acc_mutual_info" in self._metric_fn_list.keys():
                 # if we are calculating multiple choice accuracy
@@ -1299,25 +1311,52 @@ def construct_requests(
                 # here mutual info refers to calculating
                 # log(P(choice|ctx) / P(choice)) = log(P(choice|ctx)) - log(P(choice))
                 # in other words normalizing by subtracting the unconditional logprob of each choice.
+                aux_arguments = [("", f"{choice}") for choice in choices]
+
+        elif self.OUTPUT_TYPE == "generate_until":
+            arguments = (ctx, deepcopy(self.config.generation_kwargs))
+
+        multimodal_arg = {}
+        if self.doc_to_image:
+            multimodal_arg = {
+                **multimodal_arg,
+                **{"visual": self.doc_to_image(doc)},
+            }
+
+        if bool(multimodal_arg):
+            if isinstance(arguments, list):
+                arguments = [arg + (multimodal_arg,) for arg in arguments]
+            else:
+                arguments = arguments + (multimodal_arg,)
+
+        if isinstance(arguments, type):
+            if aux_arguments is not None:
+                all_arg_list = [arguments, aux_arguments]
+            else:
+                all_arg_list = [arguments]
+            request_list = []
+            for arg_list in all_arg_list:
                 request_list.extend(
                     [
                         Instance(
                             request_type="loglikelihood",
                             doc=doc,
-                            arguments=("", "{}".format(choice)),
+                            arguments=arg,
                             idx=i,
                             **kwargs,
                         )
-                        for i, choice in enumerate(choices)
+                        for i, arg in enumerate(arg_list)
                     ]
                 )
-            return request_list
 
-        elif self.OUTPUT_TYPE == "generate_until":
-            arguments = (ctx, deepcopy(self.config.generation_kwargs))
+            return request_list
 
         return Instance(
-            request_type=self.OUTPUT_TYPE, doc=doc, arguments=arguments, idx=0, **kwargs
+            request_type=self.OUTPUT_TYPE,
+            doc=doc,
+            arguments=arguments,
+            idx=0,
+            **kwargs,
         )
 
     def process_results(self, doc, results):
@@ -1526,7 +1565,7 @@ def __repr__(self):
             f"group_name={getattr(self.config, 'group', None)},"
             f"output_type={self.OUTPUT_TYPE},"
             f"num_fewshot={getattr(self.config, 'num_fewshot', None)},"
-            f"num_samples={len(self.eval_docs)})"
+            f"num_samples={len(self.eval_docs)})",
         )
 
 

From 8bce8cf684e6b0d27fa54d64b9d068d57b4799e3 Mon Sep 17 00:00:00 2001
From: lintangsutawika <lintang@eleuther.ai>
Date: Tue, 2 Jul 2024 20:18:13 +0700
Subject: [PATCH 03/60] add mmmu tasks

---
 lm_eval/tasks/mmmu/_template_yaml             |  20 +
 lm_eval/tasks/mmmu/art_and_design.yaml        |  19 +
 lm_eval/tasks/mmmu/business.yaml              |  23 +
 lm_eval/tasks/mmmu/health_and_medicine.yaml   |  23 +
 .../mmmu/humanities_and_social_sciences.yaml  |  19 +
 lm_eval/tasks/mmmu/mmmu.yaml                  |   8 +
 lm_eval/tasks/mmmu/science.yaml               |  23 +
 lm_eval/tasks/mmmu/tech_and_engineering.yaml  |  31 ++
 lm_eval/tasks/mmmu/utils.py                   | 394 ++++++++++++++++++
 9 files changed, 560 insertions(+)
 create mode 100644 lm_eval/tasks/mmmu/_template_yaml
 create mode 100644 lm_eval/tasks/mmmu/art_and_design.yaml
 create mode 100644 lm_eval/tasks/mmmu/business.yaml
 create mode 100644 lm_eval/tasks/mmmu/health_and_medicine.yaml
 create mode 100644 lm_eval/tasks/mmmu/humanities_and_social_sciences.yaml
 create mode 100644 lm_eval/tasks/mmmu/mmmu.yaml
 create mode 100644 lm_eval/tasks/mmmu/science.yaml
 create mode 100644 lm_eval/tasks/mmmu/tech_and_engineering.yaml
 create mode 100644 lm_eval/tasks/mmmu/utils.py

diff --git a/lm_eval/tasks/mmmu/_template_yaml b/lm_eval/tasks/mmmu/_template_yaml
new file mode 100644
index 0000000000..0230299af0
--- /dev/null
+++ b/lm_eval/tasks/mmmu/_template_yaml
@@ -0,0 +1,20 @@
+dataset_path: MMMU/MMMU
+validation_split: validation
+output_type: generate_until
+doc_to_image: !function utils.doc_to_image
+doc_to_text: !function utils.doc_to_text
+doc_to_target: "answer"
+process_results: !function utils.process_results
+generation_kwargs:
+  until:
+    - "<|endoftext|>"
+  temperature: 1.0
+  do_sample: false
+  max_gen_toks: 512
+  top_p: 0.94
+  repetition_penalty: 1.0
+  image_aspect_ratio: original
+metric_list:
+  - metric: mmmu_acc
+    aggregation: mean
+    higher_is_better: true
diff --git a/lm_eval/tasks/mmmu/art_and_design.yaml b/lm_eval/tasks/mmmu/art_and_design.yaml
new file mode 100644
index 0000000000..700f1df904
--- /dev/null
+++ b/lm_eval/tasks/mmmu/art_and_design.yaml
@@ -0,0 +1,19 @@
+group: mmmu_art_and_design
+group_alias: Art and Design
+task:
+  - task: mmmu_art
+    include: _template_yaml
+    task_alias: Art
+    dataset_name: Art
+  - task: mmmu_art_theory
+    include: _template_yaml
+    task_alias: Art Theory
+    dataset_name: Art_Theory
+  - task: mmmu_design
+    include: _template_yaml
+    task_alias: Design
+    dataset_name: Design
+  - task: mmmu_music
+    include: _template_yaml
+    task_alias: Music
+    dataset_name: Music
diff --git a/lm_eval/tasks/mmmu/business.yaml b/lm_eval/tasks/mmmu/business.yaml
new file mode 100644
index 0000000000..bc5f26e00c
--- /dev/null
+++ b/lm_eval/tasks/mmmu/business.yaml
@@ -0,0 +1,23 @@
+group: mmmu_business
+group_alias: Business
+task:
+  # - task: mmmu_accounting
+  #   include: _template_yaml
+  #   task_alias: Accounting
+  #   dataset_name: Accounting
+  - task: mmmu_economics
+    include: _template_yaml
+    task_alias: Economics
+    dataset_name: Economics
+  # - task: mmmu_finance
+  #   include: _template_yaml
+  #   task_alias: Finance
+  #   dataset_name: Finance
+  # - task: mmmu_manage
+  #   include: _template_yaml
+  #   task_alias: Manage
+  #   dataset_name: Manage
+  # - task: mmmu_marketing
+  #   include: _template_yaml
+  #   task_alias: Marketing
+  #   dataset_name: Marketing
diff --git a/lm_eval/tasks/mmmu/health_and_medicine.yaml b/lm_eval/tasks/mmmu/health_and_medicine.yaml
new file mode 100644
index 0000000000..9616c99de2
--- /dev/null
+++ b/lm_eval/tasks/mmmu/health_and_medicine.yaml
@@ -0,0 +1,23 @@
+group: mmmu_health_and_medicine
+group_alias: Health and Medicine
+task:
+  - task: mmmu_basic_medical_science
+    include: _template_yaml
+    task_alias: Basic Medical Science
+    dataset_name: Basic_Medical_Science
+  - task: mmmu_clinical_medicine
+    include: _template_yaml
+    task_alias: Clinical Medicine
+    dataset_name: Clinical_Medicine
+  - task: mmmu_diagnostics_and_laboratory_medicine
+    include: _template_yaml
+    task_alias: Diagnostics and Laboratory Medicine
+    dataset_name: Diagnostics_and_Laboratory_Medicine
+  - task: mmmu_pharmacy
+    include: _template_yaml
+    task_alias: Pharmacy
+    dataset_name: Pharmacy
+  - task: mmmu_public_health
+    include: _template_yaml
+    task_alias: Public Health
+    dataset_name: Public_Health
diff --git a/lm_eval/tasks/mmmu/humanities_and_social_sciences.yaml b/lm_eval/tasks/mmmu/humanities_and_social_sciences.yaml
new file mode 100644
index 0000000000..583997c686
--- /dev/null
+++ b/lm_eval/tasks/mmmu/humanities_and_social_sciences.yaml
@@ -0,0 +1,19 @@
+group: mmmu_humanities_and_social_science
+group_alias: Humanities and Social Science
+task:
+  - task: mmmu_history
+    include: _template_yaml
+    task_alias: History
+    dataset_name: History
+  - task: mmmu_literature
+    include: _template_yaml
+    task_alias: Literature
+    dataset_name: Literature
+  - task: mmmu_sociology
+    include: _template_yaml
+    task_alias: Sociology
+    dataset_name: Sociology
+  - task: mmmu_psychology
+    include: _template_yaml
+    task_alias: Psychology
+    dataset_name: Psychology
diff --git a/lm_eval/tasks/mmmu/mmmu.yaml b/lm_eval/tasks/mmmu/mmmu.yaml
new file mode 100644
index 0000000000..8b490634eb
--- /dev/null
+++ b/lm_eval/tasks/mmmu/mmmu.yaml
@@ -0,0 +1,8 @@
+group: mmmu
+task:
+  - mmmu_art_and_design
+  - mmmu_business
+  - mmmu_health_and_medicine
+  - mmmu_humanities_and_social_science
+  - mmmu_science
+  - mmmu_tech_and_engineering
diff --git a/lm_eval/tasks/mmmu/science.yaml b/lm_eval/tasks/mmmu/science.yaml
new file mode 100644
index 0000000000..09bfcf76c0
--- /dev/null
+++ b/lm_eval/tasks/mmmu/science.yaml
@@ -0,0 +1,23 @@
+group: mmmu_science
+group_alias: Science
+task:
+  - task: mmmu_biology
+    include: _template_yaml
+    task_alias: Biology
+    dataset_name: Biology
+  - task: mmmu_chemistry
+    include: _template_yaml
+    task_alias: Chemistry
+    dataset_name: Chemistry
+  - task: mmmu_geography
+    include: _template_yaml
+    task_alias: Geography
+    dataset_name: Geography
+  - task: mmmu_math
+    include: _template_yaml
+    task_alias: Math
+    dataset_name: Math
+  - task: mmmu_physics
+    include: _template_yaml
+    task_alias: Physics
+    dataset_name: Physics
diff --git a/lm_eval/tasks/mmmu/tech_and_engineering.yaml b/lm_eval/tasks/mmmu/tech_and_engineering.yaml
new file mode 100644
index 0000000000..d5bf652d00
--- /dev/null
+++ b/lm_eval/tasks/mmmu/tech_and_engineering.yaml
@@ -0,0 +1,31 @@
+group: mmmu_tech_and_engineering
+group_alias: Tech and Engineering
+task:
+  - task: mmmu_agriculture
+    include: _template_yaml
+    task_alias: Agriculture
+    dataset_name: Agriculture
+  - task: mmmu_architecture_and_engineering
+    include: _template_yaml
+    task_alias: Architecture and Engineering
+    dataset_name: Architecture_and_Engineering
+  - task: mmmu_computer_science
+    include: _template_yaml
+    task_alias: Computer Science
+    dataset_name: Computer_Science
+  - task: mmmu_electronics
+    include: _template_yaml
+    task_alias: Electronics
+    dataset_name: Electronics
+  - task: mmmu_energy_and_power
+    include: _template_yaml
+    task_alias: Energy and Power
+    dataset_name: Energy_and_Power
+  - task: mmmu_materials
+    include: _template_yaml
+    task_alias: Materials
+    dataset_name: Materials
+  - task: mmmu_mechanical_engineering
+    include: _template_yaml
+    task_alias: Mechanical Engineering
+    dataset_name: Mechanical_Engineering
diff --git a/lm_eval/tasks/mmmu/utils.py b/lm_eval/tasks/mmmu/utils.py
new file mode 100644
index 0000000000..5b55086b11
--- /dev/null
+++ b/lm_eval/tasks/mmmu/utils.py
@@ -0,0 +1,394 @@
+import ast
+import random
+import re
+
+import numpy as np
+
+
+MULTI_CHOICE_PROMPT = "Answer with the option letter from the given choices directly."
+OPEN_ENDED_PROMPT = "Answer the question using a single word or phrase."
+
+
+def replace_images_tokens(input_string):
+    for i in range(1, 8):
+        question_text = f"<image {i}>"
+        query_text = "<image>"
+        if question_text in input_string:
+            input_string = input_string.replace(question_text, query_text)
+    return input_string
+
+
+def parse_options(options):
+    option_letters = [chr(ord("A") + i) for i in range(len(options))]
+    choices_str = "\n".join(
+        [
+            f"{option_letter}. {option}"
+            for option_letter, option in zip(option_letters, options)
+        ]
+    )
+    return choices_str
+
+
+def construct_prompt(doc):
+    question = doc["question"]
+    if doc["question_type"] == "multiple-choice":
+        # Weirdly, data["options"] is a string in MMMU Huggingface dataset
+        parsed_options = parse_options(ast.literal_eval(doc["options"]))
+        # parsed_options already prepends a newline so no need to add space here
+        question = f"{question}\n{parsed_options}\n{MULTI_CHOICE_PROMPT}"
+    else:
+        question = f"{question}\n{OPEN_ENDED_PROMPT}"
+    return question
+
+
+def doc_to_text(doc):
+    question = construct_prompt(doc)
+    return replace_images_tokens(question)
+
+
+def doc_to_image(doc):
+    prompt = construct_prompt(doc)
+    image_tokens = re.findall(r"<image \d+>", prompt)
+    # Remove <> and  swap space as _
+    image_tokens = [
+        image_token.strip("<>").replace(" ", "_") for image_token in image_tokens
+    ]
+    visual = [doc[image_token].convert("RGB") for image_token in image_tokens]
+    return visual
+
+
+def process_results(doc, results):
+    pred = results[0]
+    if doc["question_type"] == "multiple-choice":
+        index2ans, all_choices = get_multi_choice_info(ast.literal_eval(doc["options"]))
+        parsed_pred = parse_multi_choice_response(pred, all_choices, index2ans)
+    else:
+        parsed_pred = parse_open_response(pred)
+
+    sample_dict = {
+        "id": doc["id"],
+        "subdomain": extract_subset_name(doc["id"]),
+        "question_type": doc["question_type"],
+        "answer": doc["answer"],
+        "parsed_pred": parsed_pred,
+    }
+    _, result_dict = evaluate_mmmu([sample_dict])
+    return result_dict
+
+
+def extract_subset_name(input_string):
+    # Define a regex pattern to match "validation_" at the beginning and "_<number>" at the end
+    split = input_string.split("_")[0]
+    pattern = re.compile(rf"^{split}_(.+?)_\d+$")
+    match = pattern.search(input_string)
+    if match:
+        return match.group(1)
+    else:
+        raise ValueError(f'No match found in "{input_string}"')
+
+
+##################
+# Helper functions written by official MMMU repo.
+##################
+
+
+def calculate_ins_level_acc(results):
+    """Calculate the instruction level accuracy for given Subject results
+    https://github.com/MMMU-Benchmark/MMMU/blob/51ce7f3e829c16bb44bc5445782686b4c3508794/eval/eval_utils.py#L246
+    """
+    acc = 0
+    ins_num = 0
+    for cat_results in results.values():
+        acc += cat_results["mmmu_acc"] * cat_results["num_example"]
+        ins_num += cat_results["num_example"]
+    if ins_num == 0:
+        return 0
+    return acc / ins_num
+
+
+def eval_multi_choice(gold_i, pred_i):
+    """
+    Evaluate a multiple choice instance.
+    https://github.com/MMMU-Benchmark/MMMU/blob/51ce7f3e829c16bb44bc5445782686b4c3508794/eval/eval_utils.py#L175
+    """
+    correct = False
+    # only they are exactly the same, we consider it as correct
+    if isinstance(gold_i, list):
+        for answer in gold_i:
+            if answer == pred_i:
+                correct = True
+                break
+    else:  # gold_i is a string
+        if gold_i == pred_i:
+            correct = True
+    return correct
+
+
+def eval_open(gold_i, pred_i):
+    """
+    Evaluate an open question instance
+    https://github.com/MMMU-Benchmark/MMMU/blob/51ce7f3e829c16bb44bc5445782686b4c3508794/eval/eval_utils.py#L191
+    """
+    correct = False
+    if isinstance(gold_i, list):
+        # use float to avoid trivial matches
+        norm_answers = []
+        for answer in gold_i:
+            norm_answers.extend(normalize_str(answer))
+    else:
+        norm_answers = normalize_str(gold_i)
+    for pred in pred_i:  # pred is already normalized in parse response phase
+        if isinstance(pred, str):  # if it's a string, then find if ans in the pred_i
+            for norm_ans in norm_answers:
+                # only see if the string answer in the string pred
+                if isinstance(norm_ans, str) and norm_ans in pred:
+                    if not correct:
+                        correct = True
+                    break
+        else:  # it's a float number
+            if pred in norm_answers:
+                if not correct:
+                    correct = True
+                break
+    return correct
+
+
+def evaluate_mmmu(samples):
+    """
+    Batch evaluation for multiple choice and open questions.
+    https://github.com/MMMU-Benchmark/MMMU/blob/51ce7f3e829c16bb44bc5445782686b4c3508794/eval/eval_utils.py#L219
+    """
+    pred_correct = 0
+    judge_dict = dict()
+    for sample in samples:
+        gold_i = sample["answer"]
+        pred_i = sample["parsed_pred"]
+        if sample["question_type"] == "multiple-choice":
+            correct = eval_multi_choice(gold_i, pred_i)
+        else:  # open question
+            correct = eval_open(gold_i, pred_i)
+
+        if correct:
+            judge_dict[sample["id"]] = "Correct"
+            pred_correct += 1
+        else:
+            judge_dict[sample["id"]] = "Wrong"
+
+    if len(samples) == 0:
+        return {"mmmu_acc": 0}
+    return judge_dict, {"mmmu_acc": pred_correct / len(samples)}
+
+
+def parse_multi_choice_response(response, all_choices, index2ans):
+    """
+    Parse the prediction from the generated response.
+    Return the predicted index e.g., A, B, C, D.
+    https://github.com/MMMU-Benchmark/MMMU/blob/51ce7f3e829c16bb44bc5445782686b4c3508794/eval/eval_utils.py#L10
+    """
+    for char in [",", ".", "!", "?", ";", ":", "'"]:
+        response = response.strip(char)
+    response = " " + response + " "  # add space to avoid partial match
+
+    index_ans = True
+    ans_with_brack = False
+    candidates = []
+    for choice in all_choices:  # e.g., (A) (B) (C) (D)
+        if f"({choice})" in response:
+            candidates.append(choice)
+            ans_with_brack = True
+
+    if len(candidates) == 0:
+        for choice in all_choices:  # e.g., A B C D
+            if f"{choice} " in response:
+                candidates.append(choice)
+
+    if len(candidates) == 0:
+        for choice in all_choices:  # e.g., A. B. C. D.
+            if f"{choice}." in response:
+                candidates.append(choice)
+
+    # if all above doesn't get candidates, check if the content is larger than 5 tokens and try to parse the example
+    if len(candidates) == 0 and len(response.split()) > 5:
+        for index, ans in index2ans.items():
+            if ans.lower() in response.lower():
+                candidates.append(index)
+                index_ans = False  # it's content ans.
+
+    if len(candidates) == 0:  # still not get answer, randomly choose one.
+        pred_index = random.choice(all_choices)
+    elif len(candidates) > 1:
+        start_indexes = []
+        if index_ans:
+            if ans_with_brack:
+                for can in candidates:
+                    index = response.rfind(f"({can})")
+                    start_indexes.append(index)  # -1 will be ignored anyway
+                # start_indexes = [generated_response.index(f'({can})') for can in candidates]
+            else:
+                for can in candidates:
+                    index = response.rfind(f" {can} ")
+                    start_indexes.append(index)
+        else:
+            for can in candidates:
+                index = response.lower().rfind(index2ans[can].lower())
+                start_indexes.append(index)
+        # get the last one
+        pred_index = candidates[np.argmax(start_indexes)]
+    else:  # if only one candidate, use it.
+        pred_index = candidates[0]
+
+    return pred_index
+
+
+def extract_numbers(string):
+    """
+    Exact all forms of numbers from a string with regex.
+    https://github.com/MMMU-Benchmark/MMMU/blob/51ce7f3e829c16bb44bc5445782686b4c3508794/eval/eval_utils.py#L100
+    """
+    # Pattern for numbers with commas
+    pattern_commas = r"-?\b\d{1,3}(?:,\d{3})+\b"
+    # Pattern for scientific notation
+    pattern_scientific = r"-?\d+(?:\.\d+)?[eE][+-]?\d+"
+    # Pattern for simple numbers without commas
+    pattern_simple = r"-?(?:\d+\.\d+|\.\d+|\d+\b)(?![eE][+-]?\d+)(?![,\d])"
+
+    # Extract numbers with commas
+    numbers_with_commas = re.findall(pattern_commas, string)
+    # Extract numbers in scientific notation
+    numbers_scientific = re.findall(pattern_scientific, string)
+    # Extract simple numbers without commas
+    numbers_simple = re.findall(pattern_simple, string)
+
+    # Combine all extracted numbersz
+    all_numbers = numbers_with_commas + numbers_scientific + numbers_simple
+    return all_numbers
+
+
+def check_is_number(string):
+    """
+    Check if the given string a number.
+    https://github.com/MMMU-Benchmark/MMMU/blob/51ce7f3e829c16bb44bc5445782686b4c3508794/eval/eval_utils.py#L65
+    """
+    try:
+        float(string.replace(",", ""))
+        return True
+    except ValueError:
+        # check if there's comma inside
+        return False
+
+
+def normalize_str(string):
+    """
+    Normalize the str to lower case and make them float numbers if possible.
+    https://github.com/MMMU-Benchmark/MMMU/blob/51ce7f3e829c16bb44bc5445782686b4c3508794/eval/eval_utils.py#L76
+    """
+    # check if characters in the string
+
+    # if number, numerize it.
+    string = string.strip()
+
+    is_number = check_is_number(string)
+
+    if is_number:
+        string = string.replace(",", "")
+        string = float(string)
+        # leave 2 decimal
+        string = round(string, 2)
+        return [string]
+    else:  # it's likely to be a string
+        # lower it
+        string = string.lower()
+        if len(string) == 1:
+            return [" " + string, string + " "]  # avoid trivial matches
+        return [string]
+
+
+def parse_open_response(response):
+    """
+    Parse the prediction from the generated response.
+    Return a list of predicted strings or numbers.
+    https://github.com/MMMU-Benchmark/MMMU/blob/51ce7f3e829c16bb44bc5445782686b4c3508794/eval/eval_utils.py#L122
+    """
+
+    # content = content.strip("\n").strip(".").strip(" ")
+    def get_key_subresponses(response):
+        key_responses = []
+        response = response.strip().strip(".").lower()
+        sub_responses = re.split(r"\.\s(?=[A-Z])|\n", response)
+        indicators_of_keys = [
+            "could be ",
+            "so ",
+            "is ",
+            "thus ",
+            "therefore ",
+            "final ",
+            "answer ",
+            "result ",
+        ]
+        key_responses = []
+        for index, resp in enumerate(sub_responses):
+            # if last one, accept it's an equation (the entire response can be just one sentence with equation)
+            if index == len(sub_responses) - 1:
+                indicators_of_keys.extend(["="])
+            shortest_key_response = None  # the shortest response that may contain the answer (tail part of the response)
+            for indicator in indicators_of_keys:
+                if indicator in resp:
+                    if not shortest_key_response:
+                        shortest_key_response = resp.split(indicator)[-1].strip()
+                    else:
+                        if len(resp.split(indicator)[-1].strip()) < len(
+                            shortest_key_response
+                        ):
+                            shortest_key_response = resp.split(indicator)[-1].strip()
+                    # key_responses.append(resp.split(indicator)[1].strip())
+
+            if shortest_key_response:
+                # and it's not trivial
+                if shortest_key_response.strip() not in [
+                    ":",
+                    ",",
+                    ".",
+                    "!",
+                    "?",
+                    ";",
+                    ":",
+                    "'",
+                ]:
+                    key_responses.append(shortest_key_response)
+        if len(key_responses) == 0:  # did not found any
+            return [response]
+        return key_responses
+
+    # pdb.set_trace()
+    key_responses = get_key_subresponses(response)
+
+    pred_list = key_responses.copy()  # keep the original string response
+    for resp in key_responses:
+        pred_list.extend(extract_numbers(resp))
+
+    tmp_pred_list = []
+    for i in range(len(pred_list)):
+        tmp_pred_list.extend(normalize_str(pred_list[i]))
+    pred_list = tmp_pred_list
+
+    # remove duplicates
+    pred_list = list(set(pred_list))
+
+    return pred_list
+
+
+def get_multi_choice_info(options):
+    """
+    Given the list of options for multiple choice question
+    Return the index2ans and all_choices
+    https://github.com/MMMU-Benchmark/MMMU/blob/51ce7f3e829c16bb44bc5445782686b4c3508794/eval/data_utils.py#L54
+    """
+
+    start_chr = "A"
+    all_choices = []
+    index2ans = {}
+    for i, option in enumerate(options):
+        index2ans[chr(ord(start_chr) + i)] = option
+        all_choices.append(chr(ord(start_chr) + i))
+
+    return index2ans, all_choices

From 1c94a54d9c532bdf8255629493f9953b32977cdc Mon Sep 17 00:00:00 2001
From: haileyschoelkopf <hailey@eleuther.ai>
Date: Tue, 9 Jul 2024 17:10:55 +0000
Subject: [PATCH 04/60] fix merge conflicts

---
 lm_eval/api/task.py | 39 ++++++++++++++++++++-------------------
 1 file changed, 20 insertions(+), 19 deletions(-)

diff --git a/lm_eval/api/task.py b/lm_eval/api/task.py
index f054542e0f..1f18060efa 100644
--- a/lm_eval/api/task.py
+++ b/lm_eval/api/task.py
@@ -378,22 +378,23 @@ def doc_to_text(self, doc):
     def doc_to_target(self, doc):
         pass
 
-    @abc.abstractmethod
+    # not an abstractmethod because not every language-only task has to implement this
     def doc_to_image(self, doc):
-        pass
+        raise NotImplementedError
 
     def build_all_requests(
         self,
         *,
-        limit=None,
-        rank=None,
-        world_size=None,
-        cache_requests=False,
-        rewrite_requests_cache=False,
-        system_instruction=None,
-        apply_chat_template=False,
-        fewshot_as_multiturn=False,
-        lm=None,
+        limit: Union[int, None] = None,
+        rank: int = 0,
+        world_size: int = 1,
+        cache_requests: bool = False,
+        rewrite_requests_cache: bool = False,
+        system_instruction: Optional[str] = None,
+        apply_chat_template: bool = False,
+        fewshot_as_multiturn: bool = False,
+        chat_template: Optional[Callable] = None,
+        tokenizer_name: str = "",
     ) -> None:
         """Build a set of Instances for a task, and store them in task.instances"""
 
@@ -408,7 +409,7 @@ def build_all_requests(
             if system_instruction is not None
             else ""
         )
-        cache_key += f"-tokenizer{lm.tokenizer_name}" if apply_chat_template else ""
+        cache_key += f"-tokenizer{tokenizer_name}"
 
         cached_instances = load_from_cache(file_name=cache_key)
 
@@ -453,7 +454,7 @@ def build_all_requests(
                 system_instruction,
                 apply_chat_template,
                 fewshot_as_multiturn,
-                lm,
+                chat_template,
             )
 
             # TODO: we should override self.config.repeats if doing greedy gen so users don't waste time+compute
@@ -1031,7 +1032,7 @@ def fewshot_context(
         system_instruction: Optional[str] = None,
         apply_chat_template: bool = False,
         fewshot_as_multiturn: bool = False,
-        lm=None,
+        chat_template: Optional[Callable] = None,
     ) -> str:
         """Returns a fewshot context string that is made up of a prepended description
         (if provided), the `num_fewshot` number of examples, and an appended prompt example.
@@ -1046,8 +1047,8 @@ def fewshot_context(
             Whether to apply the chat template to the fewshot context.
         :param fewshot_as_multiturn: bool
             Whether to provide the fewshot examples as a multiturn conversation or a single user turn.
-        :param lm:
-            Language model with definition of the tokenizer/function to use for applying the chat template.
+        :param chat_template:
+            callable (from lm.apply_chat_template) that takes in a list[Dict] chat transcript and renders it into a string.
         :returns: str
             The fewshot context.
         """
@@ -1094,7 +1095,7 @@ def fewshot_context(
         example = self.doc_to_text(doc)
         if apply_chat_template:
             if self.multiple_input:
-                return lm.apply_chat_template(labeled_examples)
+                return chat_template(labeled_examples)
             if isinstance(example, str):
                 self.append_target_question(
                     labeled_examples, example, fewshot_as_multiturn
@@ -1106,7 +1107,7 @@ def fewshot_context(
                 for ex in example:
                     chat = deepcopy(labeled_examples)
                     self.append_target_question(chat, ex, fewshot_as_multiturn)
-                    labeled_examples_list.append(lm.apply_chat_template(chat))
+                    labeled_examples_list.append(chat_template(chat))
                 return labeled_examples_list
             # if example is an integer, append the choice or convert to string
             elif isinstance(example, int):
@@ -1120,7 +1121,7 @@ def fewshot_context(
                         labeled_examples, str(example), fewshot_as_multiturn
                     )
                 # return lm.apply_chat_template(labeled_examples)
-            return lm.apply_chat_template(labeled_examples)
+            return chat_template(labeled_examples)
         else:
             if self.multiple_input:
                 return labeled_examples

From 9692aa0543796af0753bfd8bd5df00993ff677d9 Mon Sep 17 00:00:00 2001
From: haileyschoelkopf <hailey@eleuther.ai>
Date: Tue, 9 Jul 2024 18:14:39 +0000
Subject: [PATCH 05/60] add lintang's changes to hf_vlms.py

---
 lm_eval/models/hf_vlms.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/lm_eval/models/hf_vlms.py b/lm_eval/models/hf_vlms.py
index 416567d4a8..43a726480e 100644
--- a/lm_eval/models/hf_vlms.py
+++ b/lm_eval/models/hf_vlms.py
@@ -213,13 +213,13 @@ def _collate(x):
         ### Up to here: was identical to non-multimodal HFLM generate_until ###
 
         for chunk in chunks:
-            contexts, all_gen_kwargs, doc_to_visual, doc = zip(
+            contexts, all_gen_kwargs, aux_arguments = zip(
                 *chunk
-            )  # TODO: what should be passed in here as part of a chunk?
+            )  # TODO: can we cut down further on number of distinct things we pass around?
 
             visuals = [
-                vis(d) for vis, d in zip(doc_to_visual, doc)
-            ]  # TODO: I think *fully* flattening is just wrong for bs>1 ?
+                arg["visual"] for arg in aux_arguments
+            ]  # TODO: I think *fully* flattening is just wrong for bs>1 ??
 
             ### this part onward: same as HFLM ###
 

From 90ba03ac203e8dd2372cc85522de3f6611dcb96a Mon Sep 17 00:00:00 2001
From: lintangsutawika <lintang@eleuther.ai>
Date: Wed, 10 Jul 2024 03:29:04 +0000
Subject: [PATCH 06/60] fix doc_to_image

---
 lm_eval/api/task.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/lm_eval/api/task.py b/lm_eval/api/task.py
index 1f18060efa..3423ccbd04 100644
--- a/lm_eval/api/task.py
+++ b/lm_eval/api/task.py
@@ -1280,7 +1280,7 @@ def doc_to_choice(self, doc: Any) -> List[str]:
 
     def doc_to_image(self, doc: Any) -> Union[int, str, list]:
         if self.config.doc_to_image is None:
-            eval_logger.error("doc_to_image was called but not set in config")
+            return None
         else:
             doc_to_image = self.config.doc_to_image
 

From aa6c50e8e092357d958cba030e3053c943048af8 Mon Sep 17 00:00:00 2001
From: lintangsutawika <lintang@eleuther.ai>
Date: Wed, 10 Jul 2024 13:42:37 +0000
Subject: [PATCH 07/60] added yaml_path for config-loading

---
 lm_eval/tasks/__init__.py | 14 ++++++++------
 1 file changed, 8 insertions(+), 6 deletions(-)

diff --git a/lm_eval/tasks/__init__.py b/lm_eval/tasks/__init__.py
index b567a3d4b9..d307ba4659 100644
--- a/lm_eval/tasks/__init__.py
+++ b/lm_eval/tasks/__init__.py
@@ -165,12 +165,13 @@ def _load_individual_task_or_group(
         name_or_config: Optional[Union[str, dict]] = None,
         parent_name: Optional[str] = None,
         update_config: Optional[dict] = None,
+        yaml_path: Optional[str] = None,
     ) -> Mapping:
-        def _load_task(config, task):
+        def _load_task(config, task, yaml_path=None):
             if "include" in config:
                 config = {
                     **utils.load_yaml_config(
-                        yaml_path=None,
+                        yaml_path=yaml_path,
                         yaml_config={"include": config.pop("include")},
                         mode="full",
                     ),
@@ -186,7 +187,6 @@ def _load_task(config, task):
                     task_object.config.task = config["task"]
             else:
                 task_object = ConfigurableTask(config=config)
-
             return {task: task_object}
 
         def _get_group_and_subtask_from_config(config):
@@ -228,6 +228,7 @@ def _process_group_config(config, update_config=None):
                     group_name, subtask_list = _get_group_and_subtask_from_config(
                         group_config
                     )
+                    yaml_path = self._get_yaml_path(group_name.group)
                 else:
                     if self._name_is_tag(name_or_config):
                         fn = partial(
@@ -246,7 +247,8 @@ def _process_group_config(config, update_config=None):
 
         if isinstance(name_or_config, dict):
             if self._config_is_task(name_or_config):
-                name = name_or_config.pop("task")
+                # name = name_or_config.pop("task")
+                name = name_or_config["task"]
                 if update_config is not None:
                     name_or_config = {**name_or_config, **update_config}
                 # If the name is registered as a group
@@ -290,7 +292,7 @@ def _process_group_config(config, update_config=None):
                         }
                     else:
                         task_config = name_or_config
-                    return _load_task(task_config, task=name)
+                    return _load_task(task_config, task=name, yaml_path=yaml_path)
             else:
                 group_config, update_config = _process_group_config(name_or_config)
                 group_name, subtask_list = _get_group_and_subtask_from_config(
@@ -301,6 +303,7 @@ def _process_group_config(config, update_config=None):
             self._load_individual_task_or_group,
             parent_name=group_name,
             update_config=update_config,
+            yaml_path=yaml_path,
         )
         return {
             group_name: dict(collections.ChainMap(*map(fn, reversed(subtask_list))))
@@ -558,5 +561,4 @@ def get_task_dict(
     # and we'd be unsure which to use and report.)
     # we explicitly check and error in this case.
     _check_duplicates(get_subtask_list(final_task_dict))
-
     return final_task_dict

From 7c76574ba3e8bf15a49819d25faa92badbb0c7f0 Mon Sep 17 00:00:00 2001
From: lintangsutawika <lintang@eleuther.ai>
Date: Wed, 10 Jul 2024 13:45:42 +0000
Subject: [PATCH 08/60] revert

---
 lm_eval/tasks/mmmu/business.yaml | 32 ++++++++++++++++----------------
 1 file changed, 16 insertions(+), 16 deletions(-)

diff --git a/lm_eval/tasks/mmmu/business.yaml b/lm_eval/tasks/mmmu/business.yaml
index bc5f26e00c..6075fcb7a8 100644
--- a/lm_eval/tasks/mmmu/business.yaml
+++ b/lm_eval/tasks/mmmu/business.yaml
@@ -1,23 +1,23 @@
 group: mmmu_business
 group_alias: Business
 task:
-  # - task: mmmu_accounting
-  #   include: _template_yaml
-  #   task_alias: Accounting
-  #   dataset_name: Accounting
+  - task: mmmu_accounting
+    include: _template_yaml
+    task_alias: Accounting
+    dataset_name: Accounting
   - task: mmmu_economics
     include: _template_yaml
     task_alias: Economics
     dataset_name: Economics
-  # - task: mmmu_finance
-  #   include: _template_yaml
-  #   task_alias: Finance
-  #   dataset_name: Finance
-  # - task: mmmu_manage
-  #   include: _template_yaml
-  #   task_alias: Manage
-  #   dataset_name: Manage
-  # - task: mmmu_marketing
-  #   include: _template_yaml
-  #   task_alias: Marketing
-  #   dataset_name: Marketing
+  - task: mmmu_finance
+    include: _template_yaml
+    task_alias: Finance
+    dataset_name: Finance
+  - task: mmmu_manage
+    include: _template_yaml
+    task_alias: Manage
+    dataset_name: Manage
+  - task: mmmu_marketing
+    include: _template_yaml
+    task_alias: Marketing
+    dataset_name: Marketing

From 1b9deaabf139030aacadc4d41e77e53c92ad3973 Mon Sep 17 00:00:00 2001
From: lintangsutawika <lintang@eleuther.ai>
Date: Wed, 10 Jul 2024 13:46:05 +0000
Subject: [PATCH 09/60] add line to process str type v

---
 lm_eval/utils.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/lm_eval/utils.py b/lm_eval/utils.py
index 62d500ccb1..d5527f8eee 100644
--- a/lm_eval/utils.py
+++ b/lm_eval/utils.py
@@ -361,12 +361,14 @@ def make_table(result_dict, column: str = "results", sort_results: bool = False)
 
             hib = HIGHER_IS_BETTER_SYMBOLS.get(higher_is_better.get(m), "")
 
+            v = "%.4f" % v if isinstance(v, float) else v
+
             if m + "_stderr" + "," + f in dic:
                 se = dic[m + "_stderr" + "," + f]
                 se = "   N/A" if se == "N/A" else "%.4f" % se
-                values.append([k, version, f, n, m, hib, "%.4f" % v, "±", se])
+                values.append([k, version, f, n, m, hib, v, "±", se])
             else:
-                values.append([k, version, f, n, m, hib, "%.4f" % v, "", ""])
+                values.append([k, version, f, n, m, hib, v, "", ""])
             k = ""
             version = ""
     md_writer.value_matrix = values

From 8db0a470cb54ba11f870eee886ea294b9c5f3f32 Mon Sep 17 00:00:00 2001
From: lintangsutawika <lintang@eleuther.ai>
Date: Thu, 11 Jul 2024 13:52:42 +0000
Subject: [PATCH 10/60] update

---
 lm_eval/models/hf_vlms.py | 16 +++++++++++++---
 1 file changed, 13 insertions(+), 3 deletions(-)

diff --git a/lm_eval/models/hf_vlms.py b/lm_eval/models/hf_vlms.py
index 43a726480e..d7278f559e 100644
--- a/lm_eval/models/hf_vlms.py
+++ b/lm_eval/models/hf_vlms.py
@@ -1,6 +1,7 @@
 import copy
 from typing import List, Optional, Tuple, Union
 
+import torch
 import transformers
 from tqdm import tqdm
 
@@ -212,7 +213,15 @@ def _collate(x):
 
         ### Up to here: was identical to non-multimodal HFLM generate_until ###
 
-        for chunk in chunks:
+        for idx, _chunk in enumerate(chunks):
+            if idx == 0:
+                zero_chunk = _chunk
+                chunk = _chunk
+            elif idx == 69:
+                chunk = zero_chunk
+            else:
+                chunk = _chunk
+            chunk = _chunk    
             contexts, all_gen_kwargs, aux_arguments = zip(
                 *chunk
             )  # TODO: can we cut down further on number of distinct things we pass around?
@@ -264,7 +273,7 @@ def _collate(x):
                 self.device, self.model.dtype
             )  # TODO: factor out into a tok_batch_encode bit ; truncate from left using max_ctx_len
 
-            print(inputs)
+            # print(inputs)
 
             context_enc = inputs["input_ids"]
 
@@ -272,7 +281,8 @@ def _collate(x):
                 kwargs["max_length"] = context_enc.shape[1] + max_gen_toks
 
             cont = self._model_generate(inputs, stop=until, **gen_kwargs)
-
+            # del inputs
+            # del _chunk
             ### essentially same as HFLM beyond this line!
 
             cont_toks_list = cont.tolist()

From 9b9ca7bfcfc209ae9c2e42db6b11bd0ce782f3dd Mon Sep 17 00:00:00 2001
From: haileyschoelkopf <hailey@eleuther.ai>
Date: Thu, 11 Jul 2024 13:57:28 +0000
Subject: [PATCH 11/60] modeling cleanup

---
 lm_eval/models/hf_vlms.py | 114 ++++++++---------
 lm_eval/tasks/__init__.py | 254 ++++++++++++--------------------------
 2 files changed, 139 insertions(+), 229 deletions(-)

diff --git a/lm_eval/models/hf_vlms.py b/lm_eval/models/hf_vlms.py
index 43a726480e..54a6d109b0 100644
--- a/lm_eval/models/hf_vlms.py
+++ b/lm_eval/models/hf_vlms.py
@@ -1,6 +1,7 @@
 import copy
-from typing import List, Optional, Tuple, Union
+from typing import Dict, List, Optional, Tuple, Union
 
+import torch
 import transformers
 from tqdm import tqdm
 
@@ -102,51 +103,58 @@ def _create_tokenizer(
 
     #     return encoding
 
-    # def tok_batch_encode(
-    #     self,
-    #     strings: List[str],
-    #     padding_side: str = "left",
-    #     left_truncate_len: int = None,
-    #     truncation: bool = False,
-    # ) -> Tuple[torch.Tensor, torch.Tensor]:
-    #     # encode a batch of strings. converts to tensors and pads automatically, unlike tok_encode.
-    #     old_padding_side = self.tokenizer.padding_side
-    #     self.tokenizer.padding_side = padding_side
-
-    #     add_special_tokens = {}
-    #     if self.AUTO_MODEL_CLASS == transformers.AutoModelForCausalLM:
-    #         add_special_tokens = {"add_special_tokens": False or self.add_bos_token}
-
-    #     encoding = self.tokenizer(
-    #         strings,
-    #         truncation=truncation,
-    #         padding="longest",
-    #         return_tensors="pt",
-    #         **add_special_tokens,
-    #     )
-    #     if left_truncate_len:
-    #         encoding["input_ids"] = encoding["input_ids"][:, -left_truncate_len:]
-    #         encoding["attention_mask"] = encoding["attention_mask"][
-    #             :, -left_truncate_len:
-    #         ]
-    #     self.tokenizer.padding_side = old_padding_side
+    def tok_batch_encode(
+        self,
+        strings: List[str],  # note that input signature of this fn is different
+        visuals,  # TODO: typehint on this
+        padding_side: str = "left",
+        left_truncate_len: int = None,
+        truncation: bool = False,
+    ) -> Dict[
+        str, torch.Tensor
+    ]:  # TODO: note that this return signature differs from HFLM tok_batch_encode.
+        # TODO: we should allow
+
+        # encode a batch of strings. converts to tensors and pads automatically, unlike tok_encode.
+        old_padding_side = self.tokenizer.padding_side
+        self.tokenizer.padding_side = padding_side
+
+        add_special_tokens = {}
+        if self.AUTO_MODEL_CLASS == transformers.AutoModelForCausalLM:
+            add_special_tokens = {"add_special_tokens": False or self.add_bos_token}
+
+        encoding = self.processor(
+            strings,
+            truncation=truncation,
+            padding="longest",
+            return_tensors="pt",
+            **add_special_tokens,
+        ).to(
+            self.device, self.model.dtype
+        )  # TODO: casting to dtype seems odd for input_ids and attn_mask.
+        if left_truncate_len:
+            encoding["input_ids"] = encoding["input_ids"][:, -left_truncate_len:]
+            encoding["attention_mask"] = encoding["attention_mask"][
+                :, -left_truncate_len:
+            ]
+        self.tokenizer.padding_side = old_padding_side
 
-    #     return encoding["input_ids"], encoding["attention_mask"]
+        return encoding
 
     # def tok_decode(self, tokens, skip_special_tokens=True):
     #     return self.tokenizer.decode(tokens, skip_special_tokens=skip_special_tokens)
 
-    def _model_generate(self, inputs, stop, **gen_kwargs):
-        # TODO: handle max_length
+    def _model_generate(self, inputs, max_length, stop, **generation_kwargs):
         # gen_kwargs["image_sizes"] = [visuals[idx].size for idx in range(len(visuals))]
-        if "max_new_tokens" not in gen_kwargs:
-            gen_kwargs["max_new_tokens"] = 1024
-        if "temperature" not in gen_kwargs:
-            gen_kwargs["temperature"] = 0
-        if "top_p" not in gen_kwargs:
-            gen_kwargs["top_p"] = None
-        if "num_beams" not in gen_kwargs:
-            gen_kwargs["num_beams"] = 1
+        generation_kwargs["temperature"] = generation_kwargs.get("temperature", 0.0)
+        do_sample = generation_kwargs.get("do_sample", None)
+
+        # The temperature has to be a strictly positive float -- if it is 0.0, use greedy decoding strategies
+        if generation_kwargs.get("temperature") == 0.0 and do_sample is None:
+            generation_kwargs["do_sample"] = do_sample = False
+
+        if do_sample is False and generation_kwargs.get("temperature") == 0.0:
+            generation_kwargs.pop("temperature")
 
         stopping_criteria = stop_sequences_criteria(
             self.tokenizer,
@@ -156,15 +164,11 @@ def _model_generate(self, inputs, stop, **gen_kwargs):
         )
         return self.model.generate(
             **inputs,
-            # max_length=max_length,
+            max_length=max_length,
             stopping_criteria=stopping_criteria,
-            do_sample=True if gen_kwargs["temperature"] > 0 else False,
-            temperature=gen_kwargs["temperature"],
-            top_p=gen_kwargs["top_p"],
-            num_beams=gen_kwargs["num_beams"],
-            max_new_tokens=gen_kwargs["max_new_tokens"],
+            pad_token_id=self.tokenizer.pad_token_id,
             use_cache=True,
-            pad_token_id=self.tokenizer.eos_token_id,
+            **generation_kwargs,
         )
 
     def loglikelihood_rolling(self, requests: List[Instance]) -> List[float]:
@@ -257,21 +261,19 @@ def _collate(x):
 
             max_ctx_len = self.max_length - max_gen_toks  # noqa: F841 # TODO: this assumes we are using a causal LM. is that always valid? shouldn't be
 
-            self.tokenizer.padding_side = "left"
-            inputs = self.processor(  # TODO: write this as tok_batch_encode (and allow that to either take a visuals value or None)
-                images=visuals, text=contexts, return_tensors="pt", padding=True
-            ).to(
-                self.device, self.model.dtype
-            )  # TODO: factor out into a tok_batch_encode bit ; truncate from left using max_ctx_len
-
-            print(inputs)
+            inputs = self.tok_batch_encode(
+                contexts,
+                visuals,
+                left_truncate_len=max_ctx_len,
+                truncation=self.truncation,
+            ).to(self.device, self.model.dtype)
 
             context_enc = inputs["input_ids"]
 
             if "max_length" not in kwargs:
                 kwargs["max_length"] = context_enc.shape[1] + max_gen_toks
 
-            cont = self._model_generate(inputs, stop=until, **gen_kwargs)
+            cont = self._model_generate(inputs, stop=until, **kwargs)
 
             ### essentially same as HFLM beyond this line!
 
diff --git a/lm_eval/tasks/__init__.py b/lm_eval/tasks/__init__.py
index b567a3d4b9..1fd63cdb81 100644
--- a/lm_eval/tasks/__init__.py
+++ b/lm_eval/tasks/__init__.py
@@ -1,17 +1,11 @@
 import collections
-import inspect
 import logging
 import os
 from functools import partial
 from typing import Dict, List, Mapping, Optional, Union
 
 from lm_eval import utils
-from lm_eval.api.group import ConfigurableGroup, GroupConfig
 from lm_eval.api.task import ConfigurableTask, Task
-from lm_eval.evaluator_utils import get_subtask_list
-
-
-GROUP_ONLY_KEYS = list(GroupConfig().to_dict().keys())
 
 
 class TaskManager:
@@ -86,12 +80,7 @@ def _name_is_registered(self, name) -> bool:
         return False
 
     def _name_is_task(self, name) -> bool:
-        if self._name_is_registered(name) and (self.task_index[name]["type"] == "task"):
-            return True
-        return False
-
-    def _name_is_tag(self, name) -> bool:
-        if self._name_is_registered(name) and (self.task_index[name]["type"] == "tag"):
+        if self._name_is_registered(name) and ("task" in self.task_index[name]["type"]):
             return True
         return False
 
@@ -152,126 +141,89 @@ def _process_alias(self, config, group=None):
                 config["group_alias"] = None
         return config
 
-    def _class_has_config_in_constructor(self, cls):
-        constructor = getattr(cls, "__init__", None)
-        return (
-            "config" in inspect.signature(constructor).parameters
-            if constructor
-            else False
-        )
-
     def _load_individual_task_or_group(
         self,
         name_or_config: Optional[Union[str, dict]] = None,
         parent_name: Optional[str] = None,
         update_config: Optional[dict] = None,
+        yaml_path: Optional[str] = None,
     ) -> Mapping:
-        def _load_task(config, task):
+        def load_task(config, task, group=None, yaml_path=None):
             if "include" in config:
+                if yaml_path is None:
+                    raise ValueError
                 config = {
                     **utils.load_yaml_config(
-                        yaml_path=None,
+                        yaml_path,
                         yaml_config={"include": config.pop("include")},
                         mode="full",
                     ),
                     **config,
                 }
             if self._config_is_python_task(config):
-                if self._class_has_config_in_constructor(config["class"]):
-                    task_object = config["class"](config=config)
-                else:
-                    task_object = config["class"]()
-                if isinstance(task_object, ConfigurableTask):
-                    # very scuffed: set task name here. TODO: fixme?
-                    task_object.config.task = config["task"]
+                task_object = config["class"]()
             else:
+                config = self._process_alias(config, group=group)
                 task_object = ConfigurableTask(config=config)
-
+            if group is not None:
+                task_object = (group, task_object)
             return {task: task_object}
 
-        def _get_group_and_subtask_from_config(config):
-            group_name = ConfigurableGroup(config=config)
-            subtask_list = []
-            for task in group_name.config["task"]:
-                if isinstance(task, str) and self._name_is_tag(task):
-                    subtask_list.extend(self._get_tasklist(task))
-                else:
-                    subtask_list.append(task)
-            return group_name, subtask_list
-
-        def _process_group_config(config, update_config=None):
-            if update_config is not None:
-                config = {**config, **update_config}
-            _update_config = {
-                k: v for k, v in config.items() if k not in GROUP_ONLY_KEYS
-            }
-            if not bool(_update_config):
-                _update_config = None
-
-            group_config = {k: v for k, v in config.items() if k in GROUP_ONLY_KEYS}
-            return group_config, _update_config
-
         if isinstance(name_or_config, str):
             if update_config is not None:
                 # Process name_or_config as a dict instead
                 name_or_config = {"task": name_or_config, **update_config}
-            elif self._name_is_task(name_or_config) or self._name_is_python_task(
-                name_or_config
-            ):
+            elif self._name_is_task(name_or_config):
                 task_config = self._get_config(name_or_config)
-                return _load_task(task_config, task=name_or_config)
+                return load_task(task_config, task=name_or_config, group=parent_name)
             else:
+                group_name = name_or_config
                 subtask_list = self._get_tasklist(name_or_config)
                 if subtask_list == -1:
                     group_config = self._get_config(name_or_config)
-                    group_config, update_config = _process_group_config(group_config)
-                    group_name, subtask_list = _get_group_and_subtask_from_config(
-                        group_config
-                    )
-                else:
-                    if self._name_is_tag(name_or_config):
-                        fn = partial(
-                            self._load_individual_task_or_group,
-                            update_config=name_or_config
-                            if isinstance(name_or_config, dict)
-                            else None,
-                        )
-                        return dict(
-                            collections.ChainMap(*map(fn, reversed(subtask_list)))
-                        )
-                    else:
-                        group_name = ConfigurableGroup(
-                            config={"group": name_or_config, "task": subtask_list}
-                        )
+                    subtask_list = group_config["task"]
+
+                # This checks if we're at the root.
+                if parent_name is None:
+                    group_config = self._get_config(name_or_config)
+                    if set(group_config.keys()) > {"task", "group"}:
+                        update_config = {
+                            k: v
+                            for k, v in group_config.items()
+                            if k not in ["task", "group"]
+                        }
+                    yaml_path = self._get_yaml_path(group_name)
+
+                    if (update_config is not None) and ("group_alias" in update_config):
+                        group_name = update_config["group_alias"]
+                        update_config.pop("group_alias")
 
         if isinstance(name_or_config, dict):
+            if update_config is not None:
+                name_or_config = {
+                    **name_or_config,
+                    **update_config,
+                }
+
             if self._config_is_task(name_or_config):
-                name = name_or_config.pop("task")
-                if update_config is not None:
-                    name_or_config = {**name_or_config, **update_config}
+                name = name_or_config["task"]
                 # If the name is registered as a group
+                # if self._name_is_task(name) is False:
                 if self._name_is_group(name):
-                    group_config = self._get_config(name)
-
-                    group_config, update_config = _process_group_config(
-                        group_config, name_or_config
-                    )
-                    group_name, subtask_list = _get_group_and_subtask_from_config(
-                        group_config
-                    )
-                elif self._name_is_tag(name):
+                    group_name = name
+                    update_config = {
+                        k: v for k, v in name_or_config.items() if k != "task"
+                    }
                     subtask_list = self._get_tasklist(name)
-                    fn = partial(
-                        self._load_individual_task_or_group,
-                        update_config=name_or_config,
-                    )
-                    return dict(collections.ChainMap(*map(fn, reversed(subtask_list))))
+                    if subtask_list == -1:
+                        subtask_list = self._get_config(name)["task"]
                 else:
                     if self._name_is_registered(name):
                         base_task_config = self._get_config(name)
 
                         # Check if this is a duplicate.
                         if parent_name is not None:
+                            name_or_config["group"] = parent_name
                             num_duplicate = len(
                                 list(
                                     filter(
@@ -290,21 +242,34 @@ def _process_group_config(config, update_config=None):
                         }
                     else:
                         task_config = name_or_config
-                    return _load_task(task_config, task=name)
+                    return load_task(
+                        task_config, task=name, group=parent_name, yaml_path=yaml_path
+                    )
             else:
-                group_config, update_config = _process_group_config(name_or_config)
-                group_name, subtask_list = _get_group_and_subtask_from_config(
-                    group_config
-                )
+                group_name = name_or_config["group"]
+                subtask_list = name_or_config["task"]
+                if set(name_or_config.keys()) > {"task", "group"}:
+                    update_config = {
+                        k: v
+                        for k, v in name_or_config.items()
+                        if k not in ["task", "group"]
+                    }
+
+        all_subtasks = {}
+        if parent_name is not None:
+            all_subtasks = {group_name: (parent_name, None)}
 
         fn = partial(
             self._load_individual_task_or_group,
             parent_name=group_name,
             update_config=update_config,
+            yaml_path=yaml_path,
         )
-        return {
-            group_name: dict(collections.ChainMap(*map(fn, reversed(subtask_list))))
+        all_subtasks = {
+            **all_subtasks,
+            **dict(collections.ChainMap(*map(fn, subtask_list))),
         }
+        return all_subtasks
 
     def load_task_or_group(self, task_list: Optional[Union[str, list]] = None) -> dict:
         """Loads a dictionary of task objects from a list
@@ -328,11 +293,10 @@ def load_config(self, config: Dict):
 
     def _get_task_and_group(self, task_dir: str):
         """Creates a dictionary of tasks index with the following metadata,
-        - `type`, that can be either `task`, `python_task`, `group` or `tags`.
+        - `type`, that can be either `task`, `python_task`, or `group`.
             `task` refer to regular task configs, `python_task` are special
             yaml files that only consists of `task` and `class` parameters.
-            `group` are group configs. `tags` are labels that can be assigned
-            to tasks to assist in sorting and calling tasks of certain themes.
+            `group` are group configs.
         - `yaml_path`, path to the yaml file. If the entry is a `group` that
             was configured through a task config, the yaml_path will be -1
             and all subtasks will be listed in `task` (see below)
@@ -348,8 +312,6 @@ def _get_task_and_group(self, task_dir: str):
         :return
             Dictionary of task names as key and task metadata
         """
-        # TODO: remove group in next release
-        print_info = True
         ignore_dirs = [
             "__pycache__",
             ".ipynb_checkpoints",
@@ -396,38 +358,20 @@ def _get_task_and_group(self, task_dir: str):
                             "yaml_path": yaml_path,
                         }
 
-                        # TODO: remove group in next release
-                        for attr in ["tag", "group"]:
-                            if attr in config:
-                                if attr == "group" and print_info:
-                                    self.logger.info(
-                                        "`group` and `group_alias` keys in tasks' configs will no longer be used in the next release of lm-eval. "
-                                        "`tag` will be used to allow to call a collection of tasks just like `group`. "
-                                        "`group` will be removed in order to not cause confusion with the new ConfigurableGroup "
-                                        "which will be the offical way to create groups with addition of group-wide configuations."
-                                    )
-                                    print_info = False
-                                    # attr = "tag"
-
-                                attr_list = config[attr]
-                                if isinstance(attr_list, str):
-                                    attr_list = [attr_list]
-
-                                for tag in attr_list:
-                                    if tag not in tasks_and_groups:
-                                        tasks_and_groups[tag] = {
-                                            "type": "tag",
-                                            "task": [task],
-                                            "yaml_path": -1,
-                                        }
-                                    elif tasks_and_groups[tag]["type"] != "tag":
-                                        self.logger.info(
-                                            f"The tag {tag} is already registered as a group, this tag will not be registered. "
-                                            "This may affect tasks you want to call."
-                                        )
-                                        break
-                                    else:
-                                        tasks_and_groups[tag]["task"].append(task)
+                        if "group" in config:
+                            groups = config["group"]
+                            if isinstance(config["group"], str):
+                                groups = [groups]
+
+                            for group in groups:
+                                if group not in tasks_and_groups:
+                                    tasks_and_groups[group] = {
+                                        "type": "group",
+                                        "task": [task],
+                                        "yaml_path": -1,
+                                    }
+                                else:
+                                    tasks_and_groups[group]["task"].append(task)
                     else:
                         self.logger.debug(f"File {f} in {root} could not be loaded")
 
@@ -456,33 +400,6 @@ def get_task_name_from_object(task_object):
     )
 
 
-def _check_duplicates(task_dict: dict) -> List[str]:
-    """helper function solely used in validating get_task_dict output.
-    Takes the output of lm_eval.evaluator_utils.get_subtask_list and
-    returns a list of all leaf subtasks contained within, and errors if any such leaf subtasks are
-    "oversubscribed" to several disjoint groups.
-    """
-    subtask_names = []
-    for key, value in task_dict.items():
-        subtask_names.extend(value)
-
-    duplicate_tasks = {
-        task_name for task_name in subtask_names if subtask_names.count(task_name) > 1
-    }
-
-    # locate the potentially problematic groups that seem to 'compete' for constituent subtasks
-    competing_groups = [
-        group
-        for group in task_dict.keys()
-        if len(set(task_dict[group]).intersection(duplicate_tasks)) > 0
-    ]
-
-    if len(duplicate_tasks) > 0:
-        raise ValueError(
-            f"Found 1 or more tasks while trying to call get_task_dict() that were members of more than 1 called group: {list(duplicate_tasks)}. Offending groups: {competing_groups}. Please call groups which overlap their constituent tasks in separate evaluation runs."
-        )
-
-
 def get_task_dict(
     task_name_list: Union[str, List[Union[str, Dict, Task]]],
     task_manager: Optional[TaskManager] = None,
@@ -500,7 +417,6 @@ def get_task_dict(
     :return
         Dictionary of task objects
     """
-
     task_name_from_string_dict = {}
     task_name_from_config_dict = {}
     task_name_from_object_dict = {}
@@ -547,16 +463,8 @@ def get_task_dict(
     ):
         raise ValueError
 
-    final_task_dict = {
+    return {
         **task_name_from_string_dict,
         **task_name_from_config_dict,
         **task_name_from_object_dict,
     }
-
-    # behavior can get odd if one tries to invoke several groups that "compete" for the same task.
-    # (notably, because one could request several num_fewshot values at once in GroupConfig overrides for the subtask
-    # and we'd be unsure which to use and report.)
-    # we explicitly check and error in this case.
-    _check_duplicates(get_subtask_list(final_task_dict))
-
-    return final_task_dict

From 8d92a6827a929726dfbdc4b250ccb3bc9cfcf03b Mon Sep 17 00:00:00 2001
From: haileyschoelkopf <hailey@eleuther.ai>
Date: Fri, 2 Aug 2024 14:56:12 +0000
Subject: [PATCH 12/60] add aggregation for mmmu

---
 lm_eval/tasks/mmmu/_template_yaml                      | 6 +++---
 lm_eval/tasks/mmmu/art_and_design.yaml                 | 4 ++++
 lm_eval/tasks/mmmu/business.yaml                       | 4 ++++
 lm_eval/tasks/mmmu/health_and_medicine.yaml            | 4 ++++
 lm_eval/tasks/mmmu/humanities_and_social_sciences.yaml | 4 ++++
 lm_eval/tasks/mmmu/mmmu.yaml                           | 4 ++++
 lm_eval/tasks/mmmu/science.yaml                        | 4 ++++
 lm_eval/tasks/mmmu/tech_and_engineering.yaml           | 4 ++++
 8 files changed, 31 insertions(+), 3 deletions(-)

diff --git a/lm_eval/tasks/mmmu/_template_yaml b/lm_eval/tasks/mmmu/_template_yaml
index 0230299af0..7408fb5fac 100644
--- a/lm_eval/tasks/mmmu/_template_yaml
+++ b/lm_eval/tasks/mmmu/_template_yaml
@@ -11,9 +11,9 @@ generation_kwargs:
   temperature: 1.0
   do_sample: false
   max_gen_toks: 512
-  top_p: 0.94
-  repetition_penalty: 1.0
-  image_aspect_ratio: original
+  # top_p: 0.94
+  # repetition_penalty: 1.0
+  # image_aspect_ratio: original
 metric_list:
   - metric: mmmu_acc
     aggregation: mean
diff --git a/lm_eval/tasks/mmmu/art_and_design.yaml b/lm_eval/tasks/mmmu/art_and_design.yaml
index 700f1df904..9d3d2f002a 100644
--- a/lm_eval/tasks/mmmu/art_and_design.yaml
+++ b/lm_eval/tasks/mmmu/art_and_design.yaml
@@ -17,3 +17,7 @@ task:
     include: _template_yaml
     task_alias: Music
     dataset_name: Music
+aggregate_metric_list:
+  - metric: mmmu_acc
+    aggregation: mean
+    weight_by_size: true
diff --git a/lm_eval/tasks/mmmu/business.yaml b/lm_eval/tasks/mmmu/business.yaml
index 6075fcb7a8..9b7995e627 100644
--- a/lm_eval/tasks/mmmu/business.yaml
+++ b/lm_eval/tasks/mmmu/business.yaml
@@ -21,3 +21,7 @@ task:
     include: _template_yaml
     task_alias: Marketing
     dataset_name: Marketing
+aggregate_metric_list:
+  - metric: mmmu_acc
+    aggregation: mean
+    weight_by_size: true
diff --git a/lm_eval/tasks/mmmu/health_and_medicine.yaml b/lm_eval/tasks/mmmu/health_and_medicine.yaml
index 9616c99de2..17a0923b2d 100644
--- a/lm_eval/tasks/mmmu/health_and_medicine.yaml
+++ b/lm_eval/tasks/mmmu/health_and_medicine.yaml
@@ -21,3 +21,7 @@ task:
     include: _template_yaml
     task_alias: Public Health
     dataset_name: Public_Health
+aggregate_metric_list:
+  - metric: mmmu_acc
+    aggregation: mean
+    weight_by_size: true
diff --git a/lm_eval/tasks/mmmu/humanities_and_social_sciences.yaml b/lm_eval/tasks/mmmu/humanities_and_social_sciences.yaml
index 583997c686..a319401f25 100644
--- a/lm_eval/tasks/mmmu/humanities_and_social_sciences.yaml
+++ b/lm_eval/tasks/mmmu/humanities_and_social_sciences.yaml
@@ -17,3 +17,7 @@ task:
     include: _template_yaml
     task_alias: Psychology
     dataset_name: Psychology
+aggregate_metric_list:
+  - metric: mmmu_acc
+    aggregation: mean
+    weight_by_size: true
diff --git a/lm_eval/tasks/mmmu/mmmu.yaml b/lm_eval/tasks/mmmu/mmmu.yaml
index 8b490634eb..e98cc83539 100644
--- a/lm_eval/tasks/mmmu/mmmu.yaml
+++ b/lm_eval/tasks/mmmu/mmmu.yaml
@@ -6,3 +6,7 @@ task:
   - mmmu_humanities_and_social_science
   - mmmu_science
   - mmmu_tech_and_engineering
+aggregate_metric_list:
+  - metric: mmmu_acc
+    aggregation: mean
+    weight_by_size: true
diff --git a/lm_eval/tasks/mmmu/science.yaml b/lm_eval/tasks/mmmu/science.yaml
index 09bfcf76c0..81181872e8 100644
--- a/lm_eval/tasks/mmmu/science.yaml
+++ b/lm_eval/tasks/mmmu/science.yaml
@@ -21,3 +21,7 @@ task:
     include: _template_yaml
     task_alias: Physics
     dataset_name: Physics
+aggregate_metric_list:
+  - metric: mmmu_acc
+    aggregation: mean
+    weight_by_size: true
diff --git a/lm_eval/tasks/mmmu/tech_and_engineering.yaml b/lm_eval/tasks/mmmu/tech_and_engineering.yaml
index d5bf652d00..1e9e2d3b1a 100644
--- a/lm_eval/tasks/mmmu/tech_and_engineering.yaml
+++ b/lm_eval/tasks/mmmu/tech_and_engineering.yaml
@@ -29,3 +29,7 @@ task:
     include: _template_yaml
     task_alias: Mechanical Engineering
     dataset_name: Mechanical_Engineering
+aggregate_metric_list:
+  - metric: mmmu_acc
+    aggregation: mean
+    weight_by_size: true

From f410d35fbc5ee99c58a9cb5c7b2cceefc883ec3a Mon Sep 17 00:00:00 2001
From: haileyschoelkopf <hailey@eleuther.ai>
Date: Wed, 7 Aug 2024 19:52:00 +0000
Subject: [PATCH 13/60] rewrite MMMU processing code based on only MMMU
 authors' repo (doc_to_image still WIP)

---
 lm_eval/tasks/mmmu/_template_yaml             |  13 +-
 lm_eval/tasks/mmmu/art_and_design.yaml        |   2 +-
 lm_eval/tasks/mmmu/business.yaml              |   2 +-
 lm_eval/tasks/mmmu/health_and_medicine.yaml   |   2 +-
 .../mmmu/humanities_and_social_sciences.yaml  |   2 +-
 lm_eval/tasks/mmmu/mmmu.yaml                  |   2 +-
 lm_eval/tasks/mmmu/new_utils.py               | 348 ++++++++++++++++++
 lm_eval/tasks/mmmu/science.yaml               |   2 +-
 lm_eval/tasks/mmmu/tech_and_engineering.yaml  |   2 +-
 9 files changed, 360 insertions(+), 15 deletions(-)
 create mode 100644 lm_eval/tasks/mmmu/new_utils.py

diff --git a/lm_eval/tasks/mmmu/_template_yaml b/lm_eval/tasks/mmmu/_template_yaml
index 7408fb5fac..e7daa95323 100644
--- a/lm_eval/tasks/mmmu/_template_yaml
+++ b/lm_eval/tasks/mmmu/_template_yaml
@@ -1,20 +1,17 @@
 dataset_path: MMMU/MMMU
 validation_split: validation
 output_type: generate_until
-doc_to_image: !function utils.doc_to_image
-doc_to_text: !function utils.doc_to_text
+doc_to_image: !function new_utils.doc_to_image
+doc_to_text: !function new_utils.doc_to_text
 doc_to_target: "answer"
-process_results: !function utils.process_results
+process_results: !function new_utils.process_results
 generation_kwargs:
   until:
     - "<|endoftext|>"
-  temperature: 1.0
+  temperature: 0.0
   do_sample: false
   max_gen_toks: 512
-  # top_p: 0.94
-  # repetition_penalty: 1.0
-  # image_aspect_ratio: original
 metric_list:
-  - metric: mmmu_acc
+  - metric: acc
     aggregation: mean
     higher_is_better: true
diff --git a/lm_eval/tasks/mmmu/art_and_design.yaml b/lm_eval/tasks/mmmu/art_and_design.yaml
index 9d3d2f002a..c3c57db940 100644
--- a/lm_eval/tasks/mmmu/art_and_design.yaml
+++ b/lm_eval/tasks/mmmu/art_and_design.yaml
@@ -18,6 +18,6 @@ task:
     task_alias: Music
     dataset_name: Music
 aggregate_metric_list:
-  - metric: mmmu_acc
+  - metric: acc
     aggregation: mean
     weight_by_size: true
diff --git a/lm_eval/tasks/mmmu/business.yaml b/lm_eval/tasks/mmmu/business.yaml
index 9b7995e627..bb2db84438 100644
--- a/lm_eval/tasks/mmmu/business.yaml
+++ b/lm_eval/tasks/mmmu/business.yaml
@@ -22,6 +22,6 @@ task:
     task_alias: Marketing
     dataset_name: Marketing
 aggregate_metric_list:
-  - metric: mmmu_acc
+  - metric: acc
     aggregation: mean
     weight_by_size: true
diff --git a/lm_eval/tasks/mmmu/health_and_medicine.yaml b/lm_eval/tasks/mmmu/health_and_medicine.yaml
index 17a0923b2d..66d717cd14 100644
--- a/lm_eval/tasks/mmmu/health_and_medicine.yaml
+++ b/lm_eval/tasks/mmmu/health_and_medicine.yaml
@@ -22,6 +22,6 @@ task:
     task_alias: Public Health
     dataset_name: Public_Health
 aggregate_metric_list:
-  - metric: mmmu_acc
+  - metric: acc
     aggregation: mean
     weight_by_size: true
diff --git a/lm_eval/tasks/mmmu/humanities_and_social_sciences.yaml b/lm_eval/tasks/mmmu/humanities_and_social_sciences.yaml
index a319401f25..32bc2fd3ae 100644
--- a/lm_eval/tasks/mmmu/humanities_and_social_sciences.yaml
+++ b/lm_eval/tasks/mmmu/humanities_and_social_sciences.yaml
@@ -18,6 +18,6 @@ task:
     task_alias: Psychology
     dataset_name: Psychology
 aggregate_metric_list:
-  - metric: mmmu_acc
+  - metric: acc
     aggregation: mean
     weight_by_size: true
diff --git a/lm_eval/tasks/mmmu/mmmu.yaml b/lm_eval/tasks/mmmu/mmmu.yaml
index e98cc83539..3fbf0dde94 100644
--- a/lm_eval/tasks/mmmu/mmmu.yaml
+++ b/lm_eval/tasks/mmmu/mmmu.yaml
@@ -7,6 +7,6 @@ task:
   - mmmu_science
   - mmmu_tech_and_engineering
 aggregate_metric_list:
-  - metric: mmmu_acc
+  - metric: acc
     aggregation: mean
     weight_by_size: true
diff --git a/lm_eval/tasks/mmmu/new_utils.py b/lm_eval/tasks/mmmu/new_utils.py
new file mode 100644
index 0000000000..932a00e8a3
--- /dev/null
+++ b/lm_eval/tasks/mmmu/new_utils.py
@@ -0,0 +1,348 @@
+import ast
+import random
+import re
+
+import numpy as np
+
+
+random.seed(42)
+
+
+# source for prompt fstrings: https://github.com/MMMU-Benchmark/MMMU/blob/7787d60648c82a9d40acd656fa541a6c74f58995/eval/configs/llava1.5.yaml#L3
+MULTI_CHOICE_EXAMPLE_FORMAT = """{}
+
+{}
+
+Answer with the option's letter from the given choices directly."""
+
+
+SHORT_ANS_EXAMPLE_FORMAT = """{}
+
+Answer the question using a single word or phrase."""
+
+START_CHR = "A"
+
+
+def doc_to_image(doc):
+    return ""
+
+
+def doc_to_text(doc):
+    """Get the prompt for a given document."""
+
+    if doc["question_type"] == "multiple-choice":
+        choices_str = ""
+
+        for i, choice in enumerate(ast.literal_eval(doc["options"])):
+            # add (A) {choice1}\n , (B) {choice2}\n , and so on
+            # to create the list of formatted choices in the prompt
+            choices_str += f"\n({chr(ord(START_CHR) + i)}) {choice}"
+
+        choices_str = (
+            choices_str.lstrip()
+        )  # remove the extraneous prepended \n that we added
+
+        prompt = MULTI_CHOICE_EXAMPLE_FORMAT.format(doc["question"], choices_str)
+    else:
+        prompt = SHORT_ANS_EXAMPLE_FORMAT.format(doc["question"])
+
+    for i in range(1, 8):
+        # replace <image {i}> with <image>. TODO: check this is always the right decision incl. for non-HF models
+        prompt = prompt.replace(f"<image {i}>", "<image>")
+    print(prompt)
+
+    return prompt
+
+
+def process_results(doc, results):
+    if doc["question_type"] == "multiple-choice":
+        # multichoice logic
+        option_strs = ast.literal_eval(doc["options"])
+        option_letters = ["A", "B", "C", "D"]
+
+        all_choices = option_letters[: len(option_strs)]
+        index2ans = {index: ans for index, ans in zip(option_letters, option_strs)}
+
+        pred = parse_multi_choice_response(results[0], all_choices, index2ans)
+        print(pred, all_choices, index2ans)
+        is_correct = eval_multi_choice(doc["answer"], pred)
+    else:
+        pred = parse_open_response(results[0])
+        is_correct = eval_open(doc["answer"], pred)
+        # freeform response handling
+
+    return {"acc": float(is_correct)}
+
+    # TODO: it would be better if we could use a Filter for this logic.
+
+
+### Output parsing and answer selection taken from
+### https://github.com/MMMU-Benchmark/MMMU/blob/main/eval/utils/data_utils.py
+### and
+### https://github.com/MMMU-Benchmark/MMMU/blob/main/eval/utils/eval_utils.py
+
+
+# ----------- Process Multi-choice -------------
+def parse_multi_choice_response(response, all_choices, index2ans):
+    """
+    Parse the prediction from the generated response.
+    Return the predicted index e.g., A, B, C, D.
+    """
+    for char in [",", ".", "!", "?", ";", ":", "'"]:
+        response = response.strip(char)
+    response = " " + response + " "  # add space to avoid partial match
+
+    index_ans = True
+    ans_with_brack = False
+    candidates = []
+    for choice in all_choices:  # e.g., (A) (B) (C) (D)
+        if f"({choice})" in response:
+            candidates.append(choice)
+            ans_with_brack = True
+
+    if len(candidates) == 0:
+        for choice in all_choices:  # e.g., A B C D
+            if f" {choice} " in response:
+                candidates.append(choice)
+
+    # if all above doesn't get candidates, check if the content is larger than 5 tokens and try to parse the example
+    if len(candidates) == 0 and len(response.split()) > 5:
+        for index, ans in index2ans.items():
+            if ans.lower() in response.lower():
+                candidates.append(index)
+                index_ans = False  # it's content ans.
+
+    if len(candidates) == 0:  # still not get answer, randomly choose one.
+        pred_index = random.choice(all_choices)
+    elif len(candidates) > 1:
+        start_indexes = []
+        if index_ans:
+            if ans_with_brack:
+                for can in candidates:
+                    index = response.rfind(f"({can})")
+                    start_indexes.append(index)  # -1 will be ignored anyway
+                # start_indexes = [generated_response.index(f'({can})') for can in candidates]
+            else:
+                for can in candidates:
+                    index = response.rfind(f" {can} ")
+                    start_indexes.append(index)
+        else:
+            for can in candidates:
+                index = response.lower().rfind(index2ans[can].lower())
+                start_indexes.append(index)
+        # get the last one
+        pred_index = candidates[np.argmax(start_indexes)]
+    else:  # if only one candidate, use it.
+        pred_index = candidates[0]
+
+    print(response, all_choices, index2ans, pred_index)
+
+    return pred_index
+
+
+# ----------- Process Open -------------
+def check_is_number(string):
+    """
+    Check if the given string a number.
+    """
+    try:
+        float(string.replace(",", ""))
+        return True
+    except ValueError:
+        # check if there's comma inside
+        return False
+
+
+def normalize_str(string):
+    """
+    Normalize the str to lower case and make them float numbers if possible.
+    """
+    # check if characters in the string
+
+    # if number, numerize it.
+    string = string.strip()
+
+    is_number = check_is_number(string)
+
+    if is_number:
+        string = string.replace(",", "")
+        string = float(string)
+        # leave 2 decimal
+        string = round(string, 2)
+        return [string]
+    else:  # it's likely to be a string
+        # lower it
+        string = string.lower()
+        if len(string) == 1:
+            return [" " + string, string + " "]  # avoid trivial matches
+        return [string]
+
+
+def extract_numbers(string):
+    """
+    Exact all forms of numbers from a string with regex.
+    """
+    # Pattern for numbers with commas
+    pattern_commas = r"-?\b\d{1,3}(?:,\d{3})+\b"
+    # Pattern for scientific notation
+    pattern_scientific = r"-?\d+(?:\.\d+)?[eE][+-]?\d+"
+    # Pattern for simple numbers without commas
+    pattern_simple = r"-?(?:\d+\.\d+|\.\d+|\d+\b)(?![eE][+-]?\d+)(?![,\d])"
+
+    # Extract numbers with commas
+    numbers_with_commas = re.findall(pattern_commas, string)
+    # Extract numbers in scientific notation
+    numbers_scientific = re.findall(pattern_scientific, string)
+    # Extract simple numbers without commas
+    numbers_simple = re.findall(pattern_simple, string)
+
+    # Combine all extracted numbers
+    all_numbers = numbers_with_commas + numbers_scientific + numbers_simple
+    return all_numbers
+
+
+def parse_open_response(response):
+    """
+    Parse the prediction from the generated response.
+    Return a list of predicted strings or numbers.
+    """
+
+    # content = content.strip("\n").strip(".").strip(" ")
+    def get_key_subresponses(response):
+        key_responses = []
+        response = response.strip().strip(".").lower()
+        sub_responses = re.split(r"\.\s(?=[A-Z])|\n", response)
+        indicators_of_keys = [
+            "could be ",
+            "so ",
+            "is ",
+            "thus ",
+            "therefore ",
+            "final ",
+            "answer ",
+            "result ",
+        ]
+        key_responses = []
+        for index, resp in enumerate(sub_responses):
+            # if last one, accept it's an equation (the entire response can be just one sentence with equation)
+            if index == len(sub_responses) - 1:
+                indicators_of_keys.extend(["="])
+            shortest_key_response = None  # the shortest response that may contain the answer (tail part of the response)
+            for indicator in indicators_of_keys:
+                if indicator in resp:
+                    if not shortest_key_response:
+                        shortest_key_response = resp.split(indicator)[-1].strip()
+                    else:
+                        if len(resp.split(indicator)[-1].strip()) < len(
+                            shortest_key_response
+                        ):
+                            shortest_key_response = resp.split(indicator)[-1].strip()
+                    # key_responses.append(resp.split(indicator)[1].strip())
+
+            if shortest_key_response:
+                # and it's not trivial
+                if shortest_key_response.strip() not in [
+                    ":",
+                    ",",
+                    ".",
+                    "!",
+                    "?",
+                    ";",
+                    ":",
+                    "'",
+                ]:
+                    key_responses.append(shortest_key_response)
+        if len(key_responses) == 0:  # did not found any
+            return [response]
+        return key_responses
+
+    # pdb.set_trace()
+    key_responses = get_key_subresponses(response)
+
+    pred_list = key_responses.copy()  # keep the original string response
+    for resp in key_responses:
+        pred_list.extend(extract_numbers(resp))
+
+    tmp_pred_list = []
+    for i in range(len(pred_list)):
+        tmp_pred_list.extend(normalize_str(pred_list[i]))
+    pred_list = tmp_pred_list
+
+    # remove duplicates
+    pred_list = list(set(pred_list))
+
+    return pred_list
+
+
+# ----------- Evaluation -------------
+
+
+def eval_multi_choice(gold_i, pred_i):
+    """
+    Evaluate a multiple choice instance.
+    """
+    correct = False
+    # only they are exactly the same, we consider it as correct
+    if isinstance(gold_i, list):
+        for answer in gold_i:
+            if answer == pred_i:
+                correct = True
+                break
+    else:  # gold_i is a string
+        if gold_i == pred_i:
+            correct = True
+    return correct
+
+
+def eval_open(gold_i, pred_i):
+    """
+    Evaluate an open question instance
+    """
+    correct = False
+    if isinstance(gold_i, list):
+        # use float to avoid trivial matches
+        norm_answers = []
+        for answer in gold_i:
+            norm_answers.extend(normalize_str(answer))
+    else:
+        norm_answers = normalize_str(gold_i)
+    for pred in pred_i:  # pred is already normalized in parse response phase
+        if isinstance(pred, str):  # if it's a string, then find if ans in the pred_i
+            for norm_ans in norm_answers:
+                # only see if the string answer in the string pred
+                if isinstance(norm_ans, str) and norm_ans in pred:
+                    if not correct:
+                        correct = True
+                    break
+        else:  # it's a float number
+            if pred in norm_answers:
+                if not correct:
+                    correct = True
+                break
+    return correct
+
+
+# ----------- Batch Evaluation -------------
+def evaluate(samples):
+    """
+    Batch evaluation for multiple choice and open questions.
+    """
+    pred_correct = 0
+    judge_dict = dict()
+    for sample in samples:
+        gold_i = sample["answer"]
+        pred_i = sample["parsed_pred"]
+        if sample["question_type"] == "multiple-choice":
+            correct = eval_multi_choice(gold_i, pred_i)
+        else:  # open question
+            correct = eval_open(gold_i, pred_i)
+
+        if correct:
+            judge_dict[sample["id"]] = "Correct"
+            pred_correct += 1
+        else:
+            judge_dict[sample["id"]] = "Wrong"
+
+    if len(samples) == 0:
+        return {"acc": 0}
+    return judge_dict, {"acc": pred_correct / len(samples)}
diff --git a/lm_eval/tasks/mmmu/science.yaml b/lm_eval/tasks/mmmu/science.yaml
index 81181872e8..cc4bce9bfe 100644
--- a/lm_eval/tasks/mmmu/science.yaml
+++ b/lm_eval/tasks/mmmu/science.yaml
@@ -22,6 +22,6 @@ task:
     task_alias: Physics
     dataset_name: Physics
 aggregate_metric_list:
-  - metric: mmmu_acc
+  - metric: acc
     aggregation: mean
     weight_by_size: true
diff --git a/lm_eval/tasks/mmmu/tech_and_engineering.yaml b/lm_eval/tasks/mmmu/tech_and_engineering.yaml
index 1e9e2d3b1a..438d7c2d3f 100644
--- a/lm_eval/tasks/mmmu/tech_and_engineering.yaml
+++ b/lm_eval/tasks/mmmu/tech_and_engineering.yaml
@@ -30,6 +30,6 @@ task:
     task_alias: Mechanical Engineering
     dataset_name: Mechanical_Engineering
 aggregate_metric_list:
-  - metric: mmmu_acc
+  - metric: acc
     aggregation: mean
     weight_by_size: true

From ebf54d843f6a8e8818937439cbd5401b7ab91f2e Mon Sep 17 00:00:00 2001
From: lintangsutawika <lintang@eleuther.ai>
Date: Thu, 8 Aug 2024 13:20:11 +0000
Subject: [PATCH 14/60] implemented doc_to_image

---
 lm_eval/tasks/mmmu/new_utils.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/lm_eval/tasks/mmmu/new_utils.py b/lm_eval/tasks/mmmu/new_utils.py
index 932a00e8a3..f2d281d405 100644
--- a/lm_eval/tasks/mmmu/new_utils.py
+++ b/lm_eval/tasks/mmmu/new_utils.py
@@ -24,7 +24,10 @@
 
 
 def doc_to_image(doc):
-    return ""
+    question = doc["question"]
+    image_list = re.findall(r"<image \d+>", question)
+    image_list = [image_list.strip("<>").replace(" ", "_") for image in image_list]
+    return [doc[image].convert("RGB") for image in image_list]
 
 
 def doc_to_text(doc):

From 941b502c397903285eb41a7adf786243493e615b Mon Sep 17 00:00:00 2001
From: lintangsutawika <lintang@eleuther.ai>
Date: Thu, 8 Aug 2024 13:51:40 +0000
Subject: [PATCH 15/60] update doc_to_image to accept list of features

---
 lm_eval/api/task.py               |  17 +-
 lm_eval/tasks/mmmu/_template_yaml |   6 +-
 lm_eval/tasks/mmmu/new_utils.py   | 351 ------------------------------
 lm_eval/tasks/mmmu/utils.py       | 343 +++++++++++++----------------
 4 files changed, 165 insertions(+), 552 deletions(-)
 delete mode 100644 lm_eval/tasks/mmmu/new_utils.py

diff --git a/lm_eval/api/task.py b/lm_eval/api/task.py
index 3423ccbd04..2fa82c2053 100644
--- a/lm_eval/api/task.py
+++ b/lm_eval/api/task.py
@@ -1278,13 +1278,20 @@ def doc_to_choice(self, doc: Any) -> List[str]:
         else:
             raise TypeError
 
-    def doc_to_image(self, doc: Any) -> Union[int, str, list]:
-        if self.config.doc_to_image is None:
-            return None
-        else:
+    def doc_to_image(self, doc: Any, doc_to_image=None) -> Union[int, str, list]:
+        if doc_to_image is not None:
+            doc_to_image = doc_to_image
+        elif self.config.doc_to_image is not None:
             doc_to_image = self.config.doc_to_image
+        else:
+            return None
 
-        if isinstance(self.config.doc_to_image, str):
+        if isinstance(doc_to_image, list):
+            image_feature = [
+                self.doc_to_image(doc, feature) for feature in doc_to_image
+            ]
+            return [feature for feature in image_feature if feature is not None]
+        elif isinstance(doc_to_image, str):
             if doc_to_image in self.features:
                 return doc[doc_to_image]
             else:
diff --git a/lm_eval/tasks/mmmu/_template_yaml b/lm_eval/tasks/mmmu/_template_yaml
index e7daa95323..bf13112818 100644
--- a/lm_eval/tasks/mmmu/_template_yaml
+++ b/lm_eval/tasks/mmmu/_template_yaml
@@ -1,10 +1,10 @@
 dataset_path: MMMU/MMMU
 validation_split: validation
 output_type: generate_until
-doc_to_image: !function new_utils.doc_to_image
-doc_to_text: !function new_utils.doc_to_text
+doc_to_image: ["image_1", "image_2", "image_3", "image_4", "image_5", "image_6", "image_7"]
+doc_to_text: !function utils.doc_to_text
 doc_to_target: "answer"
-process_results: !function new_utils.process_results
+process_results: !function utils.process_results
 generation_kwargs:
   until:
     - "<|endoftext|>"
diff --git a/lm_eval/tasks/mmmu/new_utils.py b/lm_eval/tasks/mmmu/new_utils.py
deleted file mode 100644
index f2d281d405..0000000000
--- a/lm_eval/tasks/mmmu/new_utils.py
+++ /dev/null
@@ -1,351 +0,0 @@
-import ast
-import random
-import re
-
-import numpy as np
-
-
-random.seed(42)
-
-
-# source for prompt fstrings: https://github.com/MMMU-Benchmark/MMMU/blob/7787d60648c82a9d40acd656fa541a6c74f58995/eval/configs/llava1.5.yaml#L3
-MULTI_CHOICE_EXAMPLE_FORMAT = """{}
-
-{}
-
-Answer with the option's letter from the given choices directly."""
-
-
-SHORT_ANS_EXAMPLE_FORMAT = """{}
-
-Answer the question using a single word or phrase."""
-
-START_CHR = "A"
-
-
-def doc_to_image(doc):
-    question = doc["question"]
-    image_list = re.findall(r"<image \d+>", question)
-    image_list = [image_list.strip("<>").replace(" ", "_") for image in image_list]
-    return [doc[image].convert("RGB") for image in image_list]
-
-
-def doc_to_text(doc):
-    """Get the prompt for a given document."""
-
-    if doc["question_type"] == "multiple-choice":
-        choices_str = ""
-
-        for i, choice in enumerate(ast.literal_eval(doc["options"])):
-            # add (A) {choice1}\n , (B) {choice2}\n , and so on
-            # to create the list of formatted choices in the prompt
-            choices_str += f"\n({chr(ord(START_CHR) + i)}) {choice}"
-
-        choices_str = (
-            choices_str.lstrip()
-        )  # remove the extraneous prepended \n that we added
-
-        prompt = MULTI_CHOICE_EXAMPLE_FORMAT.format(doc["question"], choices_str)
-    else:
-        prompt = SHORT_ANS_EXAMPLE_FORMAT.format(doc["question"])
-
-    for i in range(1, 8):
-        # replace <image {i}> with <image>. TODO: check this is always the right decision incl. for non-HF models
-        prompt = prompt.replace(f"<image {i}>", "<image>")
-    print(prompt)
-
-    return prompt
-
-
-def process_results(doc, results):
-    if doc["question_type"] == "multiple-choice":
-        # multichoice logic
-        option_strs = ast.literal_eval(doc["options"])
-        option_letters = ["A", "B", "C", "D"]
-
-        all_choices = option_letters[: len(option_strs)]
-        index2ans = {index: ans for index, ans in zip(option_letters, option_strs)}
-
-        pred = parse_multi_choice_response(results[0], all_choices, index2ans)
-        print(pred, all_choices, index2ans)
-        is_correct = eval_multi_choice(doc["answer"], pred)
-    else:
-        pred = parse_open_response(results[0])
-        is_correct = eval_open(doc["answer"], pred)
-        # freeform response handling
-
-    return {"acc": float(is_correct)}
-
-    # TODO: it would be better if we could use a Filter for this logic.
-
-
-### Output parsing and answer selection taken from
-### https://github.com/MMMU-Benchmark/MMMU/blob/main/eval/utils/data_utils.py
-### and
-### https://github.com/MMMU-Benchmark/MMMU/blob/main/eval/utils/eval_utils.py
-
-
-# ----------- Process Multi-choice -------------
-def parse_multi_choice_response(response, all_choices, index2ans):
-    """
-    Parse the prediction from the generated response.
-    Return the predicted index e.g., A, B, C, D.
-    """
-    for char in [",", ".", "!", "?", ";", ":", "'"]:
-        response = response.strip(char)
-    response = " " + response + " "  # add space to avoid partial match
-
-    index_ans = True
-    ans_with_brack = False
-    candidates = []
-    for choice in all_choices:  # e.g., (A) (B) (C) (D)
-        if f"({choice})" in response:
-            candidates.append(choice)
-            ans_with_brack = True
-
-    if len(candidates) == 0:
-        for choice in all_choices:  # e.g., A B C D
-            if f" {choice} " in response:
-                candidates.append(choice)
-
-    # if all above doesn't get candidates, check if the content is larger than 5 tokens and try to parse the example
-    if len(candidates) == 0 and len(response.split()) > 5:
-        for index, ans in index2ans.items():
-            if ans.lower() in response.lower():
-                candidates.append(index)
-                index_ans = False  # it's content ans.
-
-    if len(candidates) == 0:  # still not get answer, randomly choose one.
-        pred_index = random.choice(all_choices)
-    elif len(candidates) > 1:
-        start_indexes = []
-        if index_ans:
-            if ans_with_brack:
-                for can in candidates:
-                    index = response.rfind(f"({can})")
-                    start_indexes.append(index)  # -1 will be ignored anyway
-                # start_indexes = [generated_response.index(f'({can})') for can in candidates]
-            else:
-                for can in candidates:
-                    index = response.rfind(f" {can} ")
-                    start_indexes.append(index)
-        else:
-            for can in candidates:
-                index = response.lower().rfind(index2ans[can].lower())
-                start_indexes.append(index)
-        # get the last one
-        pred_index = candidates[np.argmax(start_indexes)]
-    else:  # if only one candidate, use it.
-        pred_index = candidates[0]
-
-    print(response, all_choices, index2ans, pred_index)
-
-    return pred_index
-
-
-# ----------- Process Open -------------
-def check_is_number(string):
-    """
-    Check if the given string a number.
-    """
-    try:
-        float(string.replace(",", ""))
-        return True
-    except ValueError:
-        # check if there's comma inside
-        return False
-
-
-def normalize_str(string):
-    """
-    Normalize the str to lower case and make them float numbers if possible.
-    """
-    # check if characters in the string
-
-    # if number, numerize it.
-    string = string.strip()
-
-    is_number = check_is_number(string)
-
-    if is_number:
-        string = string.replace(",", "")
-        string = float(string)
-        # leave 2 decimal
-        string = round(string, 2)
-        return [string]
-    else:  # it's likely to be a string
-        # lower it
-        string = string.lower()
-        if len(string) == 1:
-            return [" " + string, string + " "]  # avoid trivial matches
-        return [string]
-
-
-def extract_numbers(string):
-    """
-    Exact all forms of numbers from a string with regex.
-    """
-    # Pattern for numbers with commas
-    pattern_commas = r"-?\b\d{1,3}(?:,\d{3})+\b"
-    # Pattern for scientific notation
-    pattern_scientific = r"-?\d+(?:\.\d+)?[eE][+-]?\d+"
-    # Pattern for simple numbers without commas
-    pattern_simple = r"-?(?:\d+\.\d+|\.\d+|\d+\b)(?![eE][+-]?\d+)(?![,\d])"
-
-    # Extract numbers with commas
-    numbers_with_commas = re.findall(pattern_commas, string)
-    # Extract numbers in scientific notation
-    numbers_scientific = re.findall(pattern_scientific, string)
-    # Extract simple numbers without commas
-    numbers_simple = re.findall(pattern_simple, string)
-
-    # Combine all extracted numbers
-    all_numbers = numbers_with_commas + numbers_scientific + numbers_simple
-    return all_numbers
-
-
-def parse_open_response(response):
-    """
-    Parse the prediction from the generated response.
-    Return a list of predicted strings or numbers.
-    """
-
-    # content = content.strip("\n").strip(".").strip(" ")
-    def get_key_subresponses(response):
-        key_responses = []
-        response = response.strip().strip(".").lower()
-        sub_responses = re.split(r"\.\s(?=[A-Z])|\n", response)
-        indicators_of_keys = [
-            "could be ",
-            "so ",
-            "is ",
-            "thus ",
-            "therefore ",
-            "final ",
-            "answer ",
-            "result ",
-        ]
-        key_responses = []
-        for index, resp in enumerate(sub_responses):
-            # if last one, accept it's an equation (the entire response can be just one sentence with equation)
-            if index == len(sub_responses) - 1:
-                indicators_of_keys.extend(["="])
-            shortest_key_response = None  # the shortest response that may contain the answer (tail part of the response)
-            for indicator in indicators_of_keys:
-                if indicator in resp:
-                    if not shortest_key_response:
-                        shortest_key_response = resp.split(indicator)[-1].strip()
-                    else:
-                        if len(resp.split(indicator)[-1].strip()) < len(
-                            shortest_key_response
-                        ):
-                            shortest_key_response = resp.split(indicator)[-1].strip()
-                    # key_responses.append(resp.split(indicator)[1].strip())
-
-            if shortest_key_response:
-                # and it's not trivial
-                if shortest_key_response.strip() not in [
-                    ":",
-                    ",",
-                    ".",
-                    "!",
-                    "?",
-                    ";",
-                    ":",
-                    "'",
-                ]:
-                    key_responses.append(shortest_key_response)
-        if len(key_responses) == 0:  # did not found any
-            return [response]
-        return key_responses
-
-    # pdb.set_trace()
-    key_responses = get_key_subresponses(response)
-
-    pred_list = key_responses.copy()  # keep the original string response
-    for resp in key_responses:
-        pred_list.extend(extract_numbers(resp))
-
-    tmp_pred_list = []
-    for i in range(len(pred_list)):
-        tmp_pred_list.extend(normalize_str(pred_list[i]))
-    pred_list = tmp_pred_list
-
-    # remove duplicates
-    pred_list = list(set(pred_list))
-
-    return pred_list
-
-
-# ----------- Evaluation -------------
-
-
-def eval_multi_choice(gold_i, pred_i):
-    """
-    Evaluate a multiple choice instance.
-    """
-    correct = False
-    # only they are exactly the same, we consider it as correct
-    if isinstance(gold_i, list):
-        for answer in gold_i:
-            if answer == pred_i:
-                correct = True
-                break
-    else:  # gold_i is a string
-        if gold_i == pred_i:
-            correct = True
-    return correct
-
-
-def eval_open(gold_i, pred_i):
-    """
-    Evaluate an open question instance
-    """
-    correct = False
-    if isinstance(gold_i, list):
-        # use float to avoid trivial matches
-        norm_answers = []
-        for answer in gold_i:
-            norm_answers.extend(normalize_str(answer))
-    else:
-        norm_answers = normalize_str(gold_i)
-    for pred in pred_i:  # pred is already normalized in parse response phase
-        if isinstance(pred, str):  # if it's a string, then find if ans in the pred_i
-            for norm_ans in norm_answers:
-                # only see if the string answer in the string pred
-                if isinstance(norm_ans, str) and norm_ans in pred:
-                    if not correct:
-                        correct = True
-                    break
-        else:  # it's a float number
-            if pred in norm_answers:
-                if not correct:
-                    correct = True
-                break
-    return correct
-
-
-# ----------- Batch Evaluation -------------
-def evaluate(samples):
-    """
-    Batch evaluation for multiple choice and open questions.
-    """
-    pred_correct = 0
-    judge_dict = dict()
-    for sample in samples:
-        gold_i = sample["answer"]
-        pred_i = sample["parsed_pred"]
-        if sample["question_type"] == "multiple-choice":
-            correct = eval_multi_choice(gold_i, pred_i)
-        else:  # open question
-            correct = eval_open(gold_i, pred_i)
-
-        if correct:
-            judge_dict[sample["id"]] = "Correct"
-            pred_correct += 1
-        else:
-            judge_dict[sample["id"]] = "Wrong"
-
-    if len(samples) == 0:
-        return {"acc": 0}
-    return judge_dict, {"acc": pred_correct / len(samples)}
diff --git a/lm_eval/tasks/mmmu/utils.py b/lm_eval/tasks/mmmu/utils.py
index 5b55086b11..9032c88384 100644
--- a/lm_eval/tasks/mmmu/utils.py
+++ b/lm_eval/tasks/mmmu/utils.py
@@ -5,185 +5,91 @@
 import numpy as np
 
 
-MULTI_CHOICE_PROMPT = "Answer with the option letter from the given choices directly."
-OPEN_ENDED_PROMPT = "Answer the question using a single word or phrase."
+random.seed(42)
 
 
-def replace_images_tokens(input_string):
-    for i in range(1, 8):
-        question_text = f"<image {i}>"
-        query_text = "<image>"
-        if question_text in input_string:
-            input_string = input_string.replace(question_text, query_text)
-    return input_string
-
-
-def parse_options(options):
-    option_letters = [chr(ord("A") + i) for i in range(len(options))]
-    choices_str = "\n".join(
-        [
-            f"{option_letter}. {option}"
-            for option_letter, option in zip(option_letters, options)
-        ]
-    )
-    return choices_str
+# source for prompt fstrings: https://github.com/MMMU-Benchmark/MMMU/blob/7787d60648c82a9d40acd656fa541a6c74f58995/eval/configs/llava1.5.yaml#L3
+MULTI_CHOICE_EXAMPLE_FORMAT = """{}
 
+{}
 
-def construct_prompt(doc):
-    question = doc["question"]
-    if doc["question_type"] == "multiple-choice":
-        # Weirdly, data["options"] is a string in MMMU Huggingface dataset
-        parsed_options = parse_options(ast.literal_eval(doc["options"]))
-        # parsed_options already prepends a newline so no need to add space here
-        question = f"{question}\n{parsed_options}\n{MULTI_CHOICE_PROMPT}"
-    else:
-        question = f"{question}\n{OPEN_ENDED_PROMPT}"
-    return question
+Answer with the option's letter from the given choices directly."""
 
 
-def doc_to_text(doc):
-    question = construct_prompt(doc)
-    return replace_images_tokens(question)
+SHORT_ANS_EXAMPLE_FORMAT = """{}
 
+Answer the question using a single word or phrase."""
 
-def doc_to_image(doc):
-    prompt = construct_prompt(doc)
-    image_tokens = re.findall(r"<image \d+>", prompt)
-    # Remove <> and  swap space as _
-    image_tokens = [
-        image_token.strip("<>").replace(" ", "_") for image_token in image_tokens
-    ]
-    visual = [doc[image_token].convert("RGB") for image_token in image_tokens]
-    return visual
+START_CHR = "A"
 
 
-def process_results(doc, results):
-    pred = results[0]
+# def doc_to_image(doc):
+#     question = doc["question"]
+#     image_list = re.findall(r"<image \d+>", question)
+#     image_list = [image_list.strip("<>").replace(" ", "_") for image in image_list]
+#     return [doc[image].convert("RGB") for image in image_list]
+
+
+def doc_to_text(doc):
+    """Get the prompt for a given document."""
+
     if doc["question_type"] == "multiple-choice":
-        index2ans, all_choices = get_multi_choice_info(ast.literal_eval(doc["options"]))
-        parsed_pred = parse_multi_choice_response(pred, all_choices, index2ans)
-    else:
-        parsed_pred = parse_open_response(pred)
-
-    sample_dict = {
-        "id": doc["id"],
-        "subdomain": extract_subset_name(doc["id"]),
-        "question_type": doc["question_type"],
-        "answer": doc["answer"],
-        "parsed_pred": parsed_pred,
-    }
-    _, result_dict = evaluate_mmmu([sample_dict])
-    return result_dict
-
-
-def extract_subset_name(input_string):
-    # Define a regex pattern to match "validation_" at the beginning and "_<number>" at the end
-    split = input_string.split("_")[0]
-    pattern = re.compile(rf"^{split}_(.+?)_\d+$")
-    match = pattern.search(input_string)
-    if match:
-        return match.group(1)
-    else:
-        raise ValueError(f'No match found in "{input_string}"')
+        choices_str = ""
 
+        for i, choice in enumerate(ast.literal_eval(doc["options"])):
+            # add (A) {choice1}\n , (B) {choice2}\n , and so on
+            # to create the list of formatted choices in the prompt
+            choices_str += f"\n({chr(ord(START_CHR) + i)}) {choice}"
 
-##################
-# Helper functions written by official MMMU repo.
-##################
+        choices_str = (
+            choices_str.lstrip()
+        )  # remove the extraneous prepended \n that we added
 
+        prompt = MULTI_CHOICE_EXAMPLE_FORMAT.format(doc["question"], choices_str)
+    else:
+        prompt = SHORT_ANS_EXAMPLE_FORMAT.format(doc["question"])
 
-def calculate_ins_level_acc(results):
-    """Calculate the instruction level accuracy for given Subject results
-    https://github.com/MMMU-Benchmark/MMMU/blob/51ce7f3e829c16bb44bc5445782686b4c3508794/eval/eval_utils.py#L246
-    """
-    acc = 0
-    ins_num = 0
-    for cat_results in results.values():
-        acc += cat_results["mmmu_acc"] * cat_results["num_example"]
-        ins_num += cat_results["num_example"]
-    if ins_num == 0:
-        return 0
-    return acc / ins_num
+    for i in range(1, 8):
+        # replace <image {i}> with <image>. TODO: check this is always the right decision incl. for non-HF models
+        prompt = prompt.replace(f"<image {i}>", "<image>")
+    print(prompt)
 
+    return prompt
 
-def eval_multi_choice(gold_i, pred_i):
-    """
-    Evaluate a multiple choice instance.
-    https://github.com/MMMU-Benchmark/MMMU/blob/51ce7f3e829c16bb44bc5445782686b4c3508794/eval/eval_utils.py#L175
-    """
-    correct = False
-    # only they are exactly the same, we consider it as correct
-    if isinstance(gold_i, list):
-        for answer in gold_i:
-            if answer == pred_i:
-                correct = True
-                break
-    else:  # gold_i is a string
-        if gold_i == pred_i:
-            correct = True
-    return correct
 
+def process_results(doc, results):
+    if doc["question_type"] == "multiple-choice":
+        # multichoice logic
+        option_strs = ast.literal_eval(doc["options"])
+        option_letters = ["A", "B", "C", "D"]
 
-def eval_open(gold_i, pred_i):
-    """
-    Evaluate an open question instance
-    https://github.com/MMMU-Benchmark/MMMU/blob/51ce7f3e829c16bb44bc5445782686b4c3508794/eval/eval_utils.py#L191
-    """
-    correct = False
-    if isinstance(gold_i, list):
-        # use float to avoid trivial matches
-        norm_answers = []
-        for answer in gold_i:
-            norm_answers.extend(normalize_str(answer))
+        all_choices = option_letters[: len(option_strs)]
+        index2ans = {index: ans for index, ans in zip(option_letters, option_strs)}
+
+        pred = parse_multi_choice_response(results[0], all_choices, index2ans)
+        print(pred, all_choices, index2ans)
+        is_correct = eval_multi_choice(doc["answer"], pred)
     else:
-        norm_answers = normalize_str(gold_i)
-    for pred in pred_i:  # pred is already normalized in parse response phase
-        if isinstance(pred, str):  # if it's a string, then find if ans in the pred_i
-            for norm_ans in norm_answers:
-                # only see if the string answer in the string pred
-                if isinstance(norm_ans, str) and norm_ans in pred:
-                    if not correct:
-                        correct = True
-                    break
-        else:  # it's a float number
-            if pred in norm_answers:
-                if not correct:
-                    correct = True
-                break
-    return correct
+        pred = parse_open_response(results[0])
+        is_correct = eval_open(doc["answer"], pred)
+        # freeform response handling
 
+    return {"acc": float(is_correct)}
 
-def evaluate_mmmu(samples):
-    """
-    Batch evaluation for multiple choice and open questions.
-    https://github.com/MMMU-Benchmark/MMMU/blob/51ce7f3e829c16bb44bc5445782686b4c3508794/eval/eval_utils.py#L219
-    """
-    pred_correct = 0
-    judge_dict = dict()
-    for sample in samples:
-        gold_i = sample["answer"]
-        pred_i = sample["parsed_pred"]
-        if sample["question_type"] == "multiple-choice":
-            correct = eval_multi_choice(gold_i, pred_i)
-        else:  # open question
-            correct = eval_open(gold_i, pred_i)
+    # TODO: it would be better if we could use a Filter for this logic.
 
-        if correct:
-            judge_dict[sample["id"]] = "Correct"
-            pred_correct += 1
-        else:
-            judge_dict[sample["id"]] = "Wrong"
 
-    if len(samples) == 0:
-        return {"mmmu_acc": 0}
-    return judge_dict, {"mmmu_acc": pred_correct / len(samples)}
+### Output parsing and answer selection taken from
+### https://github.com/MMMU-Benchmark/MMMU/blob/main/eval/utils/data_utils.py
+### and
+### https://github.com/MMMU-Benchmark/MMMU/blob/main/eval/utils/eval_utils.py
 
 
+# ----------- Process Multi-choice -------------
 def parse_multi_choice_response(response, all_choices, index2ans):
     """
     Parse the prediction from the generated response.
     Return the predicted index e.g., A, B, C, D.
-    https://github.com/MMMU-Benchmark/MMMU/blob/51ce7f3e829c16bb44bc5445782686b4c3508794/eval/eval_utils.py#L10
     """
     for char in [",", ".", "!", "?", ";", ":", "'"]:
         response = response.strip(char)
@@ -199,12 +105,7 @@ def parse_multi_choice_response(response, all_choices, index2ans):
 
     if len(candidates) == 0:
         for choice in all_choices:  # e.g., A B C D
-            if f"{choice} " in response:
-                candidates.append(choice)
-
-    if len(candidates) == 0:
-        for choice in all_choices:  # e.g., A. B. C. D.
-            if f"{choice}." in response:
+            if f" {choice} " in response:
                 candidates.append(choice)
 
     # if all above doesn't get candidates, check if the content is larger than 5 tokens and try to parse the example
@@ -237,37 +138,15 @@ def parse_multi_choice_response(response, all_choices, index2ans):
     else:  # if only one candidate, use it.
         pred_index = candidates[0]
 
-    return pred_index
-
+    print(response, all_choices, index2ans, pred_index)
 
-def extract_numbers(string):
-    """
-    Exact all forms of numbers from a string with regex.
-    https://github.com/MMMU-Benchmark/MMMU/blob/51ce7f3e829c16bb44bc5445782686b4c3508794/eval/eval_utils.py#L100
-    """
-    # Pattern for numbers with commas
-    pattern_commas = r"-?\b\d{1,3}(?:,\d{3})+\b"
-    # Pattern for scientific notation
-    pattern_scientific = r"-?\d+(?:\.\d+)?[eE][+-]?\d+"
-    # Pattern for simple numbers without commas
-    pattern_simple = r"-?(?:\d+\.\d+|\.\d+|\d+\b)(?![eE][+-]?\d+)(?![,\d])"
-
-    # Extract numbers with commas
-    numbers_with_commas = re.findall(pattern_commas, string)
-    # Extract numbers in scientific notation
-    numbers_scientific = re.findall(pattern_scientific, string)
-    # Extract simple numbers without commas
-    numbers_simple = re.findall(pattern_simple, string)
-
-    # Combine all extracted numbersz
-    all_numbers = numbers_with_commas + numbers_scientific + numbers_simple
-    return all_numbers
+    return pred_index
 
 
+# ----------- Process Open -------------
 def check_is_number(string):
     """
     Check if the given string a number.
-    https://github.com/MMMU-Benchmark/MMMU/blob/51ce7f3e829c16bb44bc5445782686b4c3508794/eval/eval_utils.py#L65
     """
     try:
         float(string.replace(",", ""))
@@ -280,7 +159,6 @@ def check_is_number(string):
 def normalize_str(string):
     """
     Normalize the str to lower case and make them float numbers if possible.
-    https://github.com/MMMU-Benchmark/MMMU/blob/51ce7f3e829c16bb44bc5445782686b4c3508794/eval/eval_utils.py#L76
     """
     # check if characters in the string
 
@@ -303,11 +181,33 @@ def normalize_str(string):
         return [string]
 
 
+def extract_numbers(string):
+    """
+    Exact all forms of numbers from a string with regex.
+    """
+    # Pattern for numbers with commas
+    pattern_commas = r"-?\b\d{1,3}(?:,\d{3})+\b"
+    # Pattern for scientific notation
+    pattern_scientific = r"-?\d+(?:\.\d+)?[eE][+-]?\d+"
+    # Pattern for simple numbers without commas
+    pattern_simple = r"-?(?:\d+\.\d+|\.\d+|\d+\b)(?![eE][+-]?\d+)(?![,\d])"
+
+    # Extract numbers with commas
+    numbers_with_commas = re.findall(pattern_commas, string)
+    # Extract numbers in scientific notation
+    numbers_scientific = re.findall(pattern_scientific, string)
+    # Extract simple numbers without commas
+    numbers_simple = re.findall(pattern_simple, string)
+
+    # Combine all extracted numbers
+    all_numbers = numbers_with_commas + numbers_scientific + numbers_simple
+    return all_numbers
+
+
 def parse_open_response(response):
     """
     Parse the prediction from the generated response.
     Return a list of predicted strings or numbers.
-    https://github.com/MMMU-Benchmark/MMMU/blob/51ce7f3e829c16bb44bc5445782686b4c3508794/eval/eval_utils.py#L122
     """
 
     # content = content.strip("\n").strip(".").strip(" ")
@@ -377,18 +277,75 @@ def get_key_subresponses(response):
     return pred_list
 
 
-def get_multi_choice_info(options):
+# ----------- Evaluation -------------
+
+
+def eval_multi_choice(gold_i, pred_i):
     """
-    Given the list of options for multiple choice question
-    Return the index2ans and all_choices
-    https://github.com/MMMU-Benchmark/MMMU/blob/51ce7f3e829c16bb44bc5445782686b4c3508794/eval/data_utils.py#L54
+    Evaluate a multiple choice instance.
     """
+    correct = False
+    # only they are exactly the same, we consider it as correct
+    if isinstance(gold_i, list):
+        for answer in gold_i:
+            if answer == pred_i:
+                correct = True
+                break
+    else:  # gold_i is a string
+        if gold_i == pred_i:
+            correct = True
+    return correct
 
-    start_chr = "A"
-    all_choices = []
-    index2ans = {}
-    for i, option in enumerate(options):
-        index2ans[chr(ord(start_chr) + i)] = option
-        all_choices.append(chr(ord(start_chr) + i))
 
-    return index2ans, all_choices
+def eval_open(gold_i, pred_i):
+    """
+    Evaluate an open question instance
+    """
+    correct = False
+    if isinstance(gold_i, list):
+        # use float to avoid trivial matches
+        norm_answers = []
+        for answer in gold_i:
+            norm_answers.extend(normalize_str(answer))
+    else:
+        norm_answers = normalize_str(gold_i)
+    for pred in pred_i:  # pred is already normalized in parse response phase
+        if isinstance(pred, str):  # if it's a string, then find if ans in the pred_i
+            for norm_ans in norm_answers:
+                # only see if the string answer in the string pred
+                if isinstance(norm_ans, str) and norm_ans in pred:
+                    if not correct:
+                        correct = True
+                    break
+        else:  # it's a float number
+            if pred in norm_answers:
+                if not correct:
+                    correct = True
+                break
+    return correct
+
+
+# ----------- Batch Evaluation -------------
+def evaluate(samples):
+    """
+    Batch evaluation for multiple choice and open questions.
+    """
+    pred_correct = 0
+    judge_dict = dict()
+    for sample in samples:
+        gold_i = sample["answer"]
+        pred_i = sample["parsed_pred"]
+        if sample["question_type"] == "multiple-choice":
+            correct = eval_multi_choice(gold_i, pred_i)
+        else:  # open question
+            correct = eval_open(gold_i, pred_i)
+
+        if correct:
+            judge_dict[sample["id"]] = "Correct"
+            pred_correct += 1
+        else:
+            judge_dict[sample["id"]] = "Wrong"
+
+    if len(samples) == 0:
+        return {"acc": 0}
+    return judge_dict, {"acc": pred_correct / len(samples)}

From 8e4c1d69a8974a7fc48cfe0cbda326fca5828057 Mon Sep 17 00:00:00 2001
From: lintangsutawika <lintang@eleuther.ai>
Date: Thu, 8 Aug 2024 14:24:48 +0000
Subject: [PATCH 16/60] update functions

---
 lm_eval/tasks/mmmu/_template_yaml |  1 +
 lm_eval/tasks/mmmu/utils.py       | 10 +++++-----
 2 files changed, 6 insertions(+), 5 deletions(-)

diff --git a/lm_eval/tasks/mmmu/_template_yaml b/lm_eval/tasks/mmmu/_template_yaml
index bf13112818..4fa6042138 100644
--- a/lm_eval/tasks/mmmu/_template_yaml
+++ b/lm_eval/tasks/mmmu/_template_yaml
@@ -2,6 +2,7 @@ dataset_path: MMMU/MMMU
 validation_split: validation
 output_type: generate_until
 doc_to_image: ["image_1", "image_2", "image_3", "image_4", "image_5", "image_6", "image_7"]
+# doc_to_image: !function utils.doc_to_image
 doc_to_text: !function utils.doc_to_text
 doc_to_target: "answer"
 process_results: !function utils.process_results
diff --git a/lm_eval/tasks/mmmu/utils.py b/lm_eval/tasks/mmmu/utils.py
index 9032c88384..36e9a19d46 100644
--- a/lm_eval/tasks/mmmu/utils.py
+++ b/lm_eval/tasks/mmmu/utils.py
@@ -23,11 +23,11 @@
 START_CHR = "A"
 
 
-# def doc_to_image(doc):
-#     question = doc["question"]
-#     image_list = re.findall(r"<image \d+>", question)
-#     image_list = [image_list.strip("<>").replace(" ", "_") for image in image_list]
-#     return [doc[image].convert("RGB") for image in image_list]
+def doc_to_image(doc):
+    question = doc["question"]
+    image_list = re.findall(r"<image \d+>", question)
+    image_list = [image.strip("<>").replace(" ", "_") for image in image_list]
+    return [doc[image].convert("RGB") for image in image_list]
 
 
 def doc_to_text(doc):

From 63bcbc511a7f18e3a3cad36191bc0e9fddb7bccc Mon Sep 17 00:00:00 2001
From: lintangsutawika <lintang@eleuther.ai>
Date: Sat, 10 Aug 2024 21:44:59 +0000
Subject: [PATCH 17/60] readd image processed

---
 lm_eval/models/hf_vlms.py | 11 ++++++++---
 1 file changed, 8 insertions(+), 3 deletions(-)

diff --git a/lm_eval/models/hf_vlms.py b/lm_eval/models/hf_vlms.py
index c4fca5b4f8..04ea5bedfd 100644
--- a/lm_eval/models/hf_vlms.py
+++ b/lm_eval/models/hf_vlms.py
@@ -124,12 +124,18 @@ def tok_batch_encode(
             add_special_tokens = {"add_special_tokens": False or self.add_bos_token}
 
         encoding = self.processor(
-            strings,
+            # images=visuals,
+            text=strings,
+            images=None,
             truncation=truncation,
             padding="longest",
             return_tensors="pt",
             **add_special_tokens,
-        ).to(
+        )
+        if encoding["pixel_values"] is None:
+            encoding.pop("pixel_values")
+
+        encoding.to(
             self.device, self.model.dtype
         )  # TODO: casting to dtype seems odd for input_ids and attn_mask.
         if left_truncate_len:
@@ -232,7 +238,6 @@ def _collate(x):
             visuals = [
                 arg["visual"] for arg in aux_arguments
             ]  # TODO: I think *fully* flattening is just wrong for bs>1 ??
-
             ### this part onward: same as HFLM ###
 
             # we assume all gen kwargs in the batch are the same

From 15dda350ca8d66a039a61a5beca99bdc0172a002 Mon Sep 17 00:00:00 2001
From: lintangsutawika <lintang@eleuther.ai>
Date: Sun, 11 Aug 2024 23:01:26 +0000
Subject: [PATCH 18/60] update args process

---
 lm_eval/models/hf_vlms.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/lm_eval/models/hf_vlms.py b/lm_eval/models/hf_vlms.py
index 04ea5bedfd..81a24d2bf4 100644
--- a/lm_eval/models/hf_vlms.py
+++ b/lm_eval/models/hf_vlms.py
@@ -132,7 +132,7 @@ def tok_batch_encode(
             return_tensors="pt",
             **add_special_tokens,
         )
-        if encoding["pixel_values"] is None:
+        if "pixel_values" in encoding and encoding["pixel_values"] is None:
             encoding.pop("pixel_values")
 
         encoding.to(

From d811a3a1ef7404ff4b41be4a2c14a92d855e5bc2 Mon Sep 17 00:00:00 2001
From: haileyschoelkopf <hailey@eleuther.ai>
Date: Fri, 23 Aug 2024 17:25:41 +0000
Subject: [PATCH 19/60] bugfix for repeated images fed to model

---
 lm_eval/models/hf_vlms.py         | 17 +++++++++-------
 lm_eval/tasks/mmmu/_template_yaml |  2 +-
 lm_eval/tasks/mmmu/utils.py       | 34 +++++++++++++++++++++++--------
 3 files changed, 36 insertions(+), 17 deletions(-)

diff --git a/lm_eval/models/hf_vlms.py b/lm_eval/models/hf_vlms.py
index 81a24d2bf4..1f28ae9a39 100644
--- a/lm_eval/models/hf_vlms.py
+++ b/lm_eval/models/hf_vlms.py
@@ -106,7 +106,7 @@ def _create_tokenizer(
     def tok_batch_encode(
         self,
         strings: List[str],  # note that input signature of this fn is different
-        visuals,  # TODO: typehint on this
+        visuals=None,  # TODO: typehint on this
         padding_side: str = "left",
         left_truncate_len: int = None,
         truncation: bool = False,
@@ -124,16 +124,15 @@ def tok_batch_encode(
             add_special_tokens = {"add_special_tokens": False or self.add_bos_token}
 
         encoding = self.processor(
-            # images=visuals,
+            images=visuals,
             text=strings,
-            images=None,
             truncation=truncation,
             padding="longest",
             return_tensors="pt",
             **add_special_tokens,
         )
-        if "pixel_values" in encoding and encoding["pixel_values"] is None:
-            encoding.pop("pixel_values")
+        # if "pixel_values" in encoding and encoding["pixel_values"] is None: # TODO: what case does this handle?
+        #     encoding.pop("pixel_values")
 
         encoding.to(
             self.device, self.model.dtype
@@ -179,10 +178,13 @@ def _model_generate(self, inputs, max_length, stop, **generation_kwargs):
 
     def loglikelihood_rolling(self, requests: List[Instance]) -> List[float]:
         raise NotImplementedError(
-            "model type `hf-multimodal` does not support loglikelihood_rolling. Use 'hf' model type for text-only loglikelihood_rolling tasks"
+            "model type `hf-multimodal` does not support loglikelihood_rolling. Use 'hf' model type for text-only loglikelihood_rolling tasks ",
+            "this is because we do not support measuring the loglikelihood a model assigns to an image.",
         )
 
-    def loglikelihood(self, requests: List[Instance]) -> List[Tuple[float, bool]]:
+    def loglikelihood(
+        self, requests: List[Instance], disable_tqdm: bool = False
+    ) -> List[Tuple[float, bool]]:
         raise NotImplementedError(
             "model type `hf-multimodal` does not support loglikelihood or multiple choice. Use 'hf' model type for text-only loglikelihood tasks"
         )
@@ -190,6 +192,7 @@ def loglikelihood(self, requests: List[Instance]) -> List[Tuple[float, bool]]:
     def generate_until(
         self, requests: List[Instance], disable_tqdm: bool = False
     ) -> List[str]:
+        # TODO: back out to HFLM.generate_until() for all requests without aux_arguments
         res = []
 
         def _collate(x):
diff --git a/lm_eval/tasks/mmmu/_template_yaml b/lm_eval/tasks/mmmu/_template_yaml
index 4fa6042138..3f3ebf44bd 100644
--- a/lm_eval/tasks/mmmu/_template_yaml
+++ b/lm_eval/tasks/mmmu/_template_yaml
@@ -2,7 +2,7 @@ dataset_path: MMMU/MMMU
 validation_split: validation
 output_type: generate_until
 doc_to_image: ["image_1", "image_2", "image_3", "image_4", "image_5", "image_6", "image_7"]
-# doc_to_image: !function utils.doc_to_image
+doc_to_image: !function utils.doc_to_image
 doc_to_text: !function utils.doc_to_text
 doc_to_target: "answer"
 process_results: !function utils.process_results
diff --git a/lm_eval/tasks/mmmu/utils.py b/lm_eval/tasks/mmmu/utils.py
index 36e9a19d46..982f741f36 100644
--- a/lm_eval/tasks/mmmu/utils.py
+++ b/lm_eval/tasks/mmmu/utils.py
@@ -24,15 +24,36 @@
 
 
 def doc_to_image(doc):
-    question = doc["question"]
-    image_list = re.findall(r"<image \d+>", question)
-    image_list = [image.strip("<>").replace(" ", "_") for image in image_list]
-    return [doc[image].convert("RGB") for image in image_list]
+    # get formatted prompt (incl. multi-choice options) pre-<image {i}> reformatting
+    input_text = _doc_to_text(doc)
+    # locate <image {i}> instances in input
+    image_placeholders = [
+        img.replace(" ", "_").replace("<", "").replace(">", "")
+        for img in re.findall("<image [1-7]>", input_text)
+    ]
+
+    # collect visuals (can have dupes of a given image or be out of order)
+    # E.g. validation_Math_19 contains <image 1> and <image 2> but seen as [<image 1>, <image 2>, <image 1>, <image 1>, <image 2>]
+    visuals = [doc[img] for img in image_placeholders]
+
+    return visuals
 
 
 def doc_to_text(doc):
     """Get the prompt for a given document."""
 
+    prompt = _doc_to_text(doc)
+
+    for i in range(1, 8):
+        # replace <image {i}> with <image>. TODO: check this is always the right decision incl. for non-HF models
+        prompt = prompt.replace(f"<image {i}>", "<image>")
+
+    return prompt
+
+
+def _doc_to_text(doc):
+    """Helper--get the prompt for a given document but DO NOT yet replace <image {i}> with <image>."""
+
     if doc["question_type"] == "multiple-choice":
         choices_str = ""
 
@@ -49,11 +70,6 @@ def doc_to_text(doc):
     else:
         prompt = SHORT_ANS_EXAMPLE_FORMAT.format(doc["question"])
 
-    for i in range(1, 8):
-        # replace <image {i}> with <image>. TODO: check this is always the right decision incl. for non-HF models
-        prompt = prompt.replace(f"<image {i}>", "<image>")
-    print(prompt)
-
     return prompt
 
 

From 2242ed3a68fa281fa0740f31f83cb2ddbb8dca6a Mon Sep 17 00:00:00 2001
From: haileyschoelkopf <hailey@eleuther.ai>
Date: Tue, 3 Sep 2024 20:03:27 +0000
Subject: [PATCH 20/60] push WIP loglikelihood code

---
 lm_eval/api/task.py                   |  34 ++--
 lm_eval/models/hf_vlms.py             | 238 +++++++++++++++++++++++++-
 lm_eval/tasks/mmmu/_template.yaml     |  14 ++
 lm_eval/tasks/mmmu/_template_yaml     |  33 ++--
 lm_eval/tasks/mmmu/test_mcqa_utils.py |  16 ++
 5 files changed, 303 insertions(+), 32 deletions(-)
 create mode 100644 lm_eval/tasks/mmmu/_template.yaml
 create mode 100644 lm_eval/tasks/mmmu/test_mcqa_utils.py

diff --git a/lm_eval/api/task.py b/lm_eval/api/task.py
index 2fa82c2053..ee6b083286 100644
--- a/lm_eval/api/task.py
+++ b/lm_eval/api/task.py
@@ -1333,11 +1333,15 @@ def construct_requests(
                 # in other words normalizing by subtracting the unconditional logprob of each choice.
                 aux_arguments = [("", f"{choice}") for choice in choices]
 
+                arguments.extend(aux_arguments)
+
         elif self.OUTPUT_TYPE == "generate_until":
             arguments = (ctx, deepcopy(self.config.generation_kwargs))
 
         multimodal_arg = {}
-        if self.doc_to_image:
+        if (
+            self.config.doc_to_image
+        ):  # TODO: ensure that non-multimodal tasks aren't getting visual args
             multimodal_arg = {
                 **multimodal_arg,
                 **{"visual": self.doc_to_image(doc)},
@@ -1349,25 +1353,17 @@ def construct_requests(
             else:
                 arguments = arguments + (multimodal_arg,)
 
-        if isinstance(arguments, type):
-            if aux_arguments is not None:
-                all_arg_list = [arguments, aux_arguments]
-            else:
-                all_arg_list = [arguments]
-            request_list = []
-            for arg_list in all_arg_list:
-                request_list.extend(
-                    [
-                        Instance(
-                            request_type="loglikelihood",
-                            doc=doc,
-                            arguments=arg,
-                            idx=i,
-                            **kwargs,
-                        )
-                        for i, arg in enumerate(arg_list)
-                    ]
+        if self.OUTPUT_TYPE == "multiple_choice":
+            request_list = [
+                Instance(
+                    request_type="loglikelihood",
+                    doc=doc,
+                    arguments=arg,
+                    idx=i,
+                    **kwargs,
                 )
+                for i, arg in enumerate(arguments)
+            ]
 
             return request_list
 
diff --git a/lm_eval/models/hf_vlms.py b/lm_eval/models/hf_vlms.py
index 1f28ae9a39..47683e744e 100644
--- a/lm_eval/models/hf_vlms.py
+++ b/lm_eval/models/hf_vlms.py
@@ -2,13 +2,14 @@
 from typing import Dict, List, Optional, Tuple, Union
 
 import torch
+import torch.nn.functional as F
 import transformers
 from tqdm import tqdm
 
 from lm_eval.api.instance import Instance
 from lm_eval.api.registry import register_model
 from lm_eval.models.huggingface import HFLM
-from lm_eval.models.utils import Collator, stop_sequences_criteria
+from lm_eval.models.utils import Collator, pad_and_concat, stop_sequences_criteria
 
 
 @register_model("hf-multimodal")
@@ -123,6 +124,7 @@ def tok_batch_encode(
         if self.AUTO_MODEL_CLASS == transformers.AutoModelForCausalLM:
             add_special_tokens = {"add_special_tokens": False or self.add_bos_token}
 
+        print(visuals)
         encoding = self.processor(
             images=visuals,
             text=strings,
@@ -149,6 +151,25 @@ def tok_batch_encode(
     # def tok_decode(self, tokens, skip_special_tokens=True):
     #     return self.tokenizer.decode(tokens, skip_special_tokens=skip_special_tokens)
 
+    def _model_call(self, inps, imgs, attn_mask=None, labels=None):
+        """
+        TODO: update docstring
+        :param inps: torch.Tensor
+            A torch tensor of shape [batch, (sequence_ctx + sequence_cont)] or of shape
+            [batch, sequence_ctx]. the size of sequence may vary from call to call
+        :param attn_mask: torch.Tensor, optional
+            A torch tensor of shape [batch, (sequence_ctx + sequence_cont)]. Only passed
+            (and must be passed) if self.AUTO_MODEL_CLASS is transformers.AutoModelForSeq2SeqLM
+        :param labels: torch.Tensor, optional
+            A torch tensor of shape [batch, (sequence_ctx + sequence_cont)]. Only passed
+            (and must be passed) if self.AUTO_MODEL_CLASS is transformers.AutoModelForSeq2SeqLM
+        :return
+            A torch tensor of shape [batch, sequence, vocab] with the
+        logits returned from the model's decoder
+        """
+        with torch.no_grad():
+            return self.model(inps, imgs).logits
+
     def _model_generate(self, inputs, max_length, stop, **generation_kwargs):
         # gen_kwargs["image_sizes"] = [visuals[idx].size for idx in range(len(visuals))]
         generation_kwargs["temperature"] = generation_kwargs.get("temperature", 0.0)
@@ -185,9 +206,220 @@ def loglikelihood_rolling(self, requests: List[Instance]) -> List[float]:
     def loglikelihood(
         self, requests: List[Instance], disable_tqdm: bool = False
     ) -> List[Tuple[float, bool]]:
-        raise NotImplementedError(
-            "model type `hf-multimodal` does not support loglikelihood or multiple choice. Use 'hf' model type for text-only loglikelihood tasks"
+        new_reqs = []
+        for context, continuation, aux_arguments in [req.args for req in requests]:
+            if context == "":
+                raise ValueError("Must get non-empty context for multimodal requests!")
+            else:
+                visuals = aux_arguments["visual"]
+
+                context_enc, continuation_enc = self._encode_pair(context, continuation)
+            # TODO: key to pick for caching images
+            new_reqs.append(
+                (
+                    (visuals, context, continuation),
+                    visuals,
+                    context_enc,
+                    continuation_enc,
+                )
+            )
+
+        return self._loglikelihood_tokens(new_reqs, disable_tqdm=disable_tqdm)
+
+    def _loglikelihood_tokens(
+        self,
+        requests: List[
+            Tuple[Tuple[None, str, str], List[int], List[int], List[int]]
+        ],  # TODO: update typehint to be correct
+        disable_tqdm: bool = False,
+        override_bs: int = None,
+    ) -> List[Tuple[float, bool]]:
+        res = []
+
+        def _collate(req: Tuple[Tuple[str, str], List[int], List[int]]):
+            """Defines the key for the sorted method"""
+            # the negative sign on len(toks) sorts descending - this has a few advantages:
+            # - time estimates will always be over not underestimates, which is more useful for planning
+            # - to know the size of a batch when going through the list, you know the first one is always the batch
+            #   padded context length. this is useful to simplify the batching logic and more importantly to make
+            #   automatic adaptive batches much much easier to implement
+            # - any OOMs will happen right away rather than near the end
+
+            toks = req[2] + req[3]
+            return -len(toks), tuple(toks)
+
+        def _lookup_one_token_cont(req: Tuple[Tuple[str, str], List[int], List[int]]):
+            """Defines the key to group and lookup one-token continuations"""
+            # Use with group_by="contexts" (optional)"
+            # allows for the creation of a lookup, so we can reuse logits in case of one-token continuations.
+            # speeds up some multiple-choice tasks proportionally to the number of choices.
+            # groups requests by context+continuation[:-1] and infer on one request/group.
+            return req[-3] + req[-2] + req[-1][:-1]
+
+        re_ord = Collator(
+            requests,
+            sort_fn=_collate,
+            group_by="contexts"  # TODO: can't group-by just "contexts" any more, need to incorporate imgs
+            if self.AUTO_MODEL_CLASS == transformers.AutoModelForCausalLM
+            and self.logits_cache
+            else None,
+            group_fn=_lookup_one_token_cont,
+        )
+
+        # automatic (variable) batch size detection for vectorization
+        # pull longest context sample from request
+        n_reordered_requests = len(re_ord)
+        batch_size = (
+            self.batch_size
+            if self.batch_size != "auto"
+            else override_bs
+            if override_bs is not None
+            else 0
+        )
+        batch_fn = (
+            self._batch_scheduler
+            if self.batch_size == "auto"
+            and n_reordered_requests > 0
+            and not override_bs
+            else None
+        )
+
+        chunks = re_ord.get_batched(n=batch_size, batch_fn=batch_fn)
+        pbar = tqdm(
+            total=len(requests),
+            disable=(disable_tqdm or (self.rank != 0)),
+            desc="Running loglikelihood requests",
         )
+        for chunk in chunks:
+            imgs = []
+            inps = []
+            cont_toks_list = []
+            inplens = []
+
+            padding_len_inp = None
+            # because vectorizing is annoying, we first convert each (context, continuation) pair to padded
+            # tensors, then we pack them together into a batch, call the model, and then pick it all apart
+            # again because vectorizing is annoying
+
+            for _, image_enc, context_enc, continuation_enc in chunk:
+                # sanity check
+                assert len(image_enc) > 0
+                assert len(context_enc) > 0
+                assert len(continuation_enc) > 0
+                assert len(continuation_enc) <= self.max_length
+
+                # how this all works (illustrated on a causal decoder-only setup):
+                #          CTX      CONT
+                # inp    0 1 2 3|4 5 6 7 8 9   <- last token is deleted by inp[:, :-1]
+                # model  \               \
+                # logits   1 2 3|4 5 6 7 8 9   <- the ctx half gets tossed out by the
+                # cont_toks      4 5 6 7 8 9      [:, -len(continuation_enc):, :self.vocab_size] slice
+
+                # when too long to fit in context, truncate from the left
+                # TODO: assuming that we won't handle enc-dec Vision2Seq models. Is that a safe assumption?
+                inp = torch.tensor(
+                    (context_enc + continuation_enc)[-(self.max_length + 1) :][:-1],
+                    dtype=torch.long,
+                    device=self.device,
+                )
+                (inplen,) = inp.shape
+
+                padding_len_inp = (
+                    max(padding_len_inp, inplen)
+                    if padding_len_inp is not None
+                    else inplen
+                )
+
+                inps.append(inp)  # [1, inp_length]
+                cont_toks_list.append(continuation_enc)
+                inplens.append(inplen)
+
+                # image encoding
+                image_enc = self.processor(  # TODO: factor this call into an image_encode method??
+                    text="<image>"
+                    * len(
+                        image_enc
+                    ),  # we need this to process images for some models, annoyingly
+                    images=image_enc,
+                    return_tensors="pt",  # TODO: push tensor conversion into _loglikelihood_tokens
+                    # **add_special_tokens,
+                )[
+                    "pixel_values"
+                ]  # TODO: are keys from processors always the same? just pop attention_mask and input_ids keys
+
+                imgs.append(image_enc)
+
+                # TODO: batch input images together...
+
+            # create encoder attn mask and batched conts, if seq2seq
+            call_kwargs = {}
+            batched_inps = pad_and_concat(
+                padding_len_inp, inps, padding_side="right"
+            )  # [batch, padding_len_inp]
+            batched_imgs = torch.cat(imgs, dim=0)  # TODO: might be wrong, fix
+
+            multi_logits = F.log_softmax(
+                self._model_call(batched_inps, batched_imgs, **call_kwargs), dim=-1
+            )  # [batch, padding_length (inp or cont), vocab]
+
+            for (
+                request_str,
+                image_encs,
+                ctx_tokens,
+                _,
+            ), logits, inplen, cont_toks in zip(
+                chunk, multi_logits, inplens, cont_toks_list
+            ):
+                # Slice to original seq length
+                contlen = len(cont_toks)
+                # take only logits in the continuation
+                # (discard context toks if decoder-only ; discard right-padding)
+                # also discards + checks for "virtual tokens" in the causal LM's input window
+                # from prompt/prefix tuning tokens, if applicable
+                ctx_len = (
+                    inplen + (logits.shape[0] - padding_len_inp)
+                    if self.AUTO_MODEL_CLASS == transformers.AutoModelForCausalLM
+                    else None
+                )
+                logits = self._select_cont_toks(logits, contlen=contlen, inplen=ctx_len)
+                logits = logits.unsqueeze(0)  # [1, seq, vocab]
+
+                # Check if per-token argmax is exactly equal to continuation
+                greedy_tokens = logits.argmax(dim=-1)
+
+                # check for one-token continuation cache hits.
+                # noop in case group_by != "contexts" or no cache hit and returns the
+                # original args. Otherwise, expands the logits batch dimension and yields each
+                # batch along with matching continuation tokens and prompt strings.
+                # logits -> [1, seq, vocab]
+                for request_str, cont_toks, logits in re_ord.get_cache(
+                    req_str=request_str,
+                    cxt_toks=ctx_tokens,
+                    cont_toks=cont_toks,
+                    logits=logits,
+                ):
+                    cont_toks = torch.tensor(
+                        cont_toks, dtype=torch.long, device=self.device
+                    ).unsqueeze(0)  # [1, seq]
+                    max_equal = (greedy_tokens == cont_toks).all()
+
+                    # Obtain log-probs at the corresponding continuation token indices
+                    # last_token_slice = logits[:, -1, :].squeeze(0).tolist()
+                    logits = torch.gather(logits, 2, cont_toks.unsqueeze(-1)).squeeze(
+                        -1
+                    )  # [1, seq]
+
+                    # Answer: (log prob, is-exact-match)
+                    answer = (float(logits.sum()), bool(max_equal))
+
+                    res.append(answer)
+
+                    self.cache_hook.add_partial("loglikelihood", request_str, answer)
+                    pbar.update(1)
+
+        pbar.close()
+
+        return re_ord.get_original(res)
 
     def generate_until(
         self, requests: List[Instance], disable_tqdm: bool = False
diff --git a/lm_eval/tasks/mmmu/_template.yaml b/lm_eval/tasks/mmmu/_template.yaml
new file mode 100644
index 0000000000..4a9ba16f4f
--- /dev/null
+++ b/lm_eval/tasks/mmmu/_template.yaml
@@ -0,0 +1,14 @@
+task: MMMU_loglikelihood_art
+dataset_path: MMMU/MMMU
+validation_split: validation
+dataset_name: History
+output_type: multiple_choice
+doc_to_image: !function utils.doc_to_image
+doc_to_text: !function utils.doc_to_text
+doc_to_choice: !function test_mcqa_utils.doc_to_choice
+process_docs: !function test_mcqa_utils.process_docs
+doc_to_target: !function test_mcqa_utils.doc_to_target
+metric_list:
+  - metric: acc
+    aggregation: mean
+    higher_is_better: true
diff --git a/lm_eval/tasks/mmmu/_template_yaml b/lm_eval/tasks/mmmu/_template_yaml
index 3f3ebf44bd..98549ebd5d 100644
--- a/lm_eval/tasks/mmmu/_template_yaml
+++ b/lm_eval/tasks/mmmu/_template_yaml
@@ -1,17 +1,30 @@
+# dataset_path: MMMU/MMMU
+# validation_split: validation
+# output_type: generate_until
+# doc_to_image: ["image_1", "image_2", "image_3", "image_4", "image_5", "image_6", "image_7"]
+# doc_to_image: !function utils.doc_to_image
+# doc_to_text: !function utils.doc_to_text
+# doc_to_target: "answer"
+# process_results: !function utils.process_results
+# generation_kwargs:
+#   until:
+#     - "<|endoftext|>"
+#   temperature: 0.0
+#   do_sample: false
+#   max_gen_toks: 512
+# metric_list:
+#   - metric: acc
+#     aggregation: mean
+#     higher_is_better: true
+### Test MCQA YAML ###
 dataset_path: MMMU/MMMU
 validation_split: validation
-output_type: generate_until
-doc_to_image: ["image_1", "image_2", "image_3", "image_4", "image_5", "image_6", "image_7"]
+output_type: multiple_choice
 doc_to_image: !function utils.doc_to_image
 doc_to_text: !function utils.doc_to_text
-doc_to_target: "answer"
-process_results: !function utils.process_results
-generation_kwargs:
-  until:
-    - "<|endoftext|>"
-  temperature: 0.0
-  do_sample: false
-  max_gen_toks: 512
+doc_to_choice: !function test_mcqa_utils.doc_to_choice
+process_docs: !function test_mcqa_utils.process_docs
+doc_to_target: !function test_mcqa_utils.doc_to_target
 metric_list:
   - metric: acc
     aggregation: mean
diff --git a/lm_eval/tasks/mmmu/test_mcqa_utils.py b/lm_eval/tasks/mmmu/test_mcqa_utils.py
new file mode 100644
index 0000000000..84bf7c60ca
--- /dev/null
+++ b/lm_eval/tasks/mmmu/test_mcqa_utils.py
@@ -0,0 +1,16 @@
+import ast
+
+
+def process_docs(dataset):
+    return dataset.filter(lambda x: x["question_type"] == "multiple-choice")
+
+
+def doc_to_choice(doc):
+    return ["A", "B", "C", "D", "E", "F", "G", "H", "I"][
+        : len(ast.literal_eval(doc["options"]))
+    ]
+
+
+def doc_to_target(doc):
+    print(doc)
+    return ["A", "B", "C", "D", "E", "F", "G", "H", "I"].index(doc["answer"])

From be14ac14418445253a335ce63150571e23c24d8b Mon Sep 17 00:00:00 2001
From: haileyschoelkopf <hailey@eleuther.ai>
Date: Mon, 9 Sep 2024 12:44:19 +0000
Subject: [PATCH 21/60] commit most recent code (generative ; qwen2-vl testing)

---
 lm_eval/models/hf_vlms.py         | 28 +++++++++++--------
 lm_eval/tasks/mmmu/_template_yaml | 46 +++++++++++++++----------------
 lm_eval/tasks/mmmu/utils.py       |  2 +-
 3 files changed, 41 insertions(+), 35 deletions(-)

diff --git a/lm_eval/models/hf_vlms.py b/lm_eval/models/hf_vlms.py
index 47683e744e..042ea31ff7 100644
--- a/lm_eval/models/hf_vlms.py
+++ b/lm_eval/models/hf_vlms.py
@@ -124,7 +124,6 @@ def tok_batch_encode(
         if self.AUTO_MODEL_CLASS == transformers.AutoModelForCausalLM:
             add_special_tokens = {"add_special_tokens": False or self.add_bos_token}
 
-        print(visuals)
         encoding = self.processor(
             images=visuals,
             text=strings,
@@ -457,15 +456,7 @@ def _collate(x):
 
         ### Up to here: was identical to non-multimodal HFLM generate_until ###
 
-        for idx, _chunk in enumerate(chunks):
-            if idx == 0:
-                zero_chunk = _chunk
-                chunk = _chunk
-            elif idx == 69:
-                chunk = zero_chunk
-            else:
-                chunk = _chunk
-            chunk = _chunk
+        for chunk in chunks:
             contexts, all_gen_kwargs, aux_arguments = zip(
                 *chunk
             )  # TODO: can we cut down further on number of distinct things we pass around?
@@ -473,6 +464,12 @@ def _collate(x):
             visuals = [
                 arg["visual"] for arg in aux_arguments
             ]  # TODO: I think *fully* flattening is just wrong for bs>1 ??
+
+            if not isinstance(contexts, list):
+                contexts = list(
+                    contexts
+                )  # for Qwen2-VL, processor is unhappy accepting a tuple of strings instead of a list.
+                # TODO: could we upstream this workaround?
             ### this part onward: same as HFLM ###
 
             # we assume all gen kwargs in the batch are the same
@@ -509,6 +506,7 @@ def _collate(x):
 
             max_ctx_len = self.max_length - max_gen_toks  # noqa: F841 # TODO: this assumes we are using a causal LM. is that always valid? shouldn't be
 
+            print(visuals, contexts)
             inputs = self.tok_batch_encode(
                 contexts,
                 visuals,
@@ -523,6 +521,12 @@ def _collate(x):
 
             cont = self._model_generate(inputs, stop=until, **kwargs)
 
+            del inputs
+            torch.cuda.empty_cache()
+            import gc
+
+            gc.collect()
+
             ### essentially same as HFLM beyond this line!
 
             cont_toks_list = cont.tolist()
@@ -541,7 +545,9 @@ def _collate(x):
                         s = s.split(term)[0]
 
                 res.append(s)
-                self.cache_hook.add_partial("generate_until", (context, gen_kwargs), s)
+                self.cache_hook.add_partial(
+                    "generate_until", (context, gen_kwargs), s
+                )  # TODO: cache key for multimodal input should be what?
                 pbar.update(1)
         # reorder this group of results back to original unsorted form
         res = re_ords.get_original(res)
diff --git a/lm_eval/tasks/mmmu/_template_yaml b/lm_eval/tasks/mmmu/_template_yaml
index 98549ebd5d..f826f185bd 100644
--- a/lm_eval/tasks/mmmu/_template_yaml
+++ b/lm_eval/tasks/mmmu/_template_yaml
@@ -1,31 +1,31 @@
-# dataset_path: MMMU/MMMU
-# validation_split: validation
-# output_type: generate_until
-# doc_to_image: ["image_1", "image_2", "image_3", "image_4", "image_5", "image_6", "image_7"]
-# doc_to_image: !function utils.doc_to_image
-# doc_to_text: !function utils.doc_to_text
-# doc_to_target: "answer"
-# process_results: !function utils.process_results
-# generation_kwargs:
-#   until:
-#     - "<|endoftext|>"
-#   temperature: 0.0
-#   do_sample: false
-#   max_gen_toks: 512
-# metric_list:
-#   - metric: acc
-#     aggregation: mean
-#     higher_is_better: true
-### Test MCQA YAML ###
 dataset_path: MMMU/MMMU
 validation_split: validation
-output_type: multiple_choice
+output_type: generate_until
+doc_to_image: ["image_1", "image_2", "image_3", "image_4", "image_5", "image_6", "image_7"]
 doc_to_image: !function utils.doc_to_image
 doc_to_text: !function utils.doc_to_text
-doc_to_choice: !function test_mcqa_utils.doc_to_choice
-process_docs: !function test_mcqa_utils.process_docs
-doc_to_target: !function test_mcqa_utils.doc_to_target
+doc_to_target: "answer"
+process_results: !function utils.process_results
+generation_kwargs:
+  until:
+    - "<|endoftext|>"
+  temperature: 0.0
+  do_sample: false
+  max_gen_toks: 512
 metric_list:
   - metric: acc
     aggregation: mean
     higher_is_better: true
+### Test MCQA YAML ###
+# dataset_path: MMMU/MMMU
+# validation_split: validation
+# output_type: multiple_choice
+# doc_to_image: !function utils.doc_to_image
+# doc_to_text: !function utils.doc_to_text
+# doc_to_choice: !function test_mcqa_utils.doc_to_choice
+# process_docs: !function test_mcqa_utils.process_docs
+# doc_to_target: !function test_mcqa_utils.doc_to_target
+# metric_list:
+#   - metric: acc
+#     aggregation: mean
+#     higher_is_better: true
diff --git a/lm_eval/tasks/mmmu/utils.py b/lm_eval/tasks/mmmu/utils.py
index 982f741f36..5af06043f4 100644
--- a/lm_eval/tasks/mmmu/utils.py
+++ b/lm_eval/tasks/mmmu/utils.py
@@ -46,7 +46,7 @@ def doc_to_text(doc):
 
     for i in range(1, 8):
         # replace <image {i}> with <image>. TODO: check this is always the right decision incl. for non-HF models
-        prompt = prompt.replace(f"<image {i}>", "<image>")
+        prompt = prompt.replace(f"<image {i}>", "<|image_pad|>")
 
     return prompt
 

From 7516b885f961e6ef51473d04052597367f02ffe3 Mon Sep 17 00:00:00 2001
From: haileyschoelkopf <hailey@eleuther.ai>
Date: Mon, 9 Sep 2024 16:42:17 +0000
Subject: [PATCH 22/60] preliminary image_token_id handling

---
 lm_eval/models/hf_vlms.py   | 59 +++++++++++++++++++++++++++++--------
 lm_eval/tasks/mmmu/utils.py | 32 ++------------------
 2 files changed, 49 insertions(+), 42 deletions(-)

diff --git a/lm_eval/models/hf_vlms.py b/lm_eval/models/hf_vlms.py
index 042ea31ff7..892961df04 100644
--- a/lm_eval/models/hf_vlms.py
+++ b/lm_eval/models/hf_vlms.py
@@ -6,12 +6,19 @@
 import transformers
 from tqdm import tqdm
 
+from lm_eval import utils
 from lm_eval.api.instance import Instance
 from lm_eval.api.registry import register_model
 from lm_eval.models.huggingface import HFLM
 from lm_eval.models.utils import Collator, pad_and_concat, stop_sequences_criteria
 
 
+DEFAULT_IMAGE_PLACEHOLDER = "<image>"
+
+
+eval_logger = utils.eval_logger
+
+
 @register_model("hf-multimodal")
 class HFMultimodalLM(HFLM):
     """
@@ -20,6 +27,37 @@ class HFMultimodalLM(HFLM):
 
     AUTO_MODEL_CLASS = transformers.AutoModelForVision2Seq  # TODO: what's the right way to handle this. maybe phase out the direct class-equality checks in HFLM?
 
+    # TODO: init method taking all kwargs, args, and `image_token_id` and setting image_token_id
+
+    def __init__(
+        self,
+        pretrained: Union[str, transformers.PreTrainedModel],
+        image_token_id: Optional[int] = None,
+        **kwargs,
+    ):
+        # We initialize using HFLM's init. Sub-methods like _create_model and _create_tokenizer
+        # modify init behavior.
+        super().__init__(pretrained, **kwargs)
+
+        # HF AutoModelForVision2Seq models have an `image_token_id` value in their configs
+        # denoting the token which indicates a location where an image will be substituted in.
+        # This can take different string values across models, e.g. <image> for Idefics2 and <|image_pad|> for Qwen2-VL
+        # WARNING: improperly set image_token_id can lead to ignored image input or other (potentially silent) errors!
+        self.image_token_id = (
+            int(image_token_id) if image_token_id else self.config.image_token_id
+        )
+        assert (
+            self.image_token_id is not None
+        ), "Must have a non-None image_token_id to evaluate a Hugging Face AutoModelForVision2Seq model. Please pass `image_token_id` in `--model_args` if model's config does not already specify one."
+        # get the string this token ID corresponds to
+        self.image_token = self.tok_decode(
+            [self.image_token_id], skip_special_tokens=False
+        )
+        if image_token_id is not None:
+            eval_logger.info(
+                f"A non-default image_token_id with image_token_id={self.image_token_id} and string value '{self.image_token}' was specified manually. Note that using an improper image_token placeholder may lead to ignored image input or errors!"
+            )
+
     def _create_tokenizer(
         self,
         pretrained: Union[str, transformers.PreTrainedModel],
@@ -70,14 +108,6 @@ def _create_tokenizer(
 
         self.tokenizer = self.processor.tokenizer
 
-    # def apply_chat_template(self, chat_history: List[Dict[str, str]]) -> str:
-    #     """
-    #     Method to apply a chat template to a list of chat history between user and model.
-    #     """
-    #     return self.tokenizer.apply_chat_template(
-    #         chat_history, tokenize=False, add_generation_prompt=True
-    #     )
-
     # def tok_encode(
     #     self, string: str, left_truncate_len=None, add_special_tokens=None
     # ) -> List[int]:
@@ -114,7 +144,11 @@ def tok_batch_encode(
     ) -> Dict[
         str, torch.Tensor
     ]:  # TODO: note that this return signature differs from HFLM tok_batch_encode.
-        # TODO: we should allow
+        # NOTE: here, we replace <image> tags with our model's corresponding image_token string value.
+        strings = [
+            string.replace(DEFAULT_IMAGE_PLACEHOLDER, self.image_token)
+            for string in strings
+        ]
 
         # encode a batch of strings. converts to tensors and pads automatically, unlike tok_encode.
         old_padding_side = self.tokenizer.padding_side
@@ -147,9 +181,6 @@ def tok_batch_encode(
 
         return encoding
 
-    # def tok_decode(self, tokens, skip_special_tokens=True):
-    #     return self.tokenizer.decode(tokens, skip_special_tokens=skip_special_tokens)
-
     def _model_call(self, inps, imgs, attn_mask=None, labels=None):
         """
         TODO: update docstring
@@ -212,6 +243,8 @@ def loglikelihood(
             else:
                 visuals = aux_arguments["visual"]
 
+                # TODO: write a multimodal _encode_pair that handles this
+                # TODO: rename these tokenization methods so we can still use the old ones
                 context_enc, continuation_enc = self._encode_pair(context, continuation)
             # TODO: key to pick for caching images
             new_reqs.append(
@@ -469,7 +502,7 @@ def _collate(x):
                 contexts = list(
                     contexts
                 )  # for Qwen2-VL, processor is unhappy accepting a tuple of strings instead of a list.
-                # TODO: could we upstream this workaround?
+                # TODO: could we upstream this workaround to HF?
             ### this part onward: same as HFLM ###
 
             # we assume all gen kwargs in the batch are the same
diff --git a/lm_eval/tasks/mmmu/utils.py b/lm_eval/tasks/mmmu/utils.py
index 5af06043f4..b191b9a1cd 100644
--- a/lm_eval/tasks/mmmu/utils.py
+++ b/lm_eval/tasks/mmmu/utils.py
@@ -46,7 +46,7 @@ def doc_to_text(doc):
 
     for i in range(1, 8):
         # replace <image {i}> with <image>. TODO: check this is always the right decision incl. for non-HF models
-        prompt = prompt.replace(f"<image {i}>", "<|image_pad|>")
+        prompt = prompt.replace(f"<image {i}>", "<image>")
 
     return prompt
 
@@ -83,12 +83,12 @@ def process_results(doc, results):
         index2ans = {index: ans for index, ans in zip(option_letters, option_strs)}
 
         pred = parse_multi_choice_response(results[0], all_choices, index2ans)
-        print(pred, all_choices, index2ans)
+        # print(pred, all_choices, index2ans)
         is_correct = eval_multi_choice(doc["answer"], pred)
     else:
+        # freeform response handling
         pred = parse_open_response(results[0])
         is_correct = eval_open(doc["answer"], pred)
-        # freeform response handling
 
     return {"acc": float(is_correct)}
 
@@ -339,29 +339,3 @@ def eval_open(gold_i, pred_i):
                     correct = True
                 break
     return correct
-
-
-# ----------- Batch Evaluation -------------
-def evaluate(samples):
-    """
-    Batch evaluation for multiple choice and open questions.
-    """
-    pred_correct = 0
-    judge_dict = dict()
-    for sample in samples:
-        gold_i = sample["answer"]
-        pred_i = sample["parsed_pred"]
-        if sample["question_type"] == "multiple-choice":
-            correct = eval_multi_choice(gold_i, pred_i)
-        else:  # open question
-            correct = eval_open(gold_i, pred_i)
-
-        if correct:
-            judge_dict[sample["id"]] = "Correct"
-            pred_correct += 1
-        else:
-            judge_dict[sample["id"]] = "Wrong"
-
-    if len(samples) == 0:
-        return {"acc": 0}
-    return judge_dict, {"acc": pred_correct / len(samples)}

From 5a65d104d70a322a4efc07c66f9bea55072c5448 Mon Sep 17 00:00:00 2001
From: haileyschoelkopf <hailey@eleuther.ai>
Date: Mon, 9 Sep 2024 17:38:30 +0000
Subject: [PATCH 23/60] small mmmu update: some qs have >4 mcqa options

---
 lm_eval/tasks/mmmu/_template_yaml | 2 ++
 lm_eval/tasks/mmmu/utils.py       | 4 ++--
 2 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/lm_eval/tasks/mmmu/_template_yaml b/lm_eval/tasks/mmmu/_template_yaml
index f826f185bd..2aea21ca0e 100644
--- a/lm_eval/tasks/mmmu/_template_yaml
+++ b/lm_eval/tasks/mmmu/_template_yaml
@@ -16,6 +16,8 @@ metric_list:
   - metric: acc
     aggregation: mean
     higher_is_better: true
+metadata:
+  version: 0.0
 ### Test MCQA YAML ###
 # dataset_path: MMMU/MMMU
 # validation_split: validation
diff --git a/lm_eval/tasks/mmmu/utils.py b/lm_eval/tasks/mmmu/utils.py
index b191b9a1cd..87a3022098 100644
--- a/lm_eval/tasks/mmmu/utils.py
+++ b/lm_eval/tasks/mmmu/utils.py
@@ -77,7 +77,7 @@ def process_results(doc, results):
     if doc["question_type"] == "multiple-choice":
         # multichoice logic
         option_strs = ast.literal_eval(doc["options"])
-        option_letters = ["A", "B", "C", "D"]
+        option_letters = ["A", "B", "C", "D", "E", "F", "G", "H", "I"]
 
         all_choices = option_letters[: len(option_strs)]
         index2ans = {index: ans for index, ans in zip(option_letters, option_strs)}
@@ -154,7 +154,7 @@ def parse_multi_choice_response(response, all_choices, index2ans):
     else:  # if only one candidate, use it.
         pred_index = candidates[0]
 
-    print(response, all_choices, index2ans, pred_index)
+    # print(response, all_choices, index2ans, pred_index)
 
     return pred_index
 

From 54d317d59566e2432dcf56db8f027ccd1439b34a Mon Sep 17 00:00:00 2001
From: haileyschoelkopf <hailey@eleuther.ai>
Date: Tue, 10 Sep 2024 15:48:02 +0000
Subject: [PATCH 24/60] push updated modeling code

---
 lm_eval/models/hf_vlms.py         | 181 ++++++++++++++++++++----------
 lm_eval/tasks/mmmu/_template_yaml |   1 -
 2 files changed, 123 insertions(+), 59 deletions(-)

diff --git a/lm_eval/models/hf_vlms.py b/lm_eval/models/hf_vlms.py
index 892961df04..aaded14f55 100644
--- a/lm_eval/models/hf_vlms.py
+++ b/lm_eval/models/hf_vlms.py
@@ -39,12 +39,20 @@ def __init__(
         # modify init behavior.
         super().__init__(pretrained, **kwargs)
 
+        assert (
+            self.batch_size != "auto"
+        ), "Batch size 'auto' is not yet supported for hf-multimodal models."
+
+        # TODO: phi-3.5 "image placeholders" are <image_1>, <image_2>, ... in order. how to handle this case
+
         # HF AutoModelForVision2Seq models have an `image_token_id` value in their configs
         # denoting the token which indicates a location where an image will be substituted in.
         # This can take different string values across models, e.g. <image> for Idefics2 and <|image_pad|> for Qwen2-VL
         # WARNING: improperly set image_token_id can lead to ignored image input or other (potentially silent) errors!
         self.image_token_id = (
-            int(image_token_id) if image_token_id else self.config.image_token_id
+            int(image_token_id)
+            if image_token_id
+            else getattr(self.config, "image_token_id", None)
         )
         assert (
             self.image_token_id is not None
@@ -88,8 +96,8 @@ def _create_tokenizer(
                 )
             else:
                 assert isinstance(
-                    tokenizer, transformers.PreTrainedTokenizer
-                ) or isinstance(tokenizer, transformers.PreTrainedTokenizerFast)
+                    tokenizer, transformers.ProcessorMixin
+                )  # TODO: check this condition
                 return tokenizer
 
         # Get tokenizer based on 'pretrained'
@@ -134,10 +142,70 @@ def _create_tokenizer(
 
     #     return encoding
 
-    def tok_batch_encode(
+    def tok_multimodal_encode(
+        self, string, images, left_truncate_len=None, add_special_tokens=None
+    ):
+        """Helper function which encodes an image + string combo using AutoProcessor"""
+        # We inherit special token kwarg setup from HFLM.tok_encode
+        # special_tokens_kwargs = {}
+
+        # by default for CausalLM - false or self.add_bos_token is set
+        # if add_special_tokens is None:
+        #     special_tokens_kwargs = {"add_special_tokens": False or self.add_bos_token}
+        # otherwise the method explicitly defines the value
+        # else:
+        #     special_tokens_kwargs = {"add_special_tokens": add_special_tokens}
+
+        # encode text+images
+        # TODO: why does (Qwen2-VL) processor error when attempting to add special tokens to text?
+        encoding = self.processor(
+            text=string, images=images, return_tensors=None
+        )  # , **special_tokens_kwargs)
+
+        # remove (and store) our tokenized text
+        text_encoding = encoding.pop("input_ids")
+        encoding.pop("attention_mask")
+
+        # left-truncate the encoded context to be at most `left_truncate_len` tokens long
+        if left_truncate_len:
+            text_encoding = text_encoding[-left_truncate_len:]
+
+        return text_encoding, encoding  # image_encoding is a dict
+
+    def _encode_multimodal_pair(self, context, continuation, images):
+        """Helper function to perform the role of TemplateLM._encode_pair
+        Except allowing for image input to also be processed alongside `context`.
+
+        This method is a bit messy due to the need to defer conversion of image and text token input
+        into PyTorch tensors until the main inference loop.
+        """
+
+        n_spaces = len(context) - len(context.rstrip())
+        if n_spaces > 0:
+            continuation = context[-n_spaces:] + continuation
+            context = context[:-n_spaces]
+
+        # TODO: replace default <image> placeholder with self.image_token, for contexts
+
+        whole_enc, image_enc = self.tok_multimodal_encode(
+            context + continuation, images
+        )
+        context_enc, _ = self.tok_multimodal_encode(context, images)
+
+        # tok_multimodal_encode returns List[List[int]] for tokenized text. Get rid of the batch dim
+        # since we only are encoding a single string.
+        # TODO: this is a bit hacky, it'd be nice to make this generally cleaner
+        whole_enc, context_enc = whole_enc[0], context_enc[0]
+
+        context_enc_len = len(context_enc)
+        continuation_enc = whole_enc[context_enc_len:]
+
+        return context_enc, continuation_enc, image_enc
+
+    def tok_batch_multimodal_encode(
         self,
         strings: List[str],  # note that input signature of this fn is different
-        visuals=None,  # TODO: typehint on this
+        images,  # TODO: typehint on this
         padding_side: str = "left",
         left_truncate_len: int = None,
         truncation: bool = False,
@@ -154,20 +222,16 @@ def tok_batch_encode(
         old_padding_side = self.tokenizer.padding_side
         self.tokenizer.padding_side = padding_side
 
-        add_special_tokens = {}
-        if self.AUTO_MODEL_CLASS == transformers.AutoModelForCausalLM:
-            add_special_tokens = {"add_special_tokens": False or self.add_bos_token}
+        # add_special_tokens = {"add_special_tokens": False or self.add_bos_token}
 
         encoding = self.processor(
-            images=visuals,
+            images=images,
             text=strings,
             truncation=truncation,
             padding="longest",
             return_tensors="pt",
-            **add_special_tokens,
+            # **add_special_tokens, # TODO: at least some Processors error out when passing this. How do we control whether text gets BOS added?
         )
-        # if "pixel_values" in encoding and encoding["pixel_values"] is None: # TODO: what case does this handle?
-        #     encoding.pop("pixel_values")
 
         encoding.to(
             self.device, self.model.dtype
@@ -181,27 +245,15 @@ def tok_batch_encode(
 
         return encoding
 
-    def _model_call(self, inps, imgs, attn_mask=None, labels=None):
+    def _model_multimodal_call(self, inps, imgs, attn_mask=None, labels=None):
         """
         TODO: update docstring
-        :param inps: torch.Tensor
-            A torch tensor of shape [batch, (sequence_ctx + sequence_cont)] or of shape
-            [batch, sequence_ctx]. the size of sequence may vary from call to call
-        :param attn_mask: torch.Tensor, optional
-            A torch tensor of shape [batch, (sequence_ctx + sequence_cont)]. Only passed
-            (and must be passed) if self.AUTO_MODEL_CLASS is transformers.AutoModelForSeq2SeqLM
-        :param labels: torch.Tensor, optional
-            A torch tensor of shape [batch, (sequence_ctx + sequence_cont)]. Only passed
-            (and must be passed) if self.AUTO_MODEL_CLASS is transformers.AutoModelForSeq2SeqLM
-        :return
-            A torch tensor of shape [batch, sequence, vocab] with the
-        logits returned from the model's decoder
         """
+        # note: imgs is a dict.
         with torch.no_grad():
-            return self.model(inps, imgs).logits
+            return self.model(inps, **imgs).logits
 
-    def _model_generate(self, inputs, max_length, stop, **generation_kwargs):
-        # gen_kwargs["image_sizes"] = [visuals[idx].size for idx in range(len(visuals))]
+    def _model_multimodal_generate(self, inputs, max_length, stop, **generation_kwargs):
         generation_kwargs["temperature"] = generation_kwargs.get("temperature", 0.0)
         do_sample = generation_kwargs.get("do_sample", None)
 
@@ -227,12 +279,33 @@ def _model_generate(self, inputs, max_length, stop, **generation_kwargs):
             **generation_kwargs,
         )
 
+    def _batch_images(self, image_encs):
+        """
+        Helper function: batch together image encodings across examples in a batch.
+        # TODO: for variable-sized images, this may break down.
+        """
+        batched_imgs = {}
+        for key in image_encs[0].keys():
+            batched_imgs[key] = torch.cat(
+                [
+                    torch.tensor(
+                        image_enc[key], device=self.device, dtype=self.model.dtype
+                    )
+                    for image_enc in image_encs
+                ],
+                dim=0,
+            )
+        print(batched_imgs)
+        return batched_imgs
+
     def loglikelihood_rolling(self, requests: List[Instance]) -> List[float]:
         raise NotImplementedError(
             "model type `hf-multimodal` does not support loglikelihood_rolling. Use 'hf' model type for text-only loglikelihood_rolling tasks ",
             "this is because we do not support measuring the loglikelihood a model assigns to an image.",
         )
 
+        # TODO: support this, but for only text-only input
+
     def loglikelihood(
         self, requests: List[Instance], disable_tqdm: bool = False
     ) -> List[Tuple[float, bool]]:
@@ -245,14 +318,16 @@ def loglikelihood(
 
                 # TODO: write a multimodal _encode_pair that handles this
                 # TODO: rename these tokenization methods so we can still use the old ones
-                context_enc, continuation_enc = self._encode_pair(context, continuation)
+                context_enc, continuation_enc, image_enc = self._encode_multimodal_pair(
+                    context, continuation, visuals
+                )
             # TODO: key to pick for caching images
             new_reqs.append(
                 (
-                    (visuals, context, continuation),
-                    visuals,
+                    (context, continuation, visuals),
                     context_enc,
                     continuation_enc,
+                    image_enc,
                 )
             )
 
@@ -268,6 +343,7 @@ def _loglikelihood_tokens(
     ) -> List[Tuple[float, bool]]:
         res = []
 
+        # TODO: **improve multimodal collation.** We currently ignore image size when ordering docs. ideally we'd take them into account
         def _collate(req: Tuple[Tuple[str, str], List[int], List[int]]):
             """Defines the key for the sorted method"""
             # the negative sign on len(toks) sorts descending - this has a few advantages:
@@ -276,8 +352,7 @@ def _collate(req: Tuple[Tuple[str, str], List[int], List[int]]):
             #   padded context length. this is useful to simplify the batching logic and more importantly to make
             #   automatic adaptive batches much much easier to implement
             # - any OOMs will happen right away rather than near the end
-
-            toks = req[2] + req[3]
+            toks = req[1] + req[2]
             return -len(toks), tuple(toks)
 
         def _lookup_one_token_cont(req: Tuple[Tuple[str, str], List[int], List[int]]):
@@ -286,7 +361,7 @@ def _lookup_one_token_cont(req: Tuple[Tuple[str, str], List[int], List[int]]):
             # allows for the creation of a lookup, so we can reuse logits in case of one-token continuations.
             # speeds up some multiple-choice tasks proportionally to the number of choices.
             # groups requests by context+continuation[:-1] and infer on one request/group.
-            return req[-3] + req[-2] + req[-1][:-1]
+            return req[-1] + req[-3] + req[-2][:-1]
 
         re_ord = Collator(
             requests,
@@ -320,7 +395,7 @@ def _lookup_one_token_cont(req: Tuple[Tuple[str, str], List[int], List[int]]):
         pbar = tqdm(
             total=len(requests),
             disable=(disable_tqdm or (self.rank != 0)),
-            desc="Running loglikelihood requests",
+            desc="Running loglikelihood requests with text+image input",
         )
         for chunk in chunks:
             imgs = []
@@ -333,7 +408,7 @@ def _lookup_one_token_cont(req: Tuple[Tuple[str, str], List[int], List[int]]):
             # tensors, then we pack them together into a batch, call the model, and then pick it all apart
             # again because vectorizing is annoying
 
-            for _, image_enc, context_enc, continuation_enc in chunk:
+            for _, context_enc, continuation_enc, image_enc in chunk:
                 # sanity check
                 assert len(image_enc) > 0
                 assert len(context_enc) > 0
@@ -366,39 +441,28 @@ def _lookup_one_token_cont(req: Tuple[Tuple[str, str], List[int], List[int]]):
                 cont_toks_list.append(continuation_enc)
                 inplens.append(inplen)
 
-                # image encoding
-                image_enc = self.processor(  # TODO: factor this call into an image_encode method??
-                    text="<image>"
-                    * len(
-                        image_enc
-                    ),  # we need this to process images for some models, annoyingly
-                    images=image_enc,
-                    return_tensors="pt",  # TODO: push tensor conversion into _loglikelihood_tokens
-                    # **add_special_tokens,
-                )[
-                    "pixel_values"
-                ]  # TODO: are keys from processors always the same? just pop attention_mask and input_ids keys
-
                 imgs.append(image_enc)
 
-                # TODO: batch input images together...
-
             # create encoder attn mask and batched conts, if seq2seq
             call_kwargs = {}
             batched_inps = pad_and_concat(
                 padding_len_inp, inps, padding_side="right"
             )  # [batch, padding_len_inp]
-            batched_imgs = torch.cat(imgs, dim=0)  # TODO: might be wrong, fix
+            # batch our examples' image inputs together
+            batched_imgs = self._batch_images(
+                imgs
+            )  # TODO: fix/test for bs>1 case with differently-sized imgs!
 
             multi_logits = F.log_softmax(
-                self._model_call(batched_inps, batched_imgs, **call_kwargs), dim=-1
+                self._model_multimodal_call(batched_inps, batched_imgs, **call_kwargs),
+                dim=-1,
             )  # [batch, padding_length (inp or cont), vocab]
 
             for (
                 request_str,
-                image_encs,
                 ctx_tokens,
                 _,
+                image_encs,
             ), logits, inplen, cont_toks in zip(
                 chunk, multi_logits, inplens, cont_toks_list
             ):
@@ -446,7 +510,9 @@ def _lookup_one_token_cont(req: Tuple[Tuple[str, str], List[int], List[int]]):
 
                     res.append(answer)
 
-                    self.cache_hook.add_partial("loglikelihood", request_str, answer)
+                    self.cache_hook.add_partial(
+                        "loglikelihood", request_str, answer
+                    )  # TODO: how to add images into the cache key?
                     pbar.update(1)
 
         pbar.close()
@@ -539,8 +605,7 @@ def _collate(x):
 
             max_ctx_len = self.max_length - max_gen_toks  # noqa: F841 # TODO: this assumes we are using a causal LM. is that always valid? shouldn't be
 
-            print(visuals, contexts)
-            inputs = self.tok_batch_encode(
+            inputs = self.tok_batch_multimodal_encode(
                 contexts,
                 visuals,
                 left_truncate_len=max_ctx_len,
@@ -552,7 +617,7 @@ def _collate(x):
             if "max_length" not in kwargs:
                 kwargs["max_length"] = context_enc.shape[1] + max_gen_toks
 
-            cont = self._model_generate(inputs, stop=until, **kwargs)
+            cont = self._model_multimodal_generate(inputs, stop=until, **kwargs)
 
             del inputs
             torch.cuda.empty_cache()
diff --git a/lm_eval/tasks/mmmu/_template_yaml b/lm_eval/tasks/mmmu/_template_yaml
index 2aea21ca0e..143fe1658b 100644
--- a/lm_eval/tasks/mmmu/_template_yaml
+++ b/lm_eval/tasks/mmmu/_template_yaml
@@ -1,7 +1,6 @@
 dataset_path: MMMU/MMMU
 validation_split: validation
 output_type: generate_until
-doc_to_image: ["image_1", "image_2", "image_3", "image_4", "image_5", "image_6", "image_7"]
 doc_to_image: !function utils.doc_to_image
 doc_to_text: !function utils.doc_to_text
 doc_to_target: "answer"

From 40a48c2acf99f2c8fa411dba43337ce8ae213b88 Mon Sep 17 00:00:00 2001
From: Baber Abbasi <baber@eleuther.ai>
Date: Wed, 11 Sep 2024 16:25:07 +0000
Subject: [PATCH 25/60] use processor.apply_chat_template

---
 lm_eval/models/hf_vlms.py | 40 +++++++++++++++++++++++++++++++++------
 1 file changed, 34 insertions(+), 6 deletions(-)

diff --git a/lm_eval/models/hf_vlms.py b/lm_eval/models/hf_vlms.py
index aaded14f55..af58443706 100644
--- a/lm_eval/models/hf_vlms.py
+++ b/lm_eval/models/hf_vlms.py
@@ -42,17 +42,21 @@ def __init__(
         assert (
             self.batch_size != "auto"
         ), "Batch size 'auto' is not yet supported for hf-multimodal models."
-
+        self.chat_applied: bool = False
         # TODO: phi-3.5 "image placeholders" are <image_1>, <image_2>, ... in order. how to handle this case
 
         # HF AutoModelForVision2Seq models have an `image_token_id` value in their configs
         # denoting the token which indicates a location where an image will be substituted in.
         # This can take different string values across models, e.g. <image> for Idefics2 and <|image_pad|> for Qwen2-VL
         # WARNING: improperly set image_token_id can lead to ignored image input or other (potentially silent) errors!
+        # TODO: llava-1.6 uses image_token_index
         self.image_token_id = (
             int(image_token_id)
             if image_token_id
-            else getattr(self.config, "image_token_id", None)
+            else (
+                getattr(self.config, "image_token_id", None)
+                or getattr(self.config, "image_token_index", None)
+            )
         )
         assert (
             self.image_token_id is not None
@@ -202,6 +206,29 @@ def _encode_multimodal_pair(self, context, continuation, images):
 
         return context_enc, continuation_enc, image_enc
 
+    def apply_chat_template(self, chat_history: List[Dict[str, str]]) -> str:
+        self.chat_applied = True
+        for content in chat_history:
+            c = []
+            text = content["content"]
+
+            # Count and remove image placeholders
+            image_count = text.count(DEFAULT_IMAGE_PLACEHOLDER)
+            text = text.replace(DEFAULT_IMAGE_PLACEHOLDER, "")
+
+            # Add image entries
+            for _ in range(image_count):
+                c.append({"type": "image", "image": None})
+
+            # Add single text entry at the end
+            c.append({"type": "text", "text": text})
+
+            content["content"] = c
+
+        return self.processor.apply_chat_template(
+            chat_history, add_generation_prompt=True
+        )
+
     def tok_batch_multimodal_encode(
         self,
         strings: List[str],  # note that input signature of this fn is different
@@ -213,10 +240,11 @@ def tok_batch_multimodal_encode(
         str, torch.Tensor
     ]:  # TODO: note that this return signature differs from HFLM tok_batch_encode.
         # NOTE: here, we replace <image> tags with our model's corresponding image_token string value.
-        strings = [
-            string.replace(DEFAULT_IMAGE_PLACEHOLDER, self.image_token)
-            for string in strings
-        ]
+        if not self.chat_applied:
+            strings = [
+                string.replace(DEFAULT_IMAGE_PLACEHOLDER, self.image_token)
+                for string in strings
+            ]
 
         # encode a batch of strings. converts to tensors and pads automatically, unlike tok_encode.
         old_padding_side = self.tokenizer.padding_side

From 8148fe4ea9b9e046532ffc151935f382108c80b5 Mon Sep 17 00:00:00 2001
From: Baber <baber@hey.com>
Date: Thu, 12 Sep 2024 12:05:38 +0500
Subject: [PATCH 26/60] add mathvista draft

---
 lm_eval/mathvista/testmini.yaml | 19 +++++++++++++++++++
 1 file changed, 19 insertions(+)
 create mode 100644 lm_eval/mathvista/testmini.yaml

diff --git a/lm_eval/mathvista/testmini.yaml b/lm_eval/mathvista/testmini.yaml
new file mode 100644
index 0000000000..81c8e8aca0
--- /dev/null
+++ b/lm_eval/mathvista/testmini.yaml
@@ -0,0 +1,19 @@
+#dataset_path: AI4Math/MathVista
+#test_split: testmini
+#output_type: generate_until
+#doc_to_image: !function utils.doc_to_image
+#doc_to_text: "<image> Question: {{question}}\nAnswer:"
+#doc_to_target: "answer"
+#process_results: !function utils.process_results
+#generation_kwargs:
+#  until:
+#    - "<|endoftext|>"
+#  temperature: 0.0
+#  do_sample: false
+#  max_gen_toks: 512
+#metric_list:
+#  - metric: acc
+#    aggregation: mean
+#    higher_is_better: true
+#metadata:
+#  version: 0.0

From 38b6fe3079352c518dc8befccb91dc2ee4a00227 Mon Sep 17 00:00:00 2001
From: Baber <baber@hey.com>
Date: Thu, 12 Sep 2024 12:17:21 +0500
Subject: [PATCH 27/60] nit

---
 lm_eval/mathvista/testmini.yaml | 39 +++++++++++++++++----------------
 1 file changed, 20 insertions(+), 19 deletions(-)

diff --git a/lm_eval/mathvista/testmini.yaml b/lm_eval/mathvista/testmini.yaml
index 81c8e8aca0..abaf81a665 100644
--- a/lm_eval/mathvista/testmini.yaml
+++ b/lm_eval/mathvista/testmini.yaml
@@ -1,19 +1,20 @@
-#dataset_path: AI4Math/MathVista
-#test_split: testmini
-#output_type: generate_until
-#doc_to_image: !function utils.doc_to_image
-#doc_to_text: "<image> Question: {{question}}\nAnswer:"
-#doc_to_target: "answer"
-#process_results: !function utils.process_results
-#generation_kwargs:
-#  until:
-#    - "<|endoftext|>"
-#  temperature: 0.0
-#  do_sample: false
-#  max_gen_toks: 512
-#metric_list:
-#  - metric: acc
-#    aggregation: mean
-#    higher_is_better: true
-#metadata:
-#  version: 0.0
+dataset_path: AI4Math/MathVista
+test_split: testmini
+output_type: generate_until
+doc_to_image: !function utils.doc_to_image
+doc_to_text: "<image> {{query}}"
+doc_to_target: "answer"
+# TODO: add process_results. multiple-choice question_type need to be handled differently
+process_results: !function utils.process_results
+generation_kwargs:
+  until:
+    - "<|endoftext|>"
+  temperature: 0.0
+  do_sample: false
+  max_gen_toks: 512
+metric_list:
+  - metric: acc
+    aggregation: mean
+    higher_is_better: true
+metadata:
+  version: 0.0

From d348106b748badae4e9ae966064786515e6dab6e Mon Sep 17 00:00:00 2001
From: Baber <baber@hey.com>
Date: Thu, 12 Sep 2024 12:19:10 +0500
Subject: [PATCH 28/60] nit

---
 lm_eval/{ => tasks}/mathvista/testmini.yaml | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 rename lm_eval/{ => tasks}/mathvista/testmini.yaml (100%)

diff --git a/lm_eval/mathvista/testmini.yaml b/lm_eval/tasks/mathvista/testmini.yaml
similarity index 100%
rename from lm_eval/mathvista/testmini.yaml
rename to lm_eval/tasks/mathvista/testmini.yaml

From 295a825ba8225f932a78c06fb997f914dd2a2571 Mon Sep 17 00:00:00 2001
From: haileyschoelkopf <hailey@eleuther.ai>
Date: Thu, 12 Sep 2024 14:44:56 +0000
Subject: [PATCH 29/60] ensure no footguns in text<>multimodal LM<>task
 incompatibility

---
 lm_eval/api/task.py       |  4 ++++
 lm_eval/evaluator.py      | 20 ++++++++++++++++++++
 lm_eval/models/hf_vlms.py | 37 ++++++++++++++++---------------------
 3 files changed, 40 insertions(+), 21 deletions(-)

diff --git a/lm_eval/api/task.py b/lm_eval/api/task.py
index 5d3bd83998..8dadf94857 100644
--- a/lm_eval/api/task.py
+++ b/lm_eval/api/task.py
@@ -740,6 +740,10 @@ def __init__(
                 )
             self.OUTPUT_TYPE = self.config.output_type
 
+        if self.config.doc_to_image is not None:
+            # mark the task as requiring multimodality.
+            self.MULTIMODAL = True
+
         if self.config.dataset_path is not None:
             self.DATASET_PATH = self.config.dataset_path
 
diff --git a/lm_eval/evaluator.py b/lm_eval/evaluator.py
index 52a8d00d74..6b513bd0be 100644
--- a/lm_eval/evaluator.py
+++ b/lm_eval/evaluator.py
@@ -414,8 +414,28 @@ def evaluate(
             for task_output in eval_tasks
         ):
             raise ValueError("log_samples must be True for 'bypass' metric-only tasks")
+
+    # validation check: are we running multimodal task <-> non-multimodal model class, or vice-versa.
+    incompatible_tasks = []
+    for task_output in eval_tasks:
+        task: Task = task_output.task
+
+        if getattr(lm, "MULTIMODAL", False) != getattr(task, "MULTIMODAL", False):
+            incompatible_tasks.append(task_output.task_name)
+
+    if not getattr(lm, "MULTIMODAL", False):
+        raise ValueError(
+            f"Attempted to run tasks: {incompatible_tasks} which require multimodal input, but the selected model type does not currently implement this. Multimodal support is currently restricted to the 'hf-multimodal' model type."
+        )
+    else:
+        raise ValueError(
+            f"Attempted to run tasks: {incompatible_tasks} which are text-only, but used a model type which only currently supports multimodal tasks."
+        )
+    # end multimodality validation check
+
     for task_output in eval_tasks:
         task: Task = task_output.task
+
         limit = get_sample_size(task, limit)
         task.build_all_requests(
             limit=limit,
diff --git a/lm_eval/models/hf_vlms.py b/lm_eval/models/hf_vlms.py
index af58443706..cf2ec8251f 100644
--- a/lm_eval/models/hf_vlms.py
+++ b/lm_eval/models/hf_vlms.py
@@ -25,9 +25,8 @@ class HFMultimodalLM(HFLM):
     An abstracted Hugging Face model class for multimodal LMs like Llava and Idefics.
     """
 
-    AUTO_MODEL_CLASS = transformers.AutoModelForVision2Seq  # TODO: what's the right way to handle this. maybe phase out the direct class-equality checks in HFLM?
-
-    # TODO: init method taking all kwargs, args, and `image_token_id` and setting image_token_id
+    AUTO_MODEL_CLASS = transformers.AutoModelForVision2Seq
+    MULTIMODAL = True  # flag to indicate, for now, that this model type can run multimodal requests
 
     def __init__(
         self,
@@ -49,7 +48,6 @@ def __init__(
         # denoting the token which indicates a location where an image will be substituted in.
         # This can take different string values across models, e.g. <image> for Idefics2 and <|image_pad|> for Qwen2-VL
         # WARNING: improperly set image_token_id can lead to ignored image input or other (potentially silent) errors!
-        # TODO: llava-1.6 uses image_token_index
         self.image_token_id = (
             int(image_token_id)
             if image_token_id
@@ -238,7 +236,7 @@ def tok_batch_multimodal_encode(
         truncation: bool = False,
     ) -> Dict[
         str, torch.Tensor
-    ]:  # TODO: note that this return signature differs from HFLM tok_batch_encode.
+    ]:  # note that this return signature differs from HFLM tok_batch_encode.
         # NOTE: here, we replace <image> tags with our model's corresponding image_token string value.
         if not self.chat_applied:
             strings = [
@@ -332,20 +330,22 @@ def loglikelihood_rolling(self, requests: List[Instance]) -> List[float]:
             "this is because we do not support measuring the loglikelihood a model assigns to an image.",
         )
 
-        # TODO: support this, but for only text-only input
-
     def loglikelihood(
         self, requests: List[Instance], disable_tqdm: bool = False
     ) -> List[Tuple[float, bool]]:
+        raise NotImplementedError(
+            "'loglikelihood' requests for model type `hf-multimodal` are not yet tested. This feature will be enabled when a loglikelihood-based multiple-choice VQA dataset is added!"
+        )
+
         new_reqs = []
         for context, continuation, aux_arguments in [req.args for req in requests]:
             if context == "":
-                raise ValueError("Must get non-empty context for multimodal requests!")
+                raise ValueError(
+                    "Must get non-empty context for multimodal requests! You might be trying to run 'loglikelihood_rolling', which is not supported in the multimodal case."
+                )
             else:
                 visuals = aux_arguments["visual"]
 
-                # TODO: write a multimodal _encode_pair that handles this
-                # TODO: rename these tokenization methods so we can still use the old ones
                 context_enc, continuation_enc, image_enc = self._encode_multimodal_pair(
                     context, continuation, visuals
                 )
@@ -540,7 +540,7 @@ def _lookup_one_token_cont(req: Tuple[Tuple[str, str], List[int], List[int]]):
 
                     self.cache_hook.add_partial(
                         "loglikelihood", request_str, answer
-                    )  # TODO: how to add images into the cache key?
+                    )  # TODO: choose convention for adding images into the cache key
                     pbar.update(1)
 
         pbar.close()
@@ -550,7 +550,7 @@ def _lookup_one_token_cont(req: Tuple[Tuple[str, str], List[int], List[int]]):
     def generate_until(
         self, requests: List[Instance], disable_tqdm: bool = False
     ) -> List[str]:
-        # TODO: back out to HFLM.generate_until() for all requests without aux_arguments
+        # TODO: back out to HFLM.generate_until() for all requests without aux_arguments (text-only reqs)
         res = []
 
         def _collate(x):
@@ -584,13 +584,9 @@ def _collate(x):
         ### Up to here: was identical to non-multimodal HFLM generate_until ###
 
         for chunk in chunks:
-            contexts, all_gen_kwargs, aux_arguments = zip(
-                *chunk
-            )  # TODO: can we cut down further on number of distinct things we pass around?
+            contexts, all_gen_kwargs, aux_arguments = zip(*chunk)
 
-            visuals = [
-                arg["visual"] for arg in aux_arguments
-            ]  # TODO: I think *fully* flattening is just wrong for bs>1 ??
+            visuals = [arg["visual"] for arg in aux_arguments]
 
             if not isinstance(contexts, list):
                 contexts = list(
@@ -631,7 +627,7 @@ def _collate(x):
 
             ### end stuff that's entirely copied verbatim from HFLM ###
 
-            max_ctx_len = self.max_length - max_gen_toks  # noqa: F841 # TODO: this assumes we are using a causal LM. is that always valid? shouldn't be
+            max_ctx_len = self.max_length - max_gen_toks
 
             inputs = self.tok_batch_multimodal_encode(
                 contexts,
@@ -657,8 +653,7 @@ def _collate(x):
 
             cont_toks_list = cont.tolist()
             for cont_toks, context in zip(cont_toks_list, contexts):
-                # discard context + left-padding toks if using causal decoder-only LM
-                # if self.AUTO_MODEL_CLASS == transformers.AutoModelForCausalLM: # TODO: ensure this holds for VLMs
+                # discard context + left-padding toks if using causal decoder-only VLM
                 cont_toks = cont_toks[context_enc.shape[1] :]
 
                 s = self.tok_decode(cont_toks)

From 2c9fd797126bc72d8190a45fd889bcfcbf5f7021 Mon Sep 17 00:00:00 2001
From: haileyschoelkopf <hailey@eleuther.ai>
Date: Thu, 12 Sep 2024 15:04:38 +0000
Subject: [PATCH 30/60] add notification to readme regarding launch of
 prototype!

---
 README.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/README.md b/README.md
index e973cdb7e8..53a34c39eb 100644
--- a/README.md
+++ b/README.md
@@ -6,6 +6,7 @@
 
 *Latest News 📣*
 
+- [2024/09] We are prototyping allowing users of LM Evaluation Harness to create and evaluate on text+image multimodal input, text output tasks, and have just added the `hf-multimodal` model type and `mmmu` task as a prototype feature. We welcome users to try out this in-progress feature and stress-test it for themselves, and suggest they check out [`lmms-eval`](https://github.com/EvolvingLMMs-Lab/lmms-eval), a wonderful project originally forking off of the lm-evaluation-harness, for a broader range of multimodal tasks, models, and features.
 - [2024/07] [API model](docs/API_guide.md) support has been updated and refactored, introducing support for batched and async requests, and making it significantly easier to customize and use for your own purposes. **To run Llama 405B, we recommend using VLLM's OpenAI-compliant API to host the model, and use the `local-completions` model type to evaluate the model.**
 - [2024/07] New Open LLM Leaderboard tasks have been added ! You can find them under the [leaderboard](lm_eval/tasks/leaderboard/README.md) task group.
 

From 80e1711a9ca15eb3f73099f1c5cb07c52dbe0499 Mon Sep 17 00:00:00 2001
From: haileyschoelkopf <hailey@eleuther.ai>
Date: Thu, 12 Sep 2024 16:51:52 +0000
Subject: [PATCH 31/60] fix compatibility check

---
 lm_eval/evaluator.py | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

diff --git a/lm_eval/evaluator.py b/lm_eval/evaluator.py
index 6b513bd0be..e8ddec67e3 100644
--- a/lm_eval/evaluator.py
+++ b/lm_eval/evaluator.py
@@ -422,15 +422,15 @@ def evaluate(
 
         if getattr(lm, "MULTIMODAL", False) != getattr(task, "MULTIMODAL", False):
             incompatible_tasks.append(task_output.task_name)
-
-    if not getattr(lm, "MULTIMODAL", False):
-        raise ValueError(
-            f"Attempted to run tasks: {incompatible_tasks} which require multimodal input, but the selected model type does not currently implement this. Multimodal support is currently restricted to the 'hf-multimodal' model type."
-        )
-    else:
-        raise ValueError(
-            f"Attempted to run tasks: {incompatible_tasks} which are text-only, but used a model type which only currently supports multimodal tasks."
-        )
+    if len(incompatible_tasks) > 0:
+        if not getattr(lm, "MULTIMODAL", False):
+            raise ValueError(
+                f"Attempted to run tasks: {incompatible_tasks} which require multimodal input, but the selected model type does not currently implement this. Multimodal support is currently restricted to the 'hf-multimodal' model type."
+            )
+        else:
+            raise ValueError(
+                f"Attempted to run tasks: {incompatible_tasks} which are text-only, but used a model type which only currently supports multimodal tasks."
+            )
     # end multimodality validation check
 
     for task_output in eval_tasks:

From d84b9fcf6aee02a894dfc3d17d087c9948eb7da9 Mon Sep 17 00:00:00 2001
From: haileyschoelkopf <hailey@eleuther.ai>
Date: Thu, 12 Sep 2024 19:46:58 +0000
Subject: [PATCH 32/60] reorganize mmmu configs

---
 lm_eval/tasks/mmmu/_art_and_design.yaml       | 13 +++++++
 lm_eval/tasks/mmmu/_business.yaml             | 14 ++++++++
 lm_eval/tasks/mmmu/_health_and_medicine.yaml  | 14 ++++++++
 .../mmmu/_humanities_and_social_sciences.yaml | 13 +++++++
 lm_eval/tasks/mmmu/_mmmu.yaml                 | 14 ++++++++
 lm_eval/tasks/mmmu/_science.yaml              | 14 ++++++++
 lm_eval/tasks/mmmu/_tech_and_engineering.yaml | 16 +++++++++
 lm_eval/tasks/mmmu/_template.yaml             | 14 --------
 lm_eval/tasks/mmmu/_template_yaml             | 13 -------
 lm_eval/tasks/mmmu/art_and_design.yaml        | 23 ------------
 lm_eval/tasks/mmmu/business.yaml              | 27 --------------
 lm_eval/tasks/mmmu/health_and_medicine.yaml   | 27 --------------
 .../mmmu/humanities_and_social_sciences.yaml  | 23 ------------
 lm_eval/tasks/mmmu/mmmu.yaml                  | 12 -------
 lm_eval/tasks/mmmu/mmmu_accounting.yaml       |  4 +++
 lm_eval/tasks/mmmu/mmmu_agriculture.yaml      |  4 +++
 .../mmmu_architecture_and_engineering.yaml    |  4 +++
 lm_eval/tasks/mmmu/mmmu_art.yaml              |  4 +++
 lm_eval/tasks/mmmu/mmmu_art_theory.yaml       |  4 +++
 .../mmmu/mmmu_basic_medical_science.yaml      |  4 +++
 lm_eval/tasks/mmmu/mmmu_biology.yaml          |  4 +++
 lm_eval/tasks/mmmu/mmmu_chemistry.yaml        |  4 +++
 .../tasks/mmmu/mmmu_clinical_medicine.yaml    |  4 +++
 lm_eval/tasks/mmmu/mmmu_computer_science.yaml |  4 +++
 lm_eval/tasks/mmmu/mmmu_design.yaml           |  4 +++
 ...u_diagnostics_and_laboratory_medicine.yaml |  4 +++
 lm_eval/tasks/mmmu/mmmu_economics.yaml        |  4 +++
 lm_eval/tasks/mmmu/mmmu_electronics.yaml      |  4 +++
 lm_eval/tasks/mmmu/mmmu_energy_and_power.yaml |  4 +++
 lm_eval/tasks/mmmu/mmmu_finance.yaml          |  4 +++
 lm_eval/tasks/mmmu/mmmu_geography.yaml        |  4 +++
 lm_eval/tasks/mmmu/mmmu_history.yaml          |  4 +++
 lm_eval/tasks/mmmu/mmmu_literature.yaml       |  4 +++
 lm_eval/tasks/mmmu/mmmu_manage.yaml           |  4 +++
 lm_eval/tasks/mmmu/mmmu_marketing.yaml        |  4 +++
 lm_eval/tasks/mmmu/mmmu_materials.yaml        |  4 +++
 lm_eval/tasks/mmmu/mmmu_math.yaml             |  4 +++
 .../mmmu/mmmu_mechanical_engineering.yaml     |  4 +++
 lm_eval/tasks/mmmu/mmmu_music.yaml            |  4 +++
 lm_eval/tasks/mmmu/mmmu_pharmacy.yaml         |  4 +++
 lm_eval/tasks/mmmu/mmmu_physics.yaml          |  4 +++
 lm_eval/tasks/mmmu/mmmu_psychology.yaml       |  4 +++
 lm_eval/tasks/mmmu/mmmu_public_health.yaml    |  4 +++
 lm_eval/tasks/mmmu/mmmu_sociology.yaml        |  4 +++
 lm_eval/tasks/mmmu/science.yaml               | 27 --------------
 lm_eval/tasks/mmmu/tech_and_engineering.yaml  | 35 -------------------
 lm_eval/tasks/mmmu/test_mcqa_utils.py         | 16 ---------
 47 files changed, 218 insertions(+), 217 deletions(-)
 create mode 100644 lm_eval/tasks/mmmu/_art_and_design.yaml
 create mode 100644 lm_eval/tasks/mmmu/_business.yaml
 create mode 100644 lm_eval/tasks/mmmu/_health_and_medicine.yaml
 create mode 100644 lm_eval/tasks/mmmu/_humanities_and_social_sciences.yaml
 create mode 100644 lm_eval/tasks/mmmu/_mmmu.yaml
 create mode 100644 lm_eval/tasks/mmmu/_science.yaml
 create mode 100644 lm_eval/tasks/mmmu/_tech_and_engineering.yaml
 delete mode 100644 lm_eval/tasks/mmmu/_template.yaml
 delete mode 100644 lm_eval/tasks/mmmu/art_and_design.yaml
 delete mode 100644 lm_eval/tasks/mmmu/business.yaml
 delete mode 100644 lm_eval/tasks/mmmu/health_and_medicine.yaml
 delete mode 100644 lm_eval/tasks/mmmu/humanities_and_social_sciences.yaml
 delete mode 100644 lm_eval/tasks/mmmu/mmmu.yaml
 create mode 100644 lm_eval/tasks/mmmu/mmmu_accounting.yaml
 create mode 100644 lm_eval/tasks/mmmu/mmmu_agriculture.yaml
 create mode 100644 lm_eval/tasks/mmmu/mmmu_architecture_and_engineering.yaml
 create mode 100644 lm_eval/tasks/mmmu/mmmu_art.yaml
 create mode 100644 lm_eval/tasks/mmmu/mmmu_art_theory.yaml
 create mode 100644 lm_eval/tasks/mmmu/mmmu_basic_medical_science.yaml
 create mode 100644 lm_eval/tasks/mmmu/mmmu_biology.yaml
 create mode 100644 lm_eval/tasks/mmmu/mmmu_chemistry.yaml
 create mode 100644 lm_eval/tasks/mmmu/mmmu_clinical_medicine.yaml
 create mode 100644 lm_eval/tasks/mmmu/mmmu_computer_science.yaml
 create mode 100644 lm_eval/tasks/mmmu/mmmu_design.yaml
 create mode 100644 lm_eval/tasks/mmmu/mmmu_diagnostics_and_laboratory_medicine.yaml
 create mode 100644 lm_eval/tasks/mmmu/mmmu_economics.yaml
 create mode 100644 lm_eval/tasks/mmmu/mmmu_electronics.yaml
 create mode 100644 lm_eval/tasks/mmmu/mmmu_energy_and_power.yaml
 create mode 100644 lm_eval/tasks/mmmu/mmmu_finance.yaml
 create mode 100644 lm_eval/tasks/mmmu/mmmu_geography.yaml
 create mode 100644 lm_eval/tasks/mmmu/mmmu_history.yaml
 create mode 100644 lm_eval/tasks/mmmu/mmmu_literature.yaml
 create mode 100644 lm_eval/tasks/mmmu/mmmu_manage.yaml
 create mode 100644 lm_eval/tasks/mmmu/mmmu_marketing.yaml
 create mode 100644 lm_eval/tasks/mmmu/mmmu_materials.yaml
 create mode 100644 lm_eval/tasks/mmmu/mmmu_math.yaml
 create mode 100644 lm_eval/tasks/mmmu/mmmu_mechanical_engineering.yaml
 create mode 100644 lm_eval/tasks/mmmu/mmmu_music.yaml
 create mode 100644 lm_eval/tasks/mmmu/mmmu_pharmacy.yaml
 create mode 100644 lm_eval/tasks/mmmu/mmmu_physics.yaml
 create mode 100644 lm_eval/tasks/mmmu/mmmu_psychology.yaml
 create mode 100644 lm_eval/tasks/mmmu/mmmu_public_health.yaml
 create mode 100644 lm_eval/tasks/mmmu/mmmu_sociology.yaml
 delete mode 100644 lm_eval/tasks/mmmu/science.yaml
 delete mode 100644 lm_eval/tasks/mmmu/tech_and_engineering.yaml
 delete mode 100644 lm_eval/tasks/mmmu/test_mcqa_utils.py

diff --git a/lm_eval/tasks/mmmu/_art_and_design.yaml b/lm_eval/tasks/mmmu/_art_and_design.yaml
new file mode 100644
index 0000000000..b0dda1876f
--- /dev/null
+++ b/lm_eval/tasks/mmmu/_art_and_design.yaml
@@ -0,0 +1,13 @@
+group: mmmu_val_art_and_design
+group_alias: Art and Design
+task:
+  - mmmu_val_art
+  - mmmu_val_art_theory
+  - mmmu_val_design
+  - mmmu_val_music
+aggregate_metric_list:
+  - metric: acc
+    aggregation: mean
+    weight_by_size: true
+metadata:
+  version: 0.0
diff --git a/lm_eval/tasks/mmmu/_business.yaml b/lm_eval/tasks/mmmu/_business.yaml
new file mode 100644
index 0000000000..497948e5e5
--- /dev/null
+++ b/lm_eval/tasks/mmmu/_business.yaml
@@ -0,0 +1,14 @@
+group: mmmu_val_business
+group_alias: Business
+task:
+  - mmmu_val_accounting
+  - mmmu_val_economics
+  - mmmu_val_finance
+  - mmmu_val_manage
+  - mmmu_val_marketing
+aggregate_metric_list:
+  - metric: acc
+    aggregation: mean
+    weight_by_size: true
+metadata:
+  version: 0.0
diff --git a/lm_eval/tasks/mmmu/_health_and_medicine.yaml b/lm_eval/tasks/mmmu/_health_and_medicine.yaml
new file mode 100644
index 0000000000..04709d3d46
--- /dev/null
+++ b/lm_eval/tasks/mmmu/_health_and_medicine.yaml
@@ -0,0 +1,14 @@
+group: mmmu_val_health_and_medicine
+group_alias: Health and Medicine
+task:
+  - mmmu_val_basic_medical_science
+  - mmmu_val_clinical_medicine
+  - mmmu_val_diagnostics_and_laboratory_medicine
+  - mmmu_val_pharmacy
+  - mmmu_val_public_health
+aggregate_metric_list:
+  - metric: acc
+    aggregation: mean
+    weight_by_size: true
+metadata:
+  version: 0.0
diff --git a/lm_eval/tasks/mmmu/_humanities_and_social_sciences.yaml b/lm_eval/tasks/mmmu/_humanities_and_social_sciences.yaml
new file mode 100644
index 0000000000..b8d70a9ea3
--- /dev/null
+++ b/lm_eval/tasks/mmmu/_humanities_and_social_sciences.yaml
@@ -0,0 +1,13 @@
+group: mmmu_val_humanities_and_social_science
+group_alias: Humanities and Social Science
+task:
+  - mmmu_val_history
+  - mmmu_val_literature
+  - mmmu_val_sociology
+  - mmmu_val_psychology
+aggregate_metric_list:
+  - metric: acc
+    aggregation: mean
+    weight_by_size: true
+metadata:
+  version: 0.0
diff --git a/lm_eval/tasks/mmmu/_mmmu.yaml b/lm_eval/tasks/mmmu/_mmmu.yaml
new file mode 100644
index 0000000000..6bcd234bbc
--- /dev/null
+++ b/lm_eval/tasks/mmmu/_mmmu.yaml
@@ -0,0 +1,14 @@
+group: mmmu_val
+task:
+  - mmmu_val_art_and_design
+  - mmmu_val_business
+  - mmmu_val_health_and_medicine
+  - mmmu_val_humanities_and_social_science
+  - mmmu_val_science
+  - mmmu_val_tech_and_engineering
+aggregate_metric_list:
+  - metric: acc
+    aggregation: mean
+    weight_by_size: true
+metadata:
+  version: 0.0
diff --git a/lm_eval/tasks/mmmu/_science.yaml b/lm_eval/tasks/mmmu/_science.yaml
new file mode 100644
index 0000000000..9b41585a65
--- /dev/null
+++ b/lm_eval/tasks/mmmu/_science.yaml
@@ -0,0 +1,14 @@
+group: mmmu_val_science
+group_alias: Science
+task:
+  - mmmu_val_biology
+  - mmmu_val_chemistry
+  - mmmu_val_geography
+  - mmmu_val_math
+  - mmmu_val_physics
+aggregate_metric_list:
+  - metric: acc
+    aggregation: mean
+    weight_by_size: true
+metadata:
+  version: 0.0
diff --git a/lm_eval/tasks/mmmu/_tech_and_engineering.yaml b/lm_eval/tasks/mmmu/_tech_and_engineering.yaml
new file mode 100644
index 0000000000..956f9906b3
--- /dev/null
+++ b/lm_eval/tasks/mmmu/_tech_and_engineering.yaml
@@ -0,0 +1,16 @@
+group: mmmu_val_tech_and_engineering
+group_alias: Tech and Engineering
+task:
+  - mmmu_val_agriculture
+  - mmmu_val_architecture_and_engineering
+  - mmmu_val_computer_science
+  - mmmu_val_electronics
+  - mmmu_val_energy_and_power
+  - mmmu_val_materials
+  - mmmu_val_mechanical_engineering
+aggregate_metric_list:
+  - metric: acc
+    aggregation: mean
+    weight_by_size: true
+metadata:
+  version: 0.0
diff --git a/lm_eval/tasks/mmmu/_template.yaml b/lm_eval/tasks/mmmu/_template.yaml
deleted file mode 100644
index 4a9ba16f4f..0000000000
--- a/lm_eval/tasks/mmmu/_template.yaml
+++ /dev/null
@@ -1,14 +0,0 @@
-task: MMMU_loglikelihood_art
-dataset_path: MMMU/MMMU
-validation_split: validation
-dataset_name: History
-output_type: multiple_choice
-doc_to_image: !function utils.doc_to_image
-doc_to_text: !function utils.doc_to_text
-doc_to_choice: !function test_mcqa_utils.doc_to_choice
-process_docs: !function test_mcqa_utils.process_docs
-doc_to_target: !function test_mcqa_utils.doc_to_target
-metric_list:
-  - metric: acc
-    aggregation: mean
-    higher_is_better: true
diff --git a/lm_eval/tasks/mmmu/_template_yaml b/lm_eval/tasks/mmmu/_template_yaml
index 143fe1658b..f92ce60d6b 100644
--- a/lm_eval/tasks/mmmu/_template_yaml
+++ b/lm_eval/tasks/mmmu/_template_yaml
@@ -17,16 +17,3 @@ metric_list:
     higher_is_better: true
 metadata:
   version: 0.0
-### Test MCQA YAML ###
-# dataset_path: MMMU/MMMU
-# validation_split: validation
-# output_type: multiple_choice
-# doc_to_image: !function utils.doc_to_image
-# doc_to_text: !function utils.doc_to_text
-# doc_to_choice: !function test_mcqa_utils.doc_to_choice
-# process_docs: !function test_mcqa_utils.process_docs
-# doc_to_target: !function test_mcqa_utils.doc_to_target
-# metric_list:
-#   - metric: acc
-#     aggregation: mean
-#     higher_is_better: true
diff --git a/lm_eval/tasks/mmmu/art_and_design.yaml b/lm_eval/tasks/mmmu/art_and_design.yaml
deleted file mode 100644
index c3c57db940..0000000000
--- a/lm_eval/tasks/mmmu/art_and_design.yaml
+++ /dev/null
@@ -1,23 +0,0 @@
-group: mmmu_art_and_design
-group_alias: Art and Design
-task:
-  - task: mmmu_art
-    include: _template_yaml
-    task_alias: Art
-    dataset_name: Art
-  - task: mmmu_art_theory
-    include: _template_yaml
-    task_alias: Art Theory
-    dataset_name: Art_Theory
-  - task: mmmu_design
-    include: _template_yaml
-    task_alias: Design
-    dataset_name: Design
-  - task: mmmu_music
-    include: _template_yaml
-    task_alias: Music
-    dataset_name: Music
-aggregate_metric_list:
-  - metric: acc
-    aggregation: mean
-    weight_by_size: true
diff --git a/lm_eval/tasks/mmmu/business.yaml b/lm_eval/tasks/mmmu/business.yaml
deleted file mode 100644
index bb2db84438..0000000000
--- a/lm_eval/tasks/mmmu/business.yaml
+++ /dev/null
@@ -1,27 +0,0 @@
-group: mmmu_business
-group_alias: Business
-task:
-  - task: mmmu_accounting
-    include: _template_yaml
-    task_alias: Accounting
-    dataset_name: Accounting
-  - task: mmmu_economics
-    include: _template_yaml
-    task_alias: Economics
-    dataset_name: Economics
-  - task: mmmu_finance
-    include: _template_yaml
-    task_alias: Finance
-    dataset_name: Finance
-  - task: mmmu_manage
-    include: _template_yaml
-    task_alias: Manage
-    dataset_name: Manage
-  - task: mmmu_marketing
-    include: _template_yaml
-    task_alias: Marketing
-    dataset_name: Marketing
-aggregate_metric_list:
-  - metric: acc
-    aggregation: mean
-    weight_by_size: true
diff --git a/lm_eval/tasks/mmmu/health_and_medicine.yaml b/lm_eval/tasks/mmmu/health_and_medicine.yaml
deleted file mode 100644
index 66d717cd14..0000000000
--- a/lm_eval/tasks/mmmu/health_and_medicine.yaml
+++ /dev/null
@@ -1,27 +0,0 @@
-group: mmmu_health_and_medicine
-group_alias: Health and Medicine
-task:
-  - task: mmmu_basic_medical_science
-    include: _template_yaml
-    task_alias: Basic Medical Science
-    dataset_name: Basic_Medical_Science
-  - task: mmmu_clinical_medicine
-    include: _template_yaml
-    task_alias: Clinical Medicine
-    dataset_name: Clinical_Medicine
-  - task: mmmu_diagnostics_and_laboratory_medicine
-    include: _template_yaml
-    task_alias: Diagnostics and Laboratory Medicine
-    dataset_name: Diagnostics_and_Laboratory_Medicine
-  - task: mmmu_pharmacy
-    include: _template_yaml
-    task_alias: Pharmacy
-    dataset_name: Pharmacy
-  - task: mmmu_public_health
-    include: _template_yaml
-    task_alias: Public Health
-    dataset_name: Public_Health
-aggregate_metric_list:
-  - metric: acc
-    aggregation: mean
-    weight_by_size: true
diff --git a/lm_eval/tasks/mmmu/humanities_and_social_sciences.yaml b/lm_eval/tasks/mmmu/humanities_and_social_sciences.yaml
deleted file mode 100644
index 32bc2fd3ae..0000000000
--- a/lm_eval/tasks/mmmu/humanities_and_social_sciences.yaml
+++ /dev/null
@@ -1,23 +0,0 @@
-group: mmmu_humanities_and_social_science
-group_alias: Humanities and Social Science
-task:
-  - task: mmmu_history
-    include: _template_yaml
-    task_alias: History
-    dataset_name: History
-  - task: mmmu_literature
-    include: _template_yaml
-    task_alias: Literature
-    dataset_name: Literature
-  - task: mmmu_sociology
-    include: _template_yaml
-    task_alias: Sociology
-    dataset_name: Sociology
-  - task: mmmu_psychology
-    include: _template_yaml
-    task_alias: Psychology
-    dataset_name: Psychology
-aggregate_metric_list:
-  - metric: acc
-    aggregation: mean
-    weight_by_size: true
diff --git a/lm_eval/tasks/mmmu/mmmu.yaml b/lm_eval/tasks/mmmu/mmmu.yaml
deleted file mode 100644
index 3fbf0dde94..0000000000
--- a/lm_eval/tasks/mmmu/mmmu.yaml
+++ /dev/null
@@ -1,12 +0,0 @@
-group: mmmu
-task:
-  - mmmu_art_and_design
-  - mmmu_business
-  - mmmu_health_and_medicine
-  - mmmu_humanities_and_social_science
-  - mmmu_science
-  - mmmu_tech_and_engineering
-aggregate_metric_list:
-  - metric: acc
-    aggregation: mean
-    weight_by_size: true
diff --git a/lm_eval/tasks/mmmu/mmmu_accounting.yaml b/lm_eval/tasks/mmmu/mmmu_accounting.yaml
new file mode 100644
index 0000000000..3c4aa41d9c
--- /dev/null
+++ b/lm_eval/tasks/mmmu/mmmu_accounting.yaml
@@ -0,0 +1,4 @@
+task: mmmu_val_accounting
+include: _template_yaml
+task_alias: Accounting
+dataset_name: Accounting
diff --git a/lm_eval/tasks/mmmu/mmmu_agriculture.yaml b/lm_eval/tasks/mmmu/mmmu_agriculture.yaml
new file mode 100644
index 0000000000..387f33f7a9
--- /dev/null
+++ b/lm_eval/tasks/mmmu/mmmu_agriculture.yaml
@@ -0,0 +1,4 @@
+task: mmmu_val_agriculture
+include: _template_yaml
+task_alias: Agriculture
+dataset_name: Agriculture
diff --git a/lm_eval/tasks/mmmu/mmmu_architecture_and_engineering.yaml b/lm_eval/tasks/mmmu/mmmu_architecture_and_engineering.yaml
new file mode 100644
index 0000000000..829ee3a8f8
--- /dev/null
+++ b/lm_eval/tasks/mmmu/mmmu_architecture_and_engineering.yaml
@@ -0,0 +1,4 @@
+task: mmmu_val_architecture_and_engineering
+include: _template_yaml
+task_alias: Architecture and Engineering
+dataset_name: Architecture_and_Engineering
diff --git a/lm_eval/tasks/mmmu/mmmu_art.yaml b/lm_eval/tasks/mmmu/mmmu_art.yaml
new file mode 100644
index 0000000000..ff812ab3c1
--- /dev/null
+++ b/lm_eval/tasks/mmmu/mmmu_art.yaml
@@ -0,0 +1,4 @@
+task: mmmu_val_art
+include: _template_yaml
+task_alias: Art
+dataset_name: Art
diff --git a/lm_eval/tasks/mmmu/mmmu_art_theory.yaml b/lm_eval/tasks/mmmu/mmmu_art_theory.yaml
new file mode 100644
index 0000000000..d6331871a5
--- /dev/null
+++ b/lm_eval/tasks/mmmu/mmmu_art_theory.yaml
@@ -0,0 +1,4 @@
+task: mmmu_val_art_theory
+include: _template_yaml
+task_alias: Art Theory
+dataset_name: Art_Theory
diff --git a/lm_eval/tasks/mmmu/mmmu_basic_medical_science.yaml b/lm_eval/tasks/mmmu/mmmu_basic_medical_science.yaml
new file mode 100644
index 0000000000..d7486f1ee3
--- /dev/null
+++ b/lm_eval/tasks/mmmu/mmmu_basic_medical_science.yaml
@@ -0,0 +1,4 @@
+task: mmmu_val_basic_medical_science
+include: _template_yaml
+task_alias: Basic Medical Science
+dataset_name: Basic_Medical_Science
diff --git a/lm_eval/tasks/mmmu/mmmu_biology.yaml b/lm_eval/tasks/mmmu/mmmu_biology.yaml
new file mode 100644
index 0000000000..49d36f380f
--- /dev/null
+++ b/lm_eval/tasks/mmmu/mmmu_biology.yaml
@@ -0,0 +1,4 @@
+task: mmmu_val_biology
+include: _template_yaml
+task_alias: Biology
+dataset_name: Biology
diff --git a/lm_eval/tasks/mmmu/mmmu_chemistry.yaml b/lm_eval/tasks/mmmu/mmmu_chemistry.yaml
new file mode 100644
index 0000000000..cb096531c2
--- /dev/null
+++ b/lm_eval/tasks/mmmu/mmmu_chemistry.yaml
@@ -0,0 +1,4 @@
+task: mmmu_val_chemistry
+include: _template_yaml
+task_alias: Chemistry
+dataset_name: Chemistry
diff --git a/lm_eval/tasks/mmmu/mmmu_clinical_medicine.yaml b/lm_eval/tasks/mmmu/mmmu_clinical_medicine.yaml
new file mode 100644
index 0000000000..a0833d165f
--- /dev/null
+++ b/lm_eval/tasks/mmmu/mmmu_clinical_medicine.yaml
@@ -0,0 +1,4 @@
+task: mmmu_val_clinical_medicine
+include: _template_yaml
+task_alias: Clinical Medicine
+dataset_name: Clinical_Medicine
diff --git a/lm_eval/tasks/mmmu/mmmu_computer_science.yaml b/lm_eval/tasks/mmmu/mmmu_computer_science.yaml
new file mode 100644
index 0000000000..53c91a61b3
--- /dev/null
+++ b/lm_eval/tasks/mmmu/mmmu_computer_science.yaml
@@ -0,0 +1,4 @@
+task: mmmu_val_computer_science
+include: _template_yaml
+task_alias: Computer Science
+dataset_name: Computer_Science
diff --git a/lm_eval/tasks/mmmu/mmmu_design.yaml b/lm_eval/tasks/mmmu/mmmu_design.yaml
new file mode 100644
index 0000000000..0367dd65ee
--- /dev/null
+++ b/lm_eval/tasks/mmmu/mmmu_design.yaml
@@ -0,0 +1,4 @@
+task: mmmu_val_design
+include: _template_yaml
+task_alias: Design
+dataset_name: Design
diff --git a/lm_eval/tasks/mmmu/mmmu_diagnostics_and_laboratory_medicine.yaml b/lm_eval/tasks/mmmu/mmmu_diagnostics_and_laboratory_medicine.yaml
new file mode 100644
index 0000000000..2ec1b7dca6
--- /dev/null
+++ b/lm_eval/tasks/mmmu/mmmu_diagnostics_and_laboratory_medicine.yaml
@@ -0,0 +1,4 @@
+task: mmmu_val_diagnostics_and_laboratory_medicine
+include: _template_yaml
+task_alias: Diagnostics and Laboratory Medicine
+dataset_name: Diagnostics_and_Laboratory_Medicine
diff --git a/lm_eval/tasks/mmmu/mmmu_economics.yaml b/lm_eval/tasks/mmmu/mmmu_economics.yaml
new file mode 100644
index 0000000000..3d6465da74
--- /dev/null
+++ b/lm_eval/tasks/mmmu/mmmu_economics.yaml
@@ -0,0 +1,4 @@
+task: mmmu_val_economics
+include: _template_yaml
+task_alias: Economics
+dataset_name: Economics
diff --git a/lm_eval/tasks/mmmu/mmmu_electronics.yaml b/lm_eval/tasks/mmmu/mmmu_electronics.yaml
new file mode 100644
index 0000000000..02df22f8a3
--- /dev/null
+++ b/lm_eval/tasks/mmmu/mmmu_electronics.yaml
@@ -0,0 +1,4 @@
+task: mmmu_val_electronics
+include: _template_yaml
+task_alias: Electronics
+dataset_name: Electronics
diff --git a/lm_eval/tasks/mmmu/mmmu_energy_and_power.yaml b/lm_eval/tasks/mmmu/mmmu_energy_and_power.yaml
new file mode 100644
index 0000000000..add24ebed7
--- /dev/null
+++ b/lm_eval/tasks/mmmu/mmmu_energy_and_power.yaml
@@ -0,0 +1,4 @@
+task: mmmu_val_energy_and_power
+include: _template_yaml
+task_alias: Energy and Power
+dataset_name: Energy_and_Power
diff --git a/lm_eval/tasks/mmmu/mmmu_finance.yaml b/lm_eval/tasks/mmmu/mmmu_finance.yaml
new file mode 100644
index 0000000000..c6b6f042c1
--- /dev/null
+++ b/lm_eval/tasks/mmmu/mmmu_finance.yaml
@@ -0,0 +1,4 @@
+task: mmmu_val_finance
+include: _template_yaml
+task_alias: Finance
+dataset_name: Finance
diff --git a/lm_eval/tasks/mmmu/mmmu_geography.yaml b/lm_eval/tasks/mmmu/mmmu_geography.yaml
new file mode 100644
index 0000000000..b5f9022136
--- /dev/null
+++ b/lm_eval/tasks/mmmu/mmmu_geography.yaml
@@ -0,0 +1,4 @@
+task: mmmu_val_geography
+include: _template_yaml
+task_alias: Geography
+dataset_name: Geography
diff --git a/lm_eval/tasks/mmmu/mmmu_history.yaml b/lm_eval/tasks/mmmu/mmmu_history.yaml
new file mode 100644
index 0000000000..435388fd74
--- /dev/null
+++ b/lm_eval/tasks/mmmu/mmmu_history.yaml
@@ -0,0 +1,4 @@
+task: mmmu_val_history
+include: _template_yaml
+task_alias: History
+dataset_name: History
diff --git a/lm_eval/tasks/mmmu/mmmu_literature.yaml b/lm_eval/tasks/mmmu/mmmu_literature.yaml
new file mode 100644
index 0000000000..9e0a1ba503
--- /dev/null
+++ b/lm_eval/tasks/mmmu/mmmu_literature.yaml
@@ -0,0 +1,4 @@
+task: mmmu_val_literature
+include: _template_yaml
+task_alias: Literature
+dataset_name: Literature
diff --git a/lm_eval/tasks/mmmu/mmmu_manage.yaml b/lm_eval/tasks/mmmu/mmmu_manage.yaml
new file mode 100644
index 0000000000..bd19924114
--- /dev/null
+++ b/lm_eval/tasks/mmmu/mmmu_manage.yaml
@@ -0,0 +1,4 @@
+task: mmmu_val_manage
+include: _template_yaml
+task_alias: Manage
+dataset_name: Manage
diff --git a/lm_eval/tasks/mmmu/mmmu_marketing.yaml b/lm_eval/tasks/mmmu/mmmu_marketing.yaml
new file mode 100644
index 0000000000..53848171be
--- /dev/null
+++ b/lm_eval/tasks/mmmu/mmmu_marketing.yaml
@@ -0,0 +1,4 @@
+task: mmmu_val_marketing
+include: _template_yaml
+task_alias: Marketing
+dataset_name: Marketing
diff --git a/lm_eval/tasks/mmmu/mmmu_materials.yaml b/lm_eval/tasks/mmmu/mmmu_materials.yaml
new file mode 100644
index 0000000000..ee46944316
--- /dev/null
+++ b/lm_eval/tasks/mmmu/mmmu_materials.yaml
@@ -0,0 +1,4 @@
+task: mmmu_val_materials
+include: _template_yaml
+task_alias: Materials
+dataset_name: Materials
diff --git a/lm_eval/tasks/mmmu/mmmu_math.yaml b/lm_eval/tasks/mmmu/mmmu_math.yaml
new file mode 100644
index 0000000000..e6817689ab
--- /dev/null
+++ b/lm_eval/tasks/mmmu/mmmu_math.yaml
@@ -0,0 +1,4 @@
+task: mmmu_val_math
+include: _template_yaml
+task_alias: Math
+dataset_name: Math
diff --git a/lm_eval/tasks/mmmu/mmmu_mechanical_engineering.yaml b/lm_eval/tasks/mmmu/mmmu_mechanical_engineering.yaml
new file mode 100644
index 0000000000..4b9d95be85
--- /dev/null
+++ b/lm_eval/tasks/mmmu/mmmu_mechanical_engineering.yaml
@@ -0,0 +1,4 @@
+task: mmmu_val_mechanical_engineering
+include: _template_yaml
+task_alias: Mechanical Engineering
+dataset_name: Mechanical_Engineering
diff --git a/lm_eval/tasks/mmmu/mmmu_music.yaml b/lm_eval/tasks/mmmu/mmmu_music.yaml
new file mode 100644
index 0000000000..cf4032dea3
--- /dev/null
+++ b/lm_eval/tasks/mmmu/mmmu_music.yaml
@@ -0,0 +1,4 @@
+task: mmmu_val_music
+include: _template_yaml
+task_alias: Music
+dataset_name: Music
diff --git a/lm_eval/tasks/mmmu/mmmu_pharmacy.yaml b/lm_eval/tasks/mmmu/mmmu_pharmacy.yaml
new file mode 100644
index 0000000000..04a4f1ff67
--- /dev/null
+++ b/lm_eval/tasks/mmmu/mmmu_pharmacy.yaml
@@ -0,0 +1,4 @@
+task: mmmu_val_pharmacy
+include: _template_yaml
+task_alias: Pharmacy
+dataset_name: Pharmacy
diff --git a/lm_eval/tasks/mmmu/mmmu_physics.yaml b/lm_eval/tasks/mmmu/mmmu_physics.yaml
new file mode 100644
index 0000000000..53371cad73
--- /dev/null
+++ b/lm_eval/tasks/mmmu/mmmu_physics.yaml
@@ -0,0 +1,4 @@
+task: mmmu_val_physics
+include: _template_yaml
+task_alias: Physics
+dataset_name: Physics
diff --git a/lm_eval/tasks/mmmu/mmmu_psychology.yaml b/lm_eval/tasks/mmmu/mmmu_psychology.yaml
new file mode 100644
index 0000000000..8471d4a18f
--- /dev/null
+++ b/lm_eval/tasks/mmmu/mmmu_psychology.yaml
@@ -0,0 +1,4 @@
+task: mmmu_val_psychology
+include: _template_yaml
+task_alias: Psychology
+dataset_name: Psychology
diff --git a/lm_eval/tasks/mmmu/mmmu_public_health.yaml b/lm_eval/tasks/mmmu/mmmu_public_health.yaml
new file mode 100644
index 0000000000..aefc0590a3
--- /dev/null
+++ b/lm_eval/tasks/mmmu/mmmu_public_health.yaml
@@ -0,0 +1,4 @@
+task: mmmu_val_public_health
+include: _template_yaml
+task_alias: Public Health
+dataset_name: Public_Health
diff --git a/lm_eval/tasks/mmmu/mmmu_sociology.yaml b/lm_eval/tasks/mmmu/mmmu_sociology.yaml
new file mode 100644
index 0000000000..5da664a8e7
--- /dev/null
+++ b/lm_eval/tasks/mmmu/mmmu_sociology.yaml
@@ -0,0 +1,4 @@
+task: mmmu_val_sociology
+include: _template_yaml
+task_alias: Sociology
+dataset_name: Sociology
diff --git a/lm_eval/tasks/mmmu/science.yaml b/lm_eval/tasks/mmmu/science.yaml
deleted file mode 100644
index cc4bce9bfe..0000000000
--- a/lm_eval/tasks/mmmu/science.yaml
+++ /dev/null
@@ -1,27 +0,0 @@
-group: mmmu_science
-group_alias: Science
-task:
-  - task: mmmu_biology
-    include: _template_yaml
-    task_alias: Biology
-    dataset_name: Biology
-  - task: mmmu_chemistry
-    include: _template_yaml
-    task_alias: Chemistry
-    dataset_name: Chemistry
-  - task: mmmu_geography
-    include: _template_yaml
-    task_alias: Geography
-    dataset_name: Geography
-  - task: mmmu_math
-    include: _template_yaml
-    task_alias: Math
-    dataset_name: Math
-  - task: mmmu_physics
-    include: _template_yaml
-    task_alias: Physics
-    dataset_name: Physics
-aggregate_metric_list:
-  - metric: acc
-    aggregation: mean
-    weight_by_size: true
diff --git a/lm_eval/tasks/mmmu/tech_and_engineering.yaml b/lm_eval/tasks/mmmu/tech_and_engineering.yaml
deleted file mode 100644
index 438d7c2d3f..0000000000
--- a/lm_eval/tasks/mmmu/tech_and_engineering.yaml
+++ /dev/null
@@ -1,35 +0,0 @@
-group: mmmu_tech_and_engineering
-group_alias: Tech and Engineering
-task:
-  - task: mmmu_agriculture
-    include: _template_yaml
-    task_alias: Agriculture
-    dataset_name: Agriculture
-  - task: mmmu_architecture_and_engineering
-    include: _template_yaml
-    task_alias: Architecture and Engineering
-    dataset_name: Architecture_and_Engineering
-  - task: mmmu_computer_science
-    include: _template_yaml
-    task_alias: Computer Science
-    dataset_name: Computer_Science
-  - task: mmmu_electronics
-    include: _template_yaml
-    task_alias: Electronics
-    dataset_name: Electronics
-  - task: mmmu_energy_and_power
-    include: _template_yaml
-    task_alias: Energy and Power
-    dataset_name: Energy_and_Power
-  - task: mmmu_materials
-    include: _template_yaml
-    task_alias: Materials
-    dataset_name: Materials
-  - task: mmmu_mechanical_engineering
-    include: _template_yaml
-    task_alias: Mechanical Engineering
-    dataset_name: Mechanical_Engineering
-aggregate_metric_list:
-  - metric: acc
-    aggregation: mean
-    weight_by_size: true
diff --git a/lm_eval/tasks/mmmu/test_mcqa_utils.py b/lm_eval/tasks/mmmu/test_mcqa_utils.py
deleted file mode 100644
index 84bf7c60ca..0000000000
--- a/lm_eval/tasks/mmmu/test_mcqa_utils.py
+++ /dev/null
@@ -1,16 +0,0 @@
-import ast
-
-
-def process_docs(dataset):
-    return dataset.filter(lambda x: x["question_type"] == "multiple-choice")
-
-
-def doc_to_choice(doc):
-    return ["A", "B", "C", "D", "E", "F", "G", "H", "I"][
-        : len(ast.literal_eval(doc["options"]))
-    ]
-
-
-def doc_to_target(doc):
-    print(doc)
-    return ["A", "B", "C", "D", "E", "F", "G", "H", "I"].index(doc["answer"])

From 207778fe290dd3dc07284477e02b11c3f5e368f4 Mon Sep 17 00:00:00 2001
From: Baber <baber@hey.com>
Date: Fri, 13 Sep 2024 01:01:03 +0500
Subject: [PATCH 33/60] chat_template=None

---
 lm_eval/evaluator.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/lm_eval/evaluator.py b/lm_eval/evaluator.py
index e8ddec67e3..0f703b7087 100644
--- a/lm_eval/evaluator.py
+++ b/lm_eval/evaluator.py
@@ -294,7 +294,9 @@ def _adjust_config(task_dict):
             model_source=model,
             model_args=model_args,
             system_instruction=system_instruction,
-            chat_template=lm.chat_template(apply_chat_template),
+            # TODO: change this back
+            # chat_template=lm.chat_template(apply_chat_template),
+            chat_template=None,
             fewshot_as_multiturn=fewshot_as_multiturn,
         )
 

From 45d0f3c59074dfc0ee6f375f77ba9c22b3afabe9 Mon Sep 17 00:00:00 2001
From: Baber <baber@hey.com>
Date: Fri, 13 Sep 2024 01:05:36 +0500
Subject: [PATCH 34/60] add interleave chat_template

---
 lm_eval/models/hf_vlms.py | 48 +++++++++++++++++++++++++++++----------
 1 file changed, 36 insertions(+), 12 deletions(-)

diff --git a/lm_eval/models/hf_vlms.py b/lm_eval/models/hf_vlms.py
index cf2ec8251f..ed399db623 100644
--- a/lm_eval/models/hf_vlms.py
+++ b/lm_eval/models/hf_vlms.py
@@ -32,6 +32,7 @@ def __init__(
         self,
         pretrained: Union[str, transformers.PreTrainedModel],
         image_token_id: Optional[int] = None,
+        interleave: bool = False,
         **kwargs,
     ):
         # We initialize using HFLM's init. Sub-methods like _create_model and _create_tokenizer
@@ -47,6 +48,7 @@ def __init__(
         # HF AutoModelForVision2Seq models have an `image_token_id` value in their configs
         # denoting the token which indicates a location where an image will be substituted in.
         # This can take different string values across models, e.g. <image> for Idefics2 and <|image_pad|> for Qwen2-VL
+        self.interleave = interleave
         # WARNING: improperly set image_token_id can lead to ignored image input or other (potentially silent) errors!
         self.image_token_id = (
             int(image_token_id)
@@ -206,22 +208,44 @@ def _encode_multimodal_pair(self, context, continuation, images):
 
     def apply_chat_template(self, chat_history: List[Dict[str, str]]) -> str:
         self.chat_applied = True
-        for content in chat_history:
-            c = []
-            text = content["content"]
+        if not self.interleave:
+            for content in chat_history:
+                c = []
+                text = content["content"]
 
-            # Count and remove image placeholders
-            image_count = text.count(DEFAULT_IMAGE_PLACEHOLDER)
-            text = text.replace(DEFAULT_IMAGE_PLACEHOLDER, "")
+                # Count and remove image placeholders
+                image_count = text.count(DEFAULT_IMAGE_PLACEHOLDER)
+                text = text.replace(DEFAULT_IMAGE_PLACEHOLDER, "")
 
-            # Add image entries
-            for _ in range(image_count):
-                c.append({"type": "image", "image": None})
+                # Add image entries
+                for _ in range(image_count):
+                    c.append({"type": "image", "image": None})
 
-            # Add single text entry at the end
-            c.append({"type": "text", "text": text})
+                # Add single text entry at the end
+                c.append({"type": "text", "text": text})
 
-            content["content"] = c
+                content["content"] = c
+        else:
+            for content in chat_history:
+                c = []
+                text = content["content"]
+                expected_image_count = text.count(DEFAULT_IMAGE_PLACEHOLDER)
+                if expected_image_count > 1:
+                    print("hello")
+                actual_image_count = 0
+
+                text_parts = text.split(DEFAULT_IMAGE_PLACEHOLDER)
+
+                for i, part in enumerate(text_parts):
+                    if part:  # Add non-empty text parts
+                        c.append({"type": "text", "text": part})
+                    if (
+                        i < len(text_parts) - 1
+                    ):  # Add image placeholder after each split except the last
+                        c.append({"type": "image"})
+                        actual_image_count += 1
+
+                content["content"] = c
 
         return self.processor.apply_chat_template(
             chat_history, add_generation_prompt=True

From 5d55e6c4c1e95e81131d41487ffe419d2626d4ce Mon Sep 17 00:00:00 2001
From: Baber <baber@hey.com>
Date: Fri, 13 Sep 2024 01:08:44 +0500
Subject: [PATCH 35/60] add condition

---
 lm_eval/models/hf_vlms.py | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/lm_eval/models/hf_vlms.py b/lm_eval/models/hf_vlms.py
index ed399db623..c7796e10d7 100644
--- a/lm_eval/models/hf_vlms.py
+++ b/lm_eval/models/hf_vlms.py
@@ -247,6 +247,11 @@ def apply_chat_template(self, chat_history: List[Dict[str, str]]) -> str:
 
                 content["content"] = c
 
+                if actual_image_count != expected_image_count:
+                    raise ValueError(
+                        f"Mismatch in image placeholder count. Expected: {expected_image_count}, Actual: {actual_image_count}"
+                    )
+
         return self.processor.apply_chat_template(
             chat_history, add_generation_prompt=True
         )

From af8382bec0138d711c2bc676f2bb8fb5021b9871 Mon Sep 17 00:00:00 2001
From: Baber <baber@hey.com>
Date: Fri, 13 Sep 2024 09:19:07 +0500
Subject: [PATCH 36/60] add max_images; interleave=true

---
 lm_eval/models/hf_vlms.py | 27 ++++++++++++++++++++-------
 lm_eval/models/utils.py   | 18 ++++++++++++++++++
 2 files changed, 38 insertions(+), 7 deletions(-)

diff --git a/lm_eval/models/hf_vlms.py b/lm_eval/models/hf_vlms.py
index c7796e10d7..f5221add3a 100644
--- a/lm_eval/models/hf_vlms.py
+++ b/lm_eval/models/hf_vlms.py
@@ -10,7 +10,12 @@
 from lm_eval.api.instance import Instance
 from lm_eval.api.registry import register_model
 from lm_eval.models.huggingface import HFLM
-from lm_eval.models.utils import Collator, pad_and_concat, stop_sequences_criteria
+from lm_eval.models.utils import (
+    Collator,
+    pad_and_concat,
+    replace_placeholders,
+    stop_sequences_criteria,
+)
 
 
 DEFAULT_IMAGE_PLACEHOLDER = "<image>"
@@ -32,7 +37,9 @@ def __init__(
         self,
         pretrained: Union[str, transformers.PreTrainedModel],
         image_token_id: Optional[int] = None,
-        interleave: bool = False,
+        interleave: bool = True,
+        # TODO: hamdle whitespace in image placeholder (replacement)
+        max_images: Optional[int] = 999,
         **kwargs,
     ):
         # We initialize using HFLM's init. Sub-methods like _create_model and _create_tokenizer
@@ -49,6 +56,7 @@ def __init__(
         # denoting the token which indicates a location where an image will be substituted in.
         # This can take different string values across models, e.g. <image> for Idefics2 and <|image_pad|> for Qwen2-VL
         self.interleave = interleave
+        self.max_images = max_images
         # WARNING: improperly set image_token_id can lead to ignored image input or other (potentially silent) errors!
         self.image_token_id = (
             int(image_token_id)
@@ -214,7 +222,9 @@ def apply_chat_template(self, chat_history: List[Dict[str, str]]) -> str:
                 text = content["content"]
 
                 # Count and remove image placeholders
-                image_count = text.count(DEFAULT_IMAGE_PLACEHOLDER)
+                image_count = min(
+                    self.max_images, text.count(DEFAULT_IMAGE_PLACEHOLDER)
+                )
                 text = text.replace(DEFAULT_IMAGE_PLACEHOLDER, "")
 
                 # Add image entries
@@ -230,17 +240,16 @@ def apply_chat_template(self, chat_history: List[Dict[str, str]]) -> str:
                 c = []
                 text = content["content"]
                 expected_image_count = text.count(DEFAULT_IMAGE_PLACEHOLDER)
-                if expected_image_count > 1:
-                    print("hello")
                 actual_image_count = 0
 
                 text_parts = text.split(DEFAULT_IMAGE_PLACEHOLDER)
 
                 for i, part in enumerate(text_parts):
+                    # TODO: concatenate text parts (esp. if skipping images)?
                     if part:  # Add non-empty text parts
                         c.append({"type": "text", "text": part})
                     if (
-                        i < len(text_parts) - 1
+                        (i < len(text_parts) - 1) and i < self.max_images
                     ):  # Add image placeholder after each split except the last
                         c.append({"type": "image"})
                         actual_image_count += 1
@@ -269,7 +278,9 @@ def tok_batch_multimodal_encode(
         # NOTE: here, we replace <image> tags with our model's corresponding image_token string value.
         if not self.chat_applied:
             strings = [
-                string.replace(DEFAULT_IMAGE_PLACEHOLDER, self.image_token)
+                replace_placeholders(
+                    string, DEFAULT_IMAGE_PLACEHOLDER, self.image_token, self.max_images
+                )
                 for string in strings
             ]
 
@@ -279,6 +290,8 @@ def tok_batch_multimodal_encode(
 
         # add_special_tokens = {"add_special_tokens": False or self.add_bos_token}
 
+        images = [img[: self.max_images] for img in images]
+
         encoding = self.processor(
             images=images,
             text=strings,
diff --git a/lm_eval/models/utils.py b/lm_eval/models/utils.py
index 8a81e5deca..7afbf085ee 100644
--- a/lm_eval/models/utils.py
+++ b/lm_eval/models/utils.py
@@ -664,3 +664,21 @@ def configure_pad_token(
             tokenizer.add_special_tokens({"pad_token": "<|pad|>"})
 
     return tokenizer
+
+
+def replace_placeholders(string, placeholder, replacement, max_count):
+    count = 0
+    result = ""
+    start = 0
+    while True:
+        index = string.find(placeholder, start)
+        if index == -1:
+            result += string[start:]
+            break
+        if count < max_count:
+            result += string[start:index] + replacement
+            count += 1
+        else:
+            result += string[start:index]
+        start = index + len(placeholder)
+    return result

From f1185a2e2161795e5771a11dad1d02b6c636b147 Mon Sep 17 00:00:00 2001
From: Baber <baber@hey.com>
Date: Fri, 13 Sep 2024 09:30:19 +0500
Subject: [PATCH 37/60] nit

---
 lm_eval/models/hf_vlms.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/lm_eval/models/hf_vlms.py b/lm_eval/models/hf_vlms.py
index f5221add3a..559c99e47f 100644
--- a/lm_eval/models/hf_vlms.py
+++ b/lm_eval/models/hf_vlms.py
@@ -239,7 +239,9 @@ def apply_chat_template(self, chat_history: List[Dict[str, str]]) -> str:
             for content in chat_history:
                 c = []
                 text = content["content"]
-                expected_image_count = text.count(DEFAULT_IMAGE_PLACEHOLDER)
+                expected_image_count = min(
+                    self.max_images, text.count(DEFAULT_IMAGE_PLACEHOLDER)
+                )
                 actual_image_count = 0
 
                 text_parts = text.split(DEFAULT_IMAGE_PLACEHOLDER)

From c67d8107da3889eb858c87dd471f8af0aa5dd12c Mon Sep 17 00:00:00 2001
From: Baber <baber@hey.com>
Date: Fri, 13 Sep 2024 10:05:49 +0500
Subject: [PATCH 38/60] testmini_mcq

---
 lm_eval/tasks/mathvista/testmini.yaml | 22 +++++++++-------------
 lm_eval/tasks/mathvista/utils.py      |  5 +++++
 2 files changed, 14 insertions(+), 13 deletions(-)
 create mode 100644 lm_eval/tasks/mathvista/utils.py

diff --git a/lm_eval/tasks/mathvista/testmini.yaml b/lm_eval/tasks/mathvista/testmini.yaml
index abaf81a665..b650eb6ea2 100644
--- a/lm_eval/tasks/mathvista/testmini.yaml
+++ b/lm_eval/tasks/mathvista/testmini.yaml
@@ -1,20 +1,16 @@
 dataset_path: AI4Math/MathVista
-test_split: testmini
-output_type: generate_until
+test_split: testmini_mcq
+output_type: multiple_choice
+process_docs: !function utils.process_docs
 doc_to_image: !function utils.doc_to_image
-doc_to_text: "<image> {{query}}"
-doc_to_target: "answer"
-# TODO: add process_results. multiple-choice question_type need to be handled differently
-process_results: !function utils.process_results
-generation_kwargs:
-  until:
-    - "<|endoftext|>"
-  temperature: 0.0
-  do_sample: false
-  max_gen_toks: 512
+doc_to_text: "<image>{{query}}"
+doc_to_choice: '{{ ["A", "B", "C", "D", "E", "F"][:choices.length] }}'
+doc_to_target: "{{choices.index(answer)}}"
 metric_list:
   - metric: acc
     aggregation: mean
     higher_is_better: true
 metadata:
-  version: 0.0
+  version: 1.0
+dataset_kwargs:
+  trust_remote_code: true
diff --git a/lm_eval/tasks/mathvista/utils.py b/lm_eval/tasks/mathvista/utils.py
new file mode 100644
index 0000000000..19c64035ea
--- /dev/null
+++ b/lm_eval/tasks/mathvista/utils.py
@@ -0,0 +1,5 @@
+import datasets
+
+
+def process_docs(dataset: datasets.Dataset) -> datasets.Dataset:
+    return dataset.filter(lambda x: x["question_type"].strip() == "multi_choice")

From 5848698eaa8fc7073560d55eac23482a01dce88b Mon Sep 17 00:00:00 2001
From: Baber <baber@hey.com>
Date: Fri, 13 Sep 2024 10:11:47 +0500
Subject: [PATCH 39/60] nit

---
 lm_eval/tasks/mathvista/testmini.yaml | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/lm_eval/tasks/mathvista/testmini.yaml b/lm_eval/tasks/mathvista/testmini.yaml
index b650eb6ea2..52b5384041 100644
--- a/lm_eval/tasks/mathvista/testmini.yaml
+++ b/lm_eval/tasks/mathvista/testmini.yaml
@@ -1,5 +1,6 @@
 dataset_path: AI4Math/MathVista
-test_split: testmini_mcq
+task: mathvista_mcq
+test_split: testmini
 output_type: multiple_choice
 process_docs: !function utils.process_docs
 doc_to_image: !function utils.doc_to_image

From 294dc01fb3b1a45aa356d905e926a6bd9c0ded94 Mon Sep 17 00:00:00 2001
From: Baber <baber@hey.com>
Date: Fri, 13 Sep 2024 11:56:08 +0500
Subject: [PATCH 40/60] pass image string; convert img

---
 lm_eval/models/hf_vlms.py | 42 +++++++++++++++++++++++----------------
 1 file changed, 25 insertions(+), 17 deletions(-)

diff --git a/lm_eval/models/hf_vlms.py b/lm_eval/models/hf_vlms.py
index 559c99e47f..b0c61a6651 100644
--- a/lm_eval/models/hf_vlms.py
+++ b/lm_eval/models/hf_vlms.py
@@ -37,9 +37,11 @@ def __init__(
         self,
         pretrained: Union[str, transformers.PreTrainedModel],
         image_token_id: Optional[int] = None,
+        image_string="<image>",
         interleave: bool = True,
         # TODO: hamdle whitespace in image placeholder (replacement)
         max_images: Optional[int] = 999,
+        convert_img_format=False,
         **kwargs,
     ):
         # We initialize using HFLM's init. Sub-methods like _create_model and _create_tokenizer
@@ -57,26 +59,30 @@ def __init__(
         # This can take different string values across models, e.g. <image> for Idefics2 and <|image_pad|> for Qwen2-VL
         self.interleave = interleave
         self.max_images = max_images
+        self.rgb = convert_img_format
         # WARNING: improperly set image_token_id can lead to ignored image input or other (potentially silent) errors!
-        self.image_token_id = (
-            int(image_token_id)
-            if image_token_id
-            else (
-                getattr(self.config, "image_token_id", None)
-                or getattr(self.config, "image_token_index", None)
+        if not image_string:
+            self.image_token_id = (
+                int(image_token_id)
+                if image_token_id
+                else (
+                    getattr(self.config, "image_token_id", None)
+                    or getattr(self.config, "image_token_index", None)
+                )
             )
-        )
-        assert (
-            self.image_token_id is not None
-        ), "Must have a non-None image_token_id to evaluate a Hugging Face AutoModelForVision2Seq model. Please pass `image_token_id` in `--model_args` if model's config does not already specify one."
-        # get the string this token ID corresponds to
-        self.image_token = self.tok_decode(
-            [self.image_token_id], skip_special_tokens=False
-        )
-        if image_token_id is not None:
-            eval_logger.info(
-                f"A non-default image_token_id with image_token_id={self.image_token_id} and string value '{self.image_token}' was specified manually. Note that using an improper image_token placeholder may lead to ignored image input or errors!"
+            assert (
+                self.image_token_id is not None
+            ), "Must have a non-None image_token_id to evaluate a Hugging Face AutoModelForVision2Seq model. Please pass `image_token_id` in `--model_args` if model's config does not already specify one."
+            # get the string this token ID corresponds to
+            self.image_token = self.tok_decode(
+                [self.image_token_id], skip_special_tokens=False
             )
+            if image_token_id is not None:
+                eval_logger.info(
+                    f"A non-default image_token_id with image_token_id={self.image_token_id} and string value '{self.image_token}' was specified manually. Note that using an improper image_token placeholder may lead to ignored image input or errors!"
+                )
+        else:
+            self.image_token = image_string
 
     def _create_tokenizer(
         self,
@@ -293,6 +299,8 @@ def tok_batch_multimodal_encode(
         # add_special_tokens = {"add_special_tokens": False or self.add_bos_token}
 
         images = [img[: self.max_images] for img in images]
+        if self.rgb:
+            images = [[img.convert("RGB") for img in sublist] for sublist in images]
 
         encoding = self.processor(
             images=images,

From b45d2954f1be45d0c7b925fc50066972e9a0d47d Mon Sep 17 00:00:00 2001
From: Baber <baber@hey.com>
Date: Fri, 13 Sep 2024 12:48:56 +0500
Subject: [PATCH 41/60] add vllm

---
 lm_eval/models/vllm_vlms.py | 286 ++++++++++++++++++++++++++++++++++++
 1 file changed, 286 insertions(+)
 create mode 100644 lm_eval/models/vllm_vlms.py

diff --git a/lm_eval/models/vllm_vlms.py b/lm_eval/models/vllm_vlms.py
new file mode 100644
index 0000000000..94b3da6b30
--- /dev/null
+++ b/lm_eval/models/vllm_vlms.py
@@ -0,0 +1,286 @@
+import copy
+from typing import Dict, List, Optional
+
+import transformers
+from more_itertools import distribute
+from tqdm import tqdm
+
+from lm_eval.api.instance import Instance
+from lm_eval.api.registry import register_model
+from lm_eval.models.utils import Collator, undistribute
+from lm_eval.models.vllm_causallms import VLLM
+
+
+try:
+    import ray
+    from vllm import LLM, SamplingParams
+    from vllm.lora.request import LoRARequest  # noqa: F401
+    from vllm.transformers_utils.tokenizer import get_tokenizer  # noqa: F401
+except ModuleNotFoundError:
+    pass
+
+
+DEFAULT_IMAGE_PLACEHOLDER = "<image>"
+
+
+@register_model("vllm-vlm")
+class VLLM_VLM(VLLM):
+    def __init__(
+        self,
+        pretrained: str,
+        trust_remote_code: Optional[bool] = False,
+        revision: Optional[str] = None,
+        interleave: bool = True,
+        max_images: int = 999,
+        **kwargs,
+    ):
+        super().__init__(
+            pretrained=pretrained,
+            trust_remote_code=trust_remote_code,
+            revision=revision,
+            **kwargs,
+        )
+        self.interleave = interleave
+        self.max_images = max_images
+        self.processor = transformers.AutoProcessor.from_pretrained(
+            pretrained,
+            revision=revision,
+            trust_remote_code=trust_remote_code,
+        )
+        self.chat_applied: bool = False
+
+    def tok_batch_multimodal_encode(
+        self,
+        strings: List[str],  # note that input signature of this fn is different
+        images,  # TODO: typehint on this
+        left_truncate_len: int = None,
+        truncation: bool = False,
+    ):
+        outputs = []
+        for x, i in zip(strings, images):
+            inputs = {
+                "prompt": x,
+                "multi_modal_data": {"image": i},
+            }
+            outputs.append(inputs)
+        return outputs
+
+    def _model_generate(
+        self,
+        requests: List[List[dict]] = None,
+        generate: bool = False,
+        max_tokens: int = None,
+        stop: Optional[List[str]] = None,
+        **kwargs,
+    ):
+        if generate:
+            kwargs = self.modify_gen_kwargs(kwargs)
+            sampling_params = SamplingParams(max_tokens=max_tokens, stop=stop, **kwargs)
+        else:
+            sampling_params = SamplingParams(
+                temperature=0, prompt_logprobs=1, max_tokens=1, detokenize=False
+            )
+        if self.data_parallel_size > 1:
+            # vLLM hangs if tensor_parallel > 1 and resources are set in ray.remote
+            # also seems to only work with decorator and not with ray.remote() fn
+            # see https://github.com/vllm-project/vllm/issues/973
+            # note: this has changed on 0.3.3, and it only works now if num_gpus are set.
+            # but then tensor_parallel breaks
+            @ray.remote
+            def run_inference_one_model(
+                model_args: dict, sampling_params, requests: List[List[dict]]
+            ):
+                llm = LLM(**model_args)
+                return llm.generate(requests, sampling_params=sampling_params)
+
+            # dispatch requests to all self.data_parallel_size workers, in interleaved fashion
+            # interleaved important to balance context lengths across workers
+            requests = [list(x) for x in distribute(self.data_parallel_size, requests)]
+            inputs = ((self.model_args, sampling_params, req) for req in requests)
+            object_refs = [run_inference_one_model.remote(*x) for x in inputs]
+            results = ray.get(object_refs)
+            # Invoke ray.shutdown() to prevent hang-ups if subsequent calls required.
+            ray.shutdown()
+            # flatten results
+            return undistribute(results)
+
+        if self.lora_request is not None:
+            outputs = self.model.generate(
+                requests,
+                sampling_params=sampling_params,
+                use_tqdm=True if self.batch_size == "auto" else False,
+                lora_request=self.lora_request,
+            )
+        else:
+            outputs = self.model.generate(
+                requests,
+                sampling_params=sampling_params,
+                use_tqdm=True if self.batch_size == "auto" else False,
+            )
+        return outputs
+
+    def apply_chat_template(self, chat_history: List[Dict[str, str]]) -> str:
+        self.chat_applied = True
+        if not self.interleave:
+            for content in chat_history:
+                c = []
+                text = content["content"]
+
+                # Count and remove image placeholders
+                image_count = min(
+                    self.max_images, text.count(DEFAULT_IMAGE_PLACEHOLDER)
+                )
+                text = text.replace(DEFAULT_IMAGE_PLACEHOLDER, "")
+
+                # Add image entries
+                for _ in range(image_count):
+                    c.append({"type": "image", "image": None})
+
+                # Add single text entry at the end
+                c.append({"type": "text", "text": text})
+
+                content["content"] = c
+        else:
+            for content in chat_history:
+                c = []
+                text = content["content"]
+                expected_image_count = min(
+                    self.max_images, text.count(DEFAULT_IMAGE_PLACEHOLDER)
+                )
+                actual_image_count = 0
+
+                text_parts = text.split(DEFAULT_IMAGE_PLACEHOLDER)
+
+                for i, part in enumerate(text_parts):
+                    # TODO: concatenate text parts (esp. if skipping images)?
+                    if part:  # Add non-empty text parts
+                        c.append({"type": "text", "text": part})
+                    if (
+                        (i < len(text_parts) - 1) and i < self.max_images
+                    ):  # Add image placeholder after each split except the last
+                        c.append({"type": "image"})
+                        actual_image_count += 1
+
+                content["content"] = c
+
+                if actual_image_count != expected_image_count:
+                    raise ValueError(
+                        f"Mismatch in image placeholder count. Expected: {expected_image_count}, Actual: {actual_image_count}"
+                    )
+
+        return self.processor.apply_chat_template(
+            chat_history, add_generation_prompt=True
+        )
+
+    def generate_until(
+        self, requests: List[Instance], disable_tqdm: bool = False
+    ) -> List[str]:
+        # TODO: back out to HFLM.generate_until() for all requests without aux_arguments (text-only reqs)
+        res = []
+
+        def _collate(x):
+            # the negative sign on len(toks) sorts descending - this has a few advantages:
+            # - time estimates will always be over not underestimates, which is more useful for planning
+            # - to know the size of a batch when going through the list, you know the first one is always the batch
+            #   padded context length. this is useful to simplify the batching logic and more importantly to make
+            #   automatic adaptive batches much much easier to implement
+            # - any OOMs will happen right away rather than near the end
+            toks = self.tok_encode(x[0])
+            return -len(toks), x[0]
+
+        pbar = tqdm(
+            total=len(requests),
+            disable=(disable_tqdm or (self.rank != 0)),
+            desc="Running generate_until requests with text+image input",
+        )
+        # TODO: port auto-batch sizing into this.
+
+        # we group requests by their generation_kwargs,
+        # so that we don't try to execute e.g. greedy sampling and temp=0.8 sampling
+        # in the same batch.
+        re_ords = Collator(
+            [reg.args for reg in requests],
+            _collate,
+            group_by="gen_kwargs",
+            group_fn=lambda x: x[1],
+        )
+        chunks = re_ords.get_batched(n=self.batch_size, batch_fn=None)
+
+        ### Up to here: was identical to non-multimodal HFLM generate_until ###
+
+        for chunk in chunks:
+            contexts, all_gen_kwargs, aux_arguments = zip(*chunk)
+
+            visuals = [arg["visual"] for arg in aux_arguments]
+
+            if not isinstance(contexts, list):
+                contexts = list(
+                    contexts
+                )  # for Qwen2-VL, processor is unhappy accepting a tuple of strings instead of a list.
+                # TODO: could we upstream this workaround to HF?
+            ### this part onward: same as HFLM ###
+
+            # we assume all gen kwargs in the batch are the same
+            # this is safe to assume because the `grouper` object ensures it.
+            gen_kwargs = all_gen_kwargs[0]
+            # unpack our keyword arguments.
+            until = None
+            if isinstance(gen_kwargs, dict):
+                kwargs = copy.deepcopy(gen_kwargs)  # edge case for repeats > 1
+                if "until" in kwargs.keys():
+                    until = kwargs.pop("until")
+                    if isinstance(until, str):
+                        until = [until]
+                    elif not isinstance(until, list):
+                        raise ValueError(
+                            f"Expected `kwargs['until']` to be of type Union[str,list] but got {until}"
+                        )
+            else:
+                raise ValueError(
+                    f"Expected `kwargs` to be of type `dict` but got {type(gen_kwargs)}"
+                )
+            # add EOS token to stop sequences
+            eos = self.tokenizer.decode(self.eot_token_id)
+            if not until:
+                until = [eos]
+            else:
+                until.append(eos)
+            if "max_gen_toks" in kwargs.keys():
+                max_gen_toks = kwargs.pop("max_gen_toks")
+            else:
+                max_gen_toks = self.max_gen_toks
+
+            ### end stuff that's entirely copied verbatim from HFLM ###
+
+            max_ctx_len = self.max_length - max_gen_toks
+
+            inputs = self.tok_batch_multimodal_encode(
+                contexts,
+                visuals,
+                left_truncate_len=max_ctx_len,
+            )
+
+            cont = self._model_generate(inputs, stop=until, **kwargs)
+
+            ### essentially same as HFLM beyond this line!
+
+            for s, context in zip(cont, contexts):
+                # discard context + left-padding toks if using causal decoder-only VLM
+
+                # use secondary stop seqs to cut off should-have-been-stopped content post-hoc
+                for term in until:
+                    if len(term) > 0:
+                        # ignore '' separator,
+                        # for seq2seq case where self.tok_decode(self.eot_token_id) = ''
+                        s = s.split(term)[0]
+
+                res.append(s)
+                self.cache_hook.add_partial(
+                    "generate_until", (context, gen_kwargs), s
+                )  # TODO: cache key for multimodal input should be what?
+                pbar.update(1)
+        # reorder this group of results back to original unsorted form
+        res = re_ords.get_original(res)
+
+        pbar.close()
+        return res

From db554e0300667eb3ede721cbe3047c3227bb98c2 Mon Sep 17 00:00:00 2001
From: Baber <baber@hey.com>
Date: Fri, 13 Sep 2024 12:53:56 +0500
Subject: [PATCH 42/60] add init

---
 lm_eval/models/__init__.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/lm_eval/models/__init__.py b/lm_eval/models/__init__.py
index 657932cb6b..553be3a5fd 100644
--- a/lm_eval/models/__init__.py
+++ b/lm_eval/models/__init__.py
@@ -13,6 +13,7 @@
     optimum_lm,
     textsynth,
     vllm_causallms,
+    vllm_vlms,
 )
 
 

From 90b460182835114b67d701076e0996af66102af3 Mon Sep 17 00:00:00 2001
From: Baber <baber@hey.com>
Date: Fri, 13 Sep 2024 13:08:26 +0500
Subject: [PATCH 43/60] vlm add multi attr

---
 lm_eval/evaluator.py        | 2 +-
 lm_eval/models/vllm_vlms.py | 2 ++
 2 files changed, 3 insertions(+), 1 deletion(-)

diff --git a/lm_eval/evaluator.py b/lm_eval/evaluator.py
index 0f703b7087..96e9562c54 100644
--- a/lm_eval/evaluator.py
+++ b/lm_eval/evaluator.py
@@ -427,7 +427,7 @@ def evaluate(
     if len(incompatible_tasks) > 0:
         if not getattr(lm, "MULTIMODAL", False):
             raise ValueError(
-                f"Attempted to run tasks: {incompatible_tasks} which require multimodal input, but the selected model type does not currently implement this. Multimodal support is currently restricted to the 'hf-multimodal' model type."
+                f"Attempted to run tasks: {incompatible_tasks} which require multimodal input, but the selected model type does not currently implement this. Multimodal support is currently restricted to the ['hf-multimodal', 'vllm-vlm'] model type."
             )
         else:
             raise ValueError(
diff --git a/lm_eval/models/vllm_vlms.py b/lm_eval/models/vllm_vlms.py
index 94b3da6b30..8b09aff9e2 100644
--- a/lm_eval/models/vllm_vlms.py
+++ b/lm_eval/models/vllm_vlms.py
@@ -25,6 +25,8 @@
 
 @register_model("vllm-vlm")
 class VLLM_VLM(VLLM):
+    MULTIMODAL = True
+
     def __init__(
         self,
         pretrained: str,

From 94731a75e0ace38fcac369e9829642f7072552c4 Mon Sep 17 00:00:00 2001
From: Baber <baber@hey.com>
Date: Fri, 13 Sep 2024 13:13:05 +0500
Subject: [PATCH 44/60] fixup

---
 lm_eval/models/vllm_vlms.py | 20 +++++---------------
 1 file changed, 5 insertions(+), 15 deletions(-)

diff --git a/lm_eval/models/vllm_vlms.py b/lm_eval/models/vllm_vlms.py
index 8b09aff9e2..31d4452b4a 100644
--- a/lm_eval/models/vllm_vlms.py
+++ b/lm_eval/models/vllm_vlms.py
@@ -264,22 +264,12 @@ def _collate(x):
 
             cont = self._model_generate(inputs, stop=until, **kwargs)
 
-            ### essentially same as HFLM beyond this line!
-
-            for s, context in zip(cont, contexts):
-                # discard context + left-padding toks if using causal decoder-only VLM
-
-                # use secondary stop seqs to cut off should-have-been-stopped content post-hoc
-                for term in until:
-                    if len(term) > 0:
-                        # ignore '' separator,
-                        # for seq2seq case where self.tok_decode(self.eot_token_id) = ''
-                        s = s.split(term)[0]
-
-                res.append(s)
+            for output, context in zip(cont, contexts):
+                generated_text = output.outputs[0].text
+                res.append(generated_text)
                 self.cache_hook.add_partial(
-                    "generate_until", (context, gen_kwargs), s
-                )  # TODO: cache key for multimodal input should be what?
+                    "generate_until", (context, gen_kwargs), generated_text
+                )
                 pbar.update(1)
         # reorder this group of results back to original unsorted form
         res = re_ords.get_original(res)

From d5cbc48713e1e4be620d09f59c5e614d785cdc47 Mon Sep 17 00:00:00 2001
From: Baber <baber@hey.com>
Date: Fri, 13 Sep 2024 13:47:06 +0500
Subject: [PATCH 45/60] pass max images to vllm model init

---
 lm_eval/models/vllm_vlms.py | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/lm_eval/models/vllm_vlms.py b/lm_eval/models/vllm_vlms.py
index 31d4452b4a..891047a3b1 100644
--- a/lm_eval/models/vllm_vlms.py
+++ b/lm_eval/models/vllm_vlms.py
@@ -33,9 +33,12 @@ def __init__(
         trust_remote_code: Optional[bool] = False,
         revision: Optional[str] = None,
         interleave: bool = True,
+        # TODO<baber>: handle max_images and limit_mm_per_prompt better
         max_images: int = 999,
+        limit_mm_per_prompt: str = "image=1",
         **kwargs,
     ):
+        kwargs["limit_mm_per_prompt"] = limit_mm_per_prompt
         super().__init__(
             pretrained=pretrained,
             trust_remote_code=trust_remote_code,
@@ -58,6 +61,8 @@ def tok_batch_multimodal_encode(
         left_truncate_len: int = None,
         truncation: bool = False,
     ):
+        images = [img[: self.max_images] for img in images]
+
         outputs = []
         for x, i in zip(strings, images):
             inputs = {
@@ -262,7 +267,7 @@ def _collate(x):
                 left_truncate_len=max_ctx_len,
             )
 
-            cont = self._model_generate(inputs, stop=until, **kwargs)
+            cont = self._model_generate(inputs, stop=until, generate=True, **kwargs)
 
             for output, context in zip(cont, contexts):
                 generated_text = output.outputs[0].text

From 3b4ce83b1d31f5bf8e2a6a9ea0aa7211deb94f19 Mon Sep 17 00:00:00 2001
From: Baber <baber@hey.com>
Date: Fri, 13 Sep 2024 13:57:32 +0500
Subject: [PATCH 46/60] nit

---
 lm_eval/models/vllm_vlms.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/lm_eval/models/vllm_vlms.py b/lm_eval/models/vllm_vlms.py
index 891047a3b1..27202e9305 100644
--- a/lm_eval/models/vllm_vlms.py
+++ b/lm_eval/models/vllm_vlms.py
@@ -9,6 +9,7 @@
 from lm_eval.api.registry import register_model
 from lm_eval.models.utils import Collator, undistribute
 from lm_eval.models.vllm_causallms import VLLM
+from lm_eval.utils import simple_parse_args_string
 
 
 try:
@@ -38,7 +39,7 @@ def __init__(
         limit_mm_per_prompt: str = "image=1",
         **kwargs,
     ):
-        kwargs["limit_mm_per_prompt"] = limit_mm_per_prompt
+        kwargs["limit_mm_per_prompt"] = simple_parse_args_string(limit_mm_per_prompt)
         super().__init__(
             pretrained=pretrained,
             trust_remote_code=trust_remote_code,

From 357cf640f6cc2be86a75baf8c463f85b04eafea3 Mon Sep 17 00:00:00 2001
From: Baber <baber@hey.com>
Date: Fri, 13 Sep 2024 15:34:08 +0500
Subject: [PATCH 47/60] encoding to device

---
 lm_eval/models/hf_vlms.py | 12 +++++++-----
 1 file changed, 7 insertions(+), 5 deletions(-)

diff --git a/lm_eval/models/hf_vlms.py b/lm_eval/models/hf_vlms.py
index b0c61a6651..0c014d871b 100644
--- a/lm_eval/models/hf_vlms.py
+++ b/lm_eval/models/hf_vlms.py
@@ -5,6 +5,7 @@
 import torch.nn.functional as F
 import transformers
 from tqdm import tqdm
+from transformers import BatchEncoding
 
 from lm_eval import utils
 from lm_eval.api.instance import Instance
@@ -276,14 +277,15 @@ def apply_chat_template(self, chat_history: List[Dict[str, str]]) -> str:
     def tok_batch_multimodal_encode(
         self,
         strings: List[str],  # note that input signature of this fn is different
-        images,  # TODO: typehint on this
+        images: List[List],  # TODO: images are pil.I
         padding_side: str = "left",
         left_truncate_len: int = None,
         truncation: bool = False,
-    ) -> Dict[
-        str, torch.Tensor
+    ) -> Union[
+        BatchEncoding, Dict[str, torch.Tensor]
     ]:  # note that this return signature differs from HFLM tok_batch_encode.
         # NOTE: here, we replace <image> tags with our model's corresponding image_token string value.
+        # Moves the encodings to device
         if not self.chat_applied:
             strings = [
                 replace_placeholders(
@@ -313,7 +315,7 @@ def tok_batch_multimodal_encode(
 
         encoding.to(
             self.device, self.model.dtype
-        )  # TODO: casting to dtype seems odd for input_ids and attn_mask.
+        )  # TODO: This only casts the pixel values. Should they always be float16?
         if left_truncate_len:
             encoding["input_ids"] = encoding["input_ids"][:, -left_truncate_len:]
             encoding["attention_mask"] = encoding["attention_mask"][
@@ -686,7 +688,7 @@ def _collate(x):
                 visuals,
                 left_truncate_len=max_ctx_len,
                 truncation=self.truncation,
-            ).to(self.device, self.model.dtype)
+            )
 
             context_enc = inputs["input_ids"]
 

From f9cf90e9f905286adb44421d94e54e2cefc015c3 Mon Sep 17 00:00:00 2001
From: haileyschoelkopf <hailey@eleuther.ai>
Date: Fri, 13 Sep 2024 12:22:09 +0000
Subject: [PATCH 48/60] fix HFMultimodalLM.chat_template ?

---
 lm_eval/models/hf_vlms.py | 39 +++++++++++++--------------------------
 1 file changed, 13 insertions(+), 26 deletions(-)

diff --git a/lm_eval/models/hf_vlms.py b/lm_eval/models/hf_vlms.py
index cf2ec8251f..6316a7e843 100644
--- a/lm_eval/models/hf_vlms.py
+++ b/lm_eval/models/hf_vlms.py
@@ -118,32 +118,6 @@ def _create_tokenizer(
 
         self.tokenizer = self.processor.tokenizer
 
-    # def tok_encode(
-    #     self, string: str, left_truncate_len=None, add_special_tokens=None
-    # ) -> List[int]:
-    #     """ """
-    #     # default for None - empty dict, use predefined tokenizer param
-    #     # used for all models except for CausalLM or predefined value
-    #     special_tokens_kwargs = {}
-
-    #     # by default for CausalLM - false or self.add_bos_token is set
-    #     if add_special_tokens is None:
-    #         if self.AUTO_MODEL_CLASS == transformers.AutoModelForCausalLM:
-    #             special_tokens_kwargs = {
-    #                 "add_special_tokens": False or self.add_bos_token
-    #             }
-    #     # otherwise the method explicitly defines the value
-    #     else:
-    #         special_tokens_kwargs = {"add_special_tokens": add_special_tokens}
-
-    #     encoding = self.tokenizer.encode(string, **special_tokens_kwargs)
-
-    #     # left-truncate the encoded context to be at most `left_truncate_len` tokens long
-    #     if left_truncate_len:
-    #         encoding = encoding[-left_truncate_len:]
-
-    #     return encoding
-
     def tok_multimodal_encode(
         self, string, images, left_truncate_len=None, add_special_tokens=None
     ):
@@ -227,6 +201,18 @@ def apply_chat_template(self, chat_history: List[Dict[str, str]]) -> str:
             chat_history, add_generation_prompt=True
         )
 
+    def chat_template(self, chat_template: Union[bool, str] = False) -> Optional[str]:
+        if hasattr(self.processor, "apply_chat_template"):
+            _tokenizer = self.tokenizer
+            self.tokenizer = self.processor
+
+            selected_template = super().chat_template(chat_template)
+
+            self.tokenizer = _tokenizer
+            return selected_template
+        else:
+            return super().chat_template(chat_template)
+
     def tok_batch_multimodal_encode(
         self,
         strings: List[str],  # note that input signature of this fn is different
@@ -629,6 +615,7 @@ def _collate(x):
 
             max_ctx_len = self.max_length - max_gen_toks
 
+            print(contexts, visuals)
             inputs = self.tok_batch_multimodal_encode(
                 contexts,
                 visuals,

From 8349e12a0c0ae9e63ed82cbbe9fbd8f4d360bc0e Mon Sep 17 00:00:00 2001
From: Baber <baber@hey.com>
Date: Fri, 13 Sep 2024 17:39:12 +0500
Subject: [PATCH 49/60] add mmmu readme

---
 lm_eval/tasks/mmmu/README.md | 78 ++++++++++++++++++++++++++++++++++++
 1 file changed, 78 insertions(+)
 create mode 100644 lm_eval/tasks/mmmu/README.md

diff --git a/lm_eval/tasks/mmmu/README.md b/lm_eval/tasks/mmmu/README.md
new file mode 100644
index 0000000000..a01c528f13
--- /dev/null
+++ b/lm_eval/tasks/mmmu/README.md
@@ -0,0 +1,78 @@
+# MMMU Benchmark
+
+### Paper
+
+Title: `MMMU: A Massive Multi-discipline MultimodalUnderstanding and Reasoning Benchmark for Expert AGI`
+
+Abstract: `MMMU is a new benchmark designed to evaluate multimodal models on massive multi-discipline tasks demanding college-level subject knowledge and deliberate reasoning.`
+
+`The benchmark is composed of 30 tasks, for a total of 900 mixed image+text examples (some with multiple images in context)`
+
+Homepage: `https://github.com/MMMU-Benchmark/MMMU/tree/main/mmmu`
+
+Note: Some questions have multiple images in context. To control for this use `max_images=N` in model init.
+
+### Citation
+
+```
+@inproceedings{yue2023mmmu,
+            title={MMMU: A Massive Multi-discipline Multimodal Understanding and Reasoning Benchmark for Expert AGI},
+            author={Xiang Yue and Yuansheng Ni and Kai Zhang and Tianyu Zheng and Ruoqi Liu and Ge Zhang and Samuel Stevens and Dongfu Jiang and Weiming Ren and Yuxuan Sun and Cong Wei and Botao Yu and Ruibin Yuan and Renliang Sun and Ming Yin and Boyuan Zheng and Zhenzhu Yang and Yibo Liu and Wenhao Huang and Huan Sun and Yu Su and Wenhu Chen},
+            booktitle={Proceedings of CVPR},
+            year={2024},
+          }
+```
+
+### Groups, Tags, and Tasks
+
+#### Groups
+
+* `mmmu_val`
+* `mmmu_val_art_and_design`
+* `mmmu_val_business`
+* `mmmu_val_health_and_medicine`
+* `mmmu_val_humanities_and_social_science`
+* `mmmu_val_science`
+* `mmmu_val_tech_and_engineering`
+
+#### Tags
+
+
+#### Tasks
+
+* `mmmu_val_accounting`
+* `mmmu_val_agriculture`
+* `mmmu_val_architecture_and_engineering.yaml`
+* `mmmu_val_art`
+* `mmmu_val_art_theory`
+* `mmmu_val_basic_medical_science`
+* `mmmu_val_biology`
+* `mmmu_val_chemistry`
+* `mmmu_val_computer_science`
+* `mmmu_val_clinical_medicine`
+* `mmmu_val_design`
+* `mmmu_val_diagnostics_and_laboratory_medicine`
+* `mmmu_val_electronics`
+* `mmmu_val_energy_and_power`
+* `mmmu_val_economics`
+* `mmmu_val_finance`
+* `mmmu_val_geography`
+* `mmmu_val_history`
+* ...
+
+### Variants
+
+
+
+### Checklist
+
+For adding novel benchmarks/datasets to the library:
+* [x] Is the task an existing benchmark in the literature?
+  * [x] Have you referenced the original paper that introduced the task?
+  * [x] If yes, does the original paper provide a reference implementation? If so, have you checked against the reference implementation and documented how to run such a test?
+
+
+If other tasks on this dataset are already supported:
+* [x] Is the "Main" variant of this task clearly denoted?
+* [x] Have you provided a short sentence in a README on what each new variant adds / evaluates?
+* [x] Have you noted which, if any, published evaluation setups are matched by this variant?

From 05f0dd6e5fb0bb6ea1f90a5494b340b23574cd5a Mon Sep 17 00:00:00 2001
From: haileyschoelkopf <hailey@eleuther.ai>
Date: Fri, 13 Sep 2024 13:16:24 +0000
Subject: [PATCH 50/60] remove erroneous prints

---
 lm_eval/models/hf_vlms.py | 2 --
 1 file changed, 2 deletions(-)

diff --git a/lm_eval/models/hf_vlms.py b/lm_eval/models/hf_vlms.py
index c086c03364..fe391eb62c 100644
--- a/lm_eval/models/hf_vlms.py
+++ b/lm_eval/models/hf_vlms.py
@@ -361,7 +361,6 @@ def _batch_images(self, image_encs):
                 ],
                 dim=0,
             )
-        print(batched_imgs)
         return batched_imgs
 
     def loglikelihood_rolling(self, requests: List[Instance]) -> List[float]:
@@ -669,7 +668,6 @@ def _collate(x):
 
             max_ctx_len = self.max_length - max_gen_toks
 
-            print(contexts, visuals)
             inputs = self.tok_batch_multimodal_encode(
                 contexts,
                 visuals,

From 9ddb2eca634d4cab215c2b2b66c15ec63f17bd82 Mon Sep 17 00:00:00 2001
From: haileyschoelkopf <hailey@eleuther.ai>
Date: Fri, 13 Sep 2024 13:25:10 +0000
Subject: [PATCH 51/60] use HFMultimodalLM.chat_template ; restore
 tasks/__init__.py

---
 lm_eval/evaluator.py        |  4 +---
 lm_eval/models/vllm_vlms.py |  7 +------
 lm_eval/tasks/__init__.py   | 14 ++++++--------
 3 files changed, 8 insertions(+), 17 deletions(-)

diff --git a/lm_eval/evaluator.py b/lm_eval/evaluator.py
index 96e9562c54..4f4c8f7fba 100644
--- a/lm_eval/evaluator.py
+++ b/lm_eval/evaluator.py
@@ -294,9 +294,7 @@ def _adjust_config(task_dict):
             model_source=model,
             model_args=model_args,
             system_instruction=system_instruction,
-            # TODO: change this back
-            # chat_template=lm.chat_template(apply_chat_template),
-            chat_template=None,
+            chat_template=lm.chat_template(apply_chat_template),
             fewshot_as_multiturn=fewshot_as_multiturn,
         )
 
diff --git a/lm_eval/models/vllm_vlms.py b/lm_eval/models/vllm_vlms.py
index 27202e9305..5a1260901c 100644
--- a/lm_eval/models/vllm_vlms.py
+++ b/lm_eval/models/vllm_vlms.py
@@ -183,7 +183,7 @@ def apply_chat_template(self, chat_history: List[Dict[str, str]]) -> str:
     def generate_until(
         self, requests: List[Instance], disable_tqdm: bool = False
     ) -> List[str]:
-        # TODO: back out to HFLM.generate_until() for all requests without aux_arguments (text-only reqs)
+        # TODO: support text-only reqs
         res = []
 
         def _collate(x):
@@ -214,8 +214,6 @@ def _collate(x):
         )
         chunks = re_ords.get_batched(n=self.batch_size, batch_fn=None)
 
-        ### Up to here: was identical to non-multimodal HFLM generate_until ###
-
         for chunk in chunks:
             contexts, all_gen_kwargs, aux_arguments = zip(*chunk)
 
@@ -226,7 +224,6 @@ def _collate(x):
                     contexts
                 )  # for Qwen2-VL, processor is unhappy accepting a tuple of strings instead of a list.
                 # TODO: could we upstream this workaround to HF?
-            ### this part onward: same as HFLM ###
 
             # we assume all gen kwargs in the batch are the same
             # this is safe to assume because the `grouper` object ensures it.
@@ -258,8 +255,6 @@ def _collate(x):
             else:
                 max_gen_toks = self.max_gen_toks
 
-            ### end stuff that's entirely copied verbatim from HFLM ###
-
             max_ctx_len = self.max_length - max_gen_toks
 
             inputs = self.tok_batch_multimodal_encode(
diff --git a/lm_eval/tasks/__init__.py b/lm_eval/tasks/__init__.py
index c7935cb3f0..7eeec8bbc6 100644
--- a/lm_eval/tasks/__init__.py
+++ b/lm_eval/tasks/__init__.py
@@ -253,13 +253,12 @@ def _load_individual_task_or_group(
         name_or_config: Optional[Union[str, dict]] = None,
         parent_name: Optional[str] = None,
         update_config: Optional[dict] = None,
-        yaml_path: Optional[str] = None,
     ) -> Mapping:
-        def _load_task(config, task, yaml_path=None):
+        def _load_task(config, task):
             if "include" in config:
                 config = {
                     **utils.load_yaml_config(
-                        yaml_path=yaml_path,
+                        yaml_path=None,
                         yaml_config={"include": config.pop("include")},
                         mode="full",
                     ),
@@ -275,6 +274,7 @@ def _load_task(config, task, yaml_path=None):
                     task_object.config.task = config["task"]
             else:
                 task_object = ConfigurableTask(config=config)
+
             return {task: task_object}
 
         def _get_group_and_subtask_from_config(config):
@@ -316,7 +316,6 @@ def _process_group_config(config, update_config=None):
                     group_name, subtask_list = _get_group_and_subtask_from_config(
                         group_config
                     )
-                    yaml_path = self._get_yaml_path(group_name.group)
                 else:
                     if self._name_is_tag(name_or_config):
                         fn = partial(
@@ -335,8 +334,7 @@ def _process_group_config(config, update_config=None):
 
         if isinstance(name_or_config, dict):
             if self._config_is_task(name_or_config):
-                # name = name_or_config.pop("task")
-                name = name_or_config["task"]
+                name = name_or_config.pop("task")
                 if update_config is not None:
                     name_or_config = {**name_or_config, **update_config}
                 # If the name is registered as a group
@@ -380,7 +378,7 @@ def _process_group_config(config, update_config=None):
                         }
                     else:
                         task_config = name_or_config
-                    return _load_task(task_config, task=name, yaml_path=yaml_path)
+                    return _load_task(task_config, task=name)
             else:
                 group_config, update_config = _process_group_config(name_or_config)
                 group_name, subtask_list = _get_group_and_subtask_from_config(
@@ -391,7 +389,6 @@ def _process_group_config(config, update_config=None):
             self._load_individual_task_or_group,
             parent_name=group_name,
             update_config=update_config,
-            yaml_path=yaml_path,
         )
         return {
             group_name: dict(collections.ChainMap(*map(fn, reversed(subtask_list))))
@@ -651,4 +648,5 @@ def get_task_dict(
     # and we'd be unsure which to use and report.)
     # we explicitly check and error in this case.
     _check_duplicates(get_subtask_list(final_task_dict))
+
     return final_task_dict

From d1aadbc0113a8f94decd82bdfb50cc191e803cfc Mon Sep 17 00:00:00 2001
From: haileyschoelkopf <hailey@eleuther.ai>
Date: Fri, 13 Sep 2024 13:29:23 +0000
Subject: [PATCH 52/60] add docstring for replace_placeholders in utils

---
 lm_eval/models/utils.py | 20 +++++++++++++++-----
 1 file changed, 15 insertions(+), 5 deletions(-)

diff --git a/lm_eval/models/utils.py b/lm_eval/models/utils.py
index 7afbf085ee..8c150b3d76 100644
--- a/lm_eval/models/utils.py
+++ b/lm_eval/models/utils.py
@@ -666,19 +666,29 @@ def configure_pad_token(
     return tokenizer
 
 
-def replace_placeholders(string, placeholder, replacement, max_count):
+def replace_placeholders(
+    string_: str, placeholder: str, replacement: str, max_count: int
+):
+    """
+    A utility function used for local multimodal models. It locates all `placeholder` string
+    occurrences in the given input `string_` and replaces the first `max_count` instances with
+    `replacement`, and all subsequent occurrences with the empty string.
+
+    This is used to replace <image> placeholder tags by model-specific image tokens like <|image_pad|>
+    and to allow for only the first `max_count` images to be passed to a model if desired.
+    """
     count = 0
     result = ""
     start = 0
     while True:
-        index = string.find(placeholder, start)
+        index = string_.find(placeholder, start)
         if index == -1:
-            result += string[start:]
+            result += string_[start:]
             break
         if count < max_count:
-            result += string[start:index] + replacement
+            result += string_[start:index] + replacement
             count += 1
         else:
-            result += string[start:index]
+            result += string_[start:index]
         start = index + len(placeholder)
     return result

From 19d6874072ba3b6f6122b393a35d5e5374ac80b4 Mon Sep 17 00:00:00 2001
From: Baber <baber@hey.com>
Date: Fri, 13 Sep 2024 18:32:35 +0500
Subject: [PATCH 53/60] fix `replace_placeholders`; set image_string=None

---
 lm_eval/models/hf_vlms.py |  3 ++-
 lm_eval/models/utils.py   | 34 ++++++++++++++++++++--------------
 2 files changed, 22 insertions(+), 15 deletions(-)

diff --git a/lm_eval/models/hf_vlms.py b/lm_eval/models/hf_vlms.py
index fe391eb62c..e41e546698 100644
--- a/lm_eval/models/hf_vlms.py
+++ b/lm_eval/models/hf_vlms.py
@@ -38,7 +38,7 @@ def __init__(
         self,
         pretrained: Union[str, transformers.PreTrainedModel],
         image_token_id: Optional[int] = None,
-        image_string="<image>",
+        image_string: Optional[str] = None,
         interleave: bool = True,
         # TODO: hamdle whitespace in image placeholder (replacement)
         max_images: Optional[int] = 999,
@@ -273,6 +273,7 @@ def tok_batch_multimodal_encode(
         # NOTE: here, we replace <image> tags with our model's corresponding image_token string value.
         # Moves the encodings to device
         if not self.chat_applied:
+            # TODO<baber>: This still keeps the whitespace in the image placeholder, which is not ideal.
             strings = [
                 replace_placeholders(
                     string, DEFAULT_IMAGE_PLACEHOLDER, self.image_token, self.max_images
diff --git a/lm_eval/models/utils.py b/lm_eval/models/utils.py
index 8c150b3d76..72fac03811 100644
--- a/lm_eval/models/utils.py
+++ b/lm_eval/models/utils.py
@@ -667,7 +667,7 @@ def configure_pad_token(
 
 
 def replace_placeholders(
-    string_: str, placeholder: str, replacement: str, max_count: int
+    string: str, default_placeholder: str, image_token: str, max_images: int
 ):
     """
     A utility function used for local multimodal models. It locates all `placeholder` string
@@ -676,19 +676,25 @@ def replace_placeholders(
 
     This is used to replace <image> placeholder tags by model-specific image tokens like <|image_pad|>
     and to allow for only the first `max_count` images to be passed to a model if desired.
+
+    :param string: The original string containing placeholders.
+    :param default_placeholder: The placeholder text to be replaced.
+    :param image_token: The token to replace the placeholder with.
+    :param max_images: The maximum number of replacements to make.
+    :return: The string with placeholders replaced.
     """
     count = 0
-    result = ""
-    start = 0
-    while True:
-        index = string_.find(placeholder, start)
-        if index == -1:
-            result += string_[start:]
-            break
-        if count < max_count:
-            result += string_[start:index] + replacement
+    result = []
+
+    parts = string.split(default_placeholder)
+    for part in parts[:-1]:  # Iterate through all but the last part
+        result.append(part)
+        if count < max_images:
+            result.append(image_token)
             count += 1
-        else:
-            result += string_[start:index]
-        start = index + len(placeholder)
-    return result
+        elif default_placeholder != image_token:
+            result.append(default_placeholder)
+
+    # Add the last part of the string
+    result.append(parts[-1])
+    return "".join(result)

From 5665bc607249ec8845043b157b8f33a74742b39d Mon Sep 17 00:00:00 2001
From: haileyschoelkopf <hailey@eleuther.ai>
Date: Fri, 13 Sep 2024 13:44:57 +0000
Subject: [PATCH 54/60] fix typo

---
 lm_eval/models/hf_vlms.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/lm_eval/models/hf_vlms.py b/lm_eval/models/hf_vlms.py
index fe391eb62c..2b7258da70 100644
--- a/lm_eval/models/hf_vlms.py
+++ b/lm_eval/models/hf_vlms.py
@@ -40,7 +40,7 @@ def __init__(
         image_token_id: Optional[int] = None,
         image_string="<image>",
         interleave: bool = True,
-        # TODO: hamdle whitespace in image placeholder (replacement)
+        # TODO: handle whitespace in image placeholder (replacement)
         max_images: Optional[int] = 999,
         convert_img_format=False,
         **kwargs,
@@ -263,7 +263,7 @@ def chat_template(self, chat_template: Union[bool, str] = False) -> Optional[str
     def tok_batch_multimodal_encode(
         self,
         strings: List[str],  # note that input signature of this fn is different
-        images: List[List],  # TODO: images are pil.I
+        images: List[List],  # TODO: images are pil.Image at the moment, update typehint
         padding_side: str = "left",
         left_truncate_len: int = None,
         truncation: bool = False,

From 31e5ab0ecee02f0b8a866a6cd8a1db5a605d77cc Mon Sep 17 00:00:00 2001
From: haileyschoelkopf <hailey@eleuther.ai>
Date: Fri, 13 Sep 2024 14:39:10 +0000
Subject: [PATCH 55/60] cleanup + fix merge conflicts

---
 README.md                     |  2 +-
 lm_eval/models/hf_vlms.py     |  3 +--
 lm_eval/models/huggingface.py | 25 ++-----------------------
 3 files changed, 4 insertions(+), 26 deletions(-)

diff --git a/README.md b/README.md
index 53a34c39eb..9f26c811d1 100644
--- a/README.md
+++ b/README.md
@@ -6,7 +6,7 @@
 
 *Latest News 📣*
 
-- [2024/09] We are prototyping allowing users of LM Evaluation Harness to create and evaluate on text+image multimodal input, text output tasks, and have just added the `hf-multimodal` model type and `mmmu` task as a prototype feature. We welcome users to try out this in-progress feature and stress-test it for themselves, and suggest they check out [`lmms-eval`](https://github.com/EvolvingLMMs-Lab/lmms-eval), a wonderful project originally forking off of the lm-evaluation-harness, for a broader range of multimodal tasks, models, and features.
+- [2024/09] We are prototyping allowing users of LM Evaluation Harness to create and evaluate on text+image multimodal input, text output tasks, and have just added the `hf-multimodal` and `vllm-vlm` model types and `mmmu` task as a prototype feature. We welcome users to try out this in-progress feature and stress-test it for themselves, and suggest they check out [`lmms-eval`](https://github.com/EvolvingLMMs-Lab/lmms-eval), a wonderful project originally forking off of the lm-evaluation-harness, for a broader range of multimodal tasks, models, and features.
 - [2024/07] [API model](docs/API_guide.md) support has been updated and refactored, introducing support for batched and async requests, and making it significantly easier to customize and use for your own purposes. **To run Llama 405B, we recommend using VLLM's OpenAI-compliant API to host the model, and use the `local-completions` model type to evaluate the model.**
 - [2024/07] New Open LLM Leaderboard tasks have been added ! You can find them under the [leaderboard](lm_eval/tasks/leaderboard/README.md) task group.
 
diff --git a/lm_eval/models/hf_vlms.py b/lm_eval/models/hf_vlms.py
index 2b7258da70..ba61ca31fd 100644
--- a/lm_eval/models/hf_vlms.py
+++ b/lm_eval/models/hf_vlms.py
@@ -271,7 +271,6 @@ def tok_batch_multimodal_encode(
         BatchEncoding, Dict[str, torch.Tensor]
     ]:  # note that this return signature differs from HFLM tok_batch_encode.
         # NOTE: here, we replace <image> tags with our model's corresponding image_token string value.
-        # Moves the encodings to device
         if not self.chat_applied:
             strings = [
                 replace_placeholders(
@@ -299,7 +298,7 @@ def tok_batch_multimodal_encode(
             # **add_special_tokens, # TODO: at least some Processors error out when passing this. How do we control whether text gets BOS added?
         )
 
-        encoding.to(
+        encoding.to(  # TODO: our other tokenization methods in HFLM don't typically move to device. this breaks convention
             self.device, self.model.dtype
         )  # TODO: This only casts the pixel values. Should they always be float16?
         if left_truncate_len:
diff --git a/lm_eval/models/huggingface.py b/lm_eval/models/huggingface.py
index 20f17cb63c..63ad1440bd 100644
--- a/lm_eval/models/huggingface.py
+++ b/lm_eval/models/huggingface.py
@@ -30,6 +30,7 @@
 from lm_eval.models.utils import (
     Collator,
     clear_torch_cache,
+    configure_pad_token,
     get_dtype,
     pad_and_concat,
     stop_sequences_criteria,
@@ -204,30 +205,8 @@ def __init__(
         self.logits_cache = logits_cache
         self.vocab_size = self.tokenizer.vocab_size
         # select (or create) a pad token to use
-        if self.tokenizer.pad_token:
-            pass
-        elif self.tokenizer.unk_token:
-            self.tokenizer.pad_token_id = self.tokenizer.unk_token_id
-        elif self.tokenizer.eos_token:
-            self.tokenizer.pad_token_id = self.tokenizer.eos_token_id
-        else:
-            if getattr(self.config, "model_type", None) == "qwen":
-                # Qwen's trust_remote_code tokenizer does not allow for adding special tokens
-                self.tokenizer.pad_token = "<|endoftext|>"
-            elif (
-                self.tokenizer.__class__.__name__ == "RWKVWorldTokenizer"
-                or self.tokenizer.__class__.__name__ == "Rwkv5Tokenizer"
-            ):
-                # The RWKV world tokenizer, does not allow for adding special tokens / setting the pad token (which is set as 0)
-                # The additional tokenizer name check is needed, as there exists rwkv4 models with neox tokenizer
-                # ---
-                # Note that the world tokenizer class name, might change in the future for the final huggingface merge
-                # https://github.com/huggingface/transformers/pull/26963
-                assert self.tokenizer.pad_token_id == 0
-            else:
-                self.tokenizer.add_special_tokens({"pad_token": "<|pad|>"})
+        self.tokenizer = configure_pad_token(self.tokenizer, model_config=self.config)
 
-        # TODO: override this for Gemma
         self.add_bos_token = add_bos_token
         if "gemma" in getattr(self.config, "model_type", ""):
             self.add_bos_token = True

From c0b585de4afab4dc4aa631a59f194d14efa7f69c Mon Sep 17 00:00:00 2001
From: haileyschoelkopf <hailey@eleuther.ai>
Date: Fri, 13 Sep 2024 14:53:54 +0000
Subject: [PATCH 56/60] update MMMU readme

---
 lm_eval/tasks/mmmu/README.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/lm_eval/tasks/mmmu/README.md b/lm_eval/tasks/mmmu/README.md
index a01c528f13..814e5344da 100644
--- a/lm_eval/tasks/mmmu/README.md
+++ b/lm_eval/tasks/mmmu/README.md
@@ -62,6 +62,7 @@ Note: Some questions have multiple images in context. To control for this use `m
 
 ### Variants
 
+The `mmmu_val` group implements MMMU using processing code [from the original MMMU authors](https://github.com/MMMU-Benchmark/MMMU/tree/main/mmmu) and uses the prompt format found in [the MMMU repository for Llava-1.5](https://github.com/MMMU-Benchmark/MMMU/blob/main/mmmu/configs/llava1.5.yaml). This implementation should give scores on par with or slightly higher than those reported by [lmms-eval](https://github.com/EvolvingLMMs-Lab/lmms-eval/tree/main/lmms_eval/tasks/mmmu) for `mmmu_val` and the MMMU repository code.
 
 
 ### Checklist

From 5c0bd546bbc7f76b7c39c8d461414bf5997d58af Mon Sep 17 00:00:00 2001
From: Baber <baber@hey.com>
Date: Fri, 13 Sep 2024 22:33:50 +0500
Subject: [PATCH 57/60] del mathvista

---
 lm_eval/tasks/mathvista/testmini.yaml | 17 -----------------
 lm_eval/tasks/mathvista/utils.py      |  5 -----
 2 files changed, 22 deletions(-)
 delete mode 100644 lm_eval/tasks/mathvista/testmini.yaml
 delete mode 100644 lm_eval/tasks/mathvista/utils.py

diff --git a/lm_eval/tasks/mathvista/testmini.yaml b/lm_eval/tasks/mathvista/testmini.yaml
deleted file mode 100644
index 52b5384041..0000000000
--- a/lm_eval/tasks/mathvista/testmini.yaml
+++ /dev/null
@@ -1,17 +0,0 @@
-dataset_path: AI4Math/MathVista
-task: mathvista_mcq
-test_split: testmini
-output_type: multiple_choice
-process_docs: !function utils.process_docs
-doc_to_image: !function utils.doc_to_image
-doc_to_text: "<image>{{query}}"
-doc_to_choice: '{{ ["A", "B", "C", "D", "E", "F"][:choices.length] }}'
-doc_to_target: "{{choices.index(answer)}}"
-metric_list:
-  - metric: acc
-    aggregation: mean
-    higher_is_better: true
-metadata:
-  version: 1.0
-dataset_kwargs:
-  trust_remote_code: true
diff --git a/lm_eval/tasks/mathvista/utils.py b/lm_eval/tasks/mathvista/utils.py
deleted file mode 100644
index 19c64035ea..0000000000
--- a/lm_eval/tasks/mathvista/utils.py
+++ /dev/null
@@ -1,5 +0,0 @@
-import datasets
-
-
-def process_docs(dataset: datasets.Dataset) -> datasets.Dataset:
-    return dataset.filter(lambda x: x["question_type"].strip() == "multi_choice")

From a3bb2f15006ac113655ff52d13c6e6d3ac051c77 Mon Sep 17 00:00:00 2001
From: Hailey Schoelkopf <65563625+haileyschoelkopf@users.noreply.github.com>
Date: Fri, 13 Sep 2024 13:35:35 -0400
Subject: [PATCH 58/60] add some sample scores

---
 lm_eval/tasks/mmmu/README.md | 68 ++++++++++++++++++++++++++++++++++++
 1 file changed, 68 insertions(+)

diff --git a/lm_eval/tasks/mmmu/README.md b/lm_eval/tasks/mmmu/README.md
index 814e5344da..1c1c3215b5 100644
--- a/lm_eval/tasks/mmmu/README.md
+++ b/lm_eval/tasks/mmmu/README.md
@@ -64,6 +64,74 @@ Note: Some questions have multiple images in context. To control for this use `m
 
 The `mmmu_val` group implements MMMU using processing code [from the original MMMU authors](https://github.com/MMMU-Benchmark/MMMU/tree/main/mmmu) and uses the prompt format found in [the MMMU repository for Llava-1.5](https://github.com/MMMU-Benchmark/MMMU/blob/main/mmmu/configs/llava1.5.yaml). This implementation should give scores on par with or slightly higher than those reported by [lmms-eval](https://github.com/EvolvingLMMs-Lab/lmms-eval/tree/main/lmms_eval/tasks/mmmu) for `mmmu_val` and the MMMU repository code.
 
+Scores on several tested models (**all with `--apply_chat_template`**) are:
+
+Qwen2-VL-2B:
+```
+hf-multimodal (pretrained=Qwen/Qwen2-VL-2B-Instruct,attn_implementation=flash_attention_2,dtype=bfloat16,convert_img_format=True), gen_kwargs: (None), limit: None, num_fewshot: None, batch_size: 2
+```
+```
+|             Groups             |Version|Filter|n-shot|Metric|   |Value |   |Stderr|
+|--------------------------------|------:|------|------|------|---|-----:|---|-----:|
+|mmmu_val                        |      0|none  |      |acc   |↑  |0.3778|±  |0.0155|
+| - Art and Design               |      0|none  |      |acc   |↑  |0.5500|±  |0.0415|
+| - Business                     |      0|none  |      |acc   |↑  |0.3600|±  |0.0389|
+| - Health and Medicine          |      0|none  |      |acc   |↑  |0.3667|±  |0.0394|
+| - Humanities and Social Science|      0|none  |      |acc   |↑  |0.5167|±  |0.0438|
+| - Science                      |      0|none  |      |acc   |↑  |0.2467|±  |0.0352|
+| - Tech and Engineering         |      0|none  |      |acc   |↑  |0.3143|±  |0.0317|
+```
+
+Qwen2-VL-7B:
+```
+hf-multimodal (pretrained=Qwen/Qwen2-VL-7B-Instruct,attn_implementation=flash_attention_2,dtype=bfloat16,convert_img_format=True), gen_kwargs: (None), limit: None, num_fewshot: None, batch_size: 2
+```
+```
+|             Groups             |Version|Filter|n-shot|Metric|   |Value |   |Stderr|
+|--------------------------------|------:|------|------|------|---|-----:|---|-----:|
+|mmmu_val                        |      0|none  |      |acc   |↑  |0.5056|±  |0.0160|
+| - Art and Design               |      0|none  |      |acc   |↑  |0.6917|±  |0.0398|
+| - Business                     |      0|none  |      |acc   |↑  |0.4333|±  |0.0406|
+| - Health and Medicine          |      0|none  |      |acc   |↑  |0.5667|±  |0.0401|
+| - Humanities and Social Science|      0|none  |      |acc   |↑  |0.6750|±  |0.0426|
+| - Science                      |      0|none  |      |acc   |↑  |0.3800|±  |0.0392|
+| - Tech and Engineering         |      0|none  |      |acc   |↑  |0.4000|±  |0.0341|
+```
+
+Idefics2-8B:
+
+```
+hf-multimodal (pretrained=HuggingFaceM4/idefics2-8b,attn_implementation=flash_attention_2,dtype=bfloat16,convert_img_format=True,max_images=2), gen_kwargs: (None), limit: None, num_fewshot: None, batch_size: 2
+```
+```
+|             Groups             |Version|Filter|n-shot|Metric|   |Value |   |Stderr|
+|--------------------------------|------:|------|------|------|---|-----:|---|-----:|
+|mmmu_val                        |      0|none  |      |acc   |↑  |0.4011|±  |0.0154|
+| - Art and Design               |      0|none  |      |acc   |↑  |0.6167|±  |0.0436|
+| - Business                     |      0|none  |      |acc   |↑  |0.3200|±  |0.0373|
+| - Health and Medicine          |      0|none  |      |acc   |↑  |0.4000|±  |0.0401|
+| - Humanities and Social Science|      0|none  |      |acc   |↑  |0.5750|±  |0.0424|
+| - Science                      |      0|none  |      |acc   |↑  |0.2600|±  |0.0358|
+| - Tech and Engineering         |      0|none  |      |acc   |↑  |0.3381|±  |0.0312|
+```
+
+Llava-v1.6-Mistral-7B:
+```
+hf-multimodal (pretrained=llava-hf/llava-v1.6-mistral-7b-hf,attn_implementation=flash_attention_2,dtype=bfloat16,convert_img_format=True), gen_kwargs: (None), limit: None, num_fewshot: None, batch_size: 2
+```
+```
+|             Groups             |Version|Filter|n-shot|Metric|   |Value |   |Stderr|
+|--------------------------------|------:|------|------|------|---|-----:|---|-----:|
+|mmmu_val                        |      0|none  |      |acc   |↑  |0.3522|±  |0.0151|
+| - Art and Design               |      0|none  |      |acc   |↑  |0.5167|±  |0.0440|
+| - Business                     |      0|none  |      |acc   |↑  |0.2667|±  |0.0362|
+| - Health and Medicine          |      0|none  |      |acc   |↑  |0.3867|±  |0.0397|
+| - Humanities and Social Science|      0|none  |      |acc   |↑  |0.5917|±  |0.0433|
+| - Science                      |      0|none  |      |acc   |↑  |0.2200|±  |0.0342|
+| - Tech and Engineering         |      0|none  |      |acc   |↑  |0.2524|±  |0.0299|
+```
+
+
 
 ### Checklist
 

From 5f76efd2ae467898b191675fd6d5a8f5fdfd8392 Mon Sep 17 00:00:00 2001
From: Hailey Schoelkopf <65563625+haileyschoelkopf@users.noreply.github.com>
Date: Fri, 13 Sep 2024 13:38:04 -0400
Subject: [PATCH 59/60] Update README.md

---
 lm_eval/tasks/mmmu/README.md | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/lm_eval/tasks/mmmu/README.md b/lm_eval/tasks/mmmu/README.md
index 1c1c3215b5..e9d0da12f6 100644
--- a/lm_eval/tasks/mmmu/README.md
+++ b/lm_eval/tasks/mmmu/README.md
@@ -81,6 +81,8 @@ hf-multimodal (pretrained=Qwen/Qwen2-VL-2B-Instruct,attn_implementation=flash_at
 | - Science                      |      0|none  |      |acc   |↑  |0.2467|±  |0.0352|
 | - Tech and Engineering         |      0|none  |      |acc   |↑  |0.3143|±  |0.0317|
 ```
+Author-reported score: 41.1%
+
 
 Qwen2-VL-7B:
 ```
@@ -97,9 +99,9 @@ hf-multimodal (pretrained=Qwen/Qwen2-VL-7B-Instruct,attn_implementation=flash_at
 | - Science                      |      0|none  |      |acc   |↑  |0.3800|±  |0.0392|
 | - Tech and Engineering         |      0|none  |      |acc   |↑  |0.4000|±  |0.0341|
 ```
+Author-reported score: 54.1%
 
 Idefics2-8B:
-
 ```
 hf-multimodal (pretrained=HuggingFaceM4/idefics2-8b,attn_implementation=flash_attention_2,dtype=bfloat16,convert_img_format=True,max_images=2), gen_kwargs: (None), limit: None, num_fewshot: None, batch_size: 2
 ```
@@ -114,6 +116,7 @@ hf-multimodal (pretrained=HuggingFaceM4/idefics2-8b,attn_implementation=flash_at
 | - Science                      |      0|none  |      |acc   |↑  |0.2600|±  |0.0358|
 | - Tech and Engineering         |      0|none  |      |acc   |↑  |0.3381|±  |0.0312|
 ```
+Author-reported score: ~43%
 
 Llava-v1.6-Mistral-7B:
 ```
@@ -130,7 +133,7 @@ hf-multimodal (pretrained=llava-hf/llava-v1.6-mistral-7b-hf,attn_implementation=
 | - Science                      |      0|none  |      |acc   |↑  |0.2200|±  |0.0342|
 | - Tech and Engineering         |      0|none  |      |acc   |↑  |0.2524|±  |0.0299|
 ```
-
+Author-reported score: 35.3%
 
 
 ### Checklist

From b3e87aea979251aa567d5668504c6fdd544de056 Mon Sep 17 00:00:00 2001
From: haileyschoelkopf <hailey@eleuther.ai>
Date: Fri, 13 Sep 2024 17:44:54 +0000
Subject: [PATCH 60/60] add log msg for image_string value

---
 lm_eval/models/hf_vlms.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/lm_eval/models/hf_vlms.py b/lm_eval/models/hf_vlms.py
index e14380bd4e..c94c76b26f 100644
--- a/lm_eval/models/hf_vlms.py
+++ b/lm_eval/models/hf_vlms.py
@@ -83,6 +83,9 @@ def __init__(
                     f"A non-default image_token_id with image_token_id={self.image_token_id} and string value '{self.image_token}' was specified manually. Note that using an improper image_token placeholder may lead to ignored image input or errors!"
                 )
         else:
+            eval_logger.info(
+                f"A non-default image_token string with string value image_string='{image_string}' was specified manually. Note that using an improper image_token placeholder may lead to ignored image input or errors!"
+            )
             self.image_token = image_string
 
     def _create_tokenizer(