diff --git a/README.md b/README.md
index e973cdb7e8..9f26c811d1 100644
--- a/README.md
+++ b/README.md
@@ -6,6 +6,7 @@
 
 *Latest News 📣*
 
+- [2024/09] We are prototyping allowing users of LM Evaluation Harness to create and evaluate on text+image multimodal input, text output tasks, and have just added the `hf-multimodal` and `vllm-vlm` model types and `mmmu` task as a prototype feature. We welcome users to try out this in-progress feature and stress-test it for themselves, and suggest they check out [`lmms-eval`](https://github.com/EvolvingLMMs-Lab/lmms-eval), a wonderful project originally forking off of the lm-evaluation-harness, for a broader range of multimodal tasks, models, and features.
 - [2024/07] [API model](docs/API_guide.md) support has been updated and refactored, introducing support for batched and async requests, and making it significantly easier to customize and use for your own purposes. **To run Llama 405B, we recommend using VLLM's OpenAI-compliant API to host the model, and use the `local-completions` model type to evaluate the model.**
 - [2024/07] New Open LLM Leaderboard tasks have been added ! You can find them under the [leaderboard](lm_eval/tasks/leaderboard/README.md) task group.
 
diff --git a/lm_eval/api/task.py b/lm_eval/api/task.py
index 8a1a3bdbaf..8dadf94857 100644
--- a/lm_eval/api/task.py
+++ b/lm_eval/api/task.py
@@ -75,6 +75,7 @@ class TaskConfig(dict):
     process_docs: Optional[Callable] = None
     doc_to_text: Optional[Union[Callable, str]] = None
     doc_to_target: Optional[Union[Callable, str]] = None
+    doc_to_image: Union[Callable, str] = None
     doc_to_choice: Optional[Union[Callable, str, dict, list]] = None
     process_results: Optional[Union[Callable, str]] = None
     use_prompt: Optional[str] = None
@@ -377,6 +378,10 @@ def doc_to_text(self, doc):
     def doc_to_target(self, doc):
         pass
 
+    # not an abstractmethod because not every language-only task has to implement this
+    def doc_to_image(self, doc):
+        raise NotImplementedError
+
     def build_all_requests(
         self,
         *,
@@ -735,6 +740,10 @@ def __init__(
                 )
             self.OUTPUT_TYPE = self.config.output_type
 
+        if self.config.doc_to_image is not None:
+            # mark the task as requiring multimodality.
+            self.MULTIMODAL = True
+
         if self.config.dataset_path is not None:
             self.DATASET_PATH = self.config.dataset_path
 
@@ -1042,8 +1051,8 @@ def fewshot_context(
             Whether to apply the chat template to the fewshot context.
         :param fewshot_as_multiturn: bool
             Whether to provide the fewshot examples as a multiturn conversation or a single user turn.
-        :param chat_template: Callable
-            Chat template to be applied to the fewshot context.
+        :param chat_template:
+            callable (from lm.apply_chat_template) that takes in a list[Dict] chat transcript and renders it into a string.
         :returns: str
             The fewshot context.
         """
@@ -1279,9 +1288,34 @@ def doc_to_choice(self, doc: Any, doc_to_choice=None) -> List[str]:
         else:
             raise TypeError
 
+    def doc_to_image(self, doc: Any, doc_to_image=None) -> Union[int, str, list]:
+        if doc_to_image is not None:
+            doc_to_image = doc_to_image
+        elif self.config.doc_to_image is not None:
+            doc_to_image = self.config.doc_to_image
+        else:
+            return None
+
+        if isinstance(doc_to_image, list):
+            image_feature = [
+                self.doc_to_image(doc, feature) for feature in doc_to_image
+            ]
+            return [feature for feature in image_feature if feature is not None]
+        elif isinstance(doc_to_image, str):
+            if doc_to_image in self.features:
+                return doc[doc_to_image]
+            else:
+                return ast.literal_eval(utils.apply_template(doc_to_image, doc))
+        elif callable(doc_to_image):
+            return doc_to_image(doc)
+        else:
+            return None
+
     def construct_requests(
         self, doc: dict, ctx: str, **kwargs
     ) -> Union[List[Instance], Instance]:
+        aux_arguments = None
+
         if self.OUTPUT_TYPE == "loglikelihood":
             arguments = (ctx, self.doc_to_target(doc))
         elif self.OUTPUT_TYPE == "loglikelihood_rolling":
@@ -1299,6 +1333,37 @@ def construct_requests(
                 # Otherwise they are placed in the continuation
                 arguments = [(ctx, f"{target_delimiter}{cont}") for cont in choices]
 
+            # TODO: we should raise a warning telling users this will at most ~2x runtime.
+            if "acc_mutual_info" in self._metric_fn_list.keys():
+                # if we are calculating multiple choice accuracy
+                # using mutual information instead of raw loglikelihood as metric, need unconditional lls.
+
+                # here mutual info refers to calculating
+                # log(P(choice|ctx) / P(choice)) = log(P(choice|ctx)) - log(P(choice))
+                # in other words normalizing by subtracting the unconditional logprob of each choice.
+                aux_arguments = [("", f"{choice}") for choice in choices]
+
+                arguments.extend(aux_arguments)
+
+        elif self.OUTPUT_TYPE == "generate_until":
+            arguments = (ctx, deepcopy(self.config.generation_kwargs))
+
+        multimodal_arg = {}
+        if (
+            self.config.doc_to_image
+        ):  # TODO: ensure that non-multimodal tasks aren't getting visual args
+            multimodal_arg = {
+                **multimodal_arg,
+                **{"visual": self.doc_to_image(doc)},
+            }
+
+        if bool(multimodal_arg):
+            if isinstance(arguments, list):
+                arguments = [arg + (multimodal_arg,) for arg in arguments]
+            else:
+                arguments = arguments + (multimodal_arg,)
+
+        if self.OUTPUT_TYPE == "multiple_choice":
             request_list = [
                 Instance(
                     request_type="loglikelihood",
@@ -1309,33 +1374,15 @@ def construct_requests(
                 )
                 for i, arg in enumerate(arguments)
             ]
-            # TODO: we should raise a warning telling users this will at most ~2x runtime.
-            if "acc_mutual_info" in self._metric_fn_list.keys():
-                # if we are calculating multiple choice accuracy
-                # using mutual information instead of raw loglikelihood as metric, need unconditional lls.
 
-                # here mutual info refers to calculating
-                # log(P(choice|ctx) / P(choice)) = log(P(choice|ctx)) - log(P(choice))
-                # in other words normalizing by subtracting the unconditional logprob of each choice.
-                request_list.extend(
-                    [
-                        Instance(
-                            request_type="loglikelihood",
-                            doc=doc,
-                            arguments=("", "{}".format(choice)),
-                            idx=i,
-                            **kwargs,
-                        )
-                        for i, choice in enumerate(choices)
-                    ]
-                )
             return request_list
 
-        elif self.OUTPUT_TYPE == "generate_until":
-            arguments = (ctx, deepcopy(self.config.generation_kwargs))
-
         return Instance(
-            request_type=self.OUTPUT_TYPE, doc=doc, arguments=arguments, idx=0, **kwargs
+            request_type=self.OUTPUT_TYPE,
+            doc=doc,
+            arguments=arguments,
+            idx=0,
+            **kwargs,
         )
 
     def process_results(self, doc, results):
@@ -1547,7 +1594,7 @@ def __repr__(self):
             f"ConfigurableTask(task_name={getattr(self.config, 'task', None)},"
             f"output_type={self.OUTPUT_TYPE},"
             f"num_fewshot={getattr(self.config, 'num_fewshot', None)},"
-            f"num_samples={len(self.eval_docs)})"
+            f"num_samples={len(self.eval_docs)})",
         )
 
 
diff --git a/lm_eval/evaluator.py b/lm_eval/evaluator.py
index 52a8d00d74..4f4c8f7fba 100644
--- a/lm_eval/evaluator.py
+++ b/lm_eval/evaluator.py
@@ -414,8 +414,28 @@ def evaluate(
             for task_output in eval_tasks
         ):
             raise ValueError("log_samples must be True for 'bypass' metric-only tasks")
+
+    # validation check: are we running multimodal task <-> non-multimodal model class, or vice-versa.
+    incompatible_tasks = []
     for task_output in eval_tasks:
         task: Task = task_output.task
+
+        if getattr(lm, "MULTIMODAL", False) != getattr(task, "MULTIMODAL", False):
+            incompatible_tasks.append(task_output.task_name)
+    if len(incompatible_tasks) > 0:
+        if not getattr(lm, "MULTIMODAL", False):
+            raise ValueError(
+                f"Attempted to run tasks: {incompatible_tasks} which require multimodal input, but the selected model type does not currently implement this. Multimodal support is currently restricted to the ['hf-multimodal', 'vllm-vlm'] model type."
+            )
+        else:
+            raise ValueError(
+                f"Attempted to run tasks: {incompatible_tasks} which are text-only, but used a model type which only currently supports multimodal tasks."
+            )
+    # end multimodality validation check
+
+    for task_output in eval_tasks:
+        task: Task = task_output.task
+
         limit = get_sample_size(task, limit)
         task.build_all_requests(
             limit=limit,
diff --git a/lm_eval/models/__init__.py b/lm_eval/models/__init__.py
index 0b2441eb87..553be3a5fd 100644
--- a/lm_eval/models/__init__.py
+++ b/lm_eval/models/__init__.py
@@ -3,6 +3,7 @@
     api_models,
     dummy,
     gguf,
+    hf_vlms,
     huggingface,
     mamba_lm,
     nemo_lm,
@@ -12,6 +13,7 @@
     optimum_lm,
     textsynth,
     vllm_causallms,
+    vllm_vlms,
 )
 
 
diff --git a/lm_eval/models/hf_vlms.py b/lm_eval/models/hf_vlms.py
new file mode 100644
index 0000000000..c94c76b26f
--- /dev/null
+++ b/lm_eval/models/hf_vlms.py
@@ -0,0 +1,719 @@
+import copy
+from typing import Dict, List, Optional, Tuple, Union
+
+import torch
+import torch.nn.functional as F
+import transformers
+from tqdm import tqdm
+from transformers import BatchEncoding
+
+from lm_eval import utils
+from lm_eval.api.instance import Instance
+from lm_eval.api.registry import register_model
+from lm_eval.models.huggingface import HFLM
+from lm_eval.models.utils import (
+    Collator,
+    pad_and_concat,
+    replace_placeholders,
+    stop_sequences_criteria,
+)
+
+
+DEFAULT_IMAGE_PLACEHOLDER = "<image>"
+
+
+eval_logger = utils.eval_logger
+
+
+@register_model("hf-multimodal")
+class HFMultimodalLM(HFLM):
+    """
+    An abstracted Hugging Face model class for multimodal LMs like Llava and Idefics.
+    """
+
+    AUTO_MODEL_CLASS = transformers.AutoModelForVision2Seq
+    MULTIMODAL = True  # flag to indicate, for now, that this model type can run multimodal requests
+
+    def __init__(
+        self,
+        pretrained: Union[str, transformers.PreTrainedModel],
+        image_token_id: Optional[int] = None,
+        image_string: Optional[str] = None,
+        interleave: bool = True,
+        # TODO: handle whitespace in image placeholder (replacement)
+        max_images: Optional[int] = 999,
+        convert_img_format=False,
+        **kwargs,
+    ):
+        # We initialize using HFLM's init. Sub-methods like _create_model and _create_tokenizer
+        # modify init behavior.
+        super().__init__(pretrained, **kwargs)
+
+        assert (
+            self.batch_size != "auto"
+        ), "Batch size 'auto' is not yet supported for hf-multimodal models."
+        self.chat_applied: bool = False
+        # TODO: phi-3.5 "image placeholders" are <image_1>, <image_2>, ... in order. how to handle this case
+
+        # HF AutoModelForVision2Seq models have an `image_token_id` value in their configs
+        # denoting the token which indicates a location where an image will be substituted in.
+        # This can take different string values across models, e.g. <image> for Idefics2 and <|image_pad|> for Qwen2-VL
+        self.interleave = interleave
+        self.max_images = max_images
+        self.rgb = convert_img_format
+        # WARNING: improperly set image_token_id can lead to ignored image input or other (potentially silent) errors!
+        if not image_string:
+            self.image_token_id = (
+                int(image_token_id)
+                if image_token_id
+                else (
+                    getattr(self.config, "image_token_id", None)
+                    or getattr(self.config, "image_token_index", None)
+                )
+            )
+            assert (
+                self.image_token_id is not None
+            ), "Must have a non-None image_token_id to evaluate a Hugging Face AutoModelForVision2Seq model. Please pass `image_token_id` in `--model_args` if model's config does not already specify one."
+            # get the string this token ID corresponds to
+            self.image_token = self.tok_decode(
+                [self.image_token_id], skip_special_tokens=False
+            )
+            if image_token_id is not None:
+                eval_logger.info(
+                    f"A non-default image_token_id with image_token_id={self.image_token_id} and string value '{self.image_token}' was specified manually. Note that using an improper image_token placeholder may lead to ignored image input or errors!"
+                )
+        else:
+            eval_logger.info(
+                f"A non-default image_token string with string value image_string='{image_string}' was specified manually. Note that using an improper image_token placeholder may lead to ignored image input or errors!"
+            )
+            self.image_token = image_string
+
+    def _create_tokenizer(
+        self,
+        pretrained: Union[str, transformers.PreTrainedModel],
+        tokenizer: Optional[
+            Union[
+                str,
+                transformers.ProcessorMixin,
+            ]
+        ],
+        revision: Optional[str] = "main",
+        trust_remote_code: Optional[bool] = False,
+        **kwargs,
+    ) -> None:
+        """
+        Helper method during initialization.
+
+        For the multimodal variant, we initialize not just
+        `self.tokenizer` but also `self.processor`.
+        """
+
+        if tokenizer:
+            if isinstance(tokenizer, str):
+                return transformers.AutoProcessor.from_pretrained(
+                    tokenizer,
+                    revision=revision,
+                    trust_remote_code=trust_remote_code,
+                    # use_fast=use_fast_tokenizer,
+                )
+            else:
+                assert isinstance(
+                    tokenizer, transformers.ProcessorMixin
+                )  # TODO: check this condition
+                return tokenizer
+
+        # Get tokenizer based on 'pretrained'
+        if isinstance(pretrained, str):
+            model_name = pretrained
+        else:
+            # get the HF hub name via accessor on model
+            model_name = self.model.name_or_path
+
+        self.processor = transformers.AutoProcessor.from_pretrained(
+            model_name,
+            revision=revision,
+            trust_remote_code=trust_remote_code,
+            # use_fast=use_fast_tokenizer,
+        )
+
+        self.tokenizer = self.processor.tokenizer
+
+    def tok_multimodal_encode(
+        self, string, images, left_truncate_len=None, add_special_tokens=None
+    ):
+        """Helper function which encodes an image + string combo using AutoProcessor"""
+        # We inherit special token kwarg setup from HFLM.tok_encode
+        # special_tokens_kwargs = {}
+
+        # by default for CausalLM - false or self.add_bos_token is set
+        # if add_special_tokens is None:
+        #     special_tokens_kwargs = {"add_special_tokens": False or self.add_bos_token}
+        # otherwise the method explicitly defines the value
+        # else:
+        #     special_tokens_kwargs = {"add_special_tokens": add_special_tokens}
+
+        # encode text+images
+        # TODO: why does (Qwen2-VL) processor error when attempting to add special tokens to text?
+        encoding = self.processor(
+            text=string, images=images, return_tensors=None
+        )  # , **special_tokens_kwargs)
+
+        # remove (and store) our tokenized text
+        text_encoding = encoding.pop("input_ids")
+        encoding.pop("attention_mask")
+
+        # left-truncate the encoded context to be at most `left_truncate_len` tokens long
+        if left_truncate_len:
+            text_encoding = text_encoding[-left_truncate_len:]
+
+        return text_encoding, encoding  # image_encoding is a dict
+
+    def _encode_multimodal_pair(self, context, continuation, images):
+        """Helper function to perform the role of TemplateLM._encode_pair
+        Except allowing for image input to also be processed alongside `context`.
+
+        This method is a bit messy due to the need to defer conversion of image and text token input
+        into PyTorch tensors until the main inference loop.
+        """
+
+        n_spaces = len(context) - len(context.rstrip())
+        if n_spaces > 0:
+            continuation = context[-n_spaces:] + continuation
+            context = context[:-n_spaces]
+
+        # TODO: replace default <image> placeholder with self.image_token, for contexts
+
+        whole_enc, image_enc = self.tok_multimodal_encode(
+            context + continuation, images
+        )
+        context_enc, _ = self.tok_multimodal_encode(context, images)
+
+        # tok_multimodal_encode returns List[List[int]] for tokenized text. Get rid of the batch dim
+        # since we only are encoding a single string.
+        # TODO: this is a bit hacky, it'd be nice to make this generally cleaner
+        whole_enc, context_enc = whole_enc[0], context_enc[0]
+
+        context_enc_len = len(context_enc)
+        continuation_enc = whole_enc[context_enc_len:]
+
+        return context_enc, continuation_enc, image_enc
+
+    def apply_chat_template(self, chat_history: List[Dict[str, str]]) -> str:
+        self.chat_applied = True
+        if not self.interleave:
+            for content in chat_history:
+                c = []
+                text = content["content"]
+
+                # Count and remove image placeholders
+                image_count = min(
+                    self.max_images, text.count(DEFAULT_IMAGE_PLACEHOLDER)
+                )
+                text = text.replace(DEFAULT_IMAGE_PLACEHOLDER, "")
+
+                # Add image entries
+                for _ in range(image_count):
+                    c.append({"type": "image", "image": None})
+
+                # Add single text entry at the end
+                c.append({"type": "text", "text": text})
+
+                content["content"] = c
+        else:
+            for content in chat_history:
+                c = []
+                text = content["content"]
+                expected_image_count = min(
+                    self.max_images, text.count(DEFAULT_IMAGE_PLACEHOLDER)
+                )
+                actual_image_count = 0
+
+                text_parts = text.split(DEFAULT_IMAGE_PLACEHOLDER)
+
+                for i, part in enumerate(text_parts):
+                    # TODO: concatenate text parts (esp. if skipping images)?
+                    if part:  # Add non-empty text parts
+                        c.append({"type": "text", "text": part})
+                    if (
+                        (i < len(text_parts) - 1) and i < self.max_images
+                    ):  # Add image placeholder after each split except the last
+                        c.append({"type": "image"})
+                        actual_image_count += 1
+
+                content["content"] = c
+
+                if actual_image_count != expected_image_count:
+                    raise ValueError(
+                        f"Mismatch in image placeholder count. Expected: {expected_image_count}, Actual: {actual_image_count}"
+                    )
+
+        return self.processor.apply_chat_template(
+            chat_history, add_generation_prompt=True
+        )
+
+    def chat_template(self, chat_template: Union[bool, str] = False) -> Optional[str]:
+        if hasattr(self.processor, "apply_chat_template"):
+            _tokenizer = self.tokenizer
+            self.tokenizer = self.processor
+
+            selected_template = super().chat_template(chat_template)
+
+            self.tokenizer = _tokenizer
+            return selected_template
+        else:
+            return super().chat_template(chat_template)
+
+    def tok_batch_multimodal_encode(
+        self,
+        strings: List[str],  # note that input signature of this fn is different
+        images: List[List],  # TODO: images are pil.Image at the moment, update typehint
+        padding_side: str = "left",
+        left_truncate_len: int = None,
+        truncation: bool = False,
+    ) -> Union[
+        BatchEncoding, Dict[str, torch.Tensor]
+    ]:  # note that this return signature differs from HFLM tok_batch_encode.
+        # NOTE: here, we replace <image> tags with our model's corresponding image_token string value.
+        if not self.chat_applied:
+            # TODO<baber>: This still keeps the whitespace in the image placeholder, which is not ideal.
+            strings = [
+                replace_placeholders(
+                    string, DEFAULT_IMAGE_PLACEHOLDER, self.image_token, self.max_images
+                )
+                for string in strings
+            ]
+
+        # encode a batch of strings. converts to tensors and pads automatically, unlike tok_encode.
+        old_padding_side = self.tokenizer.padding_side
+        self.tokenizer.padding_side = padding_side
+
+        # add_special_tokens = {"add_special_tokens": False or self.add_bos_token}
+
+        images = [img[: self.max_images] for img in images]
+        if self.rgb:
+            images = [[img.convert("RGB") for img in sublist] for sublist in images]
+
+        encoding = self.processor(
+            images=images,
+            text=strings,
+            truncation=truncation,
+            padding="longest",
+            return_tensors="pt",
+            # **add_special_tokens, # TODO: at least some Processors error out when passing this. How do we control whether text gets BOS added?
+        )
+
+        encoding.to(  # TODO: our other tokenization methods in HFLM don't typically move to device. this breaks convention
+            self.device, self.model.dtype
+        )  # TODO: This only casts the pixel values. Should they always be float16?
+        if left_truncate_len:
+            encoding["input_ids"] = encoding["input_ids"][:, -left_truncate_len:]
+            encoding["attention_mask"] = encoding["attention_mask"][
+                :, -left_truncate_len:
+            ]
+        self.tokenizer.padding_side = old_padding_side
+
+        return encoding
+
+    def _model_multimodal_call(self, inps, imgs, attn_mask=None, labels=None):
+        """
+        TODO: update docstring
+        """
+        # note: imgs is a dict.
+        with torch.no_grad():
+            return self.model(inps, **imgs).logits
+
+    def _model_multimodal_generate(self, inputs, max_length, stop, **generation_kwargs):
+        generation_kwargs["temperature"] = generation_kwargs.get("temperature", 0.0)
+        do_sample = generation_kwargs.get("do_sample", None)
+
+        # The temperature has to be a strictly positive float -- if it is 0.0, use greedy decoding strategies
+        if generation_kwargs.get("temperature") == 0.0 and do_sample is None:
+            generation_kwargs["do_sample"] = do_sample = False
+
+        if do_sample is False and generation_kwargs.get("temperature") == 0.0:
+            generation_kwargs.pop("temperature")
+
+        stopping_criteria = stop_sequences_criteria(
+            self.tokenizer,
+            stop,
+            inputs["input_ids"].shape[1],
+            inputs["input_ids"].shape[0],
+        )
+        return self.model.generate(
+            **inputs,
+            max_length=max_length,
+            stopping_criteria=stopping_criteria,
+            pad_token_id=self.tokenizer.pad_token_id,
+            use_cache=True,
+            **generation_kwargs,
+        )
+
+    def _batch_images(self, image_encs):
+        """
+        Helper function: batch together image encodings across examples in a batch.
+        # TODO: for variable-sized images, this may break down.
+        """
+        batched_imgs = {}
+        for key in image_encs[0].keys():
+            batched_imgs[key] = torch.cat(
+                [
+                    torch.tensor(
+                        image_enc[key], device=self.device, dtype=self.model.dtype
+                    )
+                    for image_enc in image_encs
+                ],
+                dim=0,
+            )
+        return batched_imgs
+
+    def loglikelihood_rolling(self, requests: List[Instance]) -> List[float]:
+        raise NotImplementedError(
+            "model type `hf-multimodal` does not support loglikelihood_rolling. Use 'hf' model type for text-only loglikelihood_rolling tasks ",
+            "this is because we do not support measuring the loglikelihood a model assigns to an image.",
+        )
+
+    def loglikelihood(
+        self, requests: List[Instance], disable_tqdm: bool = False
+    ) -> List[Tuple[float, bool]]:
+        raise NotImplementedError(
+            "'loglikelihood' requests for model type `hf-multimodal` are not yet tested. This feature will be enabled when a loglikelihood-based multiple-choice VQA dataset is added!"
+        )
+
+        new_reqs = []
+        for context, continuation, aux_arguments in [req.args for req in requests]:
+            if context == "":
+                raise ValueError(
+                    "Must get non-empty context for multimodal requests! You might be trying to run 'loglikelihood_rolling', which is not supported in the multimodal case."
+                )
+            else:
+                visuals = aux_arguments["visual"]
+
+                context_enc, continuation_enc, image_enc = self._encode_multimodal_pair(
+                    context, continuation, visuals
+                )
+            # TODO: key to pick for caching images
+            new_reqs.append(
+                (
+                    (context, continuation, visuals),
+                    context_enc,
+                    continuation_enc,
+                    image_enc,
+                )
+            )
+
+        return self._loglikelihood_tokens(new_reqs, disable_tqdm=disable_tqdm)
+
+    def _loglikelihood_tokens(
+        self,
+        requests: List[
+            Tuple[Tuple[None, str, str], List[int], List[int], List[int]]
+        ],  # TODO: update typehint to be correct
+        disable_tqdm: bool = False,
+        override_bs: int = None,
+    ) -> List[Tuple[float, bool]]:
+        res = []
+
+        # TODO: **improve multimodal collation.** We currently ignore image size when ordering docs. ideally we'd take them into account
+        def _collate(req: Tuple[Tuple[str, str], List[int], List[int]]):
+            """Defines the key for the sorted method"""
+            # the negative sign on len(toks) sorts descending - this has a few advantages:
+            # - time estimates will always be over not underestimates, which is more useful for planning
+            # - to know the size of a batch when going through the list, you know the first one is always the batch
+            #   padded context length. this is useful to simplify the batching logic and more importantly to make
+            #   automatic adaptive batches much much easier to implement
+            # - any OOMs will happen right away rather than near the end
+            toks = req[1] + req[2]
+            return -len(toks), tuple(toks)
+
+        def _lookup_one_token_cont(req: Tuple[Tuple[str, str], List[int], List[int]]):
+            """Defines the key to group and lookup one-token continuations"""
+            # Use with group_by="contexts" (optional)"
+            # allows for the creation of a lookup, so we can reuse logits in case of one-token continuations.
+            # speeds up some multiple-choice tasks proportionally to the number of choices.
+            # groups requests by context+continuation[:-1] and infer on one request/group.
+            return req[-1] + req[-3] + req[-2][:-1]
+
+        re_ord = Collator(
+            requests,
+            sort_fn=_collate,
+            group_by="contexts"  # TODO: can't group-by just "contexts" any more, need to incorporate imgs
+            if self.AUTO_MODEL_CLASS == transformers.AutoModelForCausalLM
+            and self.logits_cache
+            else None,
+            group_fn=_lookup_one_token_cont,
+        )
+
+        # automatic (variable) batch size detection for vectorization
+        # pull longest context sample from request
+        n_reordered_requests = len(re_ord)
+        batch_size = (
+            self.batch_size
+            if self.batch_size != "auto"
+            else override_bs
+            if override_bs is not None
+            else 0
+        )
+        batch_fn = (
+            self._batch_scheduler
+            if self.batch_size == "auto"
+            and n_reordered_requests > 0
+            and not override_bs
+            else None
+        )
+
+        chunks = re_ord.get_batched(n=batch_size, batch_fn=batch_fn)
+        pbar = tqdm(
+            total=len(requests),
+            disable=(disable_tqdm or (self.rank != 0)),
+            desc="Running loglikelihood requests with text+image input",
+        )
+        for chunk in chunks:
+            imgs = []
+            inps = []
+            cont_toks_list = []
+            inplens = []
+
+            padding_len_inp = None
+            # because vectorizing is annoying, we first convert each (context, continuation) pair to padded
+            # tensors, then we pack them together into a batch, call the model, and then pick it all apart
+            # again because vectorizing is annoying
+
+            for _, context_enc, continuation_enc, image_enc in chunk:
+                # sanity check
+                assert len(image_enc) > 0
+                assert len(context_enc) > 0
+                assert len(continuation_enc) > 0
+                assert len(continuation_enc) <= self.max_length
+
+                # how this all works (illustrated on a causal decoder-only setup):
+                #          CTX      CONT
+                # inp    0 1 2 3|4 5 6 7 8 9   <- last token is deleted by inp[:, :-1]
+                # model  \               \
+                # logits   1 2 3|4 5 6 7 8 9   <- the ctx half gets tossed out by the
+                # cont_toks      4 5 6 7 8 9      [:, -len(continuation_enc):, :self.vocab_size] slice
+
+                # when too long to fit in context, truncate from the left
+                # TODO: assuming that we won't handle enc-dec Vision2Seq models. Is that a safe assumption?
+                inp = torch.tensor(
+                    (context_enc + continuation_enc)[-(self.max_length + 1) :][:-1],
+                    dtype=torch.long,
+                    device=self.device,
+                )
+                (inplen,) = inp.shape
+
+                padding_len_inp = (
+                    max(padding_len_inp, inplen)
+                    if padding_len_inp is not None
+                    else inplen
+                )
+
+                inps.append(inp)  # [1, inp_length]
+                cont_toks_list.append(continuation_enc)
+                inplens.append(inplen)
+
+                imgs.append(image_enc)
+
+            # create encoder attn mask and batched conts, if seq2seq
+            call_kwargs = {}
+            batched_inps = pad_and_concat(
+                padding_len_inp, inps, padding_side="right"
+            )  # [batch, padding_len_inp]
+            # batch our examples' image inputs together
+            batched_imgs = self._batch_images(
+                imgs
+            )  # TODO: fix/test for bs>1 case with differently-sized imgs!
+
+            multi_logits = F.log_softmax(
+                self._model_multimodal_call(batched_inps, batched_imgs, **call_kwargs),
+                dim=-1,
+            )  # [batch, padding_length (inp or cont), vocab]
+
+            for (
+                request_str,
+                ctx_tokens,
+                _,
+                image_encs,
+            ), logits, inplen, cont_toks in zip(
+                chunk, multi_logits, inplens, cont_toks_list
+            ):
+                # Slice to original seq length
+                contlen = len(cont_toks)
+                # take only logits in the continuation
+                # (discard context toks if decoder-only ; discard right-padding)
+                # also discards + checks for "virtual tokens" in the causal LM's input window
+                # from prompt/prefix tuning tokens, if applicable
+                ctx_len = (
+                    inplen + (logits.shape[0] - padding_len_inp)
+                    if self.AUTO_MODEL_CLASS == transformers.AutoModelForCausalLM
+                    else None
+                )
+                logits = self._select_cont_toks(logits, contlen=contlen, inplen=ctx_len)
+                logits = logits.unsqueeze(0)  # [1, seq, vocab]
+
+                # Check if per-token argmax is exactly equal to continuation
+                greedy_tokens = logits.argmax(dim=-1)
+
+                # check for one-token continuation cache hits.
+                # noop in case group_by != "contexts" or no cache hit and returns the
+                # original args. Otherwise, expands the logits batch dimension and yields each
+                # batch along with matching continuation tokens and prompt strings.
+                # logits -> [1, seq, vocab]
+                for request_str, cont_toks, logits in re_ord.get_cache(
+                    req_str=request_str,
+                    cxt_toks=ctx_tokens,
+                    cont_toks=cont_toks,
+                    logits=logits,
+                ):
+                    cont_toks = torch.tensor(
+                        cont_toks, dtype=torch.long, device=self.device
+                    ).unsqueeze(0)  # [1, seq]
+                    max_equal = (greedy_tokens == cont_toks).all()
+
+                    # Obtain log-probs at the corresponding continuation token indices
+                    # last_token_slice = logits[:, -1, :].squeeze(0).tolist()
+                    logits = torch.gather(logits, 2, cont_toks.unsqueeze(-1)).squeeze(
+                        -1
+                    )  # [1, seq]
+
+                    # Answer: (log prob, is-exact-match)
+                    answer = (float(logits.sum()), bool(max_equal))
+
+                    res.append(answer)
+
+                    self.cache_hook.add_partial(
+                        "loglikelihood", request_str, answer
+                    )  # TODO: choose convention for adding images into the cache key
+                    pbar.update(1)
+
+        pbar.close()
+
+        return re_ord.get_original(res)
+
+    def generate_until(
+        self, requests: List[Instance], disable_tqdm: bool = False
+    ) -> List[str]:
+        # TODO: back out to HFLM.generate_until() for all requests without aux_arguments (text-only reqs)
+        res = []
+
+        def _collate(x):
+            # the negative sign on len(toks) sorts descending - this has a few advantages:
+            # - time estimates will always be over not underestimates, which is more useful for planning
+            # - to know the size of a batch when going through the list, you know the first one is always the batch
+            #   padded context length. this is useful to simplify the batching logic and more importantly to make
+            #   automatic adaptive batches much much easier to implement
+            # - any OOMs will happen right away rather than near the end
+            toks = self.tok_encode(x[0])
+            return -len(toks), x[0]
+
+        pbar = tqdm(
+            total=len(requests),
+            disable=(disable_tqdm or (self.rank != 0)),
+            desc="Running generate_until requests with text+image input",
+        )
+        # TODO: port auto-batch sizing into this.
+
+        # we group requests by their generation_kwargs,
+        # so that we don't try to execute e.g. greedy sampling and temp=0.8 sampling
+        # in the same batch.
+        re_ords = Collator(
+            [reg.args for reg in requests],
+            _collate,
+            group_by="gen_kwargs",
+            group_fn=lambda x: x[1],
+        )
+        chunks = re_ords.get_batched(n=self.batch_size, batch_fn=None)
+
+        ### Up to here: was identical to non-multimodal HFLM generate_until ###
+
+        for chunk in chunks:
+            contexts, all_gen_kwargs, aux_arguments = zip(*chunk)
+
+            visuals = [arg["visual"] for arg in aux_arguments]
+
+            if not isinstance(contexts, list):
+                contexts = list(
+                    contexts
+                )  # for Qwen2-VL, processor is unhappy accepting a tuple of strings instead of a list.
+                # TODO: could we upstream this workaround to HF?
+            ### this part onward: same as HFLM ###
+
+            # we assume all gen kwargs in the batch are the same
+            # this is safe to assume because the `grouper` object ensures it.
+            gen_kwargs = all_gen_kwargs[0]
+            # unpack our keyword arguments.
+            until = None
+            if isinstance(gen_kwargs, dict):
+                kwargs = copy.deepcopy(gen_kwargs)  # edge case for repeats > 1
+                if "until" in kwargs.keys():
+                    until = kwargs.pop("until")
+                    if isinstance(until, str):
+                        until = [until]
+                    elif not isinstance(until, list):
+                        raise ValueError(
+                            f"Expected `kwargs['until']` to be of type Union[str,list] but got {until}"
+                        )
+            else:
+                raise ValueError(
+                    f"Expected `kwargs` to be of type `dict` but got {type(gen_kwargs)}"
+                )
+            # add EOS token to stop sequences
+            eos = self.tok_decode(self.eot_token_id, skip_special_tokens=False)
+            if not until:
+                until = [eos]
+            else:
+                until.append(eos)
+            if "max_gen_toks" in kwargs.keys():
+                max_gen_toks = kwargs.pop("max_gen_toks")
+            else:
+                max_gen_toks = self.max_gen_toks
+
+            ### end stuff that's entirely copied verbatim from HFLM ###
+
+            max_ctx_len = self.max_length - max_gen_toks
+
+            inputs = self.tok_batch_multimodal_encode(
+                contexts,
+                visuals,
+                left_truncate_len=max_ctx_len,
+                truncation=self.truncation,
+            )
+
+            context_enc = inputs["input_ids"]
+
+            if "max_length" not in kwargs:
+                kwargs["max_length"] = context_enc.shape[1] + max_gen_toks
+
+            cont = self._model_multimodal_generate(inputs, stop=until, **kwargs)
+
+            del inputs
+            torch.cuda.empty_cache()
+            import gc
+
+            gc.collect()
+
+            ### essentially same as HFLM beyond this line!
+
+            cont_toks_list = cont.tolist()
+            for cont_toks, context in zip(cont_toks_list, contexts):
+                # discard context + left-padding toks if using causal decoder-only VLM
+                cont_toks = cont_toks[context_enc.shape[1] :]
+
+                s = self.tok_decode(cont_toks)
+
+                # use secondary stop seqs to cut off should-have-been-stopped content post-hoc
+                for term in until:
+                    if len(term) > 0:
+                        # ignore '' separator,
+                        # for seq2seq case where self.tok_decode(self.eot_token_id) = ''
+                        s = s.split(term)[0]
+
+                res.append(s)
+                self.cache_hook.add_partial(
+                    "generate_until", (context, gen_kwargs), s
+                )  # TODO: cache key for multimodal input should be what?
+                pbar.update(1)
+        # reorder this group of results back to original unsorted form
+        res = re_ords.get_original(res)
+
+        pbar.close()
+        return res
diff --git a/lm_eval/models/huggingface.py b/lm_eval/models/huggingface.py
index 01a8cf6789..63ad1440bd 100644
--- a/lm_eval/models/huggingface.py
+++ b/lm_eval/models/huggingface.py
@@ -448,7 +448,16 @@ def _get_backend(
         Helper method during initialization.
         Determines the backend ("causal" (decoder-only) or "seq2seq" (encoder-decoder))
         model type to be used.
+        sets `self.AUTO_MODEL_CLASS` appropriately if not already set.
         """
+        # escape hatch: if we're using a subclass that shouldn't follow
+        # the default _get_backend logic,
+        # then skip over the method.
+        # TODO: this seems very much undesirable in some cases--our code in HFLM
+        # references AutoModelForCausalLM at times to check for equality
+        if self.AUTO_MODEL_CLASS is not None:
+            return
+
         assert backend in ["default", "causal", "seq2seq"]
 
         if backend != "default":
diff --git a/lm_eval/models/utils.py b/lm_eval/models/utils.py
index 8a81e5deca..72fac03811 100644
--- a/lm_eval/models/utils.py
+++ b/lm_eval/models/utils.py
@@ -664,3 +664,37 @@ def configure_pad_token(
             tokenizer.add_special_tokens({"pad_token": "<|pad|>"})
 
     return tokenizer
+
+
+def replace_placeholders(
+    string: str, default_placeholder: str, image_token: str, max_images: int
+):
+    """
+    A utility function used for local multimodal models. It locates all `placeholder` string
+    occurrences in the given input `string_` and replaces the first `max_count` instances with
+    `replacement`, and all subsequent occurrences with the empty string.
+
+    This is used to replace <image> placeholder tags by model-specific image tokens like <|image_pad|>
+    and to allow for only the first `max_count` images to be passed to a model if desired.
+
+    :param string: The original string containing placeholders.
+    :param default_placeholder: The placeholder text to be replaced.
+    :param image_token: The token to replace the placeholder with.
+    :param max_images: The maximum number of replacements to make.
+    :return: The string with placeholders replaced.
+    """
+    count = 0
+    result = []
+
+    parts = string.split(default_placeholder)
+    for part in parts[:-1]:  # Iterate through all but the last part
+        result.append(part)
+        if count < max_images:
+            result.append(image_token)
+            count += 1
+        elif default_placeholder != image_token:
+            result.append(default_placeholder)
+
+    # Add the last part of the string
+    result.append(parts[-1])
+    return "".join(result)
diff --git a/lm_eval/models/vllm_vlms.py b/lm_eval/models/vllm_vlms.py
new file mode 100644
index 0000000000..5a1260901c
--- /dev/null
+++ b/lm_eval/models/vllm_vlms.py
@@ -0,0 +1,279 @@
+import copy
+from typing import Dict, List, Optional
+
+import transformers
+from more_itertools import distribute
+from tqdm import tqdm
+
+from lm_eval.api.instance import Instance
+from lm_eval.api.registry import register_model
+from lm_eval.models.utils import Collator, undistribute
+from lm_eval.models.vllm_causallms import VLLM
+from lm_eval.utils import simple_parse_args_string
+
+
+try:
+    import ray
+    from vllm import LLM, SamplingParams
+    from vllm.lora.request import LoRARequest  # noqa: F401
+    from vllm.transformers_utils.tokenizer import get_tokenizer  # noqa: F401
+except ModuleNotFoundError:
+    pass
+
+
+DEFAULT_IMAGE_PLACEHOLDER = "<image>"
+
+
+@register_model("vllm-vlm")
+class VLLM_VLM(VLLM):
+    MULTIMODAL = True
+
+    def __init__(
+        self,
+        pretrained: str,
+        trust_remote_code: Optional[bool] = False,
+        revision: Optional[str] = None,
+        interleave: bool = True,
+        # TODO<baber>: handle max_images and limit_mm_per_prompt better
+        max_images: int = 999,
+        limit_mm_per_prompt: str = "image=1",
+        **kwargs,
+    ):
+        kwargs["limit_mm_per_prompt"] = simple_parse_args_string(limit_mm_per_prompt)
+        super().__init__(
+            pretrained=pretrained,
+            trust_remote_code=trust_remote_code,
+            revision=revision,
+            **kwargs,
+        )
+        self.interleave = interleave
+        self.max_images = max_images
+        self.processor = transformers.AutoProcessor.from_pretrained(
+            pretrained,
+            revision=revision,
+            trust_remote_code=trust_remote_code,
+        )
+        self.chat_applied: bool = False
+
+    def tok_batch_multimodal_encode(
+        self,
+        strings: List[str],  # note that input signature of this fn is different
+        images,  # TODO: typehint on this
+        left_truncate_len: int = None,
+        truncation: bool = False,
+    ):
+        images = [img[: self.max_images] for img in images]
+
+        outputs = []
+        for x, i in zip(strings, images):
+            inputs = {
+                "prompt": x,
+                "multi_modal_data": {"image": i},
+            }
+            outputs.append(inputs)
+        return outputs
+
+    def _model_generate(
+        self,
+        requests: List[List[dict]] = None,
+        generate: bool = False,
+        max_tokens: int = None,
+        stop: Optional[List[str]] = None,
+        **kwargs,
+    ):
+        if generate:
+            kwargs = self.modify_gen_kwargs(kwargs)
+            sampling_params = SamplingParams(max_tokens=max_tokens, stop=stop, **kwargs)
+        else:
+            sampling_params = SamplingParams(
+                temperature=0, prompt_logprobs=1, max_tokens=1, detokenize=False
+            )
+        if self.data_parallel_size > 1:
+            # vLLM hangs if tensor_parallel > 1 and resources are set in ray.remote
+            # also seems to only work with decorator and not with ray.remote() fn
+            # see https://github.com/vllm-project/vllm/issues/973
+            # note: this has changed on 0.3.3, and it only works now if num_gpus are set.
+            # but then tensor_parallel breaks
+            @ray.remote
+            def run_inference_one_model(
+                model_args: dict, sampling_params, requests: List[List[dict]]
+            ):
+                llm = LLM(**model_args)
+                return llm.generate(requests, sampling_params=sampling_params)
+
+            # dispatch requests to all self.data_parallel_size workers, in interleaved fashion
+            # interleaved important to balance context lengths across workers
+            requests = [list(x) for x in distribute(self.data_parallel_size, requests)]
+            inputs = ((self.model_args, sampling_params, req) for req in requests)
+            object_refs = [run_inference_one_model.remote(*x) for x in inputs]
+            results = ray.get(object_refs)
+            # Invoke ray.shutdown() to prevent hang-ups if subsequent calls required.
+            ray.shutdown()
+            # flatten results
+            return undistribute(results)
+
+        if self.lora_request is not None:
+            outputs = self.model.generate(
+                requests,
+                sampling_params=sampling_params,
+                use_tqdm=True if self.batch_size == "auto" else False,
+                lora_request=self.lora_request,
+            )
+        else:
+            outputs = self.model.generate(
+                requests,
+                sampling_params=sampling_params,
+                use_tqdm=True if self.batch_size == "auto" else False,
+            )
+        return outputs
+
+    def apply_chat_template(self, chat_history: List[Dict[str, str]]) -> str:
+        self.chat_applied = True
+        if not self.interleave:
+            for content in chat_history:
+                c = []
+                text = content["content"]
+
+                # Count and remove image placeholders
+                image_count = min(
+                    self.max_images, text.count(DEFAULT_IMAGE_PLACEHOLDER)
+                )
+                text = text.replace(DEFAULT_IMAGE_PLACEHOLDER, "")
+
+                # Add image entries
+                for _ in range(image_count):
+                    c.append({"type": "image", "image": None})
+
+                # Add single text entry at the end
+                c.append({"type": "text", "text": text})
+
+                content["content"] = c
+        else:
+            for content in chat_history:
+                c = []
+                text = content["content"]
+                expected_image_count = min(
+                    self.max_images, text.count(DEFAULT_IMAGE_PLACEHOLDER)
+                )
+                actual_image_count = 0
+
+                text_parts = text.split(DEFAULT_IMAGE_PLACEHOLDER)
+
+                for i, part in enumerate(text_parts):
+                    # TODO: concatenate text parts (esp. if skipping images)?
+                    if part:  # Add non-empty text parts
+                        c.append({"type": "text", "text": part})
+                    if (
+                        (i < len(text_parts) - 1) and i < self.max_images
+                    ):  # Add image placeholder after each split except the last
+                        c.append({"type": "image"})
+                        actual_image_count += 1
+
+                content["content"] = c
+
+                if actual_image_count != expected_image_count:
+                    raise ValueError(
+                        f"Mismatch in image placeholder count. Expected: {expected_image_count}, Actual: {actual_image_count}"
+                    )
+
+        return self.processor.apply_chat_template(
+            chat_history, add_generation_prompt=True
+        )
+
+    def generate_until(
+        self, requests: List[Instance], disable_tqdm: bool = False
+    ) -> List[str]:
+        # TODO: support text-only reqs
+        res = []
+
+        def _collate(x):
+            # the negative sign on len(toks) sorts descending - this has a few advantages:
+            # - time estimates will always be over not underestimates, which is more useful for planning
+            # - to know the size of a batch when going through the list, you know the first one is always the batch
+            #   padded context length. this is useful to simplify the batching logic and more importantly to make
+            #   automatic adaptive batches much much easier to implement
+            # - any OOMs will happen right away rather than near the end
+            toks = self.tok_encode(x[0])
+            return -len(toks), x[0]
+
+        pbar = tqdm(
+            total=len(requests),
+            disable=(disable_tqdm or (self.rank != 0)),
+            desc="Running generate_until requests with text+image input",
+        )
+        # TODO: port auto-batch sizing into this.
+
+        # we group requests by their generation_kwargs,
+        # so that we don't try to execute e.g. greedy sampling and temp=0.8 sampling
+        # in the same batch.
+        re_ords = Collator(
+            [reg.args for reg in requests],
+            _collate,
+            group_by="gen_kwargs",
+            group_fn=lambda x: x[1],
+        )
+        chunks = re_ords.get_batched(n=self.batch_size, batch_fn=None)
+
+        for chunk in chunks:
+            contexts, all_gen_kwargs, aux_arguments = zip(*chunk)
+
+            visuals = [arg["visual"] for arg in aux_arguments]
+
+            if not isinstance(contexts, list):
+                contexts = list(
+                    contexts
+                )  # for Qwen2-VL, processor is unhappy accepting a tuple of strings instead of a list.
+                # TODO: could we upstream this workaround to HF?
+
+            # we assume all gen kwargs in the batch are the same
+            # this is safe to assume because the `grouper` object ensures it.
+            gen_kwargs = all_gen_kwargs[0]
+            # unpack our keyword arguments.
+            until = None
+            if isinstance(gen_kwargs, dict):
+                kwargs = copy.deepcopy(gen_kwargs)  # edge case for repeats > 1
+                if "until" in kwargs.keys():
+                    until = kwargs.pop("until")
+                    if isinstance(until, str):
+                        until = [until]
+                    elif not isinstance(until, list):
+                        raise ValueError(
+                            f"Expected `kwargs['until']` to be of type Union[str,list] but got {until}"
+                        )
+            else:
+                raise ValueError(
+                    f"Expected `kwargs` to be of type `dict` but got {type(gen_kwargs)}"
+                )
+            # add EOS token to stop sequences
+            eos = self.tokenizer.decode(self.eot_token_id)
+            if not until:
+                until = [eos]
+            else:
+                until.append(eos)
+            if "max_gen_toks" in kwargs.keys():
+                max_gen_toks = kwargs.pop("max_gen_toks")
+            else:
+                max_gen_toks = self.max_gen_toks
+
+            max_ctx_len = self.max_length - max_gen_toks
+
+            inputs = self.tok_batch_multimodal_encode(
+                contexts,
+                visuals,
+                left_truncate_len=max_ctx_len,
+            )
+
+            cont = self._model_generate(inputs, stop=until, generate=True, **kwargs)
+
+            for output, context in zip(cont, contexts):
+                generated_text = output.outputs[0].text
+                res.append(generated_text)
+                self.cache_hook.add_partial(
+                    "generate_until", (context, gen_kwargs), generated_text
+                )
+                pbar.update(1)
+        # reorder this group of results back to original unsorted form
+        res = re_ords.get_original(res)
+
+        pbar.close()
+        return res
diff --git a/lm_eval/tasks/mmmu/README.md b/lm_eval/tasks/mmmu/README.md
new file mode 100644
index 0000000000..e9d0da12f6
--- /dev/null
+++ b/lm_eval/tasks/mmmu/README.md
@@ -0,0 +1,150 @@
+# MMMU Benchmark
+
+### Paper
+
+Title: `MMMU: A Massive Multi-discipline MultimodalUnderstanding and Reasoning Benchmark for Expert AGI`
+
+Abstract: `MMMU is a new benchmark designed to evaluate multimodal models on massive multi-discipline tasks demanding college-level subject knowledge and deliberate reasoning.`
+
+`The benchmark is composed of 30 tasks, for a total of 900 mixed image+text examples (some with multiple images in context)`
+
+Homepage: `https://github.com/MMMU-Benchmark/MMMU/tree/main/mmmu`
+
+Note: Some questions have multiple images in context. To control for this use `max_images=N` in model init.
+
+### Citation
+
+```
+@inproceedings{yue2023mmmu,
+            title={MMMU: A Massive Multi-discipline Multimodal Understanding and Reasoning Benchmark for Expert AGI},
+            author={Xiang Yue and Yuansheng Ni and Kai Zhang and Tianyu Zheng and Ruoqi Liu and Ge Zhang and Samuel Stevens and Dongfu Jiang and Weiming Ren and Yuxuan Sun and Cong Wei and Botao Yu and Ruibin Yuan and Renliang Sun and Ming Yin and Boyuan Zheng and Zhenzhu Yang and Yibo Liu and Wenhao Huang and Huan Sun and Yu Su and Wenhu Chen},
+            booktitle={Proceedings of CVPR},
+            year={2024},
+          }
+```
+
+### Groups, Tags, and Tasks
+
+#### Groups
+
+* `mmmu_val`
+* `mmmu_val_art_and_design`
+* `mmmu_val_business`
+* `mmmu_val_health_and_medicine`
+* `mmmu_val_humanities_and_social_science`
+* `mmmu_val_science`
+* `mmmu_val_tech_and_engineering`
+
+#### Tags
+
+
+#### Tasks
+
+* `mmmu_val_accounting`
+* `mmmu_val_agriculture`
+* `mmmu_val_architecture_and_engineering.yaml`
+* `mmmu_val_art`
+* `mmmu_val_art_theory`
+* `mmmu_val_basic_medical_science`
+* `mmmu_val_biology`
+* `mmmu_val_chemistry`
+* `mmmu_val_computer_science`
+* `mmmu_val_clinical_medicine`
+* `mmmu_val_design`
+* `mmmu_val_diagnostics_and_laboratory_medicine`
+* `mmmu_val_electronics`
+* `mmmu_val_energy_and_power`
+* `mmmu_val_economics`
+* `mmmu_val_finance`
+* `mmmu_val_geography`
+* `mmmu_val_history`
+* ...
+
+### Variants
+
+The `mmmu_val` group implements MMMU using processing code [from the original MMMU authors](https://github.com/MMMU-Benchmark/MMMU/tree/main/mmmu) and uses the prompt format found in [the MMMU repository for Llava-1.5](https://github.com/MMMU-Benchmark/MMMU/blob/main/mmmu/configs/llava1.5.yaml). This implementation should give scores on par with or slightly higher than those reported by [lmms-eval](https://github.com/EvolvingLMMs-Lab/lmms-eval/tree/main/lmms_eval/tasks/mmmu) for `mmmu_val` and the MMMU repository code.
+
+Scores on several tested models (**all with `--apply_chat_template`**) are:
+
+Qwen2-VL-2B:
+```
+hf-multimodal (pretrained=Qwen/Qwen2-VL-2B-Instruct,attn_implementation=flash_attention_2,dtype=bfloat16,convert_img_format=True), gen_kwargs: (None), limit: None, num_fewshot: None, batch_size: 2
+```
+```
+|             Groups             |Version|Filter|n-shot|Metric|   |Value |   |Stderr|
+|--------------------------------|------:|------|------|------|---|-----:|---|-----:|
+|mmmu_val                        |      0|none  |      |acc   |↑  |0.3778|±  |0.0155|
+| - Art and Design               |      0|none  |      |acc   |↑  |0.5500|±  |0.0415|
+| - Business                     |      0|none  |      |acc   |↑  |0.3600|±  |0.0389|
+| - Health and Medicine          |      0|none  |      |acc   |↑  |0.3667|±  |0.0394|
+| - Humanities and Social Science|      0|none  |      |acc   |↑  |0.5167|±  |0.0438|
+| - Science                      |      0|none  |      |acc   |↑  |0.2467|±  |0.0352|
+| - Tech and Engineering         |      0|none  |      |acc   |↑  |0.3143|±  |0.0317|
+```
+Author-reported score: 41.1%
+
+
+Qwen2-VL-7B:
+```
+hf-multimodal (pretrained=Qwen/Qwen2-VL-7B-Instruct,attn_implementation=flash_attention_2,dtype=bfloat16,convert_img_format=True), gen_kwargs: (None), limit: None, num_fewshot: None, batch_size: 2
+```
+```
+|             Groups             |Version|Filter|n-shot|Metric|   |Value |   |Stderr|
+|--------------------------------|------:|------|------|------|---|-----:|---|-----:|
+|mmmu_val                        |      0|none  |      |acc   |↑  |0.5056|±  |0.0160|
+| - Art and Design               |      0|none  |      |acc   |↑  |0.6917|±  |0.0398|
+| - Business                     |      0|none  |      |acc   |↑  |0.4333|±  |0.0406|
+| - Health and Medicine          |      0|none  |      |acc   |↑  |0.5667|±  |0.0401|
+| - Humanities and Social Science|      0|none  |      |acc   |↑  |0.6750|±  |0.0426|
+| - Science                      |      0|none  |      |acc   |↑  |0.3800|±  |0.0392|
+| - Tech and Engineering         |      0|none  |      |acc   |↑  |0.4000|±  |0.0341|
+```
+Author-reported score: 54.1%
+
+Idefics2-8B:
+```
+hf-multimodal (pretrained=HuggingFaceM4/idefics2-8b,attn_implementation=flash_attention_2,dtype=bfloat16,convert_img_format=True,max_images=2), gen_kwargs: (None), limit: None, num_fewshot: None, batch_size: 2
+```
+```
+|             Groups             |Version|Filter|n-shot|Metric|   |Value |   |Stderr|
+|--------------------------------|------:|------|------|------|---|-----:|---|-----:|
+|mmmu_val                        |      0|none  |      |acc   |↑  |0.4011|±  |0.0154|
+| - Art and Design               |      0|none  |      |acc   |↑  |0.6167|±  |0.0436|
+| - Business                     |      0|none  |      |acc   |↑  |0.3200|±  |0.0373|
+| - Health and Medicine          |      0|none  |      |acc   |↑  |0.4000|±  |0.0401|
+| - Humanities and Social Science|      0|none  |      |acc   |↑  |0.5750|±  |0.0424|
+| - Science                      |      0|none  |      |acc   |↑  |0.2600|±  |0.0358|
+| - Tech and Engineering         |      0|none  |      |acc   |↑  |0.3381|±  |0.0312|
+```
+Author-reported score: ~43%
+
+Llava-v1.6-Mistral-7B:
+```
+hf-multimodal (pretrained=llava-hf/llava-v1.6-mistral-7b-hf,attn_implementation=flash_attention_2,dtype=bfloat16,convert_img_format=True), gen_kwargs: (None), limit: None, num_fewshot: None, batch_size: 2
+```
+```
+|             Groups             |Version|Filter|n-shot|Metric|   |Value |   |Stderr|
+|--------------------------------|------:|------|------|------|---|-----:|---|-----:|
+|mmmu_val                        |      0|none  |      |acc   |↑  |0.3522|±  |0.0151|
+| - Art and Design               |      0|none  |      |acc   |↑  |0.5167|±  |0.0440|
+| - Business                     |      0|none  |      |acc   |↑  |0.2667|±  |0.0362|
+| - Health and Medicine          |      0|none  |      |acc   |↑  |0.3867|±  |0.0397|
+| - Humanities and Social Science|      0|none  |      |acc   |↑  |0.5917|±  |0.0433|
+| - Science                      |      0|none  |      |acc   |↑  |0.2200|±  |0.0342|
+| - Tech and Engineering         |      0|none  |      |acc   |↑  |0.2524|±  |0.0299|
+```
+Author-reported score: 35.3%
+
+
+### Checklist
+
+For adding novel benchmarks/datasets to the library:
+* [x] Is the task an existing benchmark in the literature?
+  * [x] Have you referenced the original paper that introduced the task?
+  * [x] If yes, does the original paper provide a reference implementation? If so, have you checked against the reference implementation and documented how to run such a test?
+
+
+If other tasks on this dataset are already supported:
+* [x] Is the "Main" variant of this task clearly denoted?
+* [x] Have you provided a short sentence in a README on what each new variant adds / evaluates?
+* [x] Have you noted which, if any, published evaluation setups are matched by this variant?
diff --git a/lm_eval/tasks/mmmu/_art_and_design.yaml b/lm_eval/tasks/mmmu/_art_and_design.yaml
new file mode 100644
index 0000000000..b0dda1876f
--- /dev/null
+++ b/lm_eval/tasks/mmmu/_art_and_design.yaml
@@ -0,0 +1,13 @@
+group: mmmu_val_art_and_design
+group_alias: Art and Design
+task:
+  - mmmu_val_art
+  - mmmu_val_art_theory
+  - mmmu_val_design
+  - mmmu_val_music
+aggregate_metric_list:
+  - metric: acc
+    aggregation: mean
+    weight_by_size: true
+metadata:
+  version: 0.0
diff --git a/lm_eval/tasks/mmmu/_business.yaml b/lm_eval/tasks/mmmu/_business.yaml
new file mode 100644
index 0000000000..497948e5e5
--- /dev/null
+++ b/lm_eval/tasks/mmmu/_business.yaml
@@ -0,0 +1,14 @@
+group: mmmu_val_business
+group_alias: Business
+task:
+  - mmmu_val_accounting
+  - mmmu_val_economics
+  - mmmu_val_finance
+  - mmmu_val_manage
+  - mmmu_val_marketing
+aggregate_metric_list:
+  - metric: acc
+    aggregation: mean
+    weight_by_size: true
+metadata:
+  version: 0.0
diff --git a/lm_eval/tasks/mmmu/_health_and_medicine.yaml b/lm_eval/tasks/mmmu/_health_and_medicine.yaml
new file mode 100644
index 0000000000..04709d3d46
--- /dev/null
+++ b/lm_eval/tasks/mmmu/_health_and_medicine.yaml
@@ -0,0 +1,14 @@
+group: mmmu_val_health_and_medicine
+group_alias: Health and Medicine
+task:
+  - mmmu_val_basic_medical_science
+  - mmmu_val_clinical_medicine
+  - mmmu_val_diagnostics_and_laboratory_medicine
+  - mmmu_val_pharmacy
+  - mmmu_val_public_health
+aggregate_metric_list:
+  - metric: acc
+    aggregation: mean
+    weight_by_size: true
+metadata:
+  version: 0.0
diff --git a/lm_eval/tasks/mmmu/_humanities_and_social_sciences.yaml b/lm_eval/tasks/mmmu/_humanities_and_social_sciences.yaml
new file mode 100644
index 0000000000..b8d70a9ea3
--- /dev/null
+++ b/lm_eval/tasks/mmmu/_humanities_and_social_sciences.yaml
@@ -0,0 +1,13 @@
+group: mmmu_val_humanities_and_social_science
+group_alias: Humanities and Social Science
+task:
+  - mmmu_val_history
+  - mmmu_val_literature
+  - mmmu_val_sociology
+  - mmmu_val_psychology
+aggregate_metric_list:
+  - metric: acc
+    aggregation: mean
+    weight_by_size: true
+metadata:
+  version: 0.0
diff --git a/lm_eval/tasks/mmmu/_mmmu.yaml b/lm_eval/tasks/mmmu/_mmmu.yaml
new file mode 100644
index 0000000000..6bcd234bbc
--- /dev/null
+++ b/lm_eval/tasks/mmmu/_mmmu.yaml
@@ -0,0 +1,14 @@
+group: mmmu_val
+task:
+  - mmmu_val_art_and_design
+  - mmmu_val_business
+  - mmmu_val_health_and_medicine
+  - mmmu_val_humanities_and_social_science
+  - mmmu_val_science
+  - mmmu_val_tech_and_engineering
+aggregate_metric_list:
+  - metric: acc
+    aggregation: mean
+    weight_by_size: true
+metadata:
+  version: 0.0
diff --git a/lm_eval/tasks/mmmu/_science.yaml b/lm_eval/tasks/mmmu/_science.yaml
new file mode 100644
index 0000000000..9b41585a65
--- /dev/null
+++ b/lm_eval/tasks/mmmu/_science.yaml
@@ -0,0 +1,14 @@
+group: mmmu_val_science
+group_alias: Science
+task:
+  - mmmu_val_biology
+  - mmmu_val_chemistry
+  - mmmu_val_geography
+  - mmmu_val_math
+  - mmmu_val_physics
+aggregate_metric_list:
+  - metric: acc
+    aggregation: mean
+    weight_by_size: true
+metadata:
+  version: 0.0
diff --git a/lm_eval/tasks/mmmu/_tech_and_engineering.yaml b/lm_eval/tasks/mmmu/_tech_and_engineering.yaml
new file mode 100644
index 0000000000..956f9906b3
--- /dev/null
+++ b/lm_eval/tasks/mmmu/_tech_and_engineering.yaml
@@ -0,0 +1,16 @@
+group: mmmu_val_tech_and_engineering
+group_alias: Tech and Engineering
+task:
+  - mmmu_val_agriculture
+  - mmmu_val_architecture_and_engineering
+  - mmmu_val_computer_science
+  - mmmu_val_electronics
+  - mmmu_val_energy_and_power
+  - mmmu_val_materials
+  - mmmu_val_mechanical_engineering
+aggregate_metric_list:
+  - metric: acc
+    aggregation: mean
+    weight_by_size: true
+metadata:
+  version: 0.0
diff --git a/lm_eval/tasks/mmmu/_template_yaml b/lm_eval/tasks/mmmu/_template_yaml
new file mode 100644
index 0000000000..f92ce60d6b
--- /dev/null
+++ b/lm_eval/tasks/mmmu/_template_yaml
@@ -0,0 +1,19 @@
+dataset_path: MMMU/MMMU
+validation_split: validation
+output_type: generate_until
+doc_to_image: !function utils.doc_to_image
+doc_to_text: !function utils.doc_to_text
+doc_to_target: "answer"
+process_results: !function utils.process_results
+generation_kwargs:
+  until:
+    - "<|endoftext|>"
+  temperature: 0.0
+  do_sample: false
+  max_gen_toks: 512
+metric_list:
+  - metric: acc
+    aggregation: mean
+    higher_is_better: true
+metadata:
+  version: 0.0
diff --git a/lm_eval/tasks/mmmu/mmmu_accounting.yaml b/lm_eval/tasks/mmmu/mmmu_accounting.yaml
new file mode 100644
index 0000000000..3c4aa41d9c
--- /dev/null
+++ b/lm_eval/tasks/mmmu/mmmu_accounting.yaml
@@ -0,0 +1,4 @@
+task: mmmu_val_accounting
+include: _template_yaml
+task_alias: Accounting
+dataset_name: Accounting
diff --git a/lm_eval/tasks/mmmu/mmmu_agriculture.yaml b/lm_eval/tasks/mmmu/mmmu_agriculture.yaml
new file mode 100644
index 0000000000..387f33f7a9
--- /dev/null
+++ b/lm_eval/tasks/mmmu/mmmu_agriculture.yaml
@@ -0,0 +1,4 @@
+task: mmmu_val_agriculture
+include: _template_yaml
+task_alias: Agriculture
+dataset_name: Agriculture
diff --git a/lm_eval/tasks/mmmu/mmmu_architecture_and_engineering.yaml b/lm_eval/tasks/mmmu/mmmu_architecture_and_engineering.yaml
new file mode 100644
index 0000000000..829ee3a8f8
--- /dev/null
+++ b/lm_eval/tasks/mmmu/mmmu_architecture_and_engineering.yaml
@@ -0,0 +1,4 @@
+task: mmmu_val_architecture_and_engineering
+include: _template_yaml
+task_alias: Architecture and Engineering
+dataset_name: Architecture_and_Engineering
diff --git a/lm_eval/tasks/mmmu/mmmu_art.yaml b/lm_eval/tasks/mmmu/mmmu_art.yaml
new file mode 100644
index 0000000000..ff812ab3c1
--- /dev/null
+++ b/lm_eval/tasks/mmmu/mmmu_art.yaml
@@ -0,0 +1,4 @@
+task: mmmu_val_art
+include: _template_yaml
+task_alias: Art
+dataset_name: Art
diff --git a/lm_eval/tasks/mmmu/mmmu_art_theory.yaml b/lm_eval/tasks/mmmu/mmmu_art_theory.yaml
new file mode 100644
index 0000000000..d6331871a5
--- /dev/null
+++ b/lm_eval/tasks/mmmu/mmmu_art_theory.yaml
@@ -0,0 +1,4 @@
+task: mmmu_val_art_theory
+include: _template_yaml
+task_alias: Art Theory
+dataset_name: Art_Theory
diff --git a/lm_eval/tasks/mmmu/mmmu_basic_medical_science.yaml b/lm_eval/tasks/mmmu/mmmu_basic_medical_science.yaml
new file mode 100644
index 0000000000..d7486f1ee3
--- /dev/null
+++ b/lm_eval/tasks/mmmu/mmmu_basic_medical_science.yaml
@@ -0,0 +1,4 @@
+task: mmmu_val_basic_medical_science
+include: _template_yaml
+task_alias: Basic Medical Science
+dataset_name: Basic_Medical_Science
diff --git a/lm_eval/tasks/mmmu/mmmu_biology.yaml b/lm_eval/tasks/mmmu/mmmu_biology.yaml
new file mode 100644
index 0000000000..49d36f380f
--- /dev/null
+++ b/lm_eval/tasks/mmmu/mmmu_biology.yaml
@@ -0,0 +1,4 @@
+task: mmmu_val_biology
+include: _template_yaml
+task_alias: Biology
+dataset_name: Biology
diff --git a/lm_eval/tasks/mmmu/mmmu_chemistry.yaml b/lm_eval/tasks/mmmu/mmmu_chemistry.yaml
new file mode 100644
index 0000000000..cb096531c2
--- /dev/null
+++ b/lm_eval/tasks/mmmu/mmmu_chemistry.yaml
@@ -0,0 +1,4 @@
+task: mmmu_val_chemistry
+include: _template_yaml
+task_alias: Chemistry
+dataset_name: Chemistry
diff --git a/lm_eval/tasks/mmmu/mmmu_clinical_medicine.yaml b/lm_eval/tasks/mmmu/mmmu_clinical_medicine.yaml
new file mode 100644
index 0000000000..a0833d165f
--- /dev/null
+++ b/lm_eval/tasks/mmmu/mmmu_clinical_medicine.yaml
@@ -0,0 +1,4 @@
+task: mmmu_val_clinical_medicine
+include: _template_yaml
+task_alias: Clinical Medicine
+dataset_name: Clinical_Medicine
diff --git a/lm_eval/tasks/mmmu/mmmu_computer_science.yaml b/lm_eval/tasks/mmmu/mmmu_computer_science.yaml
new file mode 100644
index 0000000000..53c91a61b3
--- /dev/null
+++ b/lm_eval/tasks/mmmu/mmmu_computer_science.yaml
@@ -0,0 +1,4 @@
+task: mmmu_val_computer_science
+include: _template_yaml
+task_alias: Computer Science
+dataset_name: Computer_Science
diff --git a/lm_eval/tasks/mmmu/mmmu_design.yaml b/lm_eval/tasks/mmmu/mmmu_design.yaml
new file mode 100644
index 0000000000..0367dd65ee
--- /dev/null
+++ b/lm_eval/tasks/mmmu/mmmu_design.yaml
@@ -0,0 +1,4 @@
+task: mmmu_val_design
+include: _template_yaml
+task_alias: Design
+dataset_name: Design
diff --git a/lm_eval/tasks/mmmu/mmmu_diagnostics_and_laboratory_medicine.yaml b/lm_eval/tasks/mmmu/mmmu_diagnostics_and_laboratory_medicine.yaml
new file mode 100644
index 0000000000..2ec1b7dca6
--- /dev/null
+++ b/lm_eval/tasks/mmmu/mmmu_diagnostics_and_laboratory_medicine.yaml
@@ -0,0 +1,4 @@
+task: mmmu_val_diagnostics_and_laboratory_medicine
+include: _template_yaml
+task_alias: Diagnostics and Laboratory Medicine
+dataset_name: Diagnostics_and_Laboratory_Medicine
diff --git a/lm_eval/tasks/mmmu/mmmu_economics.yaml b/lm_eval/tasks/mmmu/mmmu_economics.yaml
new file mode 100644
index 0000000000..3d6465da74
--- /dev/null
+++ b/lm_eval/tasks/mmmu/mmmu_economics.yaml
@@ -0,0 +1,4 @@
+task: mmmu_val_economics
+include: _template_yaml
+task_alias: Economics
+dataset_name: Economics
diff --git a/lm_eval/tasks/mmmu/mmmu_electronics.yaml b/lm_eval/tasks/mmmu/mmmu_electronics.yaml
new file mode 100644
index 0000000000..02df22f8a3
--- /dev/null
+++ b/lm_eval/tasks/mmmu/mmmu_electronics.yaml
@@ -0,0 +1,4 @@
+task: mmmu_val_electronics
+include: _template_yaml
+task_alias: Electronics
+dataset_name: Electronics
diff --git a/lm_eval/tasks/mmmu/mmmu_energy_and_power.yaml b/lm_eval/tasks/mmmu/mmmu_energy_and_power.yaml
new file mode 100644
index 0000000000..add24ebed7
--- /dev/null
+++ b/lm_eval/tasks/mmmu/mmmu_energy_and_power.yaml
@@ -0,0 +1,4 @@
+task: mmmu_val_energy_and_power
+include: _template_yaml
+task_alias: Energy and Power
+dataset_name: Energy_and_Power
diff --git a/lm_eval/tasks/mmmu/mmmu_finance.yaml b/lm_eval/tasks/mmmu/mmmu_finance.yaml
new file mode 100644
index 0000000000..c6b6f042c1
--- /dev/null
+++ b/lm_eval/tasks/mmmu/mmmu_finance.yaml
@@ -0,0 +1,4 @@
+task: mmmu_val_finance
+include: _template_yaml
+task_alias: Finance
+dataset_name: Finance
diff --git a/lm_eval/tasks/mmmu/mmmu_geography.yaml b/lm_eval/tasks/mmmu/mmmu_geography.yaml
new file mode 100644
index 0000000000..b5f9022136
--- /dev/null
+++ b/lm_eval/tasks/mmmu/mmmu_geography.yaml
@@ -0,0 +1,4 @@
+task: mmmu_val_geography
+include: _template_yaml
+task_alias: Geography
+dataset_name: Geography
diff --git a/lm_eval/tasks/mmmu/mmmu_history.yaml b/lm_eval/tasks/mmmu/mmmu_history.yaml
new file mode 100644
index 0000000000..435388fd74
--- /dev/null
+++ b/lm_eval/tasks/mmmu/mmmu_history.yaml
@@ -0,0 +1,4 @@
+task: mmmu_val_history
+include: _template_yaml
+task_alias: History
+dataset_name: History
diff --git a/lm_eval/tasks/mmmu/mmmu_literature.yaml b/lm_eval/tasks/mmmu/mmmu_literature.yaml
new file mode 100644
index 0000000000..9e0a1ba503
--- /dev/null
+++ b/lm_eval/tasks/mmmu/mmmu_literature.yaml
@@ -0,0 +1,4 @@
+task: mmmu_val_literature
+include: _template_yaml
+task_alias: Literature
+dataset_name: Literature
diff --git a/lm_eval/tasks/mmmu/mmmu_manage.yaml b/lm_eval/tasks/mmmu/mmmu_manage.yaml
new file mode 100644
index 0000000000..bd19924114
--- /dev/null
+++ b/lm_eval/tasks/mmmu/mmmu_manage.yaml
@@ -0,0 +1,4 @@
+task: mmmu_val_manage
+include: _template_yaml
+task_alias: Manage
+dataset_name: Manage
diff --git a/lm_eval/tasks/mmmu/mmmu_marketing.yaml b/lm_eval/tasks/mmmu/mmmu_marketing.yaml
new file mode 100644
index 0000000000..53848171be
--- /dev/null
+++ b/lm_eval/tasks/mmmu/mmmu_marketing.yaml
@@ -0,0 +1,4 @@
+task: mmmu_val_marketing
+include: _template_yaml
+task_alias: Marketing
+dataset_name: Marketing
diff --git a/lm_eval/tasks/mmmu/mmmu_materials.yaml b/lm_eval/tasks/mmmu/mmmu_materials.yaml
new file mode 100644
index 0000000000..ee46944316
--- /dev/null
+++ b/lm_eval/tasks/mmmu/mmmu_materials.yaml
@@ -0,0 +1,4 @@
+task: mmmu_val_materials
+include: _template_yaml
+task_alias: Materials
+dataset_name: Materials
diff --git a/lm_eval/tasks/mmmu/mmmu_math.yaml b/lm_eval/tasks/mmmu/mmmu_math.yaml
new file mode 100644
index 0000000000..e6817689ab
--- /dev/null
+++ b/lm_eval/tasks/mmmu/mmmu_math.yaml
@@ -0,0 +1,4 @@
+task: mmmu_val_math
+include: _template_yaml
+task_alias: Math
+dataset_name: Math
diff --git a/lm_eval/tasks/mmmu/mmmu_mechanical_engineering.yaml b/lm_eval/tasks/mmmu/mmmu_mechanical_engineering.yaml
new file mode 100644
index 0000000000..4b9d95be85
--- /dev/null
+++ b/lm_eval/tasks/mmmu/mmmu_mechanical_engineering.yaml
@@ -0,0 +1,4 @@
+task: mmmu_val_mechanical_engineering
+include: _template_yaml
+task_alias: Mechanical Engineering
+dataset_name: Mechanical_Engineering
diff --git a/lm_eval/tasks/mmmu/mmmu_music.yaml b/lm_eval/tasks/mmmu/mmmu_music.yaml
new file mode 100644
index 0000000000..cf4032dea3
--- /dev/null
+++ b/lm_eval/tasks/mmmu/mmmu_music.yaml
@@ -0,0 +1,4 @@
+task: mmmu_val_music
+include: _template_yaml
+task_alias: Music
+dataset_name: Music
diff --git a/lm_eval/tasks/mmmu/mmmu_pharmacy.yaml b/lm_eval/tasks/mmmu/mmmu_pharmacy.yaml
new file mode 100644
index 0000000000..04a4f1ff67
--- /dev/null
+++ b/lm_eval/tasks/mmmu/mmmu_pharmacy.yaml
@@ -0,0 +1,4 @@
+task: mmmu_val_pharmacy
+include: _template_yaml
+task_alias: Pharmacy
+dataset_name: Pharmacy
diff --git a/lm_eval/tasks/mmmu/mmmu_physics.yaml b/lm_eval/tasks/mmmu/mmmu_physics.yaml
new file mode 100644
index 0000000000..53371cad73
--- /dev/null
+++ b/lm_eval/tasks/mmmu/mmmu_physics.yaml
@@ -0,0 +1,4 @@
+task: mmmu_val_physics
+include: _template_yaml
+task_alias: Physics
+dataset_name: Physics
diff --git a/lm_eval/tasks/mmmu/mmmu_psychology.yaml b/lm_eval/tasks/mmmu/mmmu_psychology.yaml
new file mode 100644
index 0000000000..8471d4a18f
--- /dev/null
+++ b/lm_eval/tasks/mmmu/mmmu_psychology.yaml
@@ -0,0 +1,4 @@
+task: mmmu_val_psychology
+include: _template_yaml
+task_alias: Psychology
+dataset_name: Psychology
diff --git a/lm_eval/tasks/mmmu/mmmu_public_health.yaml b/lm_eval/tasks/mmmu/mmmu_public_health.yaml
new file mode 100644
index 0000000000..aefc0590a3
--- /dev/null
+++ b/lm_eval/tasks/mmmu/mmmu_public_health.yaml
@@ -0,0 +1,4 @@
+task: mmmu_val_public_health
+include: _template_yaml
+task_alias: Public Health
+dataset_name: Public_Health
diff --git a/lm_eval/tasks/mmmu/mmmu_sociology.yaml b/lm_eval/tasks/mmmu/mmmu_sociology.yaml
new file mode 100644
index 0000000000..5da664a8e7
--- /dev/null
+++ b/lm_eval/tasks/mmmu/mmmu_sociology.yaml
@@ -0,0 +1,4 @@
+task: mmmu_val_sociology
+include: _template_yaml
+task_alias: Sociology
+dataset_name: Sociology
diff --git a/lm_eval/tasks/mmmu/utils.py b/lm_eval/tasks/mmmu/utils.py
new file mode 100644
index 0000000000..87a3022098
--- /dev/null
+++ b/lm_eval/tasks/mmmu/utils.py
@@ -0,0 +1,341 @@
+import ast
+import random
+import re
+
+import numpy as np
+
+
+random.seed(42)
+
+
+# source for prompt fstrings: https://github.com/MMMU-Benchmark/MMMU/blob/7787d60648c82a9d40acd656fa541a6c74f58995/eval/configs/llava1.5.yaml#L3
+MULTI_CHOICE_EXAMPLE_FORMAT = """{}
+
+{}
+
+Answer with the option's letter from the given choices directly."""
+
+
+SHORT_ANS_EXAMPLE_FORMAT = """{}
+
+Answer the question using a single word or phrase."""
+
+START_CHR = "A"
+
+
+def doc_to_image(doc):
+    # get formatted prompt (incl. multi-choice options) pre-<image {i}> reformatting
+    input_text = _doc_to_text(doc)
+    # locate <image {i}> instances in input
+    image_placeholders = [
+        img.replace(" ", "_").replace("<", "").replace(">", "")
+        for img in re.findall("<image [1-7]>", input_text)
+    ]
+
+    # collect visuals (can have dupes of a given image or be out of order)
+    # E.g. validation_Math_19 contains <image 1> and <image 2> but seen as [<image 1>, <image 2>, <image 1>, <image 1>, <image 2>]
+    visuals = [doc[img] for img in image_placeholders]
+
+    return visuals
+
+
+def doc_to_text(doc):
+    """Get the prompt for a given document."""
+
+    prompt = _doc_to_text(doc)
+
+    for i in range(1, 8):
+        # replace <image {i}> with <image>. TODO: check this is always the right decision incl. for non-HF models
+        prompt = prompt.replace(f"<image {i}>", "<image>")
+
+    return prompt
+
+
+def _doc_to_text(doc):
+    """Helper--get the prompt for a given document but DO NOT yet replace <image {i}> with <image>."""
+
+    if doc["question_type"] == "multiple-choice":
+        choices_str = ""
+
+        for i, choice in enumerate(ast.literal_eval(doc["options"])):
+            # add (A) {choice1}\n , (B) {choice2}\n , and so on
+            # to create the list of formatted choices in the prompt
+            choices_str += f"\n({chr(ord(START_CHR) + i)}) {choice}"
+
+        choices_str = (
+            choices_str.lstrip()
+        )  # remove the extraneous prepended \n that we added
+
+        prompt = MULTI_CHOICE_EXAMPLE_FORMAT.format(doc["question"], choices_str)
+    else:
+        prompt = SHORT_ANS_EXAMPLE_FORMAT.format(doc["question"])
+
+    return prompt
+
+
+def process_results(doc, results):
+    if doc["question_type"] == "multiple-choice":
+        # multichoice logic
+        option_strs = ast.literal_eval(doc["options"])
+        option_letters = ["A", "B", "C", "D", "E", "F", "G", "H", "I"]
+
+        all_choices = option_letters[: len(option_strs)]
+        index2ans = {index: ans for index, ans in zip(option_letters, option_strs)}
+
+        pred = parse_multi_choice_response(results[0], all_choices, index2ans)
+        # print(pred, all_choices, index2ans)
+        is_correct = eval_multi_choice(doc["answer"], pred)
+    else:
+        # freeform response handling
+        pred = parse_open_response(results[0])
+        is_correct = eval_open(doc["answer"], pred)
+
+    return {"acc": float(is_correct)}
+
+    # TODO: it would be better if we could use a Filter for this logic.
+
+
+### Output parsing and answer selection taken from
+### https://github.com/MMMU-Benchmark/MMMU/blob/main/eval/utils/data_utils.py
+### and
+### https://github.com/MMMU-Benchmark/MMMU/blob/main/eval/utils/eval_utils.py
+
+
+# ----------- Process Multi-choice -------------
+def parse_multi_choice_response(response, all_choices, index2ans):
+    """
+    Parse the prediction from the generated response.
+    Return the predicted index e.g., A, B, C, D.
+    """
+    for char in [",", ".", "!", "?", ";", ":", "'"]:
+        response = response.strip(char)
+    response = " " + response + " "  # add space to avoid partial match
+
+    index_ans = True
+    ans_with_brack = False
+    candidates = []
+    for choice in all_choices:  # e.g., (A) (B) (C) (D)
+        if f"({choice})" in response:
+            candidates.append(choice)
+            ans_with_brack = True
+
+    if len(candidates) == 0:
+        for choice in all_choices:  # e.g., A B C D
+            if f" {choice} " in response:
+                candidates.append(choice)
+
+    # if all above doesn't get candidates, check if the content is larger than 5 tokens and try to parse the example
+    if len(candidates) == 0 and len(response.split()) > 5:
+        for index, ans in index2ans.items():
+            if ans.lower() in response.lower():
+                candidates.append(index)
+                index_ans = False  # it's content ans.
+
+    if len(candidates) == 0:  # still not get answer, randomly choose one.
+        pred_index = random.choice(all_choices)
+    elif len(candidates) > 1:
+        start_indexes = []
+        if index_ans:
+            if ans_with_brack:
+                for can in candidates:
+                    index = response.rfind(f"({can})")
+                    start_indexes.append(index)  # -1 will be ignored anyway
+                # start_indexes = [generated_response.index(f'({can})') for can in candidates]
+            else:
+                for can in candidates:
+                    index = response.rfind(f" {can} ")
+                    start_indexes.append(index)
+        else:
+            for can in candidates:
+                index = response.lower().rfind(index2ans[can].lower())
+                start_indexes.append(index)
+        # get the last one
+        pred_index = candidates[np.argmax(start_indexes)]
+    else:  # if only one candidate, use it.
+        pred_index = candidates[0]
+
+    # print(response, all_choices, index2ans, pred_index)
+
+    return pred_index
+
+
+# ----------- Process Open -------------
+def check_is_number(string):
+    """
+    Check if the given string a number.
+    """
+    try:
+        float(string.replace(",", ""))
+        return True
+    except ValueError:
+        # check if there's comma inside
+        return False
+
+
+def normalize_str(string):
+    """
+    Normalize the str to lower case and make them float numbers if possible.
+    """
+    # check if characters in the string
+
+    # if number, numerize it.
+    string = string.strip()
+
+    is_number = check_is_number(string)
+
+    if is_number:
+        string = string.replace(",", "")
+        string = float(string)
+        # leave 2 decimal
+        string = round(string, 2)
+        return [string]
+    else:  # it's likely to be a string
+        # lower it
+        string = string.lower()
+        if len(string) == 1:
+            return [" " + string, string + " "]  # avoid trivial matches
+        return [string]
+
+
+def extract_numbers(string):
+    """
+    Exact all forms of numbers from a string with regex.
+    """
+    # Pattern for numbers with commas
+    pattern_commas = r"-?\b\d{1,3}(?:,\d{3})+\b"
+    # Pattern for scientific notation
+    pattern_scientific = r"-?\d+(?:\.\d+)?[eE][+-]?\d+"
+    # Pattern for simple numbers without commas
+    pattern_simple = r"-?(?:\d+\.\d+|\.\d+|\d+\b)(?![eE][+-]?\d+)(?![,\d])"
+
+    # Extract numbers with commas
+    numbers_with_commas = re.findall(pattern_commas, string)
+    # Extract numbers in scientific notation
+    numbers_scientific = re.findall(pattern_scientific, string)
+    # Extract simple numbers without commas
+    numbers_simple = re.findall(pattern_simple, string)
+
+    # Combine all extracted numbers
+    all_numbers = numbers_with_commas + numbers_scientific + numbers_simple
+    return all_numbers
+
+
+def parse_open_response(response):
+    """
+    Parse the prediction from the generated response.
+    Return a list of predicted strings or numbers.
+    """
+
+    # content = content.strip("\n").strip(".").strip(" ")
+    def get_key_subresponses(response):
+        key_responses = []
+        response = response.strip().strip(".").lower()
+        sub_responses = re.split(r"\.\s(?=[A-Z])|\n", response)
+        indicators_of_keys = [
+            "could be ",
+            "so ",
+            "is ",
+            "thus ",
+            "therefore ",
+            "final ",
+            "answer ",
+            "result ",
+        ]
+        key_responses = []
+        for index, resp in enumerate(sub_responses):
+            # if last one, accept it's an equation (the entire response can be just one sentence with equation)
+            if index == len(sub_responses) - 1:
+                indicators_of_keys.extend(["="])
+            shortest_key_response = None  # the shortest response that may contain the answer (tail part of the response)
+            for indicator in indicators_of_keys:
+                if indicator in resp:
+                    if not shortest_key_response:
+                        shortest_key_response = resp.split(indicator)[-1].strip()
+                    else:
+                        if len(resp.split(indicator)[-1].strip()) < len(
+                            shortest_key_response
+                        ):
+                            shortest_key_response = resp.split(indicator)[-1].strip()
+                    # key_responses.append(resp.split(indicator)[1].strip())
+
+            if shortest_key_response:
+                # and it's not trivial
+                if shortest_key_response.strip() not in [
+                    ":",
+                    ",",
+                    ".",
+                    "!",
+                    "?",
+                    ";",
+                    ":",
+                    "'",
+                ]:
+                    key_responses.append(shortest_key_response)
+        if len(key_responses) == 0:  # did not found any
+            return [response]
+        return key_responses
+
+    # pdb.set_trace()
+    key_responses = get_key_subresponses(response)
+
+    pred_list = key_responses.copy()  # keep the original string response
+    for resp in key_responses:
+        pred_list.extend(extract_numbers(resp))
+
+    tmp_pred_list = []
+    for i in range(len(pred_list)):
+        tmp_pred_list.extend(normalize_str(pred_list[i]))
+    pred_list = tmp_pred_list
+
+    # remove duplicates
+    pred_list = list(set(pred_list))
+
+    return pred_list
+
+
+# ----------- Evaluation -------------
+
+
+def eval_multi_choice(gold_i, pred_i):
+    """
+    Evaluate a multiple choice instance.
+    """
+    correct = False
+    # only they are exactly the same, we consider it as correct
+    if isinstance(gold_i, list):
+        for answer in gold_i:
+            if answer == pred_i:
+                correct = True
+                break
+    else:  # gold_i is a string
+        if gold_i == pred_i:
+            correct = True
+    return correct
+
+
+def eval_open(gold_i, pred_i):
+    """
+    Evaluate an open question instance
+    """
+    correct = False
+    if isinstance(gold_i, list):
+        # use float to avoid trivial matches
+        norm_answers = []
+        for answer in gold_i:
+            norm_answers.extend(normalize_str(answer))
+    else:
+        norm_answers = normalize_str(gold_i)
+    for pred in pred_i:  # pred is already normalized in parse response phase
+        if isinstance(pred, str):  # if it's a string, then find if ans in the pred_i
+            for norm_ans in norm_answers:
+                # only see if the string answer in the string pred
+                if isinstance(norm_ans, str) and norm_ans in pred:
+                    if not correct:
+                        correct = True
+                    break
+        else:  # it's a float number
+            if pred in norm_answers:
+                if not correct:
+                    correct = True
+                break
+    return correct