add Ovis1.6-Gemma2-27B

AIDC-AI · Nov 26, 2024 · d248e34 · d248e34
1 parent 831b595
commit d248e34
Show file tree

Hide file tree

Showing 4 changed files with 46 additions and 20 deletions.
diff --git a/README.md b/README.md
@@ -7,6 +7,7 @@ Ovis (Open VISion) is a novel Multimodal Large Language Model (MLLM) architectur
 </div>
 
 ## Release
+- [11/26] 🔥 Announcing [Ovis1.6-Gemma2-27B](https://huggingface.co/AIDC-AI/Ovis1.6-Gemma2-27B)!
 - [11/04] 🔥 Announcing quantized versions of Ovis1.6: [Ovis1.6-Gemma2-9B-GPTQ-Int4](https://huggingface.co/AIDC-AI/Ovis1.6-Gemma2-9B-GPTQ-Int4) and [Ovis1.6-Llama3.2-3B-GPTQ-Int4](https://huggingface.co/AIDC-AI/Ovis1.6-Llama3.2-3B-GPTQ-Int4)!
 - [10/22] 🔥 Announcing Ovis1.6-Llama3.2-3B ([Model](https://huggingface.co/AIDC-AI/Ovis1.6-Llama3.2-3B), [Demo](https://huggingface.co/spaces/AIDC-AI/Ovis1.6-Llama3.2-3B))!
 - [09/19] 🔥 Announcing Ovis1.6-Gemma2-9B ([Model](https://huggingface.co/AIDC-AI/Ovis1.6-Gemma2-9B), [Demo](https://huggingface.co/spaces/AIDC-AI/Ovis1.6-Gemma2-9B))! This latest release further enhances high-resolution image processing, is trained on a larger, more diverse, and higher-quality dataset, and refines the training process with DPO training following instruction-tuning.
@@ -25,7 +26,7 @@ Ovis (Open VISion) is a novel Multimodal Large Language Model (MLLM) architectur
 - [License](#license)
 
 ## Install
-Ovis has been tested with Python 3.10, Torch 2.2.0, Transformers 4.44.2, and DeepSpeed 0.14.4. For a comprehensive list of package dependencies, please consult the `requirements.txt` file. Before finetuning or inference, please install Ovis as follows.
+Ovis has been tested with Python 3.10, Torch 2.4.0, Transformers 4.46.2, and DeepSpeed 0.15.4. For a comprehensive list of package dependencies, please consult the `requirements.txt` file. Before finetuning or inference, please install Ovis as follows.
 ```bash
 git clone [email protected]:AIDC-AI/Ovis.git
 conda create -n ovis python=3.10 -y
@@ -40,10 +41,15 @@ Ovis can be instantiated with popular LLMs. We provide the following Ovis MLLMs:
 
 | Ovis MLLMs        | ViT         | LLM                |                          Model Weights                          | Demo                                                             |
 |:------------------|:-----------:|:------------------:|:---------------------------------------------------------------:|:----------------------------------------------------------------:|
+| Ovis1.6-Gemma2-27B | Siglip-400M | Gemma2-27B-It       | [Huggingface](https://huggingface.co/AIDC-AI/Ovis1.6-Gemma2-27B) | - |
 | Ovis1.6-Gemma2-9B | Siglip-400M | Gemma2-9B-It       | [Huggingface](https://huggingface.co/AIDC-AI/Ovis1.6-Gemma2-9B) | [Space](https://huggingface.co/spaces/AIDC-AI/Ovis1.6-Gemma2-9B) |
 | Ovis1.6-Llama3.2-3B | Siglip-400M | Llama-3.2-3B-Instruct       | [Huggingface](https://huggingface.co/AIDC-AI/Ovis1.6-Llama3.2-3B) | [Space](https://huggingface.co/spaces/AIDC-AI/Ovis1.6-Llama3.2-3B) |
 
 ## Performance
+With **29B** parameters, **Ovis1.6-Gemma2-27B** achieves exceptional performance in the [OpenCompass](https://github.com/open-compass/VLMEvalKit) benchmark, ranking among the top-tier open-source MLLMs.
+
+![performance-Ovis1_6-Gemma2-27B](docs/performance/Ovis1_6-Gemma2-27B.png)
+
 With just **10B** parameters, **Ovis1.6-Gemma2-9B** leads the [OpenCompass](https://github.com/open-compass/VLMEvalKit) benchmark among open-source MLLMs within **30B** parameters.
 
 ![performance-Ovis1_6-Gemma2-9B](docs/performance/Ovis1_6-Gemma2-9B.png)

diff --git a/docs/performance/Ovis1_6-Gemma2-27B.png b/docs/performance/Ovis1_6-Gemma2-27B.png
diff --git a/ovis/model/modeling_ovis.py b/ovis/model/modeling_ovis.py
@@ -1,12 +1,15 @@
 import logging
 import os
+
+from packaging import version
 from datetime import datetime
 from importlib import import_module
 from typing import List, Union, Callable, Optional, Dict
 
 import PIL.Image
 import deepspeed
 import torch
+import transformers
 from torch import Tensor
 from torch.nn import init
 from transformers import PreTrainedModel, AutoConfig, AutoModel, AutoTokenizer, AutoModelForCausalLM
@@ -361,29 +364,46 @@ def save_pretrained(
         #                                             safe_serialization=safe_serialization)
         # self.get_visual_tokenizer().get_image_processor().save_pretrained(visual_tokenizer_directory)
 
-    def _get_hybrid_cache_for_llm(self, max_batch_size: int, max_cache_len: int):
+    def _get_hybrid_cache_for_llm(self, batch_size: int, max_cache_len: int):
         cache_cls = HybridCache
         llm = self.get_llm()
 
-        need_new_cache = (
-            not hasattr(llm, "_cache")
-            or (not isinstance(llm._cache, cache_cls))
-            or llm._cache.max_batch_size != max_batch_size
-            or llm._cache.max_cache_len < max_cache_len
-        )
+        if version.parse(transformers.__version__) >= version.parse("4.46.0"):
+            need_new_cache = (
+                not hasattr(llm, "_cache")
+                or (not isinstance(llm._cache, cache_cls))
+                or llm._cache.batch_size != batch_size
+                or llm._cache.max_cache_len < max_cache_len
+            )
+        else:
+            need_new_cache = (
+                not hasattr(llm, "_cache")
+                or (not isinstance(llm._cache, cache_cls))
+                or llm._cache.max_batch_size != batch_size
+                or llm._cache.max_cache_len < max_cache_len
+            )
 
         if need_new_cache:
             if hasattr(llm.config, "_pre_quantization_dtype"):
                 cache_dtype = llm.config._pre_quantization_dtype
             else:
                 cache_dtype = llm.dtype
-            llm._cache = cache_cls(
-                config=llm.config,
-                max_batch_size=max_batch_size,
-                max_cache_len=max_cache_len,
-                device=llm.device,
-                dtype=cache_dtype,
-            )
+            if version.parse(transformers.__version__) >= version.parse("4.46.0"):
+                llm._cache = cache_cls(
+                    config=llm.config,
+                    batch_size=batch_size,
+                    max_cache_len=max_cache_len,
+                    device=llm.device,
+                    dtype=cache_dtype,
+                )
+            else:
+                llm._cache = cache_cls(
+                    config=llm.config,
+                    max_batch_size=batch_size,
+                    max_cache_len=max_cache_len,
+                    device=llm.device,
+                    dtype=cache_dtype,
+                )
         else:
             llm._cache.reset()
         return llm._cache

diff --git a/requirements.txt b/requirements.txt
@@ -1,9 +1,9 @@
-torch==2.2.0
-transformers==4.44.2
-tokenizers==0.19.1
+torch==2.4.0
+transformers==4.46.2
+tokenizers==0.20.3
 sentencepiece==0.1.99
 pyarrow==14.0.2
-accelerate==0.29.0
+accelerate==1.1.0
 pydantic==2.8.2
 markdown2[all]
 numpy==1.24.3
@@ -22,5 +22,5 @@ pandas
 torchaudio
 xformers
 pillow==10.3.0
-deepspeed==0.14.4
+deepspeed==0.15.4
 gradio