From 639327644e4b55c035cc32413ca4960cbb259389 Mon Sep 17 00:00:00 2001 From: Anush008 Date: Tue, 13 Feb 2024 23:39:35 +0530 Subject: [PATCH 1/2] feat: Added nomic-embed-text-v1 support --- README.md | 4 +-- docs/examples/Supported_Models.ipynb | 50 +++++++++++++++++---------- fastembed/__init__.py | 4 +-- fastembed/common/models.py | 2 +- fastembed/common/utils.py | 8 ++--- fastembed/text/e5_onnx_embedding.py | 10 +++--- fastembed/text/onnx_embedding.py | 17 ++++++--- fastembed/text/text_embedding.py | 14 ++++---- fastembed/text/text_embedding_base.py | 2 +- tests/test_onnx_embeddings.py | 1 + tests/test_text_onnx_embeddings.py | 1 + 11 files changed, 67 insertions(+), 46 deletions(-) diff --git a/README.md b/README.md index bbf1db9c..2290be08 100644 --- a/README.md +++ b/README.md @@ -1,6 +1,6 @@ # ⚡️ What is FastEmbed? -FastEmbed is a lightweight, fast, Python library built for embedding generation. We [support popular text models](https://qdrant.github.io/fastembed/examples/Supported_Models/). Please [open a Github issue](https://github.com/qdrant/fastembed/issues/new) if you want us to add a new model. +FastEmbed is a lightweight, fast, Python library built for embedding generation. We [support popular text models](https://qdrant.github.io/fastembed/examples/Supported_Models/). Please [open a GitHub issue](https://github.com/qdrant/fastembed/issues/new) if you want us to add a new model. The default text embedding (`TextEmbedding`) model is Flag Embedding, the top model in the [MTEB](https://huggingface.co/spaces/mteb/leaderboard) leaderboard. It supports "query" and "passage" prefixes for the input text. Here is an example for [Retrieval Embedding Generation](https://qdrant.github.io/fastembed/examples/Retrieval_with_FastEmbed/) and how to use [FastEmbed with Qdrant](https://qdrant.github.io/fastembed/examples/Usage_With_Qdrant/). @@ -48,7 +48,7 @@ Installation with Qdrant Client in Python: pip install qdrant-client[fastembed] ``` -Might have to use ```pip install 'qdrant-client[fastembed]'``` on zsh. +You might have to use ```pip install 'qdrant-client[fastembed]'``` on zsh. ```python from qdrant_client import QdrantClient diff --git a/docs/examples/Supported_Models.ipynb b/docs/examples/Supported_Models.ipynb index 47cd9ab3..c180b9ea 100644 --- a/docs/examples/Supported_Models.ipynb +++ b/docs/examples/Supported_Models.ipynb @@ -110,6 +110,14 @@ " \n", " \n", " 8\n", + " nomic-ai/nomic-embed-text-v1\n", + " 768\n", + " 8192 context length english model\n", + " 0.54\n", + " {'hf': 'xenova/nomic-embed-text-v1'}\n", + " \n", + " \n", + " 9\n", " intfloat/multilingual-e5-large\n", " 1024\n", " Multilingual model, e5-large. Recommend using this model for non-English languages\n", @@ -117,7 +125,7 @@ " {'url': 'https://storage.googleapis.com/qdrant-fastembed/fast-multilingual-e5-large.tar.gz', 'hf': 'qdrant/multilingual-e5-large-onnx'}\n", " \n", " \n", - " 9\n", + " 10\n", " sentence-transformers/paraphrase-multilingual-mpnet-base-v2\n", " 768\n", " Sentence-transformers model for tasks like clustering or semantic search\n", @@ -125,7 +133,7 @@ " {'hf': 'xenova/paraphrase-multilingual-mpnet-base-v2'}\n", " \n", " \n", - " 10\n", + " 11\n", " jinaai/jina-embeddings-v2-base-en\n", " 768\n", " English embedding model supporting 8192 sequence length\n", @@ -133,7 +141,7 @@ " {'hf': 'xenova/jina-embeddings-v2-base-en'}\n", " \n", " \n", - " 11\n", + " 12\n", " jinaai/jina-embeddings-v2-small-en\n", " 512\n", " English embedding model supporting 8192 sequence length\n", @@ -154,10 +162,11 @@ "5 BAAI/bge-small-en-v1.5 384 \n", "6 BAAI/bge-small-zh-v1.5 512 \n", "7 sentence-transformers/all-MiniLM-L6-v2 384 \n", - "8 intfloat/multilingual-e5-large 1024 \n", - "9 sentence-transformers/paraphrase-multilingual-mpnet-base-v2 768 \n", - "10 jinaai/jina-embeddings-v2-base-en 768 \n", - "11 jinaai/jina-embeddings-v2-small-en 512 \n", + "8 nomic-ai/nomic-embed-text-v1 768 \n", + "9 intfloat/multilingual-e5-large 1024 \n", + "10 sentence-transformers/paraphrase-multilingual-mpnet-base-v2 768 \n", + "11 jinaai/jina-embeddings-v2-base-en 768 \n", + "12 jinaai/jina-embeddings-v2-small-en 512 \n", "\n", " description \\\n", "0 Base English model \n", @@ -168,10 +177,11 @@ "5 Fast and Default English model \n", "6 Fast and recommended Chinese model \n", "7 Sentence Transformer model, MiniLM-L6-v2 \n", - "8 Multilingual model, e5-large. Recommend using this model for non-English languages \n", - "9 Sentence-transformers model for tasks like clustering or semantic search \n", - "10 English embedding model supporting 8192 sequence length \n", + "8 8192 context length english model \n", + "9 Multilingual model, e5-large. Recommend using this model for non-English languages \n", + "10 Sentence-transformers model for tasks like clustering or semantic search \n", "11 English embedding model supporting 8192 sequence length \n", + "12 English embedding model supporting 8192 sequence length \n", "\n", " size_in_GB \\\n", "0 0.50 \n", @@ -182,10 +192,11 @@ "5 0.13 \n", "6 0.10 \n", "7 0.09 \n", - "8 2.24 \n", - "9 1.11 \n", - "10 0.55 \n", - "11 0.13 \n", + "8 0.54 \n", + "9 2.24 \n", + "10 1.11 \n", + "11 0.55 \n", + "12 0.13 \n", "\n", " sources \n", "0 {'url': 'https://storage.googleapis.com/qdrant-fastembed/fast-bge-base-en.tar.gz'} \n", @@ -196,10 +207,11 @@ "5 {'url': 'https://storage.googleapis.com/qdrant-fastembed/fast-bge-small-en-v1.5.tar.gz', 'hf': 'qdrant/bge-small-en-v1.5-onnx-q'} \n", "6 {'url': 'https://storage.googleapis.com/qdrant-fastembed/fast-bge-small-zh-v1.5.tar.gz'} \n", "7 {'url': 'https://storage.googleapis.com/qdrant-fastembed/sentence-transformers-all-MiniLM-L6-v2.tar.gz', 'hf': 'qdrant/all-MiniLM-L6-v2-onnx'} \n", - "8 {'url': 'https://storage.googleapis.com/qdrant-fastembed/fast-multilingual-e5-large.tar.gz', 'hf': 'qdrant/multilingual-e5-large-onnx'} \n", - "9 {'hf': 'xenova/paraphrase-multilingual-mpnet-base-v2'} \n", - "10 {'hf': 'xenova/jina-embeddings-v2-base-en'} \n", - "11 {'hf': 'xenova/jina-embeddings-v2-small-en'} " + "8 {'hf': 'xenova/nomic-embed-text-v1'} \n", + "9 {'url': 'https://storage.googleapis.com/qdrant-fastembed/fast-multilingual-e5-large.tar.gz', 'hf': 'qdrant/multilingual-e5-large-onnx'} \n", + "10 {'hf': 'xenova/paraphrase-multilingual-mpnet-base-v2'} \n", + "11 {'hf': 'xenova/jina-embeddings-v2-base-en'} \n", + "12 {'hf': 'xenova/jina-embeddings-v2-small-en'} " ] }, "execution_count": 4, @@ -232,7 +244,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.11.5" + "version": "3.11.7" }, "orig_nbformat": 4 }, diff --git a/fastembed/__init__.py b/fastembed/__init__.py index f14800c6..493d0d43 100644 --- a/fastembed/__init__.py +++ b/fastembed/__init__.py @@ -1,3 +1,3 @@ -from .embedding import TextEmbedding +from fastembed.text.text_embedding import TextEmbedding -__all__ = [TextEmbedding] \ No newline at end of file +__all__ = ["TextEmbedding"] diff --git a/fastembed/common/models.py b/fastembed/common/models.py index 74cbdab8..ccfe21a1 100644 --- a/fastembed/common/models.py +++ b/fastembed/common/models.py @@ -33,7 +33,7 @@ def load_tokenizer(model_dir: Path, max_length: int = 512) -> Tokenizer: tokenizer = Tokenizer.from_file(str(tokenizer_path)) tokenizer.enable_truncation(max_length=min(tokenizer_config["model_max_length"], max_length)) - tokenizer.enable_padding(pad_id=config["pad_token_id"], pad_token=tokenizer_config["pad_token"]) + tokenizer.enable_padding(pad_id=config.get("pad_token_id", 0), pad_token=tokenizer_config["pad_token"]) for token in tokens_map.values(): if isinstance(token, str): diff --git a/fastembed/common/utils.py b/fastembed/common/utils.py index 89f881b2..a7e46772 100644 --- a/fastembed/common/utils.py +++ b/fastembed/common/utils.py @@ -24,10 +24,10 @@ def define_cache_dir(cache_dir: Optional[str] = None) -> Path: """ if cache_dir is None: default_cache_dir = os.path.join(tempfile.gettempdir(), "fastembed_cache") - cache_dir = Path(os.getenv("FASTEMBED_CACHE_PATH", default_cache_dir)) + cache_path = Path(os.getenv("FASTEMBED_CACHE_PATH", default_cache_dir)) else: - cache_dir = Path(cache_dir) + cache_path = Path(cache_dir) - cache_dir.mkdir(parents=True, exist_ok=True) + cache_path.mkdir(parents=True, exist_ok=True) - return cache_dir + return cache_path diff --git a/fastembed/text/e5_onnx_embedding.py b/fastembed/text/e5_onnx_embedding.py index 6d3787aa..6ff53dd6 100644 --- a/fastembed/text/e5_onnx_embedding.py +++ b/fastembed/text/e5_onnx_embedding.py @@ -22,8 +22,8 @@ "size_in_GB": 1.11, "sources": { "hf": "xenova/paraphrase-multilingual-mpnet-base-v2", - } - } + }, + }, ] @@ -51,8 +51,8 @@ def _preprocess_onnx_input(self, onnx_input: Dict[str, np.ndarray]) -> Dict[str, class E5OnnxEmbeddingWorker(OnnxTextEmbeddingWorker): def init_embedding( - self, - model_name: str, - cache_dir: str, + self, + model_name: str, + cache_dir: str, ) -> E5OnnxEmbedding: return E5OnnxEmbedding(model_name=model_name, cache_dir=cache_dir, threads=1) diff --git a/fastembed/text/onnx_embedding.py b/fastembed/text/onnx_embedding.py index a60a2912..d92b3a4b 100644 --- a/fastembed/text/onnx_embedding.py +++ b/fastembed/text/onnx_embedding.py @@ -1,6 +1,6 @@ import os from multiprocessing import get_all_start_methods -from typing import List, Dict, Any, Tuple, Union, Iterable, Type +from typing import List, Dict, Any, Optional, Tuple, Union, Iterable, Type import numpy as np import onnxruntime as ort @@ -98,6 +98,15 @@ "hf": "qdrant/all-MiniLM-L6-v2-onnx", }, }, + { + "model": "nomic-ai/nomic-embed-text-v1", + "dim": 768, + "description": "8192 context length english model", + "size_in_GB": 0.54, + "sources": { + "hf": "xenova/nomic-embed-text-v1", + }, + }, # { # "model": "sentence-transformers/all-MiniLM-L6-v2", # "dim": 384, @@ -149,8 +158,8 @@ def _get_model_description(cls, model_name: str) -> Dict[str, Any]: def __init__( self, model_name: str = "BAAI/bge-small-en-v1.5", - cache_dir: str = None, - threads: int = None, + cache_dir: Optional[str] = None, + threads: Optional[int] = None, **kwargs, ): """ @@ -193,7 +202,7 @@ def embed( self, documents: Union[str, Iterable[str]], batch_size: int = 256, - parallel: int = None, + parallel: Optional[int] = None, **kwargs, ) -> Iterable[np.ndarray]: """ diff --git a/fastembed/text/text_embedding.py b/fastembed/text/text_embedding.py index 6107044b..d44d7c7d 100644 --- a/fastembed/text/text_embedding.py +++ b/fastembed/text/text_embedding.py @@ -53,24 +53,22 @@ def __init__( ): super().__init__(model_name, cache_dir, threads, **kwargs) - self.model = None for embedding in self.EMBEDDINGS_REGISTRY: supported_models = embedding.list_supported_models() if any(model_name == model["model"] for model in supported_models): self.model = embedding(model_name, cache_dir, threads, **kwargs) - break + return - if self.model is None: - raise ValueError( - f"Model {model_name} is not supported in TextEmbedding." - "Please check the supported models using `TextEmbedding.list_supported_models()`" - ) + raise ValueError( + f"Model {model_name} is not supported in TextEmbedding." + "Please check the supported models using `TextEmbedding.list_supported_models()`" + ) def embed( self, documents: Union[str, Iterable[str]], batch_size: int = 256, - parallel: int = None, + parallel: Optional[int] = None, **kwargs, ) -> Iterable[np.ndarray]: """ diff --git a/fastembed/text/text_embedding_base.py b/fastembed/text/text_embedding_base.py index 49ac9804..158a97c0 100644 --- a/fastembed/text/text_embedding_base.py +++ b/fastembed/text/text_embedding_base.py @@ -19,7 +19,7 @@ def embed( self, documents: Union[str, Iterable[str]], batch_size: int = 256, - parallel: int = None, + parallel: Optional[int] = None, **kwargs, ) -> Iterable[np.ndarray]: raise NotImplementedError() diff --git a/tests/test_onnx_embeddings.py b/tests/test_onnx_embeddings.py index 6fcc64ae..7f580e6b 100644 --- a/tests/test_onnx_embeddings.py +++ b/tests/test_onnx_embeddings.py @@ -20,6 +20,7 @@ ), "jinaai/jina-embeddings-v2-small-en": np.array([-0.0455, -0.0428, -0.0122, 0.0613, 0.0015]), "jinaai/jina-embeddings-v2-base-en": np.array([-0.0332, -0.0509, 0.0287, -0.0043, -0.0077]), + "nomic-ai/nomic-embed-text-v1": np.array([0.0061, 0.0103, -0.0296, -0.0242, -0.0170]), } diff --git a/tests/test_text_onnx_embeddings.py b/tests/test_text_onnx_embeddings.py index f610fd86..8f523adf 100644 --- a/tests/test_text_onnx_embeddings.py +++ b/tests/test_text_onnx_embeddings.py @@ -20,6 +20,7 @@ ), "jinaai/jina-embeddings-v2-small-en": np.array([-0.0455, -0.0428, -0.0122, 0.0613, 0.0015]), "jinaai/jina-embeddings-v2-base-en": np.array([-0.0332, -0.0509, 0.0287, -0.0043, -0.0077]), + "nomic-ai/nomic-embed-text-v1": np.array([0.0061, 0.0103, -0.0296, -0.0242, -0.0170]), } From 86cc4b17d70bb2c6b43e10b4bdde64c4cbc3837f Mon Sep 17 00:00:00 2001 From: Anush Date: Wed, 14 Feb 2024 22:50:17 +0530 Subject: [PATCH 2/2] chore: xenova/nomic-embed-text-v1 -> nomic-ai/nomic-embed-text-v1 --- fastembed/text/onnx_embedding.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fastembed/text/onnx_embedding.py b/fastembed/text/onnx_embedding.py index d92b3a4b..165f6eb4 100644 --- a/fastembed/text/onnx_embedding.py +++ b/fastembed/text/onnx_embedding.py @@ -104,7 +104,7 @@ "description": "8192 context length english model", "size_in_GB": 0.54, "sources": { - "hf": "xenova/nomic-embed-text-v1", + "hf": "nomic-ai/nomic-embed-text-v1", }, }, # {