diff --git a/README.md b/README.md
index bbf1db9c..2290be08 100644
--- a/README.md
+++ b/README.md
@@ -1,6 +1,6 @@
# ⚡️ What is FastEmbed?
-FastEmbed is a lightweight, fast, Python library built for embedding generation. We [support popular text models](https://qdrant.github.io/fastembed/examples/Supported_Models/). Please [open a Github issue](https://github.com/qdrant/fastembed/issues/new) if you want us to add a new model.
+FastEmbed is a lightweight, fast, Python library built for embedding generation. We [support popular text models](https://qdrant.github.io/fastembed/examples/Supported_Models/). Please [open a GitHub issue](https://github.com/qdrant/fastembed/issues/new) if you want us to add a new model.
The default text embedding (`TextEmbedding`) model is Flag Embedding, the top model in the [MTEB](https://huggingface.co/spaces/mteb/leaderboard) leaderboard. It supports "query" and "passage" prefixes for the input text. Here is an example for [Retrieval Embedding Generation](https://qdrant.github.io/fastembed/examples/Retrieval_with_FastEmbed/) and how to use [FastEmbed with Qdrant](https://qdrant.github.io/fastembed/examples/Usage_With_Qdrant/).
@@ -48,7 +48,7 @@ Installation with Qdrant Client in Python:
pip install qdrant-client[fastembed]
```
-Might have to use ```pip install 'qdrant-client[fastembed]'``` on zsh.
+You might have to use ```pip install 'qdrant-client[fastembed]'``` on zsh.
```python
from qdrant_client import QdrantClient
diff --git a/docs/examples/Supported_Models.ipynb b/docs/examples/Supported_Models.ipynb
index 47cd9ab3..c180b9ea 100644
--- a/docs/examples/Supported_Models.ipynb
+++ b/docs/examples/Supported_Models.ipynb
@@ -110,6 +110,14 @@
" \n",
"
\n",
" 8 | \n",
+ " nomic-ai/nomic-embed-text-v1 | \n",
+ " 768 | \n",
+ " 8192 context length english model | \n",
+ " 0.54 | \n",
+ " {'hf': 'xenova/nomic-embed-text-v1'} | \n",
+ "
\n",
+ " \n",
+ " 9 | \n",
" intfloat/multilingual-e5-large | \n",
" 1024 | \n",
" Multilingual model, e5-large. Recommend using this model for non-English languages | \n",
@@ -117,7 +125,7 @@
" {'url': 'https://storage.googleapis.com/qdrant-fastembed/fast-multilingual-e5-large.tar.gz', 'hf': 'qdrant/multilingual-e5-large-onnx'} | \n",
"
\n",
" \n",
- " 9 | \n",
+ " 10 | \n",
" sentence-transformers/paraphrase-multilingual-mpnet-base-v2 | \n",
" 768 | \n",
" Sentence-transformers model for tasks like clustering or semantic search | \n",
@@ -125,7 +133,7 @@
" {'hf': 'xenova/paraphrase-multilingual-mpnet-base-v2'} | \n",
"
\n",
" \n",
- " 10 | \n",
+ " 11 | \n",
" jinaai/jina-embeddings-v2-base-en | \n",
" 768 | \n",
" English embedding model supporting 8192 sequence length | \n",
@@ -133,7 +141,7 @@
" {'hf': 'xenova/jina-embeddings-v2-base-en'} | \n",
"
\n",
" \n",
- " 11 | \n",
+ " 12 | \n",
" jinaai/jina-embeddings-v2-small-en | \n",
" 512 | \n",
" English embedding model supporting 8192 sequence length | \n",
@@ -154,10 +162,11 @@
"5 BAAI/bge-small-en-v1.5 384 \n",
"6 BAAI/bge-small-zh-v1.5 512 \n",
"7 sentence-transformers/all-MiniLM-L6-v2 384 \n",
- "8 intfloat/multilingual-e5-large 1024 \n",
- "9 sentence-transformers/paraphrase-multilingual-mpnet-base-v2 768 \n",
- "10 jinaai/jina-embeddings-v2-base-en 768 \n",
- "11 jinaai/jina-embeddings-v2-small-en 512 \n",
+ "8 nomic-ai/nomic-embed-text-v1 768 \n",
+ "9 intfloat/multilingual-e5-large 1024 \n",
+ "10 sentence-transformers/paraphrase-multilingual-mpnet-base-v2 768 \n",
+ "11 jinaai/jina-embeddings-v2-base-en 768 \n",
+ "12 jinaai/jina-embeddings-v2-small-en 512 \n",
"\n",
" description \\\n",
"0 Base English model \n",
@@ -168,10 +177,11 @@
"5 Fast and Default English model \n",
"6 Fast and recommended Chinese model \n",
"7 Sentence Transformer model, MiniLM-L6-v2 \n",
- "8 Multilingual model, e5-large. Recommend using this model for non-English languages \n",
- "9 Sentence-transformers model for tasks like clustering or semantic search \n",
- "10 English embedding model supporting 8192 sequence length \n",
+ "8 8192 context length english model \n",
+ "9 Multilingual model, e5-large. Recommend using this model for non-English languages \n",
+ "10 Sentence-transformers model for tasks like clustering or semantic search \n",
"11 English embedding model supporting 8192 sequence length \n",
+ "12 English embedding model supporting 8192 sequence length \n",
"\n",
" size_in_GB \\\n",
"0 0.50 \n",
@@ -182,10 +192,11 @@
"5 0.13 \n",
"6 0.10 \n",
"7 0.09 \n",
- "8 2.24 \n",
- "9 1.11 \n",
- "10 0.55 \n",
- "11 0.13 \n",
+ "8 0.54 \n",
+ "9 2.24 \n",
+ "10 1.11 \n",
+ "11 0.55 \n",
+ "12 0.13 \n",
"\n",
" sources \n",
"0 {'url': 'https://storage.googleapis.com/qdrant-fastembed/fast-bge-base-en.tar.gz'} \n",
@@ -196,10 +207,11 @@
"5 {'url': 'https://storage.googleapis.com/qdrant-fastembed/fast-bge-small-en-v1.5.tar.gz', 'hf': 'qdrant/bge-small-en-v1.5-onnx-q'} \n",
"6 {'url': 'https://storage.googleapis.com/qdrant-fastembed/fast-bge-small-zh-v1.5.tar.gz'} \n",
"7 {'url': 'https://storage.googleapis.com/qdrant-fastembed/sentence-transformers-all-MiniLM-L6-v2.tar.gz', 'hf': 'qdrant/all-MiniLM-L6-v2-onnx'} \n",
- "8 {'url': 'https://storage.googleapis.com/qdrant-fastembed/fast-multilingual-e5-large.tar.gz', 'hf': 'qdrant/multilingual-e5-large-onnx'} \n",
- "9 {'hf': 'xenova/paraphrase-multilingual-mpnet-base-v2'} \n",
- "10 {'hf': 'xenova/jina-embeddings-v2-base-en'} \n",
- "11 {'hf': 'xenova/jina-embeddings-v2-small-en'} "
+ "8 {'hf': 'xenova/nomic-embed-text-v1'} \n",
+ "9 {'url': 'https://storage.googleapis.com/qdrant-fastembed/fast-multilingual-e5-large.tar.gz', 'hf': 'qdrant/multilingual-e5-large-onnx'} \n",
+ "10 {'hf': 'xenova/paraphrase-multilingual-mpnet-base-v2'} \n",
+ "11 {'hf': 'xenova/jina-embeddings-v2-base-en'} \n",
+ "12 {'hf': 'xenova/jina-embeddings-v2-small-en'} "
]
},
"execution_count": 4,
@@ -232,7 +244,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
- "version": "3.11.5"
+ "version": "3.11.7"
},
"orig_nbformat": 4
},
diff --git a/fastembed/__init__.py b/fastembed/__init__.py
index f14800c6..493d0d43 100644
--- a/fastembed/__init__.py
+++ b/fastembed/__init__.py
@@ -1,3 +1,3 @@
-from .embedding import TextEmbedding
+from fastembed.text.text_embedding import TextEmbedding
-__all__ = [TextEmbedding]
\ No newline at end of file
+__all__ = ["TextEmbedding"]
diff --git a/fastembed/common/models.py b/fastembed/common/models.py
index 74cbdab8..ccfe21a1 100644
--- a/fastembed/common/models.py
+++ b/fastembed/common/models.py
@@ -33,7 +33,7 @@ def load_tokenizer(model_dir: Path, max_length: int = 512) -> Tokenizer:
tokenizer = Tokenizer.from_file(str(tokenizer_path))
tokenizer.enable_truncation(max_length=min(tokenizer_config["model_max_length"], max_length))
- tokenizer.enable_padding(pad_id=config["pad_token_id"], pad_token=tokenizer_config["pad_token"])
+ tokenizer.enable_padding(pad_id=config.get("pad_token_id", 0), pad_token=tokenizer_config["pad_token"])
for token in tokens_map.values():
if isinstance(token, str):
diff --git a/fastembed/common/utils.py b/fastembed/common/utils.py
index 89f881b2..a7e46772 100644
--- a/fastembed/common/utils.py
+++ b/fastembed/common/utils.py
@@ -24,10 +24,10 @@ def define_cache_dir(cache_dir: Optional[str] = None) -> Path:
"""
if cache_dir is None:
default_cache_dir = os.path.join(tempfile.gettempdir(), "fastembed_cache")
- cache_dir = Path(os.getenv("FASTEMBED_CACHE_PATH", default_cache_dir))
+ cache_path = Path(os.getenv("FASTEMBED_CACHE_PATH", default_cache_dir))
else:
- cache_dir = Path(cache_dir)
+ cache_path = Path(cache_dir)
- cache_dir.mkdir(parents=True, exist_ok=True)
+ cache_path.mkdir(parents=True, exist_ok=True)
- return cache_dir
+ return cache_path
diff --git a/fastembed/text/e5_onnx_embedding.py b/fastembed/text/e5_onnx_embedding.py
index 6d3787aa..6ff53dd6 100644
--- a/fastembed/text/e5_onnx_embedding.py
+++ b/fastembed/text/e5_onnx_embedding.py
@@ -22,8 +22,8 @@
"size_in_GB": 1.11,
"sources": {
"hf": "xenova/paraphrase-multilingual-mpnet-base-v2",
- }
- }
+ },
+ },
]
@@ -51,8 +51,8 @@ def _preprocess_onnx_input(self, onnx_input: Dict[str, np.ndarray]) -> Dict[str,
class E5OnnxEmbeddingWorker(OnnxTextEmbeddingWorker):
def init_embedding(
- self,
- model_name: str,
- cache_dir: str,
+ self,
+ model_name: str,
+ cache_dir: str,
) -> E5OnnxEmbedding:
return E5OnnxEmbedding(model_name=model_name, cache_dir=cache_dir, threads=1)
diff --git a/fastembed/text/onnx_embedding.py b/fastembed/text/onnx_embedding.py
index a60a2912..165f6eb4 100644
--- a/fastembed/text/onnx_embedding.py
+++ b/fastembed/text/onnx_embedding.py
@@ -1,6 +1,6 @@
import os
from multiprocessing import get_all_start_methods
-from typing import List, Dict, Any, Tuple, Union, Iterable, Type
+from typing import List, Dict, Any, Optional, Tuple, Union, Iterable, Type
import numpy as np
import onnxruntime as ort
@@ -98,6 +98,15 @@
"hf": "qdrant/all-MiniLM-L6-v2-onnx",
},
},
+ {
+ "model": "nomic-ai/nomic-embed-text-v1",
+ "dim": 768,
+ "description": "8192 context length english model",
+ "size_in_GB": 0.54,
+ "sources": {
+ "hf": "nomic-ai/nomic-embed-text-v1",
+ },
+ },
# {
# "model": "sentence-transformers/all-MiniLM-L6-v2",
# "dim": 384,
@@ -149,8 +158,8 @@ def _get_model_description(cls, model_name: str) -> Dict[str, Any]:
def __init__(
self,
model_name: str = "BAAI/bge-small-en-v1.5",
- cache_dir: str = None,
- threads: int = None,
+ cache_dir: Optional[str] = None,
+ threads: Optional[int] = None,
**kwargs,
):
"""
@@ -193,7 +202,7 @@ def embed(
self,
documents: Union[str, Iterable[str]],
batch_size: int = 256,
- parallel: int = None,
+ parallel: Optional[int] = None,
**kwargs,
) -> Iterable[np.ndarray]:
"""
diff --git a/fastembed/text/text_embedding.py b/fastembed/text/text_embedding.py
index 6107044b..d44d7c7d 100644
--- a/fastembed/text/text_embedding.py
+++ b/fastembed/text/text_embedding.py
@@ -53,24 +53,22 @@ def __init__(
):
super().__init__(model_name, cache_dir, threads, **kwargs)
- self.model = None
for embedding in self.EMBEDDINGS_REGISTRY:
supported_models = embedding.list_supported_models()
if any(model_name == model["model"] for model in supported_models):
self.model = embedding(model_name, cache_dir, threads, **kwargs)
- break
+ return
- if self.model is None:
- raise ValueError(
- f"Model {model_name} is not supported in TextEmbedding."
- "Please check the supported models using `TextEmbedding.list_supported_models()`"
- )
+ raise ValueError(
+ f"Model {model_name} is not supported in TextEmbedding."
+ "Please check the supported models using `TextEmbedding.list_supported_models()`"
+ )
def embed(
self,
documents: Union[str, Iterable[str]],
batch_size: int = 256,
- parallel: int = None,
+ parallel: Optional[int] = None,
**kwargs,
) -> Iterable[np.ndarray]:
"""
diff --git a/fastembed/text/text_embedding_base.py b/fastembed/text/text_embedding_base.py
index 49ac9804..158a97c0 100644
--- a/fastembed/text/text_embedding_base.py
+++ b/fastembed/text/text_embedding_base.py
@@ -19,7 +19,7 @@ def embed(
self,
documents: Union[str, Iterable[str]],
batch_size: int = 256,
- parallel: int = None,
+ parallel: Optional[int] = None,
**kwargs,
) -> Iterable[np.ndarray]:
raise NotImplementedError()
diff --git a/tests/test_onnx_embeddings.py b/tests/test_onnx_embeddings.py
index 6fcc64ae..7f580e6b 100644
--- a/tests/test_onnx_embeddings.py
+++ b/tests/test_onnx_embeddings.py
@@ -20,6 +20,7 @@
),
"jinaai/jina-embeddings-v2-small-en": np.array([-0.0455, -0.0428, -0.0122, 0.0613, 0.0015]),
"jinaai/jina-embeddings-v2-base-en": np.array([-0.0332, -0.0509, 0.0287, -0.0043, -0.0077]),
+ "nomic-ai/nomic-embed-text-v1": np.array([0.0061, 0.0103, -0.0296, -0.0242, -0.0170]),
}
diff --git a/tests/test_text_onnx_embeddings.py b/tests/test_text_onnx_embeddings.py
index f610fd86..8f523adf 100644
--- a/tests/test_text_onnx_embeddings.py
+++ b/tests/test_text_onnx_embeddings.py
@@ -20,6 +20,7 @@
),
"jinaai/jina-embeddings-v2-small-en": np.array([-0.0455, -0.0428, -0.0122, 0.0613, 0.0015]),
"jinaai/jina-embeddings-v2-base-en": np.array([-0.0332, -0.0509, 0.0287, -0.0043, -0.0077]),
+ "nomic-ai/nomic-embed-text-v1": np.array([0.0061, 0.0103, -0.0296, -0.0242, -0.0170]),
}