diff --git a/docs/examples/Supported_Models.ipynb b/docs/examples/Supported_Models.ipynb index 18b64899..44f51db3 100644 --- a/docs/examples/Supported_Models.ipynb +++ b/docs/examples/Supported_Models.ipynb @@ -12,7 +12,7 @@ }, { "cell_type": "code", - "execution_count": 1, + "execution_count": 6, "metadata": {}, "outputs": [ { @@ -45,47 +45,47 @@ " \n", " \n", " 0\n", + " BAAI/bge-base-en\n", + " 768\n", + " Base English model\n", + " 0.50\n", + " \n", + " \n", + " 1\n", + " BAAI/bge-base-en-v1.5\n", + " 768\n", + " Base English model, v1.5\n", + " 0.44\n", + " \n", + " \n", + " 2\n", + " BAAI/bge-large-en-v1.5\n", + " 1024\n", + " Large English model, v1.5\n", + " 1.34\n", + " \n", + " \n", + " 3\n", " BAAI/bge-small-en\n", " 384\n", " Fast English model\n", " 0.20\n", " \n", " \n", - " 1\n", + " 4\n", " BAAI/bge-small-en-v1.5\n", " 384\n", " Fast and Default English model\n", " 0.13\n", " \n", " \n", - " 2\n", + " 5\n", " BAAI/bge-small-zh-v1.5\n", " 512\n", " Fast and recommended Chinese model\n", " 0.10\n", " \n", " \n", - " 3\n", - " BAAI/bge-base-en\n", - " 768\n", - " Base English model\n", - " 0.50\n", - " \n", - " \n", - " 4\n", - " BAAI/bge-base-en-v1.5\n", - " 768\n", - " Base English model, v1.5\n", - " 0.44\n", - " \n", - " \n", - " 5\n", - " sentence-transformers/all-MiniLM-L6-v2\n", - " 384\n", - " Sentence Transformer model, MiniLM-L6-v2\n", - " 0.09\n", - " \n", - " \n", " 6\n", " intfloat/multilingual-e5-large\n", " 1024\n", @@ -106,46 +106,76 @@ " English embedding model supporting 8192 sequence length\n", " 0.13\n", " \n", + " \n", + " 9\n", + " sentence-transformers/all-MiniLM-L6-v2\n", + " 384\n", + " Sentence Transformer model, MiniLM-L6-v2\n", + " 0.09\n", + " \n", + " \n", + " 10\n", + " xenova/multilingual-e5-large\n", + " 1024\n", + " Multilingual model. Recommended for non-English languages\n", + " 2.24\n", + " \n", + " \n", + " 11\n", + " xenova/paraphrase-multilingual-mpnet-base-v2\n", + " 768\n", + " Sentence-transformers model for tasks like clustering or semantic search\n", + " 1.11\n", + " \n", " \n", "\n", "" ], "text/plain": [ - " model dim \\\n", - "0 BAAI/bge-small-en 384 \n", - "1 BAAI/bge-small-en-v1.5 384 \n", - "2 BAAI/bge-small-zh-v1.5 512 \n", - "3 BAAI/bge-base-en 768 \n", - "4 BAAI/bge-base-en-v1.5 768 \n", - "5 sentence-transformers/all-MiniLM-L6-v2 384 \n", - "6 intfloat/multilingual-e5-large 1024 \n", - "7 jinaai/jina-embeddings-v2-base-en 768 \n", - "8 jinaai/jina-embeddings-v2-small-en 512 \n", + " model dim \\\n", + "0 BAAI/bge-base-en 768 \n", + "1 BAAI/bge-base-en-v1.5 768 \n", + "2 BAAI/bge-large-en-v1.5 1024 \n", + "3 BAAI/bge-small-en 384 \n", + "4 BAAI/bge-small-en-v1.5 384 \n", + "5 BAAI/bge-small-zh-v1.5 512 \n", + "6 intfloat/multilingual-e5-large 1024 \n", + "7 jinaai/jina-embeddings-v2-base-en 768 \n", + "8 jinaai/jina-embeddings-v2-small-en 512 \n", + "9 sentence-transformers/all-MiniLM-L6-v2 384 \n", + "10 xenova/multilingual-e5-large 1024 \n", + "11 xenova/paraphrase-multilingual-mpnet-base-v2 768 \n", "\n", - " description \\\n", - "0 Fast English model \n", - "1 Fast and Default English model \n", - "2 Fast and recommended Chinese model \n", - "3 Base English model \n", - "4 Base English model, v1.5 \n", - "5 Sentence Transformer model, MiniLM-L6-v2 \n", - "6 Multilingual model, e5-large. Recommend using this model for non-English languages \n", - "7 English embedding model supporting 8192 sequence length \n", - "8 English embedding model supporting 8192 sequence length \n", + " description \\\n", + "0 Base English model \n", + "1 Base English model, v1.5 \n", + "2 Large English model, v1.5 \n", + "3 Fast English model \n", + "4 Fast and Default English model \n", + "5 Fast and recommended Chinese model \n", + "6 Multilingual model, e5-large. Recommend using this model for non-English languages \n", + "7 English embedding model supporting 8192 sequence length \n", + "8 English embedding model supporting 8192 sequence length \n", + "9 Sentence Transformer model, MiniLM-L6-v2 \n", + "10 Multilingual model. Recommended for non-English languages \n", + "11 Sentence-transformers model for tasks like clustering or semantic search \n", "\n", - " size_in_GB \n", - "0 0.20 \n", - "1 0.13 \n", - "2 0.10 \n", - "3 0.50 \n", - "4 0.44 \n", - "5 0.09 \n", - "6 2.24 \n", - "7 0.55 \n", - "8 0.13 " + " size_in_GB \n", + "0 0.50 \n", + "1 0.44 \n", + "2 1.34 \n", + "3 0.20 \n", + "4 0.13 \n", + "5 0.10 \n", + "6 2.24 \n", + "7 0.55 \n", + "8 0.13 \n", + "9 0.09 \n", + "10 2.24 \n", + "11 1.11 " ] }, - "execution_count": 1, + "execution_count": 6, "metadata": {}, "output_type": "execute_result" } @@ -175,7 +205,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.9.17" + "version": "3.11.7" }, "orig_nbformat": 4 }, diff --git a/fastembed/embedding.py b/fastembed/embedding.py index 620b5d5e..1330eb27 100644 --- a/fastembed/embedding.py +++ b/fastembed/embedding.py @@ -35,17 +35,18 @@ def iter_batch(iterable: Union[Iterable, Generator], size: int) -> Iterable: yield b -def locate_model_file(model_dir: Path, file_names: List[str]): +def locate_model_file(model_dir: Path, file_names: List[str]) -> Path: """ - Find model path for both TransformerJS style `onnx` subdirectory structure and direct model weights structure used by Optimum and Qdrant + Find model path recursively for both TransformerJS style `onnx` subdirectory structure and direct model weights structure used by Optimum and Qdrant """ if not model_dir.is_dir(): raise ValueError(f"Provided model path '{model_dir}' is not a directory.") - for path in model_dir.rglob("*.onnx"): - for file_name in file_names: - if path.is_file() and path.name == file_name: - return path + for file_name in file_names: + file_paths = [path for path in model_dir.rglob(file_name) if path.is_file()] + + if file_paths: + return file_paths[0] raise ValueError(f"Could not find either of {', '.join(file_names)} in {model_dir}") @@ -114,7 +115,7 @@ def __init__( # Hacky support for multilingual model self.exclude_token_type_ids = False - if model_name == "intfloat/multilingual-e5-large": + if "multilingual" in model_name: self.exclude_token_type_ids = True so = ort.SessionOptions() @@ -212,7 +213,9 @@ def embed(self, texts: Iterable[str], batch_size: int = 256, parallel: int = Non raise NotImplementedError @classmethod - def list_supported_models(cls, exclude: List[str] = []) -> List[Dict[str, Any]]: + def list_supported_models( + cls, exclude: List[str] = ["compressed_url_sources", "hf_sources"] + ) -> List[Dict[str, Any]]: """Lists the supported models. Args: diff --git a/fastembed/models.json b/fastembed/models.json index 7aea39ad..e370c74a 100644 --- a/fastembed/models.json +++ b/fastembed/models.json @@ -80,7 +80,7 @@ { "model": "jinaai/jina-embeddings-v2-base-en", "dim": 768, - "description": " English embedding model supporting 8192 sequence length", + "description": "English embedding model supporting 8192 sequence length", "size_in_GB": 0.55, "hf_sources": [ "xenova/jina-embeddings-v2-base-en" @@ -109,5 +109,25 @@ "https://storage.googleapis.com/qdrant-fastembed/fast-all-MiniLM-L6-v2.tar.gz", "https://storage.googleapis.com/qdrant-fastembed/sentence-transformers-all-MiniLM-L6-v2.tar.gz" ] + }, + { + "model": "xenova/multilingual-e5-large", + "dim": 1024, + "description": "Multilingual model. Recommended for non-English languages", + "size_in_GB": 2.24, + "hf_sources": [ + "xenova/multilingual-e5-large" + ], + "compressed_url_sources": [] + }, + { + "model": "xenova/paraphrase-multilingual-mpnet-base-v2", + "dim": 768, + "description": "Sentence-transformers model for tasks like clustering or semantic search", + "size_in_GB": 1.11, + "hf_sources": [ + "xenova/paraphrase-multilingual-mpnet-base-v2" + ], + "compressed_url_sources": [] } ] \ No newline at end of file diff --git a/tests/test_onnx_embeddings.py b/tests/test_onnx_embeddings.py index 743dd3e5..4e07fee2 100644 --- a/tests/test_onnx_embeddings.py +++ b/tests/test_onnx_embeddings.py @@ -14,6 +14,10 @@ "BAAI/bge-large-en-v1.5": np.array([0.03434538, 0.03316108, 0.02191251, -0.03713358, -0.01577825]), "sentence-transformers/all-MiniLM-L6-v2": np.array([0.0259, 0.0058, 0.0114, 0.0380, -0.0233]), "intfloat/multilingual-e5-large": np.array([0.0098, 0.0045, 0.0066, -0.0354, 0.0070]), + "xenova/multilingual-e5-large": np.array([0.00975464, 0.00446568, 0.00655449, -0.0354155, 0.00702112]), + "xenova/paraphrase-multilingual-mpnet-base-v2": np.array( + [-0.01341097, 0.0416553, -0.00480805, 0.02844842, 0.0505299] + ), "jinaai/jina-embeddings-v2-small-en": np.array([-0.0455, -0.0428, -0.0122, 0.0613, 0.0015]), "jinaai/jina-embeddings-v2-base-en": np.array([-0.0332, -0.0509, 0.0287, -0.0043, -0.0077]), }