From 7883fa3c414b2f929677f6d1f37ebd8b664dfc10 Mon Sep 17 00:00:00 2001 From: generall Date: Fri, 2 Feb 2024 15:32:42 +0100 Subject: [PATCH] new multilingual models --- fastembed/common/model_management.py | 11 ++++++----- fastembed/text/e5_onnx_embedding.py | 24 +++++++++++++++++++++--- tests/test_text_onnx_embeddings.py | 4 ++++ 3 files changed, 31 insertions(+), 8 deletions(-) diff --git a/fastembed/common/model_management.py b/fastembed/common/model_management.py index e614a44f..732d5e70 100644 --- a/fastembed/common/model_management.py +++ b/fastembed/common/model_management.py @@ -11,7 +11,7 @@ from loguru import logger -def locate_model_file(model_dir: Path, file_names: List[str]): +def locate_model_file(model_dir: Path, file_names: List[str]) -> Path: """ Find model path for both TransformerJS style `onnx` subdirectory structure and direct model weights structure used by Optimum and Qdrant @@ -19,10 +19,11 @@ def locate_model_file(model_dir: Path, file_names: List[str]): if not model_dir.is_dir(): raise ValueError(f"Provided model path '{model_dir}' is not a directory.") - for path in model_dir.rglob("*.onnx"): - for file_name in file_names: - if path.is_file() and path.name == file_name: - return path + for file_name in file_names: + file_paths = [path for path in model_dir.rglob(file_name) if path.is_file()] + + if file_paths: + return file_paths[0] raise ValueError(f"Could not find either of {', '.join(file_names)} in {model_dir}") diff --git a/fastembed/text/e5_onnx_embedding.py b/fastembed/text/e5_onnx_embedding.py index 0d4b8809..97d780bf 100644 --- a/fastembed/text/e5_onnx_embedding.py +++ b/fastembed/text/e5_onnx_embedding.py @@ -14,6 +14,24 @@ "url": "https://storage.googleapis.com/qdrant-fastembed/fast-multilingual-e5-large.tar.gz", "hf": "qdrant/multilingual-e5-large-onnx", }, + }, + { + "model": "xenova/multilingual-e5-large-quantized", + "dim": 1024, + "description": "Multilingual model. Recommended for non-English languages", + "size_in_GB": 2.24, + "sources": { + "hf": "xenova/multilingual-e5-large", + } + }, + { + "model": "xenova/paraphrase-multilingual-mpnet-base-v2", + "dim": 768, + "description": "Sentence-transformers model for tasks like clustering or semantic search", + "size_in_GB": 1.11, + "sources": { + "hf": "xenova/paraphrase-multilingual-mpnet-base-v2", + } } ] @@ -42,8 +60,8 @@ def _preprocess_onnx_input(self, onnx_input: Dict[str, np.ndarray]) -> Dict[str, class E5OnnxEmbeddingWorker(OnnxTextEmbeddingWorker): def init_embedding( - self, - model_name: str, - cache_dir: str, + self, + model_name: str, + cache_dir: str, ) -> E5OnnxEmbedding: return E5OnnxEmbedding(model_name=model_name, cache_dir=cache_dir, threads=1) diff --git a/tests/test_text_onnx_embeddings.py b/tests/test_text_onnx_embeddings.py index 4dc74f62..575b13d6 100644 --- a/tests/test_text_onnx_embeddings.py +++ b/tests/test_text_onnx_embeddings.py @@ -15,6 +15,10 @@ "BAAI/bge-large-en-v1.5-quantized": np.array([0.03434538, 0.03316108, 0.02191251, -0.03713358, -0.01577825]), "sentence-transformers/all-MiniLM-L6-v2": np.array([0.0259, 0.0058, 0.0114, 0.0380, -0.0233]), "intfloat/multilingual-e5-large": np.array([0.0098, 0.0045, 0.0066, -0.0354, 0.0070]), + "xenova/multilingual-e5-large-quantized": np.array([0.00975464, 0.00446568, 0.00655449, -0.0354155, 0.00702112]), + "xenova/paraphrase-multilingual-mpnet-base-v2": np.array( + [-0.01341097, 0.0416553, -0.00480805, 0.02844842, 0.0505299] + ), "jinaai/jina-embeddings-v2-small-en": np.array([-0.0455, -0.0428, -0.0122, 0.0613, 0.0015]), "jinaai/jina-embeddings-v2-base-en": np.array([-0.0332, -0.0509, 0.0287, -0.0043, -0.0077]), }