qdrant · NirantK · Feb 2, 2024 · Jan 30, 2024 · Jan 30, 2024 · Jan 31, 2024
diff --git a/fastembed/embedding.py b/fastembed/embedding.py
@@ -35,17 +35,18 @@ def iter_batch(iterable: Union[Iterable, Generator], size: int) -> Iterable:
         yield b
 
 
-def locate_model_file(model_dir: Path, file_names: List[str]):
+def locate_model_file(model_dir: Path, file_names: List[str]) -> Path:
     """
-    Find model path for both TransformerJS style `onnx`  subdirectory structure and direct model weights structure used by Optimum and Qdrant
+    Find model path recursively for both TransformerJS style `onnx`  subdirectory structure and direct model weights structure used by Optimum and Qdrant
     """
     if not model_dir.is_dir():
         raise ValueError(f"Provided model path '{model_dir}' is not a directory.")
 
-    for path in model_dir.rglob("*.onnx"):
-        for file_name in file_names:
-            if path.is_file() and path.name == file_name:
-                return path
+    for file_name in file_names:
+        file_paths = [path for path in model_dir.rglob(file_name) if path.is_file()]
+
+        if file_paths:
+            return file_paths[0]
 
     raise ValueError(f"Could not find either of {', '.join(file_names)} in {model_dir}")
 
@@ -114,7 +115,7 @@ def __init__(
 
         # Hacky support for multilingual model
         self.exclude_token_type_ids = False
-        if model_name == "intfloat/multilingual-e5-large":
+        if model_name in ["intfloat/multilingual-e5-large", "xenova/multilingual-e5-large", "xenova/paraphrase-multilingual-mpnet-base-v2"]:
             self.exclude_token_type_ids = True
 
         so = ort.SessionOptions()

diff --git a/fastembed/models.json b/fastembed/models.json
@@ -80,7 +80,7 @@
     {
         "model": "jinaai/jina-embeddings-v2-base-en",
         "dim": 768,
-        "description": " English embedding model supporting 8192 sequence length",
+        "description": "English embedding model supporting 8192 sequence length",
         "size_in_GB": 0.55,
         "hf_sources": [
             "xenova/jina-embeddings-v2-base-en"
@@ -109,5 +109,25 @@
             "https://storage.googleapis.com/qdrant-fastembed/fast-all-MiniLM-L6-v2.tar.gz",
             "https://storage.googleapis.com/qdrant-fastembed/sentence-transformers-all-MiniLM-L6-v2.tar.gz"
         ]
+    },
+    {
+        "model": "xenova/multilingual-e5-large",
+        "dim": 1024,
+        "description": "Multilingual model. Recommended for non-English languages",
+        "size_in_GB": 2.24,
+        "hf_sources": [
+            "xenova/multilingual-e5-large"
+        ],
+        "compressed_url_sources": []
+    },
+    {
+        "model": "xenova/paraphrase-multilingual-mpnet-base-v2",
+        "dim": 768,
+        "description": "Sentence-transformers model for tasks like clustering or semantic search",
+        "size_in_GB": 1.11,
+        "hf_sources": [
+            "xenova/paraphrase-multilingual-mpnet-base-v2"
+        ],
+        "compressed_url_sources": []
     }
 ]
diff --git a/tests/test_onnx_embeddings.py b/tests/test_onnx_embeddings.py
@@ -14,6 +14,10 @@
     "BAAI/bge-large-en-v1.5": np.array([0.03434538, 0.03316108, 0.02191251, -0.03713358, -0.01577825]),
     "sentence-transformers/all-MiniLM-L6-v2": np.array([0.0259, 0.0058, 0.0114, 0.0380, -0.0233]),
     "intfloat/multilingual-e5-large": np.array([0.0098, 0.0045, 0.0066, -0.0354, 0.0070]),
+    "xenova/multilingual-e5-large": np.array([0.00975464, 0.00446568, 0.00655449, -0.0354155, 0.00702112]),
+    "xenova/paraphrase-multilingual-mpnet-base-v2": np.array(
+        [-0.01341097, 0.0416553, -0.00480805, 0.02844842, 0.0505299]
+    ),
     "jinaai/jina-embeddings-v2-small-en": np.array([-0.0455, -0.0428, -0.0122, 0.0613, 0.0015]),
     "jinaai/jina-embeddings-v2-base-en": np.array([-0.0332, -0.0509, 0.0287, -0.0043, -0.0077]),
 }