Skip to content

Commit

Permalink
feat: Support xenova/multilingual-e5-large, xenova/paraphrase-multili…
Browse files Browse the repository at this point in the history
…ngual-mpnet-base-v2
  • Loading branch information
Anush008 committed Jan 30, 2024
1 parent 2e3e550 commit 4265993
Show file tree
Hide file tree
Showing 3 changed files with 30 additions and 7 deletions.
13 changes: 7 additions & 6 deletions fastembed/embedding.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,17 +35,18 @@ def iter_batch(iterable: Union[Iterable, Generator], size: int) -> Iterable:
yield b


def locate_model_file(model_dir: Path, file_names: List[str]):
def locate_model_file(model_dir: Path, file_names: List[str]) -> Path:
"""
Find model path for both TransformerJS style `onnx` subdirectory structure and direct model weights structure used by Optimum and Qdrant
Find model path recursively for both TransformerJS style `onnx` subdirectory structure and direct model weights structure used by Optimum and Qdrant
"""
if not model_dir.is_dir():
raise ValueError(f"Provided model path '{model_dir}' is not a directory.")

for path in model_dir.rglob("*.onnx"):
for file_name in file_names:
if path.is_file() and path.name == file_name:
return path
for file_name in file_names:
file_paths = [path for path in model_dir.rglob(file_name) if path.is_file()]

if file_paths:
return file_paths[0]

raise ValueError(f"Could not find either of {', '.join(file_names)} in {model_dir}")

Expand Down
22 changes: 21 additions & 1 deletion fastembed/models.json
Original file line number Diff line number Diff line change
Expand Up @@ -80,7 +80,7 @@
{
"model": "jinaai/jina-embeddings-v2-base-en",
"dim": 768,
"description": " English embedding model supporting 8192 sequence length",
"description": "English embedding model supporting 8192 sequence length",
"size_in_GB": 0.55,
"hf_sources": [
"xenova/jina-embeddings-v2-base-en"
Expand Down Expand Up @@ -109,5 +109,25 @@
"https://storage.googleapis.com/qdrant-fastembed/fast-all-MiniLM-L6-v2.tar.gz",
"https://storage.googleapis.com/qdrant-fastembed/sentence-transformers-all-MiniLM-L6-v2.tar.gz"
]
},
{
"model": "xenova/multilingual-e5-large",
"dim": 1024,
"description": "Multilingual model. Recommended for non-English languages",
"size_in_GB": 2.24,
"hf_sources": [
"xenova/multilingual-e5-large"
],
"compressed_url_sources": []
},
{
"model": "xenova/paraphrase-multilingual-mpnet-base-v2",
"dim": 768,
"description": "Sentence-transformers model for tasks like clustering or semantic search",
"size_in_GB": 1.11,
"hf_sources": [
"xenova/paraphrase-multilingual-mpnet-base-v2"
],
"compressed_url_sources": []
}
]
2 changes: 2 additions & 0 deletions tests/test_onnx_embeddings.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,8 @@
"BAAI/bge-large-en-v1.5": np.array([0.03434538, 0.03316108, 0.02191251, -0.03713358, -0.01577825]),
"sentence-transformers/all-MiniLM-L6-v2": np.array([0.0259, 0.0058, 0.0114, 0.0380, -0.0233]),
"intfloat/multilingual-e5-large": np.array([0.0098, 0.0045, 0.0066, -0.0354, 0.0070]),
"xenova/multilingual-e5-large": np.array([0.00, 0.00, 0.00, 0.00, 0.00]),
"xenova/paraphrase-multilingual-mpnet-base-v2": np.array([0.00, 0.00, 0.00, 0.00, 0.00]),
"jinaai/jina-embeddings-v2-small-en": np.array([-0.0455, -0.0428, -0.0122, 0.0613, 0.0015]),
"jinaai/jina-embeddings-v2-base-en": np.array([-0.0332, -0.0509, 0.0287, -0.0043, -0.0077]),
}
Expand Down

0 comments on commit 4265993

Please sign in to comment.