Skip to content

Commit

Permalink
new multilingual models
Browse files Browse the repository at this point in the history
  • Loading branch information
generall committed Feb 2, 2024
1 parent 5dbd007 commit 7883fa3
Show file tree
Hide file tree
Showing 3 changed files with 31 additions and 8 deletions.
11 changes: 6 additions & 5 deletions fastembed/common/model_management.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,18 +11,19 @@
from loguru import logger


def locate_model_file(model_dir: Path, file_names: List[str]):
def locate_model_file(model_dir: Path, file_names: List[str]) -> Path:
"""
Find model path for both TransformerJS style `onnx` subdirectory structure and direct model weights structure used
by Optimum and Qdrant
"""
if not model_dir.is_dir():
raise ValueError(f"Provided model path '{model_dir}' is not a directory.")

for path in model_dir.rglob("*.onnx"):
for file_name in file_names:
if path.is_file() and path.name == file_name:
return path
for file_name in file_names:
file_paths = [path for path in model_dir.rglob(file_name) if path.is_file()]

if file_paths:
return file_paths[0]

raise ValueError(f"Could not find either of {', '.join(file_names)} in {model_dir}")

Expand Down
24 changes: 21 additions & 3 deletions fastembed/text/e5_onnx_embedding.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,24 @@
"url": "https://storage.googleapis.com/qdrant-fastembed/fast-multilingual-e5-large.tar.gz",
"hf": "qdrant/multilingual-e5-large-onnx",
},
},
{
"model": "xenova/multilingual-e5-large-quantized",
"dim": 1024,
"description": "Multilingual model. Recommended for non-English languages",
"size_in_GB": 2.24,
"sources": {
"hf": "xenova/multilingual-e5-large",
}
},
{
"model": "xenova/paraphrase-multilingual-mpnet-base-v2",
"dim": 768,
"description": "Sentence-transformers model for tasks like clustering or semantic search",
"size_in_GB": 1.11,
"sources": {
"hf": "xenova/paraphrase-multilingual-mpnet-base-v2",
}
}
]

Expand Down Expand Up @@ -42,8 +60,8 @@ def _preprocess_onnx_input(self, onnx_input: Dict[str, np.ndarray]) -> Dict[str,

class E5OnnxEmbeddingWorker(OnnxTextEmbeddingWorker):
def init_embedding(
self,
model_name: str,
cache_dir: str,
self,
model_name: str,
cache_dir: str,
) -> E5OnnxEmbedding:
return E5OnnxEmbedding(model_name=model_name, cache_dir=cache_dir, threads=1)
4 changes: 4 additions & 0 deletions tests/test_text_onnx_embeddings.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,10 @@
"BAAI/bge-large-en-v1.5-quantized": np.array([0.03434538, 0.03316108, 0.02191251, -0.03713358, -0.01577825]),
"sentence-transformers/all-MiniLM-L6-v2": np.array([0.0259, 0.0058, 0.0114, 0.0380, -0.0233]),
"intfloat/multilingual-e5-large": np.array([0.0098, 0.0045, 0.0066, -0.0354, 0.0070]),
"xenova/multilingual-e5-large-quantized": np.array([0.00975464, 0.00446568, 0.00655449, -0.0354155, 0.00702112]),
"xenova/paraphrase-multilingual-mpnet-base-v2": np.array(
[-0.01341097, 0.0416553, -0.00480805, 0.02844842, 0.0505299]
),
"jinaai/jina-embeddings-v2-small-en": np.array([-0.0455, -0.0428, -0.0122, 0.0613, 0.0015]),
"jinaai/jina-embeddings-v2-base-en": np.array([-0.0332, -0.0509, 0.0287, -0.0043, -0.0077]),
}
Expand Down

0 comments on commit 7883fa3

Please sign in to comment.