feat: Support xenova/multilingual-e5-large, xenova/paraphrase-multili… (

#103) * feat: Support xenova/multilingual-e5-large, xenova/paraphrase-multilingual-mpnet-base-v2 * chore: updated exclude_token_type_ids check * docs: supported models update
qdrant · Feb 2, 2024 · 96f7d83 · 96f7d83
1 parent 2e3e550
commit 96f7d83
Show file tree

Hide file tree

Showing 4 changed files with 122 additions and 65 deletions.
diff --git a/docs/examples/Supported_Models.ipynb b/docs/examples/Supported_Models.ipynb
@@ -12,7 +12,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 1,
+   "execution_count": 6,
    "metadata": {},
    "outputs": [
     {
@@ -45,47 +45,47 @@
        "  <tbody>\n",
        "    <tr>\n",
        "      <th>0</th>\n",
+       "      <td>BAAI/bge-base-en</td>\n",
+       "      <td>768</td>\n",
+       "      <td>Base English model</td>\n",
+       "      <td>0.50</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>BAAI/bge-base-en-v1.5</td>\n",
+       "      <td>768</td>\n",
+       "      <td>Base English model, v1.5</td>\n",
+       "      <td>0.44</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>BAAI/bge-large-en-v1.5</td>\n",
+       "      <td>1024</td>\n",
+       "      <td>Large English model, v1.5</td>\n",
+       "      <td>1.34</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3</th>\n",
        "      <td>BAAI/bge-small-en</td>\n",
        "      <td>384</td>\n",
        "      <td>Fast English model</td>\n",
        "      <td>0.20</td>\n",
        "    </tr>\n",
        "    <tr>\n",
-       "      <th>1</th>\n",
+       "      <th>4</th>\n",
        "      <td>BAAI/bge-small-en-v1.5</td>\n",
        "      <td>384</td>\n",
        "      <td>Fast and Default English model</td>\n",
        "      <td>0.13</td>\n",
        "    </tr>\n",
        "    <tr>\n",
-       "      <th>2</th>\n",
+       "      <th>5</th>\n",
        "      <td>BAAI/bge-small-zh-v1.5</td>\n",
        "      <td>512</td>\n",
        "      <td>Fast and recommended Chinese model</td>\n",
        "      <td>0.10</td>\n",
        "    </tr>\n",
        "    <tr>\n",
-       "      <th>3</th>\n",
-       "      <td>BAAI/bge-base-en</td>\n",
-       "      <td>768</td>\n",
-       "      <td>Base English model</td>\n",
-       "      <td>0.50</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>4</th>\n",
-       "      <td>BAAI/bge-base-en-v1.5</td>\n",
-       "      <td>768</td>\n",
-       "      <td>Base English model, v1.5</td>\n",
-       "      <td>0.44</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>5</th>\n",
-       "      <td>sentence-transformers/all-MiniLM-L6-v2</td>\n",
-       "      <td>384</td>\n",
-       "      <td>Sentence Transformer model, MiniLM-L6-v2</td>\n",
-       "      <td>0.09</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
        "      <th>6</th>\n",
        "      <td>intfloat/multilingual-e5-large</td>\n",
        "      <td>1024</td>\n",
@@ -106,46 +106,76 @@
        "      <td>English embedding model supporting 8192 sequence length</td>\n",
        "      <td>0.13</td>\n",
        "    </tr>\n",
+       "    <tr>\n",
+       "      <th>9</th>\n",
+       "      <td>sentence-transformers/all-MiniLM-L6-v2</td>\n",
+       "      <td>384</td>\n",
+       "      <td>Sentence Transformer model, MiniLM-L6-v2</td>\n",
+       "      <td>0.09</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>10</th>\n",
+       "      <td>xenova/multilingual-e5-large</td>\n",
+       "      <td>1024</td>\n",
+       "      <td>Multilingual model. Recommended for non-English languages</td>\n",
+       "      <td>2.24</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>11</th>\n",
+       "      <td>xenova/paraphrase-multilingual-mpnet-base-v2</td>\n",
+       "      <td>768</td>\n",
+       "      <td>Sentence-transformers model for tasks like clustering or semantic search</td>\n",
+       "      <td>1.11</td>\n",
+       "    </tr>\n",
        "  </tbody>\n",
        "</table>\n",
        "</div>"
       ],
       "text/plain": [
-       "                                    model   dim  \\\n",
-       "0                       BAAI/bge-small-en   384   \n",
-       "1                  BAAI/bge-small-en-v1.5   384   \n",
-       "2                  BAAI/bge-small-zh-v1.5   512   \n",
-       "3                        BAAI/bge-base-en   768   \n",
-       "4                   BAAI/bge-base-en-v1.5   768   \n",
-       "5  sentence-transformers/all-MiniLM-L6-v2   384   \n",
-       "6          intfloat/multilingual-e5-large  1024   \n",
-       "7       jinaai/jina-embeddings-v2-base-en   768   \n",
-       "8      jinaai/jina-embeddings-v2-small-en   512   \n",
+       "                                           model   dim  \\\n",
+       "0                               BAAI/bge-base-en   768   \n",
+       "1                          BAAI/bge-base-en-v1.5   768   \n",
+       "2                         BAAI/bge-large-en-v1.5  1024   \n",
+       "3                              BAAI/bge-small-en   384   \n",
+       "4                         BAAI/bge-small-en-v1.5   384   \n",
+       "5                         BAAI/bge-small-zh-v1.5   512   \n",
+       "6                 intfloat/multilingual-e5-large  1024   \n",
+       "7              jinaai/jina-embeddings-v2-base-en   768   \n",
+       "8             jinaai/jina-embeddings-v2-small-en   512   \n",
+       "9         sentence-transformers/all-MiniLM-L6-v2   384   \n",
+       "10                  xenova/multilingual-e5-large  1024   \n",
+       "11  xenova/paraphrase-multilingual-mpnet-base-v2   768   \n",
        "\n",
-       "                                                                          description  \\\n",
-       "0                                                                  Fast English model   \n",
-       "1                                                      Fast and Default English model   \n",
-       "2                                                  Fast and recommended Chinese model   \n",
-       "3                                                                  Base English model   \n",
-       "4                                                            Base English model, v1.5   \n",
-       "5                                            Sentence Transformer model, MiniLM-L6-v2   \n",
-       "6  Multilingual model, e5-large. Recommend using this model for non-English languages   \n",
-       "7                             English embedding model supporting 8192 sequence length   \n",
-       "8                             English embedding model supporting 8192 sequence length   \n",
+       "                                                                           description  \\\n",
+       "0                                                                   Base English model   \n",
+       "1                                                             Base English model, v1.5   \n",
+       "2                                                            Large English model, v1.5   \n",
+       "3                                                                   Fast English model   \n",
+       "4                                                       Fast and Default English model   \n",
+       "5                                                   Fast and recommended Chinese model   \n",
+       "6   Multilingual model, e5-large. Recommend using this model for non-English languages   \n",
+       "7                              English embedding model supporting 8192 sequence length   \n",
+       "8                              English embedding model supporting 8192 sequence length   \n",
+       "9                                             Sentence Transformer model, MiniLM-L6-v2   \n",
+       "10                           Multilingual model. Recommended for non-English languages   \n",
+       "11            Sentence-transformers model for tasks like clustering or semantic search   \n",
        "\n",
-       "   size_in_GB  \n",
-       "0        0.20  \n",
-       "1        0.13  \n",
-       "2        0.10  \n",
-       "3        0.50  \n",
-       "4        0.44  \n",
-       "5        0.09  \n",
-       "6        2.24  \n",
-       "7        0.55  \n",
-       "8        0.13  "
+       "    size_in_GB  \n",
+       "0         0.50  \n",
+       "1         0.44  \n",
+       "2         1.34  \n",
+       "3         0.20  \n",
+       "4         0.13  \n",
+       "5         0.10  \n",
+       "6         2.24  \n",
+       "7         0.55  \n",
+       "8         0.13  \n",
+       "9         0.09  \n",
+       "10        2.24  \n",
+       "11        1.11  "
       ]
      },
-     "execution_count": 1,
+     "execution_count": 6,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -175,7 +205,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.9.17"
+   "version": "3.11.7"
   },
   "orig_nbformat": 4
  },

diff --git a/fastembed/embedding.py b/fastembed/embedding.py
@@ -35,17 +35,18 @@ def iter_batch(iterable: Union[Iterable, Generator], size: int) -> Iterable:
         yield b
 
 
-def locate_model_file(model_dir: Path, file_names: List[str]):
+def locate_model_file(model_dir: Path, file_names: List[str]) -> Path:
     """
-    Find model path for both TransformerJS style `onnx`  subdirectory structure and direct model weights structure used by Optimum and Qdrant
+    Find model path recursively for both TransformerJS style `onnx`  subdirectory structure and direct model weights structure used by Optimum and Qdrant
     """
     if not model_dir.is_dir():
         raise ValueError(f"Provided model path '{model_dir}' is not a directory.")
 
-    for path in model_dir.rglob("*.onnx"):
-        for file_name in file_names:
-            if path.is_file() and path.name == file_name:
-                return path
+    for file_name in file_names:
+        file_paths = [path for path in model_dir.rglob(file_name) if path.is_file()]
+
+        if file_paths:
+            return file_paths[0]
 
     raise ValueError(f"Could not find either of {', '.join(file_names)} in {model_dir}")
 
@@ -114,7 +115,7 @@ def __init__(
 
         # Hacky support for multilingual model
         self.exclude_token_type_ids = False
-        if model_name == "intfloat/multilingual-e5-large":
+        if "multilingual" in model_name:
             self.exclude_token_type_ids = True
 
         so = ort.SessionOptions()
@@ -212,7 +213,9 @@ def embed(self, texts: Iterable[str], batch_size: int = 256, parallel: int = Non
         raise NotImplementedError
 
     @classmethod
-    def list_supported_models(cls, exclude: List[str] = []) -> List[Dict[str, Any]]:
+    def list_supported_models(
+        cls, exclude: List[str] = ["compressed_url_sources", "hf_sources"]
+    ) -> List[Dict[str, Any]]:
         """Lists the supported models.
 
         Args:

diff --git a/fastembed/models.json b/fastembed/models.json
@@ -80,7 +80,7 @@
     {
         "model": "jinaai/jina-embeddings-v2-base-en",
         "dim": 768,
-        "description": " English embedding model supporting 8192 sequence length",
+        "description": "English embedding model supporting 8192 sequence length",
         "size_in_GB": 0.55,
         "hf_sources": [
             "xenova/jina-embeddings-v2-base-en"
@@ -109,5 +109,25 @@
             "https://storage.googleapis.com/qdrant-fastembed/fast-all-MiniLM-L6-v2.tar.gz",
             "https://storage.googleapis.com/qdrant-fastembed/sentence-transformers-all-MiniLM-L6-v2.tar.gz"
         ]
+    },
+    {
+        "model": "xenova/multilingual-e5-large",
+        "dim": 1024,
+        "description": "Multilingual model. Recommended for non-English languages",
+        "size_in_GB": 2.24,
+        "hf_sources": [
+            "xenova/multilingual-e5-large"
+        ],
+        "compressed_url_sources": []
+    },
+    {
+        "model": "xenova/paraphrase-multilingual-mpnet-base-v2",
+        "dim": 768,
+        "description": "Sentence-transformers model for tasks like clustering or semantic search",
+        "size_in_GB": 1.11,
+        "hf_sources": [
+            "xenova/paraphrase-multilingual-mpnet-base-v2"
+        ],
+        "compressed_url_sources": []
     }
 ]
diff --git a/tests/test_onnx_embeddings.py b/tests/test_onnx_embeddings.py
@@ -14,6 +14,10 @@
     "BAAI/bge-large-en-v1.5": np.array([0.03434538, 0.03316108, 0.02191251, -0.03713358, -0.01577825]),
     "sentence-transformers/all-MiniLM-L6-v2": np.array([0.0259, 0.0058, 0.0114, 0.0380, -0.0233]),
     "intfloat/multilingual-e5-large": np.array([0.0098, 0.0045, 0.0066, -0.0354, 0.0070]),
+    "xenova/multilingual-e5-large": np.array([0.00975464, 0.00446568, 0.00655449, -0.0354155, 0.00702112]),
+    "xenova/paraphrase-multilingual-mpnet-base-v2": np.array(
+        [-0.01341097, 0.0416553, -0.00480805, 0.02844842, 0.0505299]
+    ),
     "jinaai/jina-embeddings-v2-small-en": np.array([-0.0455, -0.0428, -0.0122, 0.0613, 0.0015]),
     "jinaai/jina-embeddings-v2-base-en": np.array([-0.0332, -0.0509, 0.0287, -0.0043, -0.0077]),
 }