From e8a7677d18a4ae645ab24be56ad904931a908df8 Mon Sep 17 00:00:00 2001 From: Anush008 Date: Wed, 21 Feb 2024 17:29:50 +0530 Subject: [PATCH] feat: Added gte-large, nomic-text 1.5, cleanup --- docs/examples/Supported_Models.ipynb | 94 +++++++++++------- fastembed/models.json | 143 --------------------------- fastembed/text/onnx_embedding.py | 18 ++++ tests/test_onnx_embeddings.py | 78 --------------- tests/test_text_onnx_embeddings.py | 6 +- 5 files changed, 82 insertions(+), 257 deletions(-) delete mode 100644 fastembed/models.json delete mode 100644 tests/test_onnx_embeddings.py diff --git a/docs/examples/Supported_Models.ipynb b/docs/examples/Supported_Models.ipynb index 8c100d12..2b21800d 100644 --- a/docs/examples/Supported_Models.ipynb +++ b/docs/examples/Supported_Models.ipynb @@ -12,7 +12,7 @@ }, { "cell_type": "code", - "execution_count": 2, + "execution_count": 6, "metadata": {}, "outputs": [ { @@ -110,22 +110,30 @@ " \n", " \n", " 8\n", - " sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2\n", - " 384\n", - " Sentence Transformer model, paraphrase-multilingual-MiniLM-L12-v2\n", - " 0.46\n", - " {'hf': 'qdrant/paraphrase-multilingual-MiniLM-L12-v2-onnx-Q'}\n", + " nomic-ai/nomic-embed-text-v1\n", + " 768\n", + " 8192 context length english model\n", + " 0.54\n", + " {'hf': 'nomic-ai/nomic-embed-text-v1'}\n", " \n", " \n", " 9\n", - " nomic-ai/nomic-embed-text-v1\n", + " nomic-ai/nomic-embed-text-v1.5\n", " 768\n", " 8192 context length english model\n", " 0.54\n", - " {'hf': 'nomic-ai/nomic-embed-text-v1'}\n", + " {'hf': 'nomic-ai/nomic-embed-text-v1.5'}\n", " \n", " \n", " 10\n", + " thenlper/gte-large\n", + " 1024\n", + " Large general text embeddings model\n", + " 1.34\n", + " {'hf': 'qdrant/gte-large-onnx'}\n", + " \n", + " \n", + " 11\n", " intfloat/multilingual-e5-large\n", " 1024\n", " Multilingual model, e5-large. Recommend using this model for non-English languages\n", @@ -133,7 +141,7 @@ " {'url': 'https://storage.googleapis.com/qdrant-fastembed/fast-multilingual-e5-large.tar.gz', 'hf': 'qdrant/multilingual-e5-large-onnx'}\n", " \n", " \n", - " 11\n", + " 12\n", " sentence-transformers/paraphrase-multilingual-mpnet-base-v2\n", " 768\n", " Sentence-transformers model for tasks like clustering or semantic search\n", @@ -141,7 +149,15 @@ " {'hf': 'xenova/paraphrase-multilingual-mpnet-base-v2'}\n", " \n", " \n", - " 12\n", + " 13\n", + " sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2\n", + " 384\n", + " Sentence Transformer model, paraphrase-multilingual-MiniLM-L12-v2\n", + " 0.46\n", + " {'hf': 'qdrant/paraphrase-multilingual-MiniLM-L12-v2-onnx-Q'}\n", + " \n", + " \n", + " 14\n", " jinaai/jina-embeddings-v2-base-en\n", " 768\n", " English embedding model supporting 8192 sequence length\n", @@ -149,7 +165,7 @@ " {'hf': 'xenova/jina-embeddings-v2-base-en'}\n", " \n", " \n", - " 13\n", + " 15\n", " jinaai/jina-embeddings-v2-small-en\n", " 512\n", " English embedding model supporting 8192 sequence length\n", @@ -170,12 +186,14 @@ "5 BAAI/bge-small-en-v1.5 384 \n", "6 BAAI/bge-small-zh-v1.5 512 \n", "7 sentence-transformers/all-MiniLM-L6-v2 384 \n", - "8 sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2 384 \n", - "9 nomic-ai/nomic-embed-text-v1 768 \n", - "10 intfloat/multilingual-e5-large 1024 \n", - "11 sentence-transformers/paraphrase-multilingual-mpnet-base-v2 768 \n", - "12 jinaai/jina-embeddings-v2-base-en 768 \n", - "13 jinaai/jina-embeddings-v2-small-en 512 \n", + "8 nomic-ai/nomic-embed-text-v1 768 \n", + "9 nomic-ai/nomic-embed-text-v1.5 768 \n", + "10 thenlper/gte-large 1024 \n", + "11 intfloat/multilingual-e5-large 1024 \n", + "12 sentence-transformers/paraphrase-multilingual-mpnet-base-v2 768 \n", + "13 sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2 384 \n", + "14 jinaai/jina-embeddings-v2-base-en 768 \n", + "15 jinaai/jina-embeddings-v2-small-en 512 \n", "\n", " description \\\n", "0 Base English model \n", @@ -186,12 +204,14 @@ "5 Fast and Default English model \n", "6 Fast and recommended Chinese model \n", "7 Sentence Transformer model, MiniLM-L6-v2 \n", - "8 Sentence Transformer model, paraphrase-multilingual-MiniLM-L12-v2 \n", + "8 8192 context length english model \n", "9 8192 context length english model \n", - "10 Multilingual model, e5-large. Recommend using this model for non-English languages \n", - "11 Sentence-transformers model for tasks like clustering or semantic search \n", - "12 English embedding model supporting 8192 sequence length \n", - "13 English embedding model supporting 8192 sequence length \n", + "10 Large general text embeddings model \n", + "11 Multilingual model, e5-large. Recommend using this model for non-English languages \n", + "12 Sentence-transformers model for tasks like clustering or semantic search \n", + "13 Sentence Transformer model, paraphrase-multilingual-MiniLM-L12-v2 \n", + "14 English embedding model supporting 8192 sequence length \n", + "15 English embedding model supporting 8192 sequence length \n", "\n", " size_in_GB \\\n", "0 0.50 \n", @@ -202,12 +222,14 @@ "5 0.13 \n", "6 0.10 \n", "7 0.09 \n", - "8 0.46 \n", + "8 0.54 \n", "9 0.54 \n", - "10 2.24 \n", - "11 1.11 \n", - "12 0.55 \n", - "13 0.13 \n", + "10 1.34 \n", + "11 2.24 \n", + "12 1.11 \n", + "13 0.46 \n", + "14 0.55 \n", + "15 0.13 \n", "\n", " sources \n", "0 {'url': 'https://storage.googleapis.com/qdrant-fastembed/fast-bge-base-en.tar.gz'} \n", @@ -218,15 +240,17 @@ "5 {'url': 'https://storage.googleapis.com/qdrant-fastembed/fast-bge-small-en-v1.5.tar.gz', 'hf': 'qdrant/bge-small-en-v1.5-onnx-q'} \n", "6 {'url': 'https://storage.googleapis.com/qdrant-fastembed/fast-bge-small-zh-v1.5.tar.gz'} \n", "7 {'url': 'https://storage.googleapis.com/qdrant-fastembed/sentence-transformers-all-MiniLM-L6-v2.tar.gz', 'hf': 'qdrant/all-MiniLM-L6-v2-onnx'} \n", - "8 {'hf': 'qdrant/paraphrase-multilingual-MiniLM-L12-v2-onnx-Q'} \n", - "9 {'hf': 'nomic-ai/nomic-embed-text-v1'} \n", - "10 {'url': 'https://storage.googleapis.com/qdrant-fastembed/fast-multilingual-e5-large.tar.gz', 'hf': 'qdrant/multilingual-e5-large-onnx'} \n", - "11 {'hf': 'xenova/paraphrase-multilingual-mpnet-base-v2'} \n", - "12 {'hf': 'xenova/jina-embeddings-v2-base-en'} \n", - "13 {'hf': 'xenova/jina-embeddings-v2-small-en'} " + "8 {'hf': 'nomic-ai/nomic-embed-text-v1'} \n", + "9 {'hf': 'nomic-ai/nomic-embed-text-v1.5'} \n", + "10 {'hf': 'qdrant/gte-large-onnx'} \n", + "11 {'url': 'https://storage.googleapis.com/qdrant-fastembed/fast-multilingual-e5-large.tar.gz', 'hf': 'qdrant/multilingual-e5-large-onnx'} \n", + "12 {'hf': 'xenova/paraphrase-multilingual-mpnet-base-v2'} \n", + "13 {'hf': 'qdrant/paraphrase-multilingual-MiniLM-L12-v2-onnx-Q'} \n", + "14 {'hf': 'xenova/jina-embeddings-v2-base-en'} \n", + "15 {'hf': 'xenova/jina-embeddings-v2-small-en'} " ] }, - "execution_count": 2, + "execution_count": 6, "metadata": {}, "output_type": "execute_result" } @@ -256,7 +280,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.11.4" + "version": "3.10.13" }, "orig_nbformat": 4 }, diff --git a/fastembed/models.json b/fastembed/models.json deleted file mode 100644 index f10bd3af..00000000 --- a/fastembed/models.json +++ /dev/null @@ -1,143 +0,0 @@ -[ - { - "model": "BAAI/bge-base-en", - "dim": 768, - "description": "Base English model", - "size_in_GB": 0.5, - "hf_sources": [], - "compressed_url_sources": [ - "https://storage.googleapis.com/qdrant-fastembed/fast-bge-base-en.tar.gz" - ] - }, - { - "model": "BAAI/bge-base-en-v1.5", - "dim": 768, - "description": "Base English model, v1.5", - "size_in_GB": 0.44, - "hf_sources": [ - "qdrant/bge-base-en-v1.5-onnx-q" - ], - "compressed_url_sources": [ - "https://storage.googleapis.com/qdrant-fastembed/fast-bge-base-en-v1.5.tar.gz" - ] - }, - { - "model": "BAAI/bge-large-en-v1.5", - "dim": 1024, - "description": "Large English model, v1.5", - "size_in_GB": 1.34, - "hf_sources": [ - "qdrant/bge-large-en-v1.5-onnx", - "qdrant/bge-large-en-v1.5-onnx-q" - ], - "compressed_url_sources": [] - }, - { - "model": "BAAI/bge-small-en", - "dim": 384, - "description": "Fast English model", - "size_in_GB": 0.2, - "hf_sources": [], - "compressed_url_sources": [ - "https://storage.googleapis.com/qdrant-fastembed/fast-bge-small-en.tar.gz", - "https://storage.googleapis.com/qdrant-fastembed/BAAI-bge-small-en.tar.gz" - ] - }, - { - "model": "BAAI/bge-small-en-v1.5", - "dim": 384, - "description": "Fast and Default English model", - "size_in_GB": 0.13, - "hf_sources": [ - "qdrant/bge-small-en-v1.5-onnx-q" - ], - "compressed_url_sources": [ - "https://storage.googleapis.com/qdrant-fastembed/fast-bge-small-en-v1.5.tar.gz" - ] - }, - { - "model": "BAAI/bge-small-zh-v1.5", - "dim": 512, - "description": "Fast and recommended Chinese model", - "size_in_GB": 0.1, - "hf_sources": [], - "compressed_url_sources": [ - "https://storage.googleapis.com/qdrant-fastembed/fast-bge-small-zh-v1.5.tar.gz" - ] - }, - { - "model": "intfloat/multilingual-e5-large", - "dim": 1024, - "description": "Multilingual model, e5-large. Recommend using this model for non-English languages", - "size_in_GB": 2.24, - "hf_sources": [ - "qdrant/multilingual-e5-large-onnx" - ], - "compressed_url_sources": [ - "https://storage.googleapis.com/qdrant-fastembed/intfloat-multilingual-e5-large.tar.gz" - ] - }, - { - "model": "jinaai/jina-embeddings-v2-base-en", - "dim": 768, - "description": "English embedding model supporting 8192 sequence length", - "size_in_GB": 0.55, - "hf_sources": [ - "xenova/jina-embeddings-v2-base-en" - ], - "compressed_url_sources": [] - }, - { - "model": "jinaai/jina-embeddings-v2-small-en", - "dim": 512, - "description": " English embedding model supporting 8192 sequence length", - "size_in_GB": 0.13, - "hf_sources": [ - "xenova/jina-embeddings-v2-small-en" - ], - "compressed_url_sources": [] - }, - { - "model": "sentence-transformers/all-MiniLM-L6-v2", - "dim": 384, - "description": "Sentence Transformer model, MiniLM-L6-v2", - "size_in_GB": 0.09, - "hf_sources": [ - "qdrant/all-MiniLM-L6-v2-onnx" - ], - "compressed_url_sources": [ - "https://storage.googleapis.com/qdrant-fastembed/fast-all-MiniLM-L6-v2.tar.gz", - "https://storage.googleapis.com/qdrant-fastembed/sentence-transformers-all-MiniLM-L6-v2.tar.gz" - ] - }, - { - "model": "sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2", - "dim": 384, - "description": "Sentence Transformer model, paraphrase-multilingual-MiniLM-L12-v2", - "size_in_GB": 0.46, - "hf_sources": [ - "qdrant/paraphrase-multilingual-MiniLM-L12-v2-onnx-Q" - ], - "compressed_url_sources": [] - }, - { - "model": "xenova/multilingual-e5-large", - "dim": 1024, - "description": "Multilingual model. Recommended for non-English languages", - "size_in_GB": 2.24, - "hf_sources": [ - "xenova/multilingual-e5-large" - ], - "compressed_url_sources": [] - }, - { - "model": "xenova/paraphrase-multilingual-mpnet-base-v2", - "dim": 768, - "description": "Sentence-transformers model for tasks like clustering or semantic search", - "size_in_GB": 1.11, - "hf_sources": [ - "xenova/paraphrase-multilingual-mpnet-base-v2" - ], - "compressed_url_sources": [] - } -] \ No newline at end of file diff --git a/fastembed/text/onnx_embedding.py b/fastembed/text/onnx_embedding.py index 1d194450..8ff6cd65 100644 --- a/fastembed/text/onnx_embedding.py +++ b/fastembed/text/onnx_embedding.py @@ -116,6 +116,24 @@ "hf": "nomic-ai/nomic-embed-text-v1", }, }, + { + "model": "nomic-ai/nomic-embed-text-v1.5", + "dim": 768, + "description": "8192 context length english model", + "size_in_GB": 0.54, + "sources": { + "hf": "nomic-ai/nomic-embed-text-v1.5", + }, + }, + { + "model": "thenlper/gte-large", + "dim": 1024, + "description": "Large general text embeddings model", + "size_in_GB": 1.34, + "sources": { + "hf": "qdrant/gte-large-onnx", + }, + }, # { # "model": "sentence-transformers/all-MiniLM-L6-v2", # "dim": 384, diff --git a/tests/test_onnx_embeddings.py b/tests/test_onnx_embeddings.py deleted file mode 100644 index ea255af2..00000000 --- a/tests/test_onnx_embeddings.py +++ /dev/null @@ -1,78 +0,0 @@ -import os - -import numpy as np -import pytest - -from fastembed.embedding import DefaultEmbedding, JinaEmbedding - -CANONICAL_VECTOR_VALUES = { - "BAAI/bge-small-en": np.array([-0.0232, -0.0255, 0.0174, -0.0639, -0.0006]), - "BAAI/bge-small-en-v1.5": np.array([0.01522374, -0.02271799, 0.00860278, -0.07424029, 0.00386434]), - "BAAI/bge-small-zh-v1.5": np.array([-0.01023294, 0.07634465, 0.0691722, -0.04458365, -0.03160762]), - "BAAI/bge-base-en": np.array([0.0115, 0.0372, 0.0295, 0.0121, 0.0346]), - "BAAI/bge-base-en-v1.5": np.array([0.01129394, 0.05493144, 0.02615099, 0.00328772, 0.02996045]), - "BAAI/bge-large-en-v1.5": np.array([0.03434538, 0.03316108, 0.02191251, -0.03713358, -0.01577825]), - "sentence-transformers/all-MiniLM-L6-v2": np.array([0.0259, 0.0058, 0.0114, 0.0380, -0.0233]), - "sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2": np.array([0.0094, 0.0184, 0.0328, 0.0072, -0.0351]), - "intfloat/multilingual-e5-large": np.array([0.0098, 0.0045, 0.0066, -0.0354, 0.0070]), - "xenova/multilingual-e5-large": np.array([0.00975464, 0.00446568, 0.00655449, -0.0354155, 0.00702112]), - "xenova/paraphrase-multilingual-mpnet-base-v2": np.array( - [-0.01341097, 0.0416553, -0.00480805, 0.02844842, 0.0505299] - ), - "jinaai/jina-embeddings-v2-small-en": np.array([-0.0455, -0.0428, -0.0122, 0.0613, 0.0015]), - "jinaai/jina-embeddings-v2-base-en": np.array([-0.0332, -0.0509, 0.0287, -0.0043, -0.0077]), - "nomic-ai/nomic-embed-text-v1": np.array([0.0061, 0.0103, -0.0296, -0.0242, -0.0170]), -} - - -@pytest.mark.parametrize("embedding_class", [DefaultEmbedding, JinaEmbedding]) -def test_embedding(embedding_class): - is_ubuntu_ci = os.getenv("IS_UBUNTU_CI") - - for model_desc in embedding_class.list_supported_models(): - if is_ubuntu_ci == "false" and model_desc["size_in_GB"] > 1: - continue - - if model_desc["model"] not in CANONICAL_VECTOR_VALUES: - continue - - dim = model_desc["dim"] - model = embedding_class(model_name=model_desc["model"]) - - docs = ["hello world", "flag embedding"] - embeddings = list(model.embed(docs)) - embeddings = np.stack(embeddings, axis=0) - assert embeddings.shape == (2, dim) - - canonical_vector = CANONICAL_VECTOR_VALUES[model_desc["model"]] - assert np.allclose(embeddings[0, : canonical_vector.shape[0]], canonical_vector, atol=1e-3), model_desc["model"] - - -@pytest.mark.parametrize("n_dims,embedding_class", [(384, DefaultEmbedding), (768, JinaEmbedding)]) -def test_batch_embedding(n_dims, embedding_class): - model = embedding_class() - - docs = ["hello world", "flag embedding"] * 100 - embeddings = list(model.embed(docs, batch_size=10)) - embeddings = np.stack(embeddings, axis=0) - - assert embeddings.shape == (200, n_dims) - - -@pytest.mark.parametrize("n_dims,embedding_class", [(384, DefaultEmbedding), (768, JinaEmbedding)]) -def test_parallel_processing(n_dims, embedding_class): - model = embedding_class() - - docs = ["hello world", "flag embedding"] * 100 - embeddings = list(model.embed(docs, batch_size=10, parallel=2)) - embeddings = np.stack(embeddings, axis=0) - - embeddings_2 = list(model.embed(docs, batch_size=10, parallel=None)) - embeddings_2 = np.stack(embeddings_2, axis=0) - - embeddings_3 = list(model.embed(docs, batch_size=10, parallel=0)) - embeddings_3 = np.stack(embeddings_3, axis=0) - - assert embeddings.shape == (200, n_dims) - assert np.allclose(embeddings, embeddings_2, atol=1e-3) - assert np.allclose(embeddings, embeddings_3, atol=1e-3) diff --git a/tests/test_text_onnx_embeddings.py b/tests/test_text_onnx_embeddings.py index 81bcbbd3..9343434e 100644 --- a/tests/test_text_onnx_embeddings.py +++ b/tests/test_text_onnx_embeddings.py @@ -14,7 +14,7 @@ "BAAI/bge-large-en-v1.5": np.array([0.03434538, 0.03316108, 0.02191251, -0.03713358, -0.01577825]), "BAAI/bge-large-en-v1.5-quantized": np.array([0.03434538, 0.03316108, 0.02191251, -0.03713358, -0.01577825]), "sentence-transformers/all-MiniLM-L6-v2": np.array([0.0259, 0.0058, 0.0114, 0.0380, -0.0233]), - "sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2": np.array([0.0094, 0.0184, 0.0328, 0.0072, -0.0351]), + "sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2": np.array([0.0094, 0.0184, 0.0328, 0.0072, -0.0351]), "intfloat/multilingual-e5-large": np.array([0.0098, 0.0045, 0.0066, -0.0354, 0.0070]), "sentence-transformers/paraphrase-multilingual-mpnet-base-v2": np.array( [-0.01341097, 0.0416553, -0.00480805, 0.02844842, 0.0505299] @@ -22,6 +22,10 @@ "jinaai/jina-embeddings-v2-small-en": np.array([-0.0455, -0.0428, -0.0122, 0.0613, 0.0015]), "jinaai/jina-embeddings-v2-base-en": np.array([-0.0332, -0.0509, 0.0287, -0.0043, -0.0077]), "nomic-ai/nomic-embed-text-v1": np.array([0.0061, 0.0103, -0.0296, -0.0242, -0.0170]), + "nomic-ai/nomic-embed-text-v1.5": np.array( + [-1.6531514e-02, 8.5380634e-05, -1.8171231e-01, -3.9333291e-03, 1.2763254e-02] + ), + "thenlper/gte-large": np.array([-0.01920587, 0.00113156, -0.00708992, -0.00632304, -0.04025577]), }