diff --git a/.nojekyll b/.nojekyll new file mode 100644 index 00000000..e69de29b diff --git a/404.html b/404.html new file mode 100644 index 00000000..ce798910 --- /dev/null +++ b/404.html @@ -0,0 +1,727 @@ + + + +
+ + + + + + + + + + + + + + + + +Here you will learn how to use the fastembed package to embed your data into a vector space. The package is designed to be easy to use and fast. It is built on top of the ONNX standard, which allows for fast inference on a variety of hardware (called Runtimes in ONNX).
+The fastembed package is designed to be easy to use. We'll be using TextEmbedding
class. It takes a list of strings as input and returns an generator of vectors. If you're seeing generators for the first time, don't worry, you can convert it to a list using list()
.
> 💡 You can learn more about generators from Python Wiki
+!pip install -Uqq fastembed # Install fastembed
+
import numpy as np
+from fastembed import TextEmbedding
+from typing import List
+
+# Example list of documents
+documents: List[str] = [
+ "This is built to be faster and lighter than other embedding libraries e.g. Transformers, Sentence-Transformers, etc.",
+ "fastembed is supported by and maintained by Qdrant.",
+]
+
+# This will trigger the model download and initialization
+embedding_model = TextEmbedding()
+print("The model BAAI/bge-small-en-v1.5 is ready to use.")
+
+embeddings_generator = embedding_model.embed(documents) # reminder this is a generator
+embeddings_list = list(embeddings_generator)
+# you can also convert the generator to a list, and that to a numpy array
+len(embeddings_list[0]) # Vector of 384 dimensions
+
> 💡 Why do we use generators? +> +> We use them to save memory mostly. Instead of loading all the vectors into memory, we can load them one by one. This is useful when you have a large dataset and you don't want to load all the vectors at once.
+embeddings_generator = embedding_model.embed(documents) # reminder this is a generator
+
+for doc, vector in zip(documents, embeddings_generator):
+ print("Document:", doc)
+ print(f"Vector of type: {type(vector)} with shape: {vector.shape}")
+
embeddings_list = np.array(
+ list(embedding_model.embed(documents))
+) # you can also convert the generator to a list, and that to a numpy array
+embeddings_list.shape
+
We're using BAAI/bge-small-en-v1.5 a state of the art Flag Embedding model. The model does better than OpenAI text-embedding-ada-002. We've made it even faster by converting it to ONNX format and quantizing the model for you.
+The default model is built for speed and efficiency. If you need a more accurate model, you can use the TextEmbedding
class to load any model from our list of available models. You can find the list of available models using TextEmbedding.list_supported_models()
.
multilingual_large_model = TextEmbedding("intfloat/multilingual-e5-large") # This can take a few minutes to download
+
np.array(
+ list(multilingual_large_model.embed(["Hello, world!", "你好世界", "¡Hola Mundo!", "नमस्ते!"]))
+).shape # Vector of 1024 dimensions
+
Next: Checkout how to use FastEmbed with Qdrant for similarity search: FastEmbed with Qdrant
+{"use strict";/*!
+ * escape-html
+ * Copyright(c) 2012-2013 TJ Holowaychuk
+ * Copyright(c) 2015 Andreas Lubbe
+ * Copyright(c) 2015 Tiancheng "Timothy" Gu
+ * MIT Licensed
+ */var Va=/["'&<>]/;qn.exports=za;function za(e){var t=""+e,r=Va.exec(t);if(!r)return t;var o,n="",i=0,a=0;for(i=r.index;i