diff --git a/docs/examples/FastEmbed_Multi_GPU.ipynb b/docs/examples/FastEmbed_Multi_GPU.ipynb index 563ae698..77c1d16a 100644 --- a/docs/examples/FastEmbed_Multi_GPU.ipynb +++ b/docs/examples/FastEmbed_Multi_GPU.ipynb @@ -14,9 +14,9 @@ "source": [ "#### Prerequisites\n", "To get started, ensure you have the following installed:\n", - "- Python 3.8 or later\n", + "- Python 3.9 or later\n", "- Fastembed (`pip install fastembed-gpu`)\n", - "- Refer to [this](https://github.com/qdrant/fastembed/blob/main/docs/examples/FastEmbed_GPU.ipynb) tutorial if you have issues in GPU dependencies\n", + "- Refer to [this](https://github.com/qdrant/fastembed/blob/main/docs/examples/FastEmbed_GPU.ipynb) tutorial if you have issues with GPU dependencies\n", "- Access to a multi-GPU server" ] }, @@ -47,12 +47,11 @@ " model_name=\"sentence-transformers/all-MiniLM-L6-v2\",\n", " cuda=True,\n", " device_ids=device_ids,\n", - " cache_dir=\"models\",\n", " lazy_load=True,\n", " )\n", "\n", " # generate embeddings\n", - " text_embeddings = list(text_model.embed(docs, batch_size=1, parallel=len(device_ids)))\n", + " text_embeddings = list(text_model.embed(docs, batch_size=2, parallel=len(device_ids)))\n", " print(text_embeddings)" ] }, @@ -62,51 +61,14 @@ "source": [ "In this snippet:\n", "- `cuda=True` enables GPU acceleration.\n", - "- `device_ids=[0, 1]` specifies GPUs to use. Replace `[0, 1]` with your available GPU IDs.\n", + "- `device_ids=[0, 1]` specifies GPUs to use. Replace `[0, 1]` with available GPU IDs.\n", "- `lazy_load=True`\n", "\n", - "**NOTE**: When using multi-GPU settings, it is recommended to enable lazy_load. Without lazy_load, the model is initially loaded into the memory of the first GPU by the main process. Subsequently, child processes are spawned for each GPU specified in device_ids, causing the model to be loaded a second time on the first GPU. This results in redundant memory usage and potential inefficiencies." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Multi-GPU using cuda argument with ImageEmbedding" - ] - }, - { - "cell_type": "code", - "execution_count": 1, - "metadata": {}, - "outputs": [], - "source": [ - "from io import BytesIO\n", - "\n", - "import requests\n", - "from PIL import Image\n", - "from fastembed import ImageEmbedding\n", - "\n", - "\n", - "# load sample image\n", - "images = [Image.open(BytesIO(requests.get(\"https://qdrant.tech/img/logo.png\").content))] * 10\n", - "\n", - "# define gpu ids\n", - "device_ids = [0, 1]\n", + "**NOTE**: When using multi-GPU settings, it is important to configure `parallel` and `lazy_load` properly to avoid inefficiencies:\n", "\n", - "if __name__ == \"__main__\":\n", - " # initialize ImageEmbedding model\n", - " image_model = ImageEmbedding(\n", - " model_name=\"Qdrant/clip-ViT-B-32-vision\",\n", - " cuda=True,\n", - " device_ids=device_ids,\n", - " cache_dir=\"models\",\n", - " lazy_load=True,\n", - " )\n", + "`parallel`: This parameter enables multi-GPU support by spawning child processes for each GPU specified in device_ids. To ensure proper utilization, the value of `parallel` must match the number of GPUs in device_ids. If using a single GPU, this parameter is not necessary.\n", "\n", - " # generate image embeddings\n", - " image_embeddings = list(image_model.embed(images, batch_size=1, parallel=len(device_ids)))\n", - " print(image_embeddings)" + "`lazy_load`: Enabling `lazy_load` prevents redundant memory usage. Without `lazy_load`, the model is initially loaded into the memory of the first GPU by the main process. When child processes are spawned for each GPU, the model is reloaded on the first GPU, causing redundant memory consumption and inefficiencies." ] } ],