From 367178b904092707f562a823381a82f3c22a7ad5 Mon Sep 17 00:00:00 2001
From: Dmitrii Rudenko <dmitrii.rudenko@qdrant.tech>
Date: Fri, 29 Nov 2024 11:44:34 +0100
Subject: [PATCH] Tests draft

---
 experiments/colpali_convert_lang_model.py  |  31 ++
 experiments/colpali_image_test.ipynb       | 381 ++++++++++++++++++
 experiments/colpali_text_test.ipynb        | 444 +++++++++++++++++++++
 experiments/late_interaction_colpali.ipynb | 202 ++++++++++
 fastembed/common/model_management.py       |   2 -
 fastembed/late_interaction/colpali.py      |   6 +-
 6 files changed, 1062 insertions(+), 4 deletions(-)
 create mode 100644 experiments/colpali_convert_lang_model.py
 create mode 100644 experiments/colpali_image_test.ipynb
 create mode 100644 experiments/colpali_text_test.ipynb
 create mode 100644 experiments/late_interaction_colpali.ipynb

diff --git a/experiments/colpali_convert_lang_model.py b/experiments/colpali_convert_lang_model.py
new file mode 100644
index 00000000..691f0cba
--- /dev/null
+++ b/experiments/colpali_convert_lang_model.py
@@ -0,0 +1,31 @@
+import torch
+from colpali_engine.models import ColPali, ColPaliProcessor
+import onnxruntime as ort
+
+model_name = "vidore/colpali-v1.2"
+original_model = ColPali.from_pretrained(model_name).eval()
+processor = ColPaliProcessor.from_pretrained(model_name)
+
+dummy_query = ["Is attention really all you need?"]
+
+# Process the input query
+processed_query = processor.process_queries(dummy_query).to(original_model.device)
+
+# Prepare input tensors
+input_query_tensor = processed_query["input_ids"].type(torch.long)
+attention_mask_tensor = processed_query["attention_mask"].type(torch.long)
+
+# Export the model to ONNX with the required inputs and dynamic shapes
+torch.onnx.export(
+    original_model.model.language_model,
+    (input_query_tensor, attention_mask_tensor),
+    "experiments/colpali_text_encoder_dir/model.onnx",
+    input_names=["input_ids", "attention_mask"],
+    output_names=["logits"],
+    dynamo=True,
+    opset_version=14,
+)
+
+
+image_session = ort.InferenceSession("experiments/colpali_text_encoder_dir/model.onnx")
+print("Session output", image_session((input_query_tensor, attention_mask_tensor)))
diff --git a/experiments/colpali_image_test.ipynb b/experiments/colpali_image_test.ipynb
new file mode 100644
index 00000000..354bc596
--- /dev/null
+++ b/experiments/colpali_image_test.ipynb
@@ -0,0 +1,381 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "id": "initial_id",
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2024-11-28T10:02:39.315496Z",
+     "start_time": "2024-11-28T10:02:39.290846Z"
+    },
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": [
+    "from PIL import Image\n",
+    "\n",
+    "images = [\n",
+    "    Image.open(\"/Users/d.rudenko/PycharmProjects/opensource/fastembed/tests/misc/image.jpeg\"),\n",
+    "    Image.open(\n",
+    "        \"/Users/d.rudenko/PycharmProjects/opensource/fastembed/tests/misc/small_image.jpeg\"\n",
+    "    ),\n",
+    "]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "e46189ce4b8b0677",
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2024-11-28T09:58:37.254586Z",
+     "start_time": "2024-11-28T09:58:22.754066Z"
+    }
+   },
+   "outputs": [
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "ba9856c5109643049718592a236b2206",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Fetching 7 files:   0%|          | 0/7 [00:00<?, ?it/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    }
+   ],
+   "source": [
+    "from fastembed.image.image_embedding import ColpaliImageModel\n",
+    "\n",
+    "model_f_i = ColpaliImageModel(model_name=\"akshayballal/colpali-v1.2-merged\")\n",
+    "fastembed_i_embeddings = list(model_f_i.embed(images))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "id": "6c245dd6ad8db891",
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2024-11-28T10:02:43.888293Z",
+     "start_time": "2024-11-28T10:02:41.543525Z"
+    }
+   },
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "You are passing both `text` and `images` to `PaliGemmaProcessor`. The processor expects special image tokens in the text, as many tokens as there are images per each text. It is recommended to add `<image>` tokens in the very beginning of your text and `<bos>` token after that. For this call, we will infer how many images each text has and add special tokens.\n"
+     ]
+    }
+   ],
+   "source": [
+    "from colpali_engine.models import ColPaliProcessor\n",
+    "\n",
+    "model_name = \"vidore/colpali-v1.2-merged\"\n",
+    "\n",
+    "processor = ColPaliProcessor.from_pretrained(model_name)\n",
+    "# Process the inputs\n",
+    "batch_images_onnx = processor.process_images(images)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "id": "89c2fbe3d64964fc",
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2024-11-28T10:03:56.766986Z",
+     "start_time": "2024-11-28T10:02:43.893495Z"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "import onnxruntime as ort\n",
+    "\n",
+    "sess = ort.InferenceSession(\"/Users/d.rudenko/dev/qdrant/colpali-v1.2-merged-onnx/model.onnx\")\n",
+    "image_embeddings_onnx = sess.run(\n",
+    "    [sess.get_outputs()[0].name],\n",
+    "    {\n",
+    "        \"input_ids\": batch_images_onnx[\"input_ids\"].numpy(),\n",
+    "        \"pixel_values\": batch_images_onnx[\"pixel_values\"].numpy(),\n",
+    "        \"attention_mask\": batch_images_onnx[\"attention_mask\"].numpy(),\n",
+    "    },\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "id": "61b43dd6caaa0909",
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2024-11-28T10:06:23.238770Z",
+     "start_time": "2024-11-28T10:06:23.235457Z"
+    }
+   },
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "(1, 2, 1030, 128)"
+      ]
+     },
+     "execution_count": 8,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "import numpy as np\n",
+    "\n",
+    "np.array(image_embeddings_onnx).shape"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 18,
+   "id": "5be8ebb15c6dfaa6",
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2024-11-28T10:59:48.765049Z",
+     "start_time": "2024-11-28T10:59:48.761122Z"
+    }
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[[[ 0.015  0.051  0.059  0.026 -0.061 -0.027 -0.014]\n",
+      "  [-0.22  -0.111  0.046  0.081 -0.048 -0.052 -0.086]\n",
+      "  [-0.184 -0.131  0.004  0.062 -0.038 -0.059 -0.127]\n",
+      "  [-0.209 -0.113  0.015  0.059 -0.035 -0.035 -0.072]\n",
+      "  [-0.031 -0.044  0.092 -0.005  0.006 -0.057 -0.061]\n",
+      "  [-0.18  -0.039  0.031  0.003  0.083 -0.041  0.088]\n",
+      "  [-0.091  0.023  0.116 -0.02   0.039 -0.064 -0.026]]\n",
+      "\n",
+      " [[-0.25  -0.112 -0.065 -0.014  0.005 -0.092  0.024]\n",
+      "  [-0.22  -0.096 -0.014  0.039 -0.02  -0.12  -0.004]\n",
+      "  [-0.228 -0.114  0.031  0.019  0.034 -0.052 -0.031]\n",
+      "  [-0.274 -0.186  0.095 -0.019  0.017  0.021 -0.016]\n",
+      "  [-0.186 -0.061 -0.01   0.065 -0.058 -0.05   0.019]\n",
+      "  [-0.183 -0.11  -0.034 -0.042  0.026 -0.071  0.02 ]\n",
+      "  [-0.153 -0.072 -0.015  0.088 -0.081 -0.043  0.04 ]]]\n"
+     ]
+    }
+   ],
+   "source": [
+    "print(np.round(image_embeddings_onnx[0][:, :7, :7], decimals=3))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 13,
+   "id": "bc9f7ffda971d3ba",
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2024-11-28T10:59:02.286294Z",
+     "start_time": "2024-11-28T10:59:02.264997Z"
+    }
+   },
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "array([ 0.01533 ,  0.05118 ,  0.05948 ,  0.02583 , -0.06128 , -0.02682 ,\n",
+       "       -0.013565,  0.10254 , -0.0983  ,  0.1109  , -0.00342 , -0.0344  ,\n",
+       "       -0.00887 , -0.1616  ,  0.09814 ,  0.2257  ,  0.03976 ,  0.03687 ,\n",
+       "        0.1648  ,  0.06866 ,  0.0396  ,  0.1672  ,  0.1455  , -0.1387  ,\n",
+       "        0.1203  ,  0.04907 , -0.07965 , -0.0885  ,  0.01982 ,  0.0404  ,\n",
+       "       -0.07513 , -0.02844 ,  0.04337 ,  0.03857 , -0.1065  ,  0.0288  ,\n",
+       "       -0.1279  , -0.1126  ,  0.03363 , -0.0507  ,  0.11584 ,  0.0483  ,\n",
+       "        0.035   , -0.08417 , -0.0907  ,  0.0279  ,  0.1394  , -0.10364 ,\n",
+       "       -0.1471  , -0.07135 , -0.136   ,  0.1289  ,  0.082   ,  0.02232 ,\n",
+       "       -0.00571 , -0.02547 ,  0.1053  ,  0.0377  ,  0.0148  ,  0.02795 ,\n",
+       "       -0.01859 , -0.11066 , -0.12195 ,  0.0583  ,  0.0995  ,  0.01086 ,\n",
+       "        0.0859  ,  0.1302  , -0.10126 ,  0.005417,  0.05423 , -0.1808  ,\n",
+       "        0.1444  ,  0.1885  ,  0.09247 , -0.04718 ,  0.1018  , -0.02997 ,\n",
+       "       -0.0598  , -0.011284,  0.1203  , -0.1313  , -0.04584 , -0.02725 ,\n",
+       "       -0.1277  , -0.04236 , -0.08466 , -0.0861  ,  0.1131  ,  0.02806 ,\n",
+       "       -0.0947  ,  0.04388 ,  0.04263 ,  0.03598 , -0.06866 , -0.06018 ,\n",
+       "       -0.02763 , -0.0972  ,  0.11505 , -0.1097  , -0.04166 ,  0.0742  ,\n",
+       "       -0.06683 , -0.02188 , -0.1663  , -0.0902  ,  0.02594 , -0.03802 ,\n",
+       "       -0.034   , -0.04828 , -0.05765 ,  0.0633  , -0.02515 , -0.08826 ,\n",
+       "       -0.09753 , -0.10974 , -0.074   , -0.02083 , -0.1301  ,  0.1383  ,\n",
+       "        0.1428  ,  0.0935  ,  0.0949  ,  0.03876 ,  0.08514 , -0.12256 ,\n",
+       "       -0.0451  , -0.002306], dtype=float16)"
+      ]
+     },
+     "execution_count": 13,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "np.array(image_embeddings_onnx)[0][0][0]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "id": "34a238b20e5fcab2",
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2024-11-20T15:39:41.314176Z",
+     "start_time": "2024-11-20T15:39:41.308579Z"
+    }
+   },
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "True"
+      ]
+     },
+     "execution_count": 5,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "import numpy as np\n",
+    "\n",
+    "np.allclose(image_embeddings_onnx[0][0], fastembed_i_embeddings[0])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "id": "5ca3b11eb3813a87",
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2024-11-20T15:39:42.081408Z",
+     "start_time": "2024-11-20T15:39:42.078582Z"
+    }
+   },
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "array([ 0.01533 ,  0.05118 ,  0.05948 ,  0.02583 , -0.06128 , -0.02682 ,\n",
+       "       -0.013565,  0.10254 , -0.0983  ,  0.1109  , -0.00342 , -0.0344  ,\n",
+       "       -0.00887 , -0.1616  ,  0.09814 ,  0.2257  ,  0.03976 ,  0.03687 ,\n",
+       "        0.1648  ,  0.06866 ,  0.0396  ,  0.1672  ,  0.1455  , -0.1387  ,\n",
+       "        0.1203  ,  0.04907 , -0.07965 , -0.0885  ,  0.01982 ,  0.0404  ,\n",
+       "       -0.07513 , -0.02844 ,  0.04337 ,  0.03857 , -0.1065  ,  0.0288  ,\n",
+       "       -0.1279  , -0.1126  ,  0.03363 , -0.0507  ,  0.11584 ,  0.0483  ,\n",
+       "        0.035   , -0.08417 , -0.0907  ,  0.0279  ,  0.1394  , -0.10364 ,\n",
+       "       -0.1471  , -0.07135 , -0.136   ,  0.1289  ,  0.082   ,  0.02232 ,\n",
+       "       -0.00571 , -0.02547 ,  0.1053  ,  0.0377  ,  0.0148  ,  0.02795 ,\n",
+       "       -0.01859 , -0.11066 , -0.12195 ,  0.0583  ,  0.0995  ,  0.01086 ,\n",
+       "        0.0859  ,  0.1302  , -0.10126 ,  0.005417,  0.05423 , -0.1808  ,\n",
+       "        0.1444  ,  0.1885  ,  0.09247 , -0.04718 ,  0.1018  , -0.02997 ,\n",
+       "       -0.0598  , -0.011284,  0.1203  , -0.1313  , -0.04584 , -0.02725 ,\n",
+       "       -0.1277  , -0.04236 , -0.08466 , -0.0861  ,  0.1131  ,  0.02806 ,\n",
+       "       -0.0947  ,  0.04388 ,  0.04263 ,  0.03598 , -0.06866 , -0.06018 ,\n",
+       "       -0.02763 , -0.0972  ,  0.11505 , -0.1097  , -0.04166 ,  0.0742  ,\n",
+       "       -0.06683 , -0.02188 , -0.1663  , -0.0902  ,  0.02594 , -0.03802 ,\n",
+       "       -0.034   , -0.04828 , -0.05765 ,  0.0633  , -0.02515 , -0.08826 ,\n",
+       "       -0.09753 , -0.10974 , -0.074   , -0.02083 , -0.1301  ,  0.1383  ,\n",
+       "        0.1428  ,  0.0935  ,  0.0949  ,  0.03876 ,  0.08514 , -0.12256 ,\n",
+       "       -0.0451  , -0.002306], dtype=float16)"
+      ]
+     },
+     "execution_count": 6,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "image_embeddings_onnx[0][0][0]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "id": "2c52a4d7d83aeda7",
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2024-11-20T16:05:32.115768Z",
+     "start_time": "2024-11-20T16:05:32.090218Z"
+    }
+   },
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "[array([[ 0.01532745,  0.05117798,  0.05947876, ..., -0.12255859,\n",
+       "         -0.04510498, -0.00230598],\n",
+       "        [-0.22009277, -0.11071777,  0.04562378, ...,  0.00257111,\n",
+       "         -0.06988525,  0.12384033],\n",
+       "        [-0.18371582, -0.13085938,  0.00393677, ..., -0.02949524,\n",
+       "         -0.05444336,  0.1295166 ],\n",
+       "        ...,\n",
+       "        [-0.1418457 ,  0.01023102,  0.1239624 , ..., -0.00460434,\n",
+       "          0.17321777,  0.09454346],\n",
+       "        [-0.24572754, -0.06878662,  0.11834717, ..., -0.02763367,\n",
+       "         -0.03022766,  0.08917236],\n",
+       "        [-0.2211914 , -0.04171753,  0.19519043, ..., -0.01535797,\n",
+       "         -0.02432251, -0.03561401]], dtype=float32),\n",
+       " array([[-2.49877930e-01, -1.11511230e-01, -6.51855469e-02, ...,\n",
+       "          3.19519043e-02,  3.44543457e-02, -1.33666992e-02],\n",
+       "        [-2.20336914e-01, -9.56420898e-02, -1.39694214e-02, ...,\n",
+       "         -8.88705254e-05, -1.57318115e-02, -1.00555420e-02],\n",
+       "        [-2.28271484e-01, -1.14501953e-01,  3.10058594e-02, ...,\n",
+       "          7.59277344e-02, -4.28466797e-02,  1.19262695e-01],\n",
+       "        ...,\n",
+       "        [-2.04589844e-01, -4.86755371e-02,  8.46557617e-02, ...,\n",
+       "         -3.98254395e-02,  1.66625977e-01,  9.71679688e-02],\n",
+       "        [-2.88085938e-01, -4.50439453e-02,  7.69653320e-02, ...,\n",
+       "         -4.36096191e-02, -1.28784180e-02,  6.26220703e-02],\n",
+       "        [-2.67578125e-01, -3.25317383e-02,  1.66625977e-01, ...,\n",
+       "         -2.90679932e-03, -1.52282715e-02, -3.62243652e-02]], dtype=float32)]"
+      ]
+     },
+     "execution_count": 7,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "fastembed_i_embeddings"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "786bfac25eb7704a",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 2
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython2",
+   "version": "2.7.6"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
diff --git a/experiments/colpali_text_test.ipynb b/experiments/colpali_text_test.ipynb
new file mode 100644
index 00000000..b3392089
--- /dev/null
+++ b/experiments/colpali_text_test.ipynb
@@ -0,0 +1,444 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "id": "54b3bfd4ad5b9ee6",
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2024-11-28T10:44:32.841758Z",
+     "start_time": "2024-11-28T10:44:32.830025Z"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "# Your inputs\n",
+    "queries = [\n",
+    "    # \"Is attention really all you need?\",\n",
+    "    # \"Are Benjamin, Antoine, Merve, and Jo best friends?\",\n",
+    "    # \"Long long long long long long long long long long long long long long long long long long long long long long long long long long long long long long long long long long long long long long long long long long long long long long long long long long long long long long long long long long long long long long long long long long long long long long long long long long long long long long long long long long long long long long long long long long long long long long long long long long long long long long long long long long long long long\"\n",
+    "    \"hello world\",\n",
+    "    \"flag embedding\",\n",
+    "]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "id": "86ee1b68fb88b11d",
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2024-11-27T22:54:23.016952Z",
+     "start_time": "2024-11-27T22:33:14.872976Z"
+    }
+   },
+   "outputs": [
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "f1b463d5ae47404f951fecc6629e8008",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Fetching 7 files:   0%|          | 0/7 [00:00<?, ?it/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "efe53dfd5f944b0e8471640bb06853fa",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "config.json:   0%|          | 0.00/999 [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "0cf4ef27186a4f168ac773327db6f69f",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "preprocessor_config.json:   0%|          | 0.00/700 [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "29410bf54eef4416aaa403ae1c8570af",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "special_tokens_map.json:   0%|          | 0.00/733 [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "8712ea1c7cb248a09f1426b396697d57",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "tokenizer_config.json:   0%|          | 0.00/243k [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "9c0f9d23532644cea2d73386526e0160",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "tokenizer.json:   0%|          | 0.00/17.8M [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "1e24d6175e664e56963e40cd049abf89",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "model.onnx_data:   0%|          | 0.00/5.85G [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "3b89652848024b0397d5a14b10e69f1f",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "model.onnx:   0%|          | 0.00/2.08M [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    }
+   ],
+   "source": [
+    "from fastembed.text.colpali_model import ColpaliTextModel\n",
+    "\n",
+    "model_f_i = ColpaliTextModel(model_name=\"akshayballal/colpali-v1.2-merged\")\n",
+    "fastembed_q_embeddings = list(model_f_i.embed(queries))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "id": "dc79153ef1d3a0e7",
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2024-11-27T22:56:33.981640Z",
+     "start_time": "2024-11-27T22:56:33.975215Z"
+    }
+   },
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "array([ 0.15808105, -0.020401  ,  0.10046387, -0.02319336,  0.04464722,\n",
+       "        0.03086853,  0.07092285,  0.03094482,  0.00642776, -0.04315186,\n",
+       "       -0.05871582, -0.12011719,  0.06945801, -0.06872559,  0.02192688,\n",
+       "        0.05203247,  0.04309082,  0.05557251,  0.14208984,  0.01930237,\n",
+       "        0.04016113, -0.03347778, -0.12988281,  0.01519775, -0.01997375,\n",
+       "        0.03387451, -0.07312012,  0.22009277, -0.04208374, -0.04373169,\n",
+       "        0.14050293,  0.05145264, -0.11676025, -0.08654785, -0.01660156,\n",
+       "        0.04003906, -0.06347656,  0.01754761, -0.00278473,  0.05606079,\n",
+       "        0.00063658, -0.07666016, -0.1842041 , -0.08789062,  0.09680176,\n",
+       "       -0.09796143,  0.22021484,  0.02334595,  0.06124878,  0.01942444,\n",
+       "       -0.1451416 ,  0.05764771, -0.05407715,  0.04977417, -0.09295654,\n",
+       "        0.15979004,  0.08251953,  0.03430176, -0.12548828, -0.05374146,\n",
+       "       -0.03567505,  0.05725098,  0.01432037,  0.09197998,  0.05490112,\n",
+       "       -0.02153015,  0.20117188,  0.05340576,  0.15795898,  0.06793213,\n",
+       "       -0.10949707,  0.02668762, -0.01317596, -0.15319824, -0.04272461,\n",
+       "       -0.04919434,  0.0690918 ,  0.01977539, -0.21496582,  0.08673096,\n",
+       "       -0.02093506, -0.07617188,  0.12011719, -0.02085876,  0.06143188,\n",
+       "       -0.12145996,  0.14746094, -0.16052246,  0.05407715,  0.09411621,\n",
+       "        0.12561035,  0.06161499, -0.10467529, -0.08300781,  0.04089355,\n",
+       "       -0.02893066,  0.19641113, -0.02070618, -0.08050537, -0.01583862,\n",
+       "       -0.04794312,  0.12756348,  0.08563232,  0.14074707,  0.0670166 ,\n",
+       "        0.00191021,  0.03793335,  0.05007935, -0.13354492,  0.09527588,\n",
+       "       -0.09979248,  0.02902222,  0.19995117, -0.01350403, -0.08197021,\n",
+       "       -0.03619385, -0.0546875 , -0.0144043 , -0.0054512 ,  0.03964233,\n",
+       "       -0.02452087,  0.0397644 , -0.06976318, -0.02655029, -0.13708496,\n",
+       "       -0.04150391, -0.08532715, -0.02980042], dtype=float32)"
+      ]
+     },
+     "execution_count": 5,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "fastembed_q_embeddings[0][0]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "id": "d22988ab121a057c",
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2024-11-20T15:36:19.343805Z",
+     "start_time": "2024-11-20T15:36:19.336606Z"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "import numpy as np\n",
+    "\n",
+    "documents = model_f_i._preprocess_queries(queries)\n",
+    "encoded = model_f_i.tokenize(documents)\n",
+    "input_ids = np.array([[2, 9413] + e.ids[2:] for e in encoded])\n",
+    "\n",
+    "attention_mask = np.array([e.attention_mask for e in encoded])\n",
+    "onnx_input = {\"input_ids\": np.array(input_ids, dtype=np.int64)}\n",
+    "onnx_input = model_f_i._preprocess_onnx_input(onnx_input)\n",
+    "onnx_input[\"attention_mask\"] = attention_mask"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "id": "initial_id",
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2024-11-28T10:44:40.207323Z",
+     "start_time": "2024-11-28T10:44:35.659515Z"
+    },
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": [
+    "import numpy as np\n",
+    "import onnxruntime as ort\n",
+    "from colpali_engine.models import ColPaliProcessor\n",
+    "\n",
+    "model_name = \"vidore/colpali-v1.2-merged\"\n",
+    "\n",
+    "processor = ColPaliProcessor.from_pretrained(model_name)\n",
+    "processor.tokenizer.max_length = 10\n",
+    "batch_queries = processor.process_queries(queries, max_length=5)\n",
+    "pixel_values = np.zeros((batch_queries[\"input_ids\"].shape[0], 3, 448, 448), dtype=np.float32)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "id": "babec1df6d511c97",
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2024-11-20T15:36:24.454605Z",
+     "start_time": "2024-11-20T15:36:24.440732Z"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "assert np.allclose(onnx_input[\"input_ids\"], batch_queries[\"input_ids\"])\n",
+    "assert np.allclose(onnx_input[\"attention_mask\"], batch_queries[\"attention_mask\"])\n",
+    "assert np.allclose(onnx_input[\"pixel_values\"], pixel_values)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "id": "5d5b902398645476",
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2024-11-28T10:45:29.902461Z",
+     "start_time": "2024-11-28T10:44:41.427491Z"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "sess = ort.InferenceSession(\"/Users/d.rudenko/dev/qdrant/colpali-v1.2-merged-onnx/model.onnx\")\n",
+    "\n",
+    "query_embeddings = sess.run(\n",
+    "    [sess.get_outputs()[0].name],\n",
+    "    {\n",
+    "        \"input_ids\": batch_queries[\"input_ids\"].numpy(),\n",
+    "        \"pixel_values\": pixel_values,\n",
+    "        \"attention_mask\": batch_queries[\"attention_mask\"].numpy(),\n",
+    "    },\n",
+    ")[0]\n",
+    "query_embeddings = np.array(query_embeddings)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 33,
+   "id": "682cec3898fb2e32",
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2024-11-28T10:54:28.404363Z",
+     "start_time": "2024-11-28T10:54:28.401732Z"
+    }
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[[ 0.158 -0.02   0.1   -0.023  0.045  0.031  0.071]\n",
+      " [-0.074 -0.111  0.065 -0.    -0.089 -0.003 -0.099]\n",
+      " [-0.034 -0.014  0.174 -0.063 -0.09  -0.036  0.064]\n",
+      " [-0.07  -0.014  0.186 -0.013 -0.021 -0.062  0.107]\n",
+      " [-0.085  0.025  0.179 -0.101  0.036 -0.089  0.098]\n",
+      " [-0.058  0.031  0.18  -0.078  0.023 -0.119  0.131]\n",
+      " [-0.067  0.038  0.188 -0.079 -0.001 -0.123  0.127]\n",
+      " [-0.063  0.037  0.204 -0.069  0.003 -0.118  0.134]\n",
+      " [-0.054  0.036  0.212 -0.072 -0.001 -0.117  0.133]\n",
+      " [-0.044  0.03   0.218 -0.077 -0.003 -0.107  0.139]\n",
+      " [-0.037  0.033  0.22  -0.088  0.    -0.095  0.146]\n",
+      " [-0.031  0.041  0.213 -0.092  0.001 -0.088  0.147]\n",
+      " [-0.026  0.047  0.204 -0.089 -0.002 -0.084  0.144]\n",
+      " [-0.027  0.051  0.199 -0.084 -0.007 -0.083  0.14 ]\n",
+      " [-0.031  0.056  0.19  -0.082 -0.011 -0.086  0.135]\n",
+      " [-0.008  0.108  0.144 -0.095 -0.018 -0.086  0.085]]\n"
+     ]
+    }
+   ],
+   "source": [
+    "print(np.round(query_embeddings[0][:, :7], decimals=3))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "id": "15ff17e6b6fd7af5",
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2024-11-20T15:37:16.295953Z",
+     "start_time": "2024-11-20T15:37:16.293634Z"
+    }
+   },
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "(2, 16, 128)"
+      ]
+     },
+     "execution_count": 8,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "query_embeddings.shape"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "id": "b85e775a53e463d7",
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2024-11-20T15:37:18.610588Z",
+     "start_time": "2024-11-20T15:37:18.608215Z"
+    }
+   },
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "True"
+      ]
+     },
+     "execution_count": 10,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "np.allclose(query_embeddings, np.array(fastembed_q_embeddings))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 11,
+   "id": "99e615de8452f847",
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2024-11-20T15:37:19.371624Z",
+     "start_time": "2024-11-20T15:37:19.368963Z"
+    }
+   },
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "True"
+      ]
+     },
+     "execution_count": 11,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "np.allclose(query_embeddings, np.array(fastembed_q_embeddings))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "6738a639e64f0b71",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 2
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython2",
+   "version": "2.7.6"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
diff --git a/experiments/late_interaction_colpali.ipynb b/experiments/late_interaction_colpali.ipynb
new file mode 100644
index 00000000..11537bf9
--- /dev/null
+++ b/experiments/late_interaction_colpali.ipynb
@@ -0,0 +1,202 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "id": "57beaefa0f2647f1",
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2024-11-28T10:27:49.893832Z",
+     "start_time": "2024-11-28T10:27:46.562442Z"
+    }
+   },
+   "outputs": [
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "15d5aff7ab6746fca768261532fd6cef",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Fetching 7 files:   0%|          | 0/7 [00:00<?, ?it/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    }
+   ],
+   "source": [
+    "from tests.config import TEST_MISC_DIR\n",
+    "from PIL import Image\n",
+    "import numpy as np\n",
+    "\n",
+    "from fastembed.late_interaction.colpali import ColPali\n",
+    "\n",
+    "model = ColPali(\n",
+    "    \"akshayballal/colpali-v1.2-merged\",\n",
+    "    cache_dir=\"/Users/d.rudenko/PycharmProjects/opensource/fastembed/fake_cache\",\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "id": "dab18cf1ffd0c644",
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2024-11-28T10:28:12.187816Z",
+     "start_time": "2024-11-28T10:27:49.902833Z"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "docs = [\"hello world\", \"flag embedding\"]\n",
+    "expected_output = np.array([0.15808105, -0.020401, 0.10046387, -0.02319336, 0.04464722])\n",
+    "embeddings = np.array(list(model.embed(docs, is_doc=True)))\n",
+    "\n",
+    "assert embeddings.shape == (len(docs), 16, 128)\n",
+    "assert np.allclose(embeddings[0][0, : expected_output.shape[0]], expected_output, atol=1e-3)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "id": "3c50c019b2220f5d",
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2024-11-28T10:43:33.922105Z",
+     "start_time": "2024-11-28T10:42:26.996731Z"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "images = [\n",
+    "    TEST_MISC_DIR / \"image.jpeg\",\n",
+    "    str(TEST_MISC_DIR / \"small_image.jpeg\"),\n",
+    "    Image.open((TEST_MISC_DIR / \"small_image.jpeg\")),\n",
+    "]\n",
+    "expected_output = np.array([0.01533, 0.05118, 0.05948, 0.02583, -0.06128, -0.02682])\n",
+    "embeddings = np.array(list(model.embed(images)))\n",
+    "assert embeddings.shape == (len(images), 1030, 128)\n",
+    "assert np.allclose(embeddings[0][0, : expected_output.shape[0]], expected_output, atol=1e-3)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "id": "3b0e778ef1aefed8",
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2024-11-28T10:07:53.734108Z",
+     "start_time": "2024-11-28T10:07:53.728659Z"
+    }
+   },
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "(3, 128, 1030)"
+      ]
+     },
+     "execution_count": 6,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "embeddings.shape"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "id": "f8f6cda82bd2463a",
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2024-11-28T10:43:33.981622Z",
+     "start_time": "2024-11-28T10:43:33.973188Z"
+    }
+   },
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "array([ 0.01533,  0.05118,  0.05948,  0.02583, -0.06128, -0.02682])"
+      ]
+     },
+     "execution_count": 6,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "expected_output"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "id": "c3525805415a1b83",
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2024-11-28T10:30:44.212927Z",
+     "start_time": "2024-11-28T10:30:44.192814Z"
+    }
+   },
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "{'__init__',\n",
+       " '_post_process_onnx_output',\n",
+       " '_preprocess_onnx_image_input',\n",
+       " '_preprocess_onnx_text_input',\n",
+       " '_preprocess_queries',\n",
+       " 'embed',\n",
+       " 'load_onnx_model',\n",
+       " 'onnx_embed',\n",
+       " 'onnx_embed_image',\n",
+       " 'onnx_embed_text'}"
+      ]
+     },
+     "execution_count": 4,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "model._called_methods"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "514067dd33ab4797",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 2
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython2",
+   "version": "2.7.6"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
diff --git a/fastembed/common/model_management.py b/fastembed/common/model_management.py
index d7423a99..5ce95a49 100644
--- a/fastembed/common/model_management.py
+++ b/fastembed/common/model_management.py
@@ -119,8 +119,6 @@ def download_files_from_huggingface(
         if extra_patterns is not None:
             allow_patterns.extend(extra_patterns)
 
-        os.environ["HF_HUB_ENABLE_HF_TRANSFER"] = "1"
-
         return snapshot_download(
             repo_id=hf_source_repo,
             allow_patterns=allow_patterns,
diff --git a/fastembed/late_interaction/colpali.py b/fastembed/late_interaction/colpali.py
index 9ef9c880..eee1c3a7 100644
--- a/fastembed/late_interaction/colpali.py
+++ b/fastembed/late_interaction/colpali.py
@@ -207,9 +207,10 @@ def __init__(
             self.device_id = self.device_ids[0]
         else:
             self.device_id = None
-        self.load_onnx_model()
+        if not self.lazy_load:
+            self.load_onnx_model()
+
         self.processor = load_preprocessor(model_dir=self._model_dir)
-        self.tokenizer.enable_truncation(max_length=maxsize)
 
     def load_onnx_model(self) -> None:
         """
@@ -223,6 +224,7 @@ def load_onnx_model(self) -> None:
             cuda=self.cuda,
             device_id=self.device_id,
         )
+        self.tokenizer.enable_truncation(max_length=maxsize)
 
 
 class ColPaliEmbeddingWorker(TextEmbeddingWorker):