Skip to content

Commit

Permalink
add analysis
Browse files Browse the repository at this point in the history
  • Loading branch information
mdouze committed Aug 14, 2024
1 parent ea076f1 commit f7cc101
Show file tree
Hide file tree
Showing 2 changed files with 479 additions and 0 deletions.
230 changes: 230 additions & 0 deletions notebooks/check_eval_jitter_due_to_duplicates.ipynb
Original file line number Diff line number Diff line change
@@ -0,0 +1,230 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"id": "ffc775a5-92b1-4df8-8f3d-6dfbd2050bae",
"metadata": {},
"outputs": [],
"source": [
"\n",
"# data/yfcc100M/GT.private.2727415019.ibin\n",
"# /checkpoint/matthijs/billion-scale-ann-benchmarks/yfcc100M/GT.private.2727415019.ibin\n",
"# /checkpoint/matthijs/billion-scale-ann-benchmarks/yfcc100M/base.10M.u8bin"
]
},
{
"cell_type": "code",
"execution_count": 2,
"id": "9f690b2c-43cd-4ad8-b516-3dfe07658da1",
"metadata": {},
"outputs": [],
"source": [
"import sys\n",
"sys.path.append(\"/private/home/matthijs/src/big-ann-benchmarks/benchmark\")"
]
},
{
"cell_type": "code",
"execution_count": 3,
"id": "6bb4dbbe-0024-4d86-8aa3-0b8572360b83",
"metadata": {},
"outputs": [],
"source": [
"import dataset_io"
]
},
{
"cell_type": "code",
"execution_count": 11,
"id": "67f3565b-2a39-4454-8919-7e7b524804a0",
"metadata": {},
"outputs": [],
"source": [
"import numpy as np\n",
"import faiss"
]
},
{
"cell_type": "code",
"execution_count": 4,
"id": "6aba1c8e-df2c-46a7-b7d7-8aa30991cb40",
"metadata": {},
"outputs": [],
"source": [
"basedir = \"/checkpoint/matthijs/billion-scale-ann-benchmarks/\"\n",
"\n",
"gt = dataset_io.read_ibin(basedir + \"yfcc100M/GT.private.2727415019.ibin\")"
]
},
{
"cell_type": "code",
"execution_count": 6,
"id": "b785c765-dede-4269-ba9c-292eea547ae6",
"metadata": {},
"outputs": [],
"source": [
"xb = dataset_io.xbin_mmap(basedir + \"yfcc100M/base.10M.u8bin\", dtype='uint8')"
]
},
{
"cell_type": "code",
"execution_count": 7,
"id": "240772a9-6750-43e9-8ec3-1d3444a13ad8",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"(10000000, 192)"
]
},
"execution_count": 7,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"xb.shape"
]
},
{
"cell_type": "code",
"execution_count": 10,
"id": "22834b44-d65f-4b0a-91dd-32042fbeb8d7",
"metadata": {},
"outputs": [],
"source": [
"checksums = faiss.checksum(xb)"
]
},
{
"cell_type": "code",
"execution_count": 21,
"id": "a7448a21-4e8d-48d7-b5ef-8d358bd7018f",
"metadata": {},
"outputs": [],
"source": [
"u, inv, counts = np.unique(checksums, return_inverse=True, return_counts=True)"
]
},
{
"cell_type": "code",
"execution_count": 22,
"id": "54160284-fab5-45d1-af73-80b778f4c320",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"134"
]
},
"execution_count": 22,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"counts.max()"
]
},
{
"cell_type": "code",
"execution_count": 16,
"id": "4a131a0b-3b43-4029-bfba-4e6665735c6d",
"metadata": {},
"outputs": [],
"source": [
"gt_i = inv[gt]"
]
},
{
"cell_type": "code",
"execution_count": 49,
"id": "58dfa2de-a7f0-445b-a899-8c99f31bd059",
"metadata": {},
"outputs": [],
"source": [
"# 1. if all equivalent vectors are included in the GT result list ==> no problem \n",
"# 2. if all equivalent vectors are outside the GT result list ==> no problem either\n",
"# 3. if equivalent resutls vectors are split half-in half-out the result list ==> ambiguous\n",
"\n",
"# so check cases where n_eq ids at the end of the result list that are equivalent \n",
"# the size of the equivalence class is c >= n_eq \n",
"# if c == n_eq ==> back to 1. \n",
"# if n_eq < c ==> then two cases \n",
"# * optimistic case: the user returns the selected GT vectors -- there are min(c, n_eq) of them \n",
"# * pessimistic case: the user returns vectors that are not selected in GT -- min(n_neq, c - n_eq)\n",
"\n",
"TP_count_optimistic = 0 \n",
"TP_count_pessimistic = 0 \n",
"\n",
"nq = 100_000 \n",
"k = 10\n",
"\n",
"for q in range(nq): \n",
" gt_q = gt_i[q]\n",
" c = counts[gt_q[-1]]\n",
" if c > 1: \n",
" # there is a risk!\n",
" n_eq = (gt_q == gt_q[-1]).sum()\n",
" assert n_eq <= c\n",
" if n_eq < c: \n",
" # print(f\"{q=}, {n_eq=}, {c=}\")\n",
" TP_count_optimistic += min(c, n_eq)\n",
" TP_count_pessimistic += min(n_eq, c - n_eq)"
]
},
{
"cell_type": "code",
"execution_count": 50,
"id": "5ce54aae-8c35-4c6e-938d-e3cd25ce2744",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"0.000146"
]
},
"execution_count": 50,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"(TP_count_optimistic - TP_count_pessimistic) / (nq * k)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "6c826722-b37e-49a0-a97f-348e0acdf6f8",
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.12.3"
}
},
"nbformat": 4,
"nbformat_minor": 5
}
249 changes: 249 additions & 0 deletions notebooks/visualize_OOD.ipynb

Large diffs are not rendered by default.

0 comments on commit f7cc101

Please sign in to comment.