Skip to content

Commit

Permalink
pinecone's filter solution (#258)
Browse files Browse the repository at this point in the history
* pinecone filter solution

* support random-filter-s

* add test

* fix algo name in CI

* make algo name same as folder

* algo name again
  • Loading branch information
ingberam authored Jan 15, 2024
1 parent bee146d commit e1bbf5c
Show file tree
Hide file tree
Showing 4 changed files with 171 additions and 0 deletions.
3 changes: 3 additions & 0 deletions .github/workflows/neurips23.yml
Original file line number Diff line number Diff line change
Expand Up @@ -102,6 +102,9 @@ jobs:
- algorithm: puck
dataset: random-xs
track: ood
- algorithm: pinecone
dataset: random-filter-s
track: filter
fail-fast: false

steps:
Expand Down
11 changes: 11 additions & 0 deletions neurips23/filter/pinecone/Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
FROM neurips23

# install MKL support
RUN DEBIAN_FRONTEND=noninteractive apt-get install -y libmkl-full-dev

# copy and install the pys2 python package
RUN git clone --branch filter https://github.com/pinecone-io/bigann.git
RUN pip install ./bigann/*.whl
# verify that the build worked
RUN python3 -c 'import pys2;'

40 changes: 40 additions & 0 deletions neurips23/filter/pinecone/config.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
random-filter-s:
pinecone:
docker-tag: neurips23-filter-pinecone
module: neurips23.filter.pinecone.pinecone_index
constructor: PineconeIndex
base-args: ["@metric"]
run-groups:
base:
args: |
[{"indexkey": "FilterIVFFlatU8", "num_clusters": "128", "precompute_intersection_threshold": "5000"}]
query-args: |
[
{"fraction_coefficient": "0.3", "fraction_exponent": "0.1", "skip_clustering_threshold": 2000},
{"fraction_coefficient": "0.7", "fraction_exponent": "0.1", "skip_clustering_threshold": 2000},
{"fraction_coefficient": "1.0", "fraction_exponent": "0.1", "skip_clustering_threshold": 2000},
{"fraction_coefficient": "2.0", "fraction_exponent": "0.1", "skip_clustering_threshold": 2000}
]
yfcc-10M:
pinecone:
docker-tag: neurips23-filter-pinecone
module: neurips23.filter.pinecone.pinecone_index
constructor: PineconeIndex
base-args: ["@metric"]
run-groups:
base:
args: |
[{"indexkey": "FilterIVFFlatU8", "num_clusters": "1024", "precompute_intersection_threshold": "2500"}]
query-args: |
[
{"fraction_coefficient": "19.0", "fraction_exponent": "0.65", "skip_clustering_threshold": 2000},
{"fraction_coefficient": "18.0", "fraction_exponent": "0.65", "skip_clustering_threshold": 2000},
{"fraction_coefficient": "17.0", "fraction_exponent": "0.65", "skip_clustering_threshold": 2000},
{"fraction_coefficient": "16.0", "fraction_exponent": "0.65", "skip_clustering_threshold": 2000},
{"fraction_coefficient": "15.0", "fraction_exponent": "0.65", "skip_clustering_threshold": 2000},
{"fraction_coefficient": "14.7", "fraction_exponent": "0.65", "skip_clustering_threshold": 2000},
{"fraction_coefficient": "14.3", "fraction_exponent": "0.65", "skip_clustering_threshold": 2000},
{"fraction_coefficient": "14.0", "fraction_exponent": "0.65", "skip_clustering_threshold": 2000},
{"fraction_coefficient": "13.0", "fraction_exponent": "0.65", "skip_clustering_threshold": 2000},
{"fraction_coefficient": "11.0", "fraction_exponent": "0.65", "skip_clustering_threshold": 2000}
]
117 changes: 117 additions & 0 deletions neurips23/filter/pinecone/pinecone_index.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,117 @@
import os
import numpy as np

from neurips23.filter.base import BaseFilterANN
from benchmark.datasets import DATASETS

import pys2

class PineconeIndex(BaseFilterANN):

def __init__(self, metric, index_params):
self._index_params = index_params
self._metric = metric
print(index_params)
self.indexkey = index_params.get("indexkey", "FilterIVFFlatU8")
self.nt = index_params.get("threads", 1)
self.qas = {}

def fit(self, dataset):
ds = DATASETS[dataset]()

if ds.search_type() != "knn_filtered":
raise NotImplementedError()

print(f"Building index")
index = pys2.FilterIndexWrapper(ds.d,
self.indexkey,
self._index_params,
ds.get_dataset_fn(),
os.path.join(ds.basedir, ds.ds_metadata_fn))

self.index = index

def load_index(self, dataset):
"""
Load the index for dataset. Returns False if index
is not available, True otherwise.
Checking the index usually involves the dataset name
and the index build parameters passed during construction.
If the file does not exist, there is an option to download it from a public url
"""
filename = dataset + '.index'

if not os.path.exists(filename):
return False

print("Loading index from " + filename)
self.index = pys2.load_filter_ivf_index(filename)
return True


def index_files_to_store(self, dataset):
"""
Specify a triplet with the local directory path of index files,
the common prefix name of index component(s) and a list of
index components that need to be uploaded to (after build)
or downloaded from (for search) cloud storage.
For local directory path under docker environment, please use
a directory under
data/indices/track(T1 or T2)/algo.__str__()/DATASETS[dataset]().short_name()
"""
raise NotImplementedError()

def query(self, X, k):
raise NotImplementedError()

def filtered_query(self, X, filter, k):

if (X.dtype.kind == 'f'):
print('data type of X is ' + str(X.dtype))
X = X*10 + 128
X = X.astype(np.uint8)
padding_size = 192 - X.shape[1]
X = np.pad(X, ((0, 0), (0, padding_size)), mode='constant')


results_tuple = self.index.search_parallel(X, filter.indptr, filter.indices, k) # this returns a tuple: (results_array, query_time, post_processing_time)
self.I = results_tuple[0]
print("query and postprocessing times: ", results_tuple[1:])


def get_results(self):
return self.I

def set_query_arguments(self, query_args):
self.qas = query_args
print("setting query args:" + str(self.qas))

if "skip_clustering_threshold" in query_args:
self.skip_clustering_threshold = query_args['skip_clustering_threshold']
self.index.set_search_param('skip_clustering_threshold', str(self.skip_clustering_threshold))
self.qas = query_args
else:
self.skip_clustering_threshold = 0

if "fraction_coefficient" in query_args:
self.fraction_coefficient = query_args['fraction_coefficient']
self.index.set_search_param('fraction_coefficient', str(self.fraction_coefficient))
self.qas = query_args
else:
self.fraction_coefficient = 18.0

if "fraction_exponent" in query_args:
self.fraction_exponent = query_args['fraction_exponent']
self.index.set_search_param('fraction_exponent', str(self.fraction_exponent))
self.qas = query_args
else:
self.fraction_coefficient = 0.65


def __str__(self):
return f'pinecone_filter({self.indexkey, self._index_params, self.qas})'


0 comments on commit e1bbf5c

Please sign in to comment.