From e1bbf5cdbbcbce8724fd41d4ab6e6a06684dda81 Mon Sep 17 00:00:00 2001 From: Amir Ingber <93526572+ingberam@users.noreply.github.com> Date: Mon, 15 Jan 2024 07:14:48 +0200 Subject: [PATCH] pinecone's filter solution (#258) * pinecone filter solution * support random-filter-s * add test * fix algo name in CI * make algo name same as folder * algo name again --- .github/workflows/neurips23.yml | 3 + neurips23/filter/pinecone/Dockerfile | 11 ++ neurips23/filter/pinecone/config.yaml | 40 +++++++ neurips23/filter/pinecone/pinecone_index.py | 117 ++++++++++++++++++++ 4 files changed, 171 insertions(+) create mode 100644 neurips23/filter/pinecone/Dockerfile create mode 100644 neurips23/filter/pinecone/config.yaml create mode 100644 neurips23/filter/pinecone/pinecone_index.py diff --git a/.github/workflows/neurips23.yml b/.github/workflows/neurips23.yml index a7595cd79..c69ccede1 100644 --- a/.github/workflows/neurips23.yml +++ b/.github/workflows/neurips23.yml @@ -102,6 +102,9 @@ jobs: - algorithm: puck dataset: random-xs track: ood + - algorithm: pinecone + dataset: random-filter-s + track: filter fail-fast: false steps: diff --git a/neurips23/filter/pinecone/Dockerfile b/neurips23/filter/pinecone/Dockerfile new file mode 100644 index 000000000..542e2736e --- /dev/null +++ b/neurips23/filter/pinecone/Dockerfile @@ -0,0 +1,11 @@ +FROM neurips23 + +# install MKL support +RUN DEBIAN_FRONTEND=noninteractive apt-get install -y libmkl-full-dev + +# copy and install the pys2 python package +RUN git clone --branch filter https://github.com/pinecone-io/bigann.git +RUN pip install ./bigann/*.whl +# verify that the build worked +RUN python3 -c 'import pys2;' + diff --git a/neurips23/filter/pinecone/config.yaml b/neurips23/filter/pinecone/config.yaml new file mode 100644 index 000000000..4f0a6085e --- /dev/null +++ b/neurips23/filter/pinecone/config.yaml @@ -0,0 +1,40 @@ +random-filter-s: + pinecone: + docker-tag: neurips23-filter-pinecone + module: neurips23.filter.pinecone.pinecone_index + constructor: PineconeIndex + base-args: ["@metric"] + run-groups: + base: + args: | + [{"indexkey": "FilterIVFFlatU8", "num_clusters": "128", "precompute_intersection_threshold": "5000"}] + query-args: | + [ + {"fraction_coefficient": "0.3", "fraction_exponent": "0.1", "skip_clustering_threshold": 2000}, + {"fraction_coefficient": "0.7", "fraction_exponent": "0.1", "skip_clustering_threshold": 2000}, + {"fraction_coefficient": "1.0", "fraction_exponent": "0.1", "skip_clustering_threshold": 2000}, + {"fraction_coefficient": "2.0", "fraction_exponent": "0.1", "skip_clustering_threshold": 2000} + ] +yfcc-10M: + pinecone: + docker-tag: neurips23-filter-pinecone + module: neurips23.filter.pinecone.pinecone_index + constructor: PineconeIndex + base-args: ["@metric"] + run-groups: + base: + args: | + [{"indexkey": "FilterIVFFlatU8", "num_clusters": "1024", "precompute_intersection_threshold": "2500"}] + query-args: | + [ + {"fraction_coefficient": "19.0", "fraction_exponent": "0.65", "skip_clustering_threshold": 2000}, + {"fraction_coefficient": "18.0", "fraction_exponent": "0.65", "skip_clustering_threshold": 2000}, + {"fraction_coefficient": "17.0", "fraction_exponent": "0.65", "skip_clustering_threshold": 2000}, + {"fraction_coefficient": "16.0", "fraction_exponent": "0.65", "skip_clustering_threshold": 2000}, + {"fraction_coefficient": "15.0", "fraction_exponent": "0.65", "skip_clustering_threshold": 2000}, + {"fraction_coefficient": "14.7", "fraction_exponent": "0.65", "skip_clustering_threshold": 2000}, + {"fraction_coefficient": "14.3", "fraction_exponent": "0.65", "skip_clustering_threshold": 2000}, + {"fraction_coefficient": "14.0", "fraction_exponent": "0.65", "skip_clustering_threshold": 2000}, + {"fraction_coefficient": "13.0", "fraction_exponent": "0.65", "skip_clustering_threshold": 2000}, + {"fraction_coefficient": "11.0", "fraction_exponent": "0.65", "skip_clustering_threshold": 2000} + ] diff --git a/neurips23/filter/pinecone/pinecone_index.py b/neurips23/filter/pinecone/pinecone_index.py new file mode 100644 index 000000000..bea518c24 --- /dev/null +++ b/neurips23/filter/pinecone/pinecone_index.py @@ -0,0 +1,117 @@ +import os +import numpy as np + +from neurips23.filter.base import BaseFilterANN +from benchmark.datasets import DATASETS + +import pys2 + +class PineconeIndex(BaseFilterANN): + + def __init__(self, metric, index_params): + self._index_params = index_params + self._metric = metric + print(index_params) + self.indexkey = index_params.get("indexkey", "FilterIVFFlatU8") + self.nt = index_params.get("threads", 1) + self.qas = {} + + def fit(self, dataset): + ds = DATASETS[dataset]() + + if ds.search_type() != "knn_filtered": + raise NotImplementedError() + + print(f"Building index") + index = pys2.FilterIndexWrapper(ds.d, + self.indexkey, + self._index_params, + ds.get_dataset_fn(), + os.path.join(ds.basedir, ds.ds_metadata_fn)) + + self.index = index + + def load_index(self, dataset): + """ + Load the index for dataset. Returns False if index + is not available, True otherwise. + + Checking the index usually involves the dataset name + and the index build parameters passed during construction. + + If the file does not exist, there is an option to download it from a public url + """ + filename = dataset + '.index' + + if not os.path.exists(filename): + return False + + print("Loading index from " + filename) + self.index = pys2.load_filter_ivf_index(filename) + return True + + + def index_files_to_store(self, dataset): + """ + Specify a triplet with the local directory path of index files, + the common prefix name of index component(s) and a list of + index components that need to be uploaded to (after build) + or downloaded from (for search) cloud storage. + + For local directory path under docker environment, please use + a directory under + data/indices/track(T1 or T2)/algo.__str__()/DATASETS[dataset]().short_name() + """ + raise NotImplementedError() + + def query(self, X, k): + raise NotImplementedError() + + def filtered_query(self, X, filter, k): + + if (X.dtype.kind == 'f'): + print('data type of X is ' + str(X.dtype)) + X = X*10 + 128 + X = X.astype(np.uint8) + padding_size = 192 - X.shape[1] + X = np.pad(X, ((0, 0), (0, padding_size)), mode='constant') + + + results_tuple = self.index.search_parallel(X, filter.indptr, filter.indices, k) # this returns a tuple: (results_array, query_time, post_processing_time) + self.I = results_tuple[0] + print("query and postprocessing times: ", results_tuple[1:]) + + + def get_results(self): + return self.I + + def set_query_arguments(self, query_args): + self.qas = query_args + print("setting query args:" + str(self.qas)) + + if "skip_clustering_threshold" in query_args: + self.skip_clustering_threshold = query_args['skip_clustering_threshold'] + self.index.set_search_param('skip_clustering_threshold', str(self.skip_clustering_threshold)) + self.qas = query_args + else: + self.skip_clustering_threshold = 0 + + if "fraction_coefficient" in query_args: + self.fraction_coefficient = query_args['fraction_coefficient'] + self.index.set_search_param('fraction_coefficient', str(self.fraction_coefficient)) + self.qas = query_args + else: + self.fraction_coefficient = 18.0 + + if "fraction_exponent" in query_args: + self.fraction_exponent = query_args['fraction_exponent'] + self.index.set_search_param('fraction_exponent', str(self.fraction_exponent)) + self.qas = query_args + else: + self.fraction_coefficient = 0.65 + + + def __str__(self): + return f'pinecone_filter({self.indexkey, self._index_params, self.qas})' + +