Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Harshasi/final runbook #177

Merged
merged 19 commits into from
Oct 18, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
30 changes: 30 additions & 0 deletions benchmark/datasets.py
Original file line number Diff line number Diff line change
Expand Up @@ -193,6 +193,13 @@ def get_dataset_iterator(self, bs=512, split=(1,0)):
j1 = min(j0 + bs, i1)
yield sanitize(x[j0:j1])

def get_data_in_range(self, start, end):
assert start >= 0
assert end <= self.nb
filename = self.get_dataset_fn()
x = xbin_mmap(filename, dtype=self.dtype, maxn=self.nb)
return x[start:end]

def search_type(self):
return "knn"

Expand Down Expand Up @@ -434,6 +441,28 @@ def distance(self):

def prepare(self, skip_data=False, original_size=10 ** 9):
return super().prepare(skip_data, original_size = self.nb)

class MSTuringClustered30M(DatasetCompetitionFormat):
def __init__(self):
self.nb = 29998994
self.d = 100
self.nq = 10000
self.dtype = "float32"
self.ds_fn = "30M-clustered64.fbin"
self.qs_fn = "testQuery10K.fbin"
self.gt_fn = "clu_msturing30M_gt100"

self.base_url = "https://comp21storage.blob.core.windows.net/publiccontainer/comp23/clustered_data/msturing-30M-clustered/"
self.basedir = os.path.join(BASEDIR, "MSTuring-30M-clustered")

self.private_gt_url = None
self.private_qs_url = None

def distance(self):
return "euclidean"

def prepare(self, skip_data=False, original_size=10 ** 9):
return super().prepare(skip_data, original_size = self.nb)

class MSSPACEV1B(DatasetCompetitionFormat):
def __init__(self, nb_M=1000):
Expand Down Expand Up @@ -984,6 +1013,7 @@ def __str__(self):
'msturing-1M': lambda : MSTuringANNS(1),

'msturing-10M-clustered': lambda: MSTuringClustered10M(),
'msturing-30M-clustered': lambda: MSTuringClustered30M(),

'msspacev-1B': lambda : MSSPACEV1B(1000),
'msspacev-100M': lambda : MSSPACEV1B(100),
Expand Down
9 changes: 3 additions & 6 deletions benchmark/results.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,6 @@
import re
import traceback


def get_result_filename(dataset=None, count=None, definition=None,
query_arguments=None, neurips23track=None, runbook_path=None):
d = ['results']
Expand Down Expand Up @@ -41,9 +40,7 @@ def get_result_filename(dataset=None, count=None, definition=None,

def add_results_to_h5py(f, search_type, results, count, suffix = ''):
if search_type == "knn" or search_type == "knn_filtered":
neighbors = f.create_dataset('neighbors' + suffix, (len(results), count), 'i')
for i, idxs in enumerate(results):
neighbors[i] = idxs
neighbors = f.create_dataset('neighbors' + suffix, (len(results), count), 'i', data = results)
elif search_type == "range":
lims, D, I= results
f.create_dataset('neighbors' + suffix, data=I)
Expand All @@ -59,7 +56,7 @@ def store_results(dataset, count, definition, query_arguments,
head, tail = os.path.split(fn)
if not os.path.isdir(head):
os.makedirs(head)
f = h5py.File(fn, 'w')
f = h5py.File(name=fn, mode='w', libver='latest')
for k, v in attrs.items():
f.attrs[k] = v

Expand All @@ -83,7 +80,7 @@ def load_all_results(dataset=None, count=None, neurips23track=None, runbook_path
if os.path.splitext(fn)[-1] != '.hdf5':
continue
try:
f = h5py.File(os.path.join(root, fn), 'r+')
f = h5py.File(name=os.path.join(root, fn), mode='r+', libver='latest')
properties = dict(f.attrs)
yield properties, f
f.close()
Expand Down
6 changes: 4 additions & 2 deletions benchmark/runner.py
Original file line number Diff line number Diff line change
Expand Up @@ -100,7 +100,7 @@ def run(definition, dataset, count, run_count, rebuild,
algo.set_query_arguments(*query_arguments)
if neurips23track == 'streaming':
descriptor, results = custom_runner.run_task(
algo, ds, distance, 1, run_count, search_type, private_query, runbook)
algo, ds, distance, count, 1, search_type, private_query, runbook)
else:
descriptor, results = custom_runner.run_task(
algo, ds, distance, count, run_count, search_type, private_query)
Expand All @@ -116,9 +116,11 @@ def run(definition, dataset, count, run_count, rebuild,
X = ds.get_private_queries()
power_stats = power_capture.run(algo, X, distance, count,
run_count, search_type, descriptor)
print('start store results')
store_results(dataset, count, definition,
query_arguments, descriptor,
results, search_type, neurips23track, runbook_path)
print('end store results')
finally:
algo.done()

Expand Down Expand Up @@ -263,7 +265,7 @@ def run_docker(definition, dataset, count, runs, timeout, rebuild,

client = docker.from_env()
if mem_limit is None:
mem_limit = psutil.virtual_memory().available
mem_limit = psutil.virtual_memory().available if neurips23track != 'streaming' else (8*1024*1024*1024)

# ready the container object invoked later in this function
container = None
Expand Down
5 changes: 4 additions & 1 deletion benchmark/streaming/compute_gt.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@ def gt_dir(ds, runbook_path):
return os.path.join(ds.basedir, str(ds.nb), runbook_filename)

def output_gt(ds, ids, step, gt_cmdline, runbook_path):
data = ds.get_dataset()
data = ds.get_data_in_range(0, ds.nb)
data_slice = data[ids]

dir = gt_dir(ds, runbook_path)
Expand All @@ -52,6 +52,9 @@ def output_gt(ds, ids, step, gt_cmdline, runbook_path):
gt_cmdline += ' --tags_file ' + tags_file
print("Executing cmdline: ", gt_cmdline)
os.system(gt_cmdline)
print("Removing data file")
rm_cmdline = "rm " + data_file
os.system(rm_cmdline)


def main():
Expand Down
5 changes: 4 additions & 1 deletion data_export.py
Original file line number Diff line number Diff line change
Expand Up @@ -95,7 +95,10 @@ def cleaned_run_metric(run_metrics):
print(f"Looking at track:{track}, dataset:{dataset_name}")
dataset = DATASETS[dataset_name]()
if track == 'streaming':
for runbook_path in ['neurips23/streaming/simple_runbook.yaml', 'neurips23/streaming/clustered_runbook.yaml', 'neurips23/streaming/delete_runbook.yaml']:
for runbook_path in ['neurips23/streaming/simple_runbook.yaml',
'neurips23/streaming/clustered_runbook.yaml',
'neurips23/streaming/delete_runbook.yaml',
'neurips23/streaming/final_runbook.yaml']:
results = load_all_results(dataset_name, neurips23track=track, runbook_path=runbook_path)
run_metrics = compute_metrics_all_runs(dataset, dataset_name, results, args.recompute, \
args.sensors, args.search_times, args.private_query, \
Expand Down
11 changes: 8 additions & 3 deletions neurips23/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ The Practical Vector Search challenge at NeurIPS 2023 has four different tasks:
The tags are from a vocabulary of 200386 possible tags.
The 100,000 queries consist of one image embedding and one or two tags that must appear in the database elements to be considered.

**Task Streaming:** This task uses 10M slice of the MS Turing data set released in the previous challenge. The index starts with zero points and must implement the "runbook" provided - a sequence of insertion operations, deletion operations, and search commands (roughly 4:4:1 ratio) - within a time bound of 1 hour. In the final run, we will use a different runbook, and possibly a different data set, to avoid participants over-fitting to this dataset. Entries will be ranked by average recall over queries at all check points. The intention is for the algorithm to process the operations and maintain a compact index over the active points rather than index the entire anticipated set of points and use tombstones or flags to mark active elements.
**Task Streaming:** This task uses 10M slice of the MS Turing data set released in the previous challenge. The index starts with zero points and must implement the "runbook" provided - a sequence of insertion operations, deletion operations, and search commands (roughly 4:4:1 ratio) - within a time bound of 1 hour and a DRAM limit of 8GB. Entries will be ranked by average recall over queries at all check points. The intention is for the algorithm to process the operations and maintain a compact index over the active points rather than index the entire anticipated set of points and use tombstones or flags to mark active elements. ~~In the final run, we will use a different runbook, and possibly a different data set, to avoid participants over-fitting to this dataset.~~ The final run will use `msturing-30M-clustered`, a 30M slice of the MSTuring dataset, and the `final_runbook.yaml` runbook.

**Task Out-Of-Distribution:** Yandex Text-to-Image 10M represents a cross-modal dataset where the database and query vectors have different distributions in the shared vector space.
The base set is a 10M subset of the Yandex visual search database of 200-dimensional image embeddings which are produced with the Se-ResNext-101 model.
Expand All @@ -46,6 +46,7 @@ The baselines were run on an Azure Standard D8lds v5 (8 vcpus, 16 GiB memory) ma
|Sparse | Linear Scan | 101 | `python3 run.py --dataset sparse-full --algorithm linscan --neurips23track sparse` |
|Filter | faiss | 3200 | `python3 run.py --dataset yfcc-10M --algorithm faiss --neurips23track filter` |
|Streaming| DiskANN | 0.924 (recall@10), 23 mins | `python3 run.py --dataset msturing-10M-clustered --algorithm diskann --neurips23track streaming --runbook_path neurips23/streaming/delete_runbook.yaml` |
|Streaming| DiskANN | 0.883 (recall@10), 45 mins | `python3 run.py --dataset msturing-30M-clustered --algorithm diskann --neurips23track streaming --runbook_path neurips23/streaming/final_runbook.yaml` |
|OOD | DiskANN | 4882 | `python3 run.py --dataset text2image-10M --algorithm diskann --neurips23track ood` |


Expand Down Expand Up @@ -110,13 +111,17 @@ For the competition dataset, run commands mentioned in the table above, for exam
python run.py --neurips23track filter --algorithm faiss --dataset yfcc-10M
python run.py --neurips23track sparse --algorithm linscan --dataset sparse-full
python run.py --neurips23track ood --algorithm diskann --dataset text2image-10M
# preliminary runbook for testing
python run.py --neurips23track streaming --algorithm diskann --dataset msturing-10M-clustered --runbook_path neurips23/streaming/delete_runbook.yaml
#Final runbook for evaluation
python run.py --neurips23track streaming --algorithm diskann --dataset msturing-30M-clustered --runbook_path neurips23/streaming/final_runbook.yaml
```

For streaming track, runbook specifies the order of operations to be executed by the algorithms. To download the ground truth for every search operation: (needs azcopy tool in your binary path):
```
python benchmark/streaming/download_gt.py --runbook_file neurips23/streaming/simple_runbook.yaml --dataset msspacev-10M
python benchmark/streaming/download_gt.py --runbook_file neurips23/streaming/delete_runbook.yaml --dataset msturing-10M-clustered
python -m benchmark.streaming.download_gt --runbook_file neurips23/streaming/simple_runbook.yaml --dataset msspacev-10M
python -m benchmark.streaming.download_gt --runbook_file neurips23/streaming/delete_runbook.yaml --dataset msturing-10M-clustered
python -m benchmark.streaming.download_gt --runbook_file neurips23/streaming/final_runbook.yaml --dataset msturing-30M-clustered
```
Alternately, to compute ground truth for an arbitrary runbook, [clone and build DiskANN repo](https://github.com/Microsoft/DiskANN) and use the command line tool to compute ground truth at various search checkpoints. The `--gt_cmdline_tool` points to the directory with DiskANN commandline tools.
```
Expand Down
16 changes: 15 additions & 1 deletion neurips23/streaming/diskann/config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -83,4 +83,18 @@ msturing-10M-clustered:
args: |
[{"R":64, "L":50, "insert_threads":16, "consolidate_threads":16}]
query-args: |
[{"Ls":100, "T":16}]
[{"Ls":100, "T":16}]
msturing-30M-clustered:
diskann:
docker-tag: neurips23-streaming-diskann
module: neurips23.streaming.diskann.diskann-str
constructor: diskann
base-args: ["@metric"]
run-groups:
base:
args: |
[{"R":32, "L":50, "insert_threads":16, "consolidate_threads":16},
{"R":32, "L":70, "insert_threads":16, "consolidate_threads":16},
{"R":50, "L":50, "insert_threads":16, "consolidate_threads":16}]
query-args: |
[{"Ls":70, "T":16}]
Loading
Loading