Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add "replace" to streaming challenge with new runbooks #301

Merged
merged 8 commits into from
Aug 26, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
56 changes: 40 additions & 16 deletions benchmark/streaming/compute_gt.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,32 +2,55 @@
import os
import numpy as np

import sys
[sys.path.append(i) for i in ['.', '..']]

from benchmark.datasets import DATASETS
from benchmark.streaming.load_runbook import load_runbook

def get_range_start_end(entry):
return np.arange(entry['start'], entry['end'], dtype=np.uint32)
def get_range_start_end(entry, tag_to_id):
for i in range(entry['end'] - entry['start']):
tag_to_id[i+entry['start']] = i+entry['start']
return tag_to_id
harsha-simhadri marked this conversation as resolved.
Show resolved Hide resolved

def get_next_set(ids: np.ndarray, entry):
def get_next_set(tag_to_id: np.ndarray, entry):
match entry['operation']:
case 'insert':
range = get_range_start_end(entry)
return np.union1d(ids, range)
for i in range(entry['end'] - entry['start']):
tag_to_id[i+entry['start']] = i+entry['start']
return tag_to_id
case 'delete':
range = get_range_start_end(entry)
return np.setdiff1d(ids, range, assume_unique=True)
# delete is by key
for i in range(entry['end'] - entry['start']):
tag_to_id.pop(i + entry['start'])
return tag_to_id
case 'replace':
# replace key with value
for i in range(entry['tags_end'] - entry['tags_start']):
tag_to_id[i + entry['tags_start']] = entry['ids_start'] + i
return tag_to_id
case 'search':
return ids
return tag_to_id
case _:
raise ValueError('Undefined entry in runbook')

def gt_dir(ds, runbook_path):
runbook_filename = os.path.split(runbook_path)[1]
return os.path.join(ds.basedir, str(ds.nb), runbook_filename)

def output_gt(ds, ids, step, gt_cmdline, runbook_path):
def output_gt(ds, tag_to_id, step, gt_cmdline, runbook_path):
ids_list = []
tags_list = []
for tag, id in tag_to_id.items():
ids_list.append(id)
tags_list.append(tag)

ids = np.array(ids_list, dtype = np.uint32)
tags = np.array(tags_list, dtype = np.uint32)


data = ds.get_data_in_range(0, ds.nb)
data_slice = data[ids]
data_slice = data[np.array(ids)]

dir = gt_dir(ds, runbook_path)
prefix = os.path.join(dir, 'step') + str(step)
Expand All @@ -39,9 +62,9 @@ def output_gt(ds, ids, step, gt_cmdline, runbook_path):

with open(tags_file, 'wb') as tf:
one = 1
tf.write(ids.size.to_bytes(4, byteorder='little'))
tf.write(tags.size.to_bytes(4, byteorder='little'))
tf.write(one.to_bytes(4, byteorder='little'))
ids.tofile(tf)
tags.tofile(tf)
with open(data_file, 'wb') as f:
f.write(ids.size.to_bytes(4, byteorder='little')) #npts
f.write(ds.d.to_bytes(4, byteorder='little'))
Expand Down Expand Up @@ -111,14 +134,15 @@ def main():

step = 1
ids = np.empty(0, dtype=np.uint32)

for entry in runbook:
# the first step must be an insertion
if step == 1:
ids = get_range_start_end(entry)
tag_to_id = get_range_start_end(entry, {})
else:
ids = get_next_set(ids, entry)
print(ids)
tag_to_id = get_next_set(tag_to_id, entry)
if (entry['operation'] == 'search'):
output_gt(ds, ids, step, common_cmd, args.runbook_file)
output_gt(ds, tag_to_id, step, common_cmd, args.runbook_file)
step += 1

if __name__ == '__main__':
Expand Down
19 changes: 18 additions & 1 deletion benchmark/streaming/load_runbook.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ def load_runbook(dataset_name, max_pts, runbook_file):
run_list = []
while i in runbook:
entry = runbook.get(i)
if entry['operation'] not in {'insert', 'delete', 'search'}:
if entry['operation'] not in {'insert', 'delete', 'search', 'replace'}:
raise Exception('Undefined runbook operation')
if entry['operation'] in {'insert', 'delete'}:
if 'start' not in entry:
Expand All @@ -18,6 +18,23 @@ def load_runbook(dataset_name, max_pts, runbook_file):
raise Exception('Start out of range in runbook')
if entry['end'] < 0 or entry['end'] > max_pts:
raise Exception('End out of range in runbook')
harsha-simhadri marked this conversation as resolved.
Show resolved Hide resolved
if entry['operation'] in {'replace'}:
if 'tags_start' not in entry:
raise Exception('Start of indices to be replaced not specified in runbook')
if 'tags_end' not in entry:
raise Exception('End of indices to be replaced not specified in runbook')
if 'ids_start' not in entry:
raise Exception('Start of indices to replace not specified in runbook')
if 'ids_end' not in entry:
raise Exception('End of indices to replace not specified in runbook')
if entry['tags_start'] < 0 or entry ['tags_start'] >= max_pts:
raise Exception('Start of indices to be replaced out of range in runbook')
if entry['tags_end'] < 0 or entry ['tags_end'] > max_pts:
raise Exception('End of indices to be replaced out of range in runbook')
if entry['ids_start'] < 0 or entry ['ids_start'] >= max_pts:
raise Exception('Start of indices to replace out of range in runbook')
if entry['ids_end'] < 0 or entry ['ids_end'] > max_pts:
raise Exception('End of indices to replace out of range in runbook')
i += 1
run_list.append(entry)

Expand Down
5 changes: 5 additions & 0 deletions data_export.py
Original file line number Diff line number Diff line change
Expand Up @@ -97,13 +97,18 @@ def cleaned_run_metric(run_metrics):
runbook_paths = [None]
if track == 'streaming':
runbook_paths = ['neurips23/streaming/simple_runbook.yaml',
'neurips23/streaming/simple_replace_runbook.yaml',
'neurips23/streaming/random_replace_runbook.yaml',
'neurips23/streaming/clustered_replace_runbook.yaml',
'neurips23/streaming/clustered_runbook.yaml',
'neurips23/streaming/clustered_runbook.yaml',
'neurips23/streaming/delete_runbook.yaml',
'neurips23/streaming/final_runbook.yaml',
'neurips23/streaming/msturing-10M_slidingwindow_runbook.yaml',
'neurips23/streaming/wikipedia-35M_expirationtime_runbook.yaml',
'neurips23/streaming/msmarco-100M_expirationtime_runbook.yaml']
for runbook_path in runbook_paths:
print("Looking for runbook ", runbook_path)
results = load_all_results(dataset_name, neurips23track=track, runbook_path=runbook_path)
results = compute_metrics_all_runs(dataset, dataset_name, results, args.recompute, \
args.sensors, args.search_times, args.private_query, \
Expand Down
29 changes: 29 additions & 0 deletions neurips23/streaming/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
# NeurIPS 2023 Streaming Challenge and Beyond

This README will discuss ongoing changes to the streaming benchmark challenge. See the NeurIPS23 README for instructions on how to execute runbooks and compute groundtruth for them. All changes here are backwards-compatible with those instructions.

## Semantics

The streaming runbooks support four operations: `search`, `insert`, `delete`, and a recent addition `replace`. The addition of replace, where a vector's data is modified in-place, prompts us to define the semantics of vector *tags* versus vector *ids*.

Each vector is assumed to have a unique *id* which never changes throughout the course of a runbook. In the case of replaces, each vector is also assigned a numeric *tag*. The underlying vector id corresponding to a tag may change throughout the runbook when a vector is replaced. In the runbooks here, the tag of a vector is assumed to correspond to the vector id when a vector is first inserted, and then remains constant when the vector is replaced. For example, a vector with id #245 is first inserted with tag #245. If the vector is later replaced with vector id #1067, tag #245 now corresponds to vector id #1067. Upon another replace, tag #245 might next correspond to vector id #2428. This distinction leads us to define the semantics of each operation in terms of ids and tags:

1. `search` provides a set of query vectors, and returns an array of tags corresponding to the nearest index vectors to each query vector. In this repository, each call to `search` in one runbook refers to the same set of query vectors.
2. `insert` provides a range of vector ids, whose tags are identical to their vector ids, to insert into the index.
3. `delete` provides a range of existing tags whose underlying data is to be deleted from the index and no longer returned as answers to queries.
4. `replace` provides a range of existing tags and a range of vector ids, such that each tag should henceforth correspond to the new vector id.

## Available Runbooks

Now that the number of runbooks has started to increase significantly, here we list the available runbooks with a brief description of each.

1. `simple_runbook.yaml`: A runbook executing a short sequences of insertions, searches, and deletions to aid with debugging and testing.
2. `simple_replace_runbook.yaml`: A runbook executing a short sequence of inserts, searches, and replaces to aid with debugging and testing.
3. `clustered_runbook.yaml`: A runbook taking a clustered dataset (options are `random-xs-clustered` and `msturing-10M-clustered`) and inserting points in clustered order.
4. `delete_runbook.yaml`: A runbook executing all steps in the clustered runbook, but which then deletes a fraction of each cluster.
5. `final_runbook.yaml`: The NeurIPS 2023 streaming challenge final runbook. It takes the `msturing-30M-clustered` dataset and performs several rounds of insertion and deletion in clustered order.
6. `msmarco-100M_expirationtime_runbook.yaml`: A runbook using the `msmarco-100M` dataset which inserts each point with a randomly chosen expiration time: never, in 200 steps, or in 50 steps.
7. `neurips23/streaming/wikipedia-35M_expirationtime_runbook.yaml`: A runbook using the `wikipedia-35M` dataset which inserts each point with a randomly chosen expiration time: never, in 200 steps, or in 50 steps.
8. `neurips23/streaming/msturing-10M_slidingwindow_runbook.yaml`: A runbook using the `msturing-10M` dataset which inserts half the points, then maintains the index at a consistent size using a sliding window.
9. `clustered_replace_runbook.yaml`: A replace-focused runbook which takes the `msturing-10M-clustered` dataset, inserts a fraction of the points in each cluster, then replaces some of that fraction with vector ids from the same cluster.
10. `random_replace_runbook.yaml`: A replace-focused runbook which takes the `msturing-10M-clustered` dataset, inserts a fraction of the points in each cluster, then replaces some of that fraction with vector ids from a different randomly selected cluster.
Loading
Loading