Skip to content

Commit

Permalink
Add "replace" to streaming challenge with new runbooks (#301)
Browse files Browse the repository at this point in the history
* added replace to load and compute gt

* commit to switch

* added support for replace

* added README to streaming section

* Update README.md

Reworded README in a few places

* got rid of accidental change to file
  • Loading branch information
magdalendobson authored Aug 26, 2024
1 parent 3666346 commit 4aea829
Show file tree
Hide file tree
Showing 9 changed files with 1,176 additions and 17 deletions.
56 changes: 40 additions & 16 deletions benchmark/streaming/compute_gt.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,32 +2,55 @@
import os
import numpy as np

import sys
[sys.path.append(i) for i in ['.', '..']]

from benchmark.datasets import DATASETS
from benchmark.streaming.load_runbook import load_runbook

def get_range_start_end(entry):
return np.arange(entry['start'], entry['end'], dtype=np.uint32)
def get_range_start_end(entry, tag_to_id):
for i in range(entry['end'] - entry['start']):
tag_to_id[i+entry['start']] = i+entry['start']
return tag_to_id

def get_next_set(ids: np.ndarray, entry):
def get_next_set(tag_to_id: np.ndarray, entry):
match entry['operation']:
case 'insert':
range = get_range_start_end(entry)
return np.union1d(ids, range)
for i in range(entry['end'] - entry['start']):
tag_to_id[i+entry['start']] = i+entry['start']
return tag_to_id
case 'delete':
range = get_range_start_end(entry)
return np.setdiff1d(ids, range, assume_unique=True)
# delete is by key
for i in range(entry['end'] - entry['start']):
tag_to_id.pop(i + entry['start'])
return tag_to_id
case 'replace':
# replace key with value
for i in range(entry['tags_end'] - entry['tags_start']):
tag_to_id[i + entry['tags_start']] = entry['ids_start'] + i
return tag_to_id
case 'search':
return ids
return tag_to_id
case _:
raise ValueError('Undefined entry in runbook')

def gt_dir(ds, runbook_path):
runbook_filename = os.path.split(runbook_path)[1]
return os.path.join(ds.basedir, str(ds.nb), runbook_filename)

def output_gt(ds, ids, step, gt_cmdline, runbook_path):
def output_gt(ds, tag_to_id, step, gt_cmdline, runbook_path):
ids_list = []
tags_list = []
for tag, id in tag_to_id.items():
ids_list.append(id)
tags_list.append(tag)

ids = np.array(ids_list, dtype = np.uint32)
tags = np.array(tags_list, dtype = np.uint32)


data = ds.get_data_in_range(0, ds.nb)
data_slice = data[ids]
data_slice = data[np.array(ids)]

dir = gt_dir(ds, runbook_path)
prefix = os.path.join(dir, 'step') + str(step)
Expand All @@ -39,9 +62,9 @@ def output_gt(ds, ids, step, gt_cmdline, runbook_path):

with open(tags_file, 'wb') as tf:
one = 1
tf.write(ids.size.to_bytes(4, byteorder='little'))
tf.write(tags.size.to_bytes(4, byteorder='little'))
tf.write(one.to_bytes(4, byteorder='little'))
ids.tofile(tf)
tags.tofile(tf)
with open(data_file, 'wb') as f:
f.write(ids.size.to_bytes(4, byteorder='little')) #npts
f.write(ds.d.to_bytes(4, byteorder='little'))
Expand Down Expand Up @@ -111,14 +134,15 @@ def main():

step = 1
ids = np.empty(0, dtype=np.uint32)

for entry in runbook:
# the first step must be an insertion
if step == 1:
ids = get_range_start_end(entry)
tag_to_id = get_range_start_end(entry, {})
else:
ids = get_next_set(ids, entry)
print(ids)
tag_to_id = get_next_set(tag_to_id, entry)
if (entry['operation'] == 'search'):
output_gt(ds, ids, step, common_cmd, args.runbook_file)
output_gt(ds, tag_to_id, step, common_cmd, args.runbook_file)
step += 1

if __name__ == '__main__':
Expand Down
19 changes: 18 additions & 1 deletion benchmark/streaming/load_runbook.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ def load_runbook(dataset_name, max_pts, runbook_file):
run_list = []
while i in runbook:
entry = runbook.get(i)
if entry['operation'] not in {'insert', 'delete', 'search'}:
if entry['operation'] not in {'insert', 'delete', 'search', 'replace'}:
raise Exception('Undefined runbook operation')
if entry['operation'] in {'insert', 'delete'}:
if 'start' not in entry:
Expand All @@ -18,6 +18,23 @@ def load_runbook(dataset_name, max_pts, runbook_file):
raise Exception('Start out of range in runbook')
if entry['end'] < 0 or entry['end'] > max_pts:
raise Exception('End out of range in runbook')
if entry['operation'] in {'replace'}:
if 'tags_start' not in entry:
raise Exception('Start of indices to be replaced not specified in runbook')
if 'tags_end' not in entry:
raise Exception('End of indices to be replaced not specified in runbook')
if 'ids_start' not in entry:
raise Exception('Start of indices to replace not specified in runbook')
if 'ids_end' not in entry:
raise Exception('End of indices to replace not specified in runbook')
if entry['tags_start'] < 0 or entry ['tags_start'] >= max_pts:
raise Exception('Start of indices to be replaced out of range in runbook')
if entry['tags_end'] < 0 or entry ['tags_end'] > max_pts:
raise Exception('End of indices to be replaced out of range in runbook')
if entry['ids_start'] < 0 or entry ['ids_start'] >= max_pts:
raise Exception('Start of indices to replace out of range in runbook')
if entry['ids_end'] < 0 or entry ['ids_end'] > max_pts:
raise Exception('End of indices to replace out of range in runbook')
i += 1
run_list.append(entry)

Expand Down
5 changes: 5 additions & 0 deletions data_export.py
Original file line number Diff line number Diff line change
Expand Up @@ -97,13 +97,18 @@ def cleaned_run_metric(run_metrics):
runbook_paths = [None]
if track == 'streaming':
runbook_paths = ['neurips23/streaming/simple_runbook.yaml',
'neurips23/streaming/simple_replace_runbook.yaml',
'neurips23/streaming/random_replace_runbook.yaml',
'neurips23/streaming/clustered_replace_runbook.yaml',
'neurips23/streaming/clustered_runbook.yaml',
'neurips23/streaming/clustered_runbook.yaml',
'neurips23/streaming/delete_runbook.yaml',
'neurips23/streaming/final_runbook.yaml',
'neurips23/streaming/msturing-10M_slidingwindow_runbook.yaml',
'neurips23/streaming/wikipedia-35M_expirationtime_runbook.yaml',
'neurips23/streaming/msmarco-100M_expirationtime_runbook.yaml']
for runbook_path in runbook_paths:
print("Looking for runbook ", runbook_path)
results = load_all_results(dataset_name, neurips23track=track, runbook_path=runbook_path)
results = compute_metrics_all_runs(dataset, dataset_name, results, args.recompute, \
args.sensors, args.search_times, args.private_query, \
Expand Down
29 changes: 29 additions & 0 deletions neurips23/streaming/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
# NeurIPS 2023 Streaming Challenge and Beyond

This README will discuss ongoing changes to the streaming benchmark challenge. See the NeurIPS23 README for instructions on how to execute runbooks and compute groundtruth for them. All changes here are backwards-compatible with those instructions.

## Semantics

The streaming runbooks support four operations: `search`, `insert`, `delete`, and a recent addition `replace`. The addition of replace, where a vector's data is modified in-place, prompts us to define the semantics of vector *tags* versus vector *ids*.

Each vector is assumed to have a unique *id* which never changes throughout the course of a runbook. In the case of replaces, each vector is also assigned a numeric *tag*. The underlying vector id corresponding to a tag may change throughout the runbook when a vector is replaced. In the runbooks here, the tag of a vector is assumed to correspond to the vector id when a vector is first inserted, and then remains constant when the vector is replaced. For example, a vector with id #245 is first inserted with tag #245. If the vector is later replaced with vector id #1067, tag #245 now corresponds to vector id #1067. Upon another replace, tag #245 might next correspond to vector id #2428. This distinction leads us to define the semantics of each operation in terms of ids and tags:

1. `search` provides a set of query vectors, and returns an array of tags corresponding to the nearest index vectors to each query vector. In this repository, each call to `search` in one runbook refers to the same set of query vectors.
2. `insert` provides a range of vector ids, whose tags are identical to their vector ids, to insert into the index.
3. `delete` provides a range of existing tags whose underlying data is to be deleted from the index and no longer returned as answers to queries.
4. `replace` provides a range of existing tags and a range of vector ids, such that each tag should henceforth correspond to the new vector id.

## Available Runbooks

Now that the number of runbooks has started to increase significantly, here we list the available runbooks with a brief description of each.

1. `simple_runbook.yaml`: A runbook executing a short sequences of insertions, searches, and deletions to aid with debugging and testing.
2. `simple_replace_runbook.yaml`: A runbook executing a short sequence of inserts, searches, and replaces to aid with debugging and testing.
3. `clustered_runbook.yaml`: A runbook taking a clustered dataset (options are `random-xs-clustered` and `msturing-10M-clustered`) and inserting points in clustered order.
4. `delete_runbook.yaml`: A runbook executing all steps in the clustered runbook, but which then deletes a fraction of each cluster.
5. `final_runbook.yaml`: The NeurIPS 2023 streaming challenge final runbook. It takes the `msturing-30M-clustered` dataset and performs several rounds of insertion and deletion in clustered order.
6. `msmarco-100M_expirationtime_runbook.yaml`: A runbook using the `msmarco-100M` dataset which inserts each point with a randomly chosen expiration time: never, in 200 steps, or in 50 steps.
7. `neurips23/streaming/wikipedia-35M_expirationtime_runbook.yaml`: A runbook using the `wikipedia-35M` dataset which inserts each point with a randomly chosen expiration time: never, in 200 steps, or in 50 steps.
8. `neurips23/streaming/msturing-10M_slidingwindow_runbook.yaml`: A runbook using the `msturing-10M` dataset which inserts half the points, then maintains the index at a consistent size using a sliding window.
9. `clustered_replace_runbook.yaml`: A replace-focused runbook which takes the `msturing-10M-clustered` dataset, inserts a fraction of the points in each cluster, then replaces some of that fraction with vector ids from the same cluster.
10. `random_replace_runbook.yaml`: A replace-focused runbook which takes the `msturing-10M-clustered` dataset, inserts a fraction of the points in each cluster, then replaces some of that fraction with vector ids from a different randomly selected cluster.
Loading

0 comments on commit 4aea829

Please sign in to comment.