Add "replace" to streaming challenge with new runbooks (#301)

* added replace to load and compute gt * commit to switch * added support for replace * added README to streaming section * Update README.md Reworded README in a few places * got rid of accidental change to file
harsha-simhadri · Aug 26, 2024 · 4aea829 · 4aea829
1 parent 3666346
commit 4aea829
Show file tree

Hide file tree

Showing 9 changed files with 1,176 additions and 17 deletions.
diff --git a/benchmark/streaming/compute_gt.py b/benchmark/streaming/compute_gt.py
@@ -2,32 +2,55 @@
 import os
 import numpy as np
 
+import sys
+[sys.path.append(i) for i in ['.', '..']]
+
 from benchmark.datasets import DATASETS
 from benchmark.streaming.load_runbook import load_runbook
 
-def get_range_start_end(entry):
-    return np.arange(entry['start'],  entry['end'], dtype=np.uint32)
+def get_range_start_end(entry, tag_to_id):
+    for i in range(entry['end'] - entry['start']):
+        tag_to_id[i+entry['start']] = i+entry['start']
+    return tag_to_id
 
-def get_next_set(ids: np.ndarray, entry):
+def get_next_set(tag_to_id: np.ndarray, entry):
     match entry['operation']:
         case 'insert':
-            range = get_range_start_end(entry)
-            return np.union1d(ids, range)
+            for i in range(entry['end'] - entry['start']):
+                tag_to_id[i+entry['start']] = i+entry['start']
+            return tag_to_id
         case 'delete':
-            range = get_range_start_end(entry)
-            return np.setdiff1d(ids, range, assume_unique=True)
+            # delete is by key 
+            for i in range(entry['end'] - entry['start']):
+                tag_to_id.pop(i + entry['start'])
+            return tag_to_id
+        case 'replace':
+            # replace key with value
+            for i in range(entry['tags_end'] - entry['tags_start']):
+                tag_to_id[i + entry['tags_start']] = entry['ids_start'] + i
+            return tag_to_id
         case 'search':
-            return ids
+            return tag_to_id
         case _:       
             raise ValueError('Undefined entry in runbook')
 
 def gt_dir(ds, runbook_path):
     runbook_filename = os.path.split(runbook_path)[1]
     return os.path.join(ds.basedir, str(ds.nb), runbook_filename)
 
-def output_gt(ds, ids, step, gt_cmdline, runbook_path):
+def output_gt(ds, tag_to_id, step, gt_cmdline, runbook_path):
+    ids_list = []
+    tags_list = []
+    for tag, id in tag_to_id.items():
+        ids_list.append(id)
+        tags_list.append(tag)
+
+    ids = np.array(ids_list, dtype = np.uint32)
+    tags = np.array(tags_list, dtype = np.uint32)
+
+
     data = ds.get_data_in_range(0, ds.nb)
-    data_slice = data[ids]
+    data_slice = data[np.array(ids)]
 
     dir = gt_dir(ds, runbook_path)
     prefix = os.path.join(dir, 'step') + str(step) 
@@ -39,9 +62,9 @@ def output_gt(ds, ids, step, gt_cmdline, runbook_path):
 
     with open(tags_file, 'wb') as tf:
         one = 1
-        tf.write(ids.size.to_bytes(4, byteorder='little'))
+        tf.write(tags.size.to_bytes(4, byteorder='little'))
         tf.write(one.to_bytes(4, byteorder='little'))
-        ids.tofile(tf)
+        tags.tofile(tf)    
     with open(data_file, 'wb') as f:
         f.write(ids.size.to_bytes(4, byteorder='little')) #npts
         f.write(ds.d.to_bytes(4, byteorder='little'))
@@ -111,14 +134,15 @@ def main():
 
     step = 1
     ids = np.empty(0, dtype=np.uint32)
+
     for entry in runbook:
+        # the first step must be an insertion
         if step == 1:
-            ids = get_range_start_end(entry)
+            tag_to_id = get_range_start_end(entry, {})
         else:
-            ids = get_next_set(ids, entry)
-        print(ids)
+            tag_to_id = get_next_set(tag_to_id, entry)
         if (entry['operation'] == 'search'):
-            output_gt(ds, ids, step, common_cmd, args.runbook_file)
+            output_gt(ds, tag_to_id, step, common_cmd, args.runbook_file)
         step += 1
 
 if __name__ == '__main__':

diff --git a/benchmark/streaming/load_runbook.py b/benchmark/streaming/load_runbook.py
@@ -7,7 +7,7 @@ def load_runbook(dataset_name, max_pts, runbook_file):
         run_list = []
         while i in runbook:
             entry = runbook.get(i)
-            if entry['operation'] not in {'insert', 'delete', 'search'}:
+            if entry['operation'] not in {'insert', 'delete', 'search', 'replace'}:
                 raise Exception('Undefined runbook operation')
             if entry['operation']  in {'insert', 'delete'}:
                 if 'start' not in entry:
@@ -18,6 +18,23 @@ def load_runbook(dataset_name, max_pts, runbook_file):
                     raise Exception('Start out of range in runbook')
                 if entry['end'] < 0 or entry['end'] > max_pts:
                     raise Exception('End out of range in runbook')
+            if entry['operation'] in {'replace'}:
+                if 'tags_start' not in entry:
+                    raise Exception('Start of indices to be replaced not specified in runbook')
+                if 'tags_end' not in entry:
+                    raise Exception('End of indices to be replaced not specified in runbook')
+                if 'ids_start' not in entry:
+                    raise Exception('Start of indices to replace not specified in runbook')
+                if 'ids_end' not in entry:
+                    raise Exception('End of indices to replace not specified in runbook')
+                if entry['tags_start'] < 0 or entry ['tags_start'] >= max_pts:
+                    raise Exception('Start of indices to be replaced out of range in runbook') 
+                if entry['tags_end'] < 0 or entry ['tags_end'] > max_pts:
+                    raise Exception('End of indices to be replaced out of range in runbook') 
+                if entry['ids_start'] < 0 or entry ['ids_start'] >= max_pts:
+                    raise Exception('Start of indices to replace out of range in runbook') 
+                if entry['ids_end'] < 0 or entry ['ids_end'] > max_pts:
+                    raise Exception('End of indices to replace out of range in runbook') 
             i += 1
             run_list.append(entry)
 

diff --git a/data_export.py b/data_export.py
@@ -97,13 +97,18 @@ def cleaned_run_metric(run_metrics):
             runbook_paths = [None]
             if track == 'streaming':
                 runbook_paths = ['neurips23/streaming/simple_runbook.yaml',
+                                    'neurips23/streaming/simple_replace_runbook.yaml',
+                                    'neurips23/streaming/random_replace_runbook.yaml',
+                                    'neurips23/streaming/clustered_replace_runbook.yaml',
+                                    'neurips23/streaming/clustered_runbook.yaml',
                                     'neurips23/streaming/clustered_runbook.yaml',
                                     'neurips23/streaming/delete_runbook.yaml',
                                     'neurips23/streaming/final_runbook.yaml',
                                     'neurips23/streaming/msturing-10M_slidingwindow_runbook.yaml',
                                     'neurips23/streaming/wikipedia-35M_expirationtime_runbook.yaml',
                                     'neurips23/streaming/msmarco-100M_expirationtime_runbook.yaml']
             for runbook_path in runbook_paths:
+                print("Looking for runbook ", runbook_path)
                 results = load_all_results(dataset_name, neurips23track=track, runbook_path=runbook_path)
                 results = compute_metrics_all_runs(dataset, dataset_name, results, args.recompute, \
                     args.sensors, args.search_times, args.private_query, \

diff --git a/neurips23/streaming/README.md b/neurips23/streaming/README.md
@@ -0,0 +1,29 @@
+# NeurIPS 2023 Streaming Challenge and Beyond
+
+This README will discuss ongoing changes to the streaming benchmark challenge. See the NeurIPS23 README for instructions on how to execute runbooks and compute groundtruth for them. All changes here are backwards-compatible with those instructions. 
+
+## Semantics
+
+The streaming runbooks support four operations: `search`, `insert`, `delete`, and a recent addition `replace`. The addition of replace, where a vector's data is modified in-place, prompts us to define the semantics of vector *tags* versus vector *ids*. 
+
+Each vector is assumed to have a unique *id* which never changes throughout the course of a runbook. In the case of replaces, each vector is also assigned a numeric *tag*. The underlying vector id corresponding to a tag may change throughout the runbook when a vector is replaced. In the runbooks here, the tag of a vector is assumed to correspond to the vector id when a vector is first inserted, and then remains constant when the vector is replaced. For example, a vector with id #245 is first inserted with tag #245. If the vector is later replaced with vector id #1067, tag #245 now corresponds to vector id #1067. Upon another replace, tag #245 might next correspond to vector id #2428. This distinction leads us to define the semantics of each operation in terms of ids and tags:
+
+1. `search` provides a set of query vectors, and returns an array of tags corresponding to the nearest index vectors to each query vector. In this repository, each call to `search` in one runbook refers to the same set of query vectors.
+2. `insert` provides a range of vector ids, whose tags are identical to their vector ids, to insert into the index.
+3. `delete` provides a range of existing tags whose underlying data is to be deleted from the index and no longer returned as answers to queries.
+4. `replace` provides a range of existing tags and a range of vector ids, such that each tag should henceforth correspond to the new vector id. 
+
+## Available Runbooks
+
+Now that the number of runbooks has started to increase significantly, here we list the available runbooks with a brief description of each. 
+
+1. `simple_runbook.yaml`: A runbook executing a short sequences of insertions, searches, and deletions to aid with debugging and testing.
+2. `simple_replace_runbook.yaml`: A runbook executing a short sequence of inserts, searches, and replaces to aid with debugging and testing.
+3. `clustered_runbook.yaml`: A runbook taking a clustered dataset (options are `random-xs-clustered` and `msturing-10M-clustered`) and inserting points in clustered order.
+4. `delete_runbook.yaml`: A runbook executing all steps in the clustered runbook, but which then deletes a fraction of each cluster.
+5. `final_runbook.yaml`: The NeurIPS 2023 streaming challenge final runbook. It takes the `msturing-30M-clustered` dataset and performs several rounds of insertion and deletion in clustered order.
+6. `msmarco-100M_expirationtime_runbook.yaml`: A runbook using the `msmarco-100M` dataset which inserts each point with a randomly chosen expiration time: never, in 200 steps, or in 50 steps.
+7. `neurips23/streaming/wikipedia-35M_expirationtime_runbook.yaml`: A runbook using the `wikipedia-35M` dataset which inserts each point with a randomly chosen expiration time: never, in 200 steps, or in 50 steps.
+8. `neurips23/streaming/msturing-10M_slidingwindow_runbook.yaml`: A runbook using the `msturing-10M` dataset which inserts half the points, then maintains the index at a consistent size using a sliding window. 
+9. `clustered_replace_runbook.yaml`: A replace-focused runbook which takes the `msturing-10M-clustered` dataset, inserts a fraction of the points in each cluster, then replaces some of that fraction with vector ids from the same cluster.
+10. `random_replace_runbook.yaml`: A replace-focused runbook which takes the `msturing-10M-clustered` dataset, inserts a fraction of the points in each cluster, then replaces some of that fraction with vector ids from a different randomly selected cluster.