From 4aea829713b5a5aa702e57a01a56b039859090aa Mon Sep 17 00:00:00 2001 From: magdalendobson <58752279+magdalendobson@users.noreply.github.com> Date: Mon, 26 Aug 2024 17:16:05 -0400 Subject: [PATCH] Add "replace" to streaming challenge with new runbooks (#301) * added replace to load and compute gt * commit to switch * added support for replace * added README to streaming section * Update README.md Reworded README in a few places * got rid of accidental change to file --- benchmark/streaming/compute_gt.py | 56 ++- benchmark/streaming/load_runbook.py | 19 +- data_export.py | 5 + neurips23/streaming/README.md | 29 ++ .../streaming/clustered_replace_runbook.yaml | 450 ++++++++++++++++++ neurips23/streaming/gen_replace_runbooks.py | 149 ++++++ .../streaming/random_replace_runbook.yaml | 450 ++++++++++++++++++ neurips23/streaming/run.py | 5 + .../streaming/simple_replace_runbook.yaml | 30 ++ 9 files changed, 1176 insertions(+), 17 deletions(-) create mode 100644 neurips23/streaming/README.md create mode 100644 neurips23/streaming/clustered_replace_runbook.yaml create mode 100644 neurips23/streaming/gen_replace_runbooks.py create mode 100644 neurips23/streaming/random_replace_runbook.yaml create mode 100644 neurips23/streaming/simple_replace_runbook.yaml diff --git a/benchmark/streaming/compute_gt.py b/benchmark/streaming/compute_gt.py index 2260e8c10..909708609 100644 --- a/benchmark/streaming/compute_gt.py +++ b/benchmark/streaming/compute_gt.py @@ -2,22 +2,35 @@ import os import numpy as np +import sys +[sys.path.append(i) for i in ['.', '..']] + from benchmark.datasets import DATASETS from benchmark.streaming.load_runbook import load_runbook -def get_range_start_end(entry): - return np.arange(entry['start'], entry['end'], dtype=np.uint32) +def get_range_start_end(entry, tag_to_id): + for i in range(entry['end'] - entry['start']): + tag_to_id[i+entry['start']] = i+entry['start'] + return tag_to_id -def get_next_set(ids: np.ndarray, entry): +def get_next_set(tag_to_id: np.ndarray, entry): match entry['operation']: case 'insert': - range = get_range_start_end(entry) - return np.union1d(ids, range) + for i in range(entry['end'] - entry['start']): + tag_to_id[i+entry['start']] = i+entry['start'] + return tag_to_id case 'delete': - range = get_range_start_end(entry) - return np.setdiff1d(ids, range, assume_unique=True) + # delete is by key + for i in range(entry['end'] - entry['start']): + tag_to_id.pop(i + entry['start']) + return tag_to_id + case 'replace': + # replace key with value + for i in range(entry['tags_end'] - entry['tags_start']): + tag_to_id[i + entry['tags_start']] = entry['ids_start'] + i + return tag_to_id case 'search': - return ids + return tag_to_id case _: raise ValueError('Undefined entry in runbook') @@ -25,9 +38,19 @@ def gt_dir(ds, runbook_path): runbook_filename = os.path.split(runbook_path)[1] return os.path.join(ds.basedir, str(ds.nb), runbook_filename) -def output_gt(ds, ids, step, gt_cmdline, runbook_path): +def output_gt(ds, tag_to_id, step, gt_cmdline, runbook_path): + ids_list = [] + tags_list = [] + for tag, id in tag_to_id.items(): + ids_list.append(id) + tags_list.append(tag) + + ids = np.array(ids_list, dtype = np.uint32) + tags = np.array(tags_list, dtype = np.uint32) + + data = ds.get_data_in_range(0, ds.nb) - data_slice = data[ids] + data_slice = data[np.array(ids)] dir = gt_dir(ds, runbook_path) prefix = os.path.join(dir, 'step') + str(step) @@ -39,9 +62,9 @@ def output_gt(ds, ids, step, gt_cmdline, runbook_path): with open(tags_file, 'wb') as tf: one = 1 - tf.write(ids.size.to_bytes(4, byteorder='little')) + tf.write(tags.size.to_bytes(4, byteorder='little')) tf.write(one.to_bytes(4, byteorder='little')) - ids.tofile(tf) + tags.tofile(tf) with open(data_file, 'wb') as f: f.write(ids.size.to_bytes(4, byteorder='little')) #npts f.write(ds.d.to_bytes(4, byteorder='little')) @@ -111,14 +134,15 @@ def main(): step = 1 ids = np.empty(0, dtype=np.uint32) + for entry in runbook: + # the first step must be an insertion if step == 1: - ids = get_range_start_end(entry) + tag_to_id = get_range_start_end(entry, {}) else: - ids = get_next_set(ids, entry) - print(ids) + tag_to_id = get_next_set(tag_to_id, entry) if (entry['operation'] == 'search'): - output_gt(ds, ids, step, common_cmd, args.runbook_file) + output_gt(ds, tag_to_id, step, common_cmd, args.runbook_file) step += 1 if __name__ == '__main__': diff --git a/benchmark/streaming/load_runbook.py b/benchmark/streaming/load_runbook.py index b2cc12644..ec70e1da9 100644 --- a/benchmark/streaming/load_runbook.py +++ b/benchmark/streaming/load_runbook.py @@ -7,7 +7,7 @@ def load_runbook(dataset_name, max_pts, runbook_file): run_list = [] while i in runbook: entry = runbook.get(i) - if entry['operation'] not in {'insert', 'delete', 'search'}: + if entry['operation'] not in {'insert', 'delete', 'search', 'replace'}: raise Exception('Undefined runbook operation') if entry['operation'] in {'insert', 'delete'}: if 'start' not in entry: @@ -18,6 +18,23 @@ def load_runbook(dataset_name, max_pts, runbook_file): raise Exception('Start out of range in runbook') if entry['end'] < 0 or entry['end'] > max_pts: raise Exception('End out of range in runbook') + if entry['operation'] in {'replace'}: + if 'tags_start' not in entry: + raise Exception('Start of indices to be replaced not specified in runbook') + if 'tags_end' not in entry: + raise Exception('End of indices to be replaced not specified in runbook') + if 'ids_start' not in entry: + raise Exception('Start of indices to replace not specified in runbook') + if 'ids_end' not in entry: + raise Exception('End of indices to replace not specified in runbook') + if entry['tags_start'] < 0 or entry ['tags_start'] >= max_pts: + raise Exception('Start of indices to be replaced out of range in runbook') + if entry['tags_end'] < 0 or entry ['tags_end'] > max_pts: + raise Exception('End of indices to be replaced out of range in runbook') + if entry['ids_start'] < 0 or entry ['ids_start'] >= max_pts: + raise Exception('Start of indices to replace out of range in runbook') + if entry['ids_end'] < 0 or entry ['ids_end'] > max_pts: + raise Exception('End of indices to replace out of range in runbook') i += 1 run_list.append(entry) diff --git a/data_export.py b/data_export.py index 12635c930..6c24216b5 100644 --- a/data_export.py +++ b/data_export.py @@ -97,6 +97,10 @@ def cleaned_run_metric(run_metrics): runbook_paths = [None] if track == 'streaming': runbook_paths = ['neurips23/streaming/simple_runbook.yaml', + 'neurips23/streaming/simple_replace_runbook.yaml', + 'neurips23/streaming/random_replace_runbook.yaml', + 'neurips23/streaming/clustered_replace_runbook.yaml', + 'neurips23/streaming/clustered_runbook.yaml', 'neurips23/streaming/clustered_runbook.yaml', 'neurips23/streaming/delete_runbook.yaml', 'neurips23/streaming/final_runbook.yaml', @@ -104,6 +108,7 @@ def cleaned_run_metric(run_metrics): 'neurips23/streaming/wikipedia-35M_expirationtime_runbook.yaml', 'neurips23/streaming/msmarco-100M_expirationtime_runbook.yaml'] for runbook_path in runbook_paths: + print("Looking for runbook ", runbook_path) results = load_all_results(dataset_name, neurips23track=track, runbook_path=runbook_path) results = compute_metrics_all_runs(dataset, dataset_name, results, args.recompute, \ args.sensors, args.search_times, args.private_query, \ diff --git a/neurips23/streaming/README.md b/neurips23/streaming/README.md new file mode 100644 index 000000000..1b66e222b --- /dev/null +++ b/neurips23/streaming/README.md @@ -0,0 +1,29 @@ +# NeurIPS 2023 Streaming Challenge and Beyond + +This README will discuss ongoing changes to the streaming benchmark challenge. See the NeurIPS23 README for instructions on how to execute runbooks and compute groundtruth for them. All changes here are backwards-compatible with those instructions. + +## Semantics + +The streaming runbooks support four operations: `search`, `insert`, `delete`, and a recent addition `replace`. The addition of replace, where a vector's data is modified in-place, prompts us to define the semantics of vector *tags* versus vector *ids*. + +Each vector is assumed to have a unique *id* which never changes throughout the course of a runbook. In the case of replaces, each vector is also assigned a numeric *tag*. The underlying vector id corresponding to a tag may change throughout the runbook when a vector is replaced. In the runbooks here, the tag of a vector is assumed to correspond to the vector id when a vector is first inserted, and then remains constant when the vector is replaced. For example, a vector with id #245 is first inserted with tag #245. If the vector is later replaced with vector id #1067, tag #245 now corresponds to vector id #1067. Upon another replace, tag #245 might next correspond to vector id #2428. This distinction leads us to define the semantics of each operation in terms of ids and tags: + +1. `search` provides a set of query vectors, and returns an array of tags corresponding to the nearest index vectors to each query vector. In this repository, each call to `search` in one runbook refers to the same set of query vectors. +2. `insert` provides a range of vector ids, whose tags are identical to their vector ids, to insert into the index. +3. `delete` provides a range of existing tags whose underlying data is to be deleted from the index and no longer returned as answers to queries. +4. `replace` provides a range of existing tags and a range of vector ids, such that each tag should henceforth correspond to the new vector id. + +## Available Runbooks + +Now that the number of runbooks has started to increase significantly, here we list the available runbooks with a brief description of each. + +1. `simple_runbook.yaml`: A runbook executing a short sequences of insertions, searches, and deletions to aid with debugging and testing. +2. `simple_replace_runbook.yaml`: A runbook executing a short sequence of inserts, searches, and replaces to aid with debugging and testing. +3. `clustered_runbook.yaml`: A runbook taking a clustered dataset (options are `random-xs-clustered` and `msturing-10M-clustered`) and inserting points in clustered order. +4. `delete_runbook.yaml`: A runbook executing all steps in the clustered runbook, but which then deletes a fraction of each cluster. +5. `final_runbook.yaml`: The NeurIPS 2023 streaming challenge final runbook. It takes the `msturing-30M-clustered` dataset and performs several rounds of insertion and deletion in clustered order. +6. `msmarco-100M_expirationtime_runbook.yaml`: A runbook using the `msmarco-100M` dataset which inserts each point with a randomly chosen expiration time: never, in 200 steps, or in 50 steps. +7. `neurips23/streaming/wikipedia-35M_expirationtime_runbook.yaml`: A runbook using the `wikipedia-35M` dataset which inserts each point with a randomly chosen expiration time: never, in 200 steps, or in 50 steps. +8. `neurips23/streaming/msturing-10M_slidingwindow_runbook.yaml`: A runbook using the `msturing-10M` dataset which inserts half the points, then maintains the index at a consistent size using a sliding window. +9. `clustered_replace_runbook.yaml`: A replace-focused runbook which takes the `msturing-10M-clustered` dataset, inserts a fraction of the points in each cluster, then replaces some of that fraction with vector ids from the same cluster. +10. `random_replace_runbook.yaml`: A replace-focused runbook which takes the `msturing-10M-clustered` dataset, inserts a fraction of the points in each cluster, then replaces some of that fraction with vector ids from a different randomly selected cluster. diff --git a/neurips23/streaming/clustered_replace_runbook.yaml b/neurips23/streaming/clustered_replace_runbook.yaml new file mode 100644 index 000000000..de3abcaac --- /dev/null +++ b/neurips23/streaming/clustered_replace_runbook.yaml @@ -0,0 +1,450 @@ +msturing-10M-clustered: + max_pts: 7191263 + 1: + end: 184033 + operation: insert + start: 0 + 2: + operation: search + 3: + end: 441437 + operation: insert + start: 255771 + 4: + operation: search + 5: + end: 738616 + operation: insert + start: 491965 + 6: + operation: search + 7: + end: 1008884 + operation: insert + start: 824781 + 8: + operation: search + 9: + end: 1407605 + operation: insert + start: 1081209 + 10: + operation: search + 11: + end: 1864833 + operation: insert + start: 1568760 + 12: + operation: search + 13: + end: 2259572 + operation: insert + start: 1959174 + 14: + operation: search + 15: + end: 2742135 + operation: insert + start: 2404186 + 16: + operation: search + 17: + end: 3050396 + operation: insert + start: 2798660 + 18: + operation: search + 19: + end: 3342738 + operation: insert + start: 3082959 + 20: + operation: search + 21: + end: 3832037 + operation: insert + start: 3480554 + 22: + operation: search + 23: + end: 4112969 + operation: insert + start: 3910930 + 24: + operation: search + 25: + end: 4527913 + operation: insert + start: 4194870 + 26: + operation: search + 27: + end: 4844097 + operation: insert + start: 4652840 + 28: + operation: search + 29: + end: 5037538 + operation: insert + start: 4872616 + 30: + operation: search + 31: + end: 5422398 + operation: insert + start: 5184725 + 32: + operation: search + 33: + end: 5829295 + operation: insert + start: 5629098 + 34: + operation: search + 35: + end: 6247916 + operation: insert + start: 6023119 + 36: + operation: search + 37: + end: 6468216 + operation: insert + start: 6292969 + 38: + operation: search + 39: + end: 6728355 + operation: insert + start: 6508987 + 40: + operation: search + 41: + end: 6975224 + operation: insert + start: 6767675 + 42: + operation: search + 43: + end: 7216362 + operation: insert + start: 7000498 + 44: + operation: search + 45: + end: 7415603 + operation: insert + start: 7263856 + 46: + operation: search + 47: + end: 7692157 + operation: insert + start: 7485517 + 48: + operation: search + 49: + end: 7912750 + operation: insert + start: 7739934 + 50: + operation: search + 51: + end: 8301620 + operation: insert + start: 8055691 + 52: + operation: search + 53: + end: 8586722 + operation: insert + start: 8381008 + 54: + operation: search + 55: + end: 8919414 + operation: insert + start: 8750107 + 56: + operation: search + 57: + end: 9141661 + operation: insert + start: 8942969 + 58: + operation: search + 59: + end: 9413396 + operation: insert + start: 9223315 + 60: + operation: search + 61: + end: 9638406 + operation: insert + start: 9508781 + 62: + operation: search + 63: + end: 9947236 + operation: insert + start: 9722747 + 64: + operation: search + 65: + ids_end: 216756 + ids_start: 184033 + operation: replace + tags_end: 32723 + tags_start: 0 + 66: + operation: search + 67: + ids_end: 470158 + ids_start: 441437 + operation: replace + tags_end: 284492 + tags_start: 255771 + 68: + operation: search + 69: + ids_end: 740235 + ids_start: 738616 + operation: replace + tags_end: 493584 + tags_start: 491965 + 70: + operation: search + 71: + ids_end: 1053554 + ids_start: 1008884 + operation: replace + tags_end: 869451 + tags_start: 824781 + 72: + operation: search + 73: + ids_end: 1506247 + ids_start: 1407605 + operation: replace + tags_end: 1179851 + tags_start: 1081209 + 74: + operation: search + 75: + ids_end: 1923035 + ids_start: 1864833 + operation: replace + tags_end: 1626962 + tags_start: 1568760 + 76: + operation: search + 77: + ids_end: 2396051 + ids_start: 2259572 + operation: replace + tags_end: 2095653 + tags_start: 1959174 + 78: + operation: search + 79: + ids_end: 2780674 + ids_start: 2742135 + operation: replace + tags_end: 2442725 + tags_start: 2404186 + 80: + operation: search + 81: + ids_end: 3062102 + ids_start: 3050396 + operation: replace + tags_end: 2810366 + tags_start: 2798660 + 82: + operation: search + 83: + ids_end: 3402967 + ids_start: 3342738 + operation: replace + tags_end: 3143188 + tags_start: 3082959 + 84: + operation: search + 85: + ids_end: 3887075 + ids_start: 3832037 + operation: replace + tags_end: 3535592 + tags_start: 3480554 + 86: + operation: search + 87: + ids_end: 4117901 + ids_start: 4112969 + operation: replace + tags_end: 3915862 + tags_start: 3910930 + 88: + operation: search + 89: + ids_end: 4611210 + ids_start: 4527913 + operation: replace + tags_end: 4278167 + tags_start: 4194870 + 90: + operation: search + 91: + ids_end: 4863222 + ids_start: 4844097 + operation: replace + tags_end: 4671965 + tags_start: 4652840 + 92: + operation: search + 93: + ids_end: 5068503 + ids_start: 5037538 + operation: replace + tags_end: 4903581 + tags_start: 4872616 + 94: + operation: search + 95: + ids_end: 5449047 + ids_start: 5422398 + operation: replace + tags_end: 5211374 + tags_start: 5184725 + 96: + operation: search + 97: + ids_end: 5890432 + ids_start: 5829295 + operation: replace + tags_end: 5690235 + tags_start: 5629098 + 98: + operation: search + 99: + ids_end: 6264302 + ids_start: 6247916 + operation: replace + tags_end: 6039505 + tags_start: 6023119 + 100: + operation: search + 101: + ids_end: 6491463 + ids_start: 6468216 + operation: replace + tags_end: 6316216 + tags_start: 6292969 + 102: + operation: search + 103: + ids_end: 6745600 + ids_start: 6728355 + operation: replace + tags_end: 6526232 + tags_start: 6508987 + 104: + operation: search + 105: + ids_end: 7000204 + ids_start: 6975224 + operation: replace + tags_end: 6792655 + tags_start: 6767675 + 106: + operation: search + 107: + ids_end: 7221208 + ids_start: 7216362 + operation: replace + tags_end: 7005344 + tags_start: 7000498 + 108: + operation: search + 109: + ids_end: 7430206 + ids_start: 7415603 + operation: replace + tags_end: 7278459 + tags_start: 7263856 + 110: + operation: search + 111: + ids_end: 7699863 + ids_start: 7692157 + operation: replace + tags_end: 7493223 + tags_start: 7485517 + 112: + operation: search + 113: + ids_end: 8006105 + ids_start: 7912750 + operation: replace + tags_end: 7833289 + tags_start: 7739934 + 114: + operation: search + 115: + ids_end: 8321728 + ids_start: 8301620 + operation: replace + tags_end: 8075799 + tags_start: 8055691 + 116: + operation: search + 117: + ids_end: 8662910 + ids_start: 8586722 + operation: replace + tags_end: 8457196 + tags_start: 8381008 + 118: + operation: search + 119: + ids_end: 8925171 + ids_start: 8919414 + operation: replace + tags_end: 8755864 + tags_start: 8750107 + 120: + operation: search + 121: + ids_end: 9154641 + ids_start: 9141661 + operation: replace + tags_end: 8955949 + tags_start: 8942969 + 122: + operation: search + 123: + ids_end: 9423924 + ids_start: 9413396 + operation: replace + tags_end: 9233843 + tags_start: 9223315 + 124: + operation: search + 125: + ids_end: 9693761 + ids_start: 9638406 + operation: replace + tags_end: 9564136 + tags_start: 9508781 + 126: + operation: search + 127: + ids_end: 9954527 + ids_start: 9947236 + operation: replace + tags_end: 9730038 + tags_start: 9722747 + 128: + operation: search diff --git a/neurips23/streaming/gen_replace_runbooks.py b/neurips23/streaming/gen_replace_runbooks.py new file mode 100644 index 000000000..ccaa5ec24 --- /dev/null +++ b/neurips23/streaming/gen_replace_runbooks.py @@ -0,0 +1,149 @@ +import argparse +import os +import numpy as np +import yaml + +import sys +[sys.path.append(i) for i in ['.', '..', '../..']] + +from scipy.cluster.vq import vq, kmeans2 +from typing import Tuple +from benchmark.datasets import DATASETS +from benchmark.streaming.load_runbook import load_runbook + +#extract cluster information from msturing-10M-clustered +def extract_clusters(runbook_path, max_pts, ds_name): + max_pts, run_list = load_runbook(ds_name, max_pts, runbook_path) + clusters = [] + for entry in run_list: + match entry['operation']: + case 'insert': + clusters.append((entry['start'], entry['end'])) + case _: + continue + return clusters + + + +# runbook will do the following: +# 1) insert a random fraction > .5 of each of the 32 clusters with searches interleaved +# 2) for each cluster, replace its beginning prefix with a random fraction of the remaining +# points with searches interleaved +def write_replace_clustered_runbook(clusters, output_yaml_file, dataset_str): + inserted_clusters = [] + operation_list = [] + max_pts = 0 + num_operations = 1 + active_points = 0 + # add seed to make operation deterministic + np.random.seed(0) + + #step 1: insert + for cluster in clusters: + fraction = np.random.uniform(.5, .9) + delta = int(fraction*(cluster[1]-cluster[0])) + active_points += delta + max_pts = max(max_pts, active_points) + cluster_to_insert = (cluster[0], cluster[0]+delta) + inserted_clusters.append(cluster_to_insert) + entry = {'operation': 'insert','start': int(cluster_to_insert[0]), 'end': int(cluster_to_insert[1])} + operation_list.append((num_operations, entry)) + num_operations += 1 + operation_list.append((num_operations, {'operation': str('search')})) + num_operations += 1 + + #step 2: replace + for inserted_cluster, full_cluster in zip(inserted_clusters, clusters): + fraction = np.random.uniform(0,1.0) + delta = int(fraction*(full_cluster[1] - inserted_cluster[1])) + assert delta <= inserted_cluster[1] - inserted_cluster[0] + replace_tags_start = inserted_cluster[0] + replace_tags_end = replace_tags_start + delta + replace_ids_start = inserted_cluster[1] + replace_ids_end = inserted_cluster[1] + delta + entry = {'operation': 'replace', 'tags_start': replace_tags_start, 'tags_end': replace_tags_end, 'ids_start': replace_ids_start, 'ids_end': replace_ids_end} + operation_list.append((num_operations, entry)) + num_operations += 1 + operation_list.append((num_operations, {'operation': str('search')})) + num_operations += 1 + + #write to yaml file + with open(output_yaml_file, 'w') as yf: + operation_list.sort(key = lambda x: x[0]) + sorted_dict = {} + sorted_dict['max_pts'] = int(max_pts) + for (k, v) in operation_list: + sorted_dict[k]=v + yaml_object = {} + yaml_object[dataset_str] = sorted_dict + yaml.dump(yaml_object, yf) + + + +# runbook will do the following: +# 1) insert a random fraction > .5 of each of the 32 clusters with searches interleaved +# 2) for each cluster, select a *random* cluster and replace its beginning prefix +# with a random fraction of remaining points in that cluster with searches interleaved +def write_replace_random_runbook(clusters, output_yaml_file, dataset_str): + inserted_clusters = [] + operation_list = [] + max_pts = 0 + num_operations = 1 + active_points = 0 + # add seed to make operation deterministic + np.random.seed(1) + + #step 1: insert + for cluster in clusters: + fraction = np.random.uniform(.5, .9) + delta = int(fraction*(cluster[1]-cluster[0])) + active_points += delta + max_pts = max(max_pts, active_points) + cluster_to_insert = (cluster[0], cluster[0]+delta) + inserted_clusters.append(cluster_to_insert) + entry = {'operation': 'insert','start': int(cluster_to_insert[0]), 'end': int(cluster_to_insert[1])} + operation_list.append((num_operations, entry)) + num_operations += 1 + operation_list.append((num_operations, {'operation': str('search')})) + num_operations += 1 + + cluster_ids = np.random.permutation(32) + + #step 2: replace + for c in range(32): + fraction = np.random.uniform(0,1.0) + full_cluster_random = clusters[cluster_ids[c]] + inserted_cluster_random = inserted_clusters[cluster_ids[c]] + this_cluster = inserted_clusters[c] + this_cluster_size = this_cluster[1] - this_cluster[0] + delta = min(this_cluster_size, int(fraction*(full_cluster_random[1] - inserted_cluster_random[1]))) + assert delta <= this_cluster[1] - this_cluster[0] + replace_tags_start = this_cluster[0] + replace_tags_end = replace_tags_start + delta + replace_ids_start = inserted_cluster_random[1] + replace_ids_end = replace_ids_start + delta + entry = {'operation': 'replace', 'tags_start': replace_tags_start, 'tags_end': replace_tags_end, 'ids_start': replace_ids_start, 'ids_end': replace_ids_end} + operation_list.append((num_operations, entry)) + num_operations += 1 + operation_list.append((num_operations, {'operation': str('search')})) + num_operations += 1 + + #write to yaml file + with open(output_yaml_file, 'w') as yf: + operation_list.sort(key = lambda x: x[0]) + sorted_dict = {} + sorted_dict['max_pts'] = int(max_pts) + for (k, v) in operation_list: + sorted_dict[k]=v + yaml_object = {} + yaml_object[dataset_str] = sorted_dict + yaml.dump(yaml_object, yf) + + +ds = DATASETS['msturing-10M-clustered'] +cluster_runbook_path='clustered_runbook.yaml' +clustered_replace_yaml='clustered_replace_runbook.yaml' +clustered_random_yaml='random_replace_runbook.yaml' +clusters = extract_clusters(cluster_runbook_path, 10000000, 'msturing-10M-clustered') +write_replace_clustered_runbook(clusters, clustered_replace_yaml, 'msturing-10M-clustered') +write_replace_random_runbook(clusters, clustered_random_yaml, 'msturing-10M-clustered') \ No newline at end of file diff --git a/neurips23/streaming/random_replace_runbook.yaml b/neurips23/streaming/random_replace_runbook.yaml new file mode 100644 index 000000000..44af3ebf5 --- /dev/null +++ b/neurips23/streaming/random_replace_runbook.yaml @@ -0,0 +1,450 @@ +msturing-10M-clustered: + max_pts: 6641122 + 1: + end: 170550 + operation: insert + start: 0 + 2: + operation: search + 3: + end: 441922 + operation: insert + start: 255771 + 4: + operation: search + 5: + end: 658388 + operation: insert + start: 491965 + 6: + operation: search + 7: + end: 984005 + operation: insert + start: 824781 + 8: + operation: search + 9: + end: 1353604 + operation: insert + start: 1081209 + 10: + operation: search + 11: + end: 1778387 + operation: insert + start: 1568760 + 12: + operation: search + 13: + end: 2214835 + operation: insert + start: 1959174 + 14: + operation: search + 15: + end: 2655948 + operation: insert + start: 2404186 + 16: + operation: search + 17: + end: 2985929 + operation: insert + start: 2798660 + 18: + operation: search + 19: + end: 3367448 + operation: insert + start: 3082959 + 20: + operation: search + 21: + end: 3767906 + operation: insert + start: 3480554 + 22: + operation: search + 23: + end: 4130724 + operation: insert + start: 3910930 + 24: + operation: search + 25: + end: 4461308 + operation: insert + start: 4194870 + 26: + operation: search + 27: + end: 4839923 + operation: insert + start: 4652840 + 28: + operation: search + 29: + end: 5032089 + operation: insert + start: 4872616 + 30: + operation: search + 31: + end: 5526086 + operation: insert + start: 5184725 + 32: + operation: search + 33: + end: 5891879 + operation: insert + start: 5629098 + 34: + operation: search + 35: + end: 6218348 + operation: insert + start: 6023119 + 36: + operation: search + 37: + end: 6413108 + operation: insert + start: 6292969 + 38: + operation: search + 39: + end: 6658829 + operation: insert + start: 6508987 + 40: + operation: search + 41: + end: 6958659 + operation: insert + start: 6767675 + 42: + operation: search + 43: + end: 7234176 + operation: insert + start: 7000498 + 44: + operation: search + 45: + end: 7402476 + operation: insert + start: 7263856 + 46: + operation: search + 47: + end: 7683180 + operation: insert + start: 7485517 + 48: + operation: search + 49: + end: 8008502 + operation: insert + start: 7739934 + 50: + operation: search + 51: + end: 8334761 + operation: insert + start: 8055691 + 52: + operation: search + 53: + end: 8578113 + operation: insert + start: 8381008 + 54: + operation: search + 55: + end: 8849550 + operation: insert + start: 8750107 + 56: + operation: search + 57: + end: 9102186 + operation: insert + start: 8942969 + 58: + operation: search + 59: + end: 9466319 + operation: insert + start: 9223315 + 60: + operation: search + 61: + end: 9624181 + operation: insert + start: 9508781 + 62: + operation: search + 63: + end: 9908074 + operation: insert + start: 9722747 + 64: + operation: search + 65: + ids_end: 9970476 + ids_start: 9908074 + operation: replace + tags_end: 62402 + tags_start: 0 + 66: + operation: search + 67: + ids_end: 7420049 + ids_start: 7402476 + operation: replace + tags_end: 273344 + tags_start: 255771 + 68: + operation: search + 69: + ids_end: 8623785 + ids_start: 8578113 + operation: replace + tags_end: 537637 + tags_start: 491965 + 70: + operation: search + 71: + ids_end: 1867257 + ids_start: 1778387 + operation: replace + tags_end: 913651 + tags_start: 824781 + 72: + operation: search + 73: + ids_end: 9468584 + ids_start: 9466319 + operation: replace + tags_end: 1083474 + tags_start: 1081209 + 74: + operation: search + 75: + ids_end: 9171728 + ids_start: 9102186 + operation: replace + tags_end: 1638302 + tags_start: 1568760 + 76: + operation: search + 77: + ids_end: 449264 + ids_start: 441922 + operation: replace + tags_end: 1966516 + tags_start: 1959174 + 78: + operation: search + 79: + ids_end: 2326420 + ids_start: 2214835 + operation: replace + tags_end: 2515771 + tags_start: 2404186 + 80: + operation: search + 81: + ids_end: 3053826 + ids_start: 2985929 + operation: replace + tags_end: 2866557 + tags_start: 2798660 + 82: + operation: search + 83: + ids_end: 5047708 + ids_start: 5032089 + operation: replace + tags_end: 3098578 + tags_start: 3082959 + 84: + operation: search + 85: + ids_end: 3827125 + ids_start: 3767906 + operation: replace + tags_end: 3539773 + tags_start: 3480554 + 86: + operation: search + 87: + ids_end: 7722589 + ids_start: 7683180 + operation: replace + tags_end: 3950339 + tags_start: 3910930 + 88: + operation: search + 89: + ids_end: 2715056 + ids_start: 2655948 + operation: replace + tags_end: 4253978 + tags_start: 4194870 + 90: + operation: search + 91: + ids_end: 666699 + ids_start: 658388 + operation: replace + tags_end: 4661151 + tags_start: 4652840 + 92: + operation: search + 93: + ids_end: 6464489 + ids_start: 6413108 + operation: replace + tags_end: 4923997 + tags_start: 4872616 + 94: + operation: search + 95: + ids_end: 8911561 + ids_start: 8849550 + operation: replace + tags_end: 5246736 + tags_start: 5184725 + 96: + operation: search + 97: + ids_end: 214429 + ids_start: 170550 + operation: replace + tags_end: 5672977 + tags_start: 5629098 + 98: + operation: search + 99: + ids_end: 5623390 + ids_start: 5526086 + operation: replace + tags_end: 6120423 + tags_start: 6023119 + 100: + operation: search + 101: + ids_end: 3433790 + ids_start: 3367448 + operation: replace + tags_end: 6359311 + tags_start: 6292969 + 102: + operation: search + 103: + ids_end: 6757160 + ids_start: 6658829 + operation: replace + tags_end: 6607318 + tags_start: 6508987 + 104: + operation: search + 105: + ids_end: 4844417 + ids_start: 4839923 + operation: replace + tags_end: 6772169 + tags_start: 6767675 + 106: + operation: search + 107: + ids_end: 5910157 + ids_start: 5891879 + operation: replace + tags_end: 7018776 + tags_start: 7000498 + 108: + operation: search + 109: + ids_end: 6992439 + ids_start: 6958659 + operation: replace + tags_end: 7297636 + tags_start: 7263856 + 110: + operation: search + 111: + ids_end: 4537475 + ids_start: 4461308 + operation: replace + tags_end: 7561684 + tags_start: 7485517 + 112: + operation: search + 113: + ids_end: 4141330 + ids_start: 4130724 + operation: replace + tags_end: 7750540 + tags_start: 7739934 + 114: + operation: search + 115: + ids_end: 9715601 + ids_start: 9624181 + operation: replace + tags_end: 8147111 + tags_start: 8055691 + 116: + operation: search + 117: + ids_end: 8024912 + ids_start: 8008502 + operation: replace + tags_end: 8397418 + tags_start: 8381008 + 118: + operation: search + 119: + ids_end: 1453047 + ids_start: 1353604 + operation: replace + tags_end: 8849550 + tags_start: 8750107 + 120: + operation: search + 121: + ids_end: 1054574 + ids_start: 984005 + operation: replace + tags_end: 9013538 + tags_start: 8942969 + 122: + operation: search + 123: + ids_end: 8375611 + ids_start: 8334761 + operation: replace + tags_end: 9264165 + tags_start: 9223315 + 124: + operation: search + 125: + ids_end: 6264887 + ids_start: 6218348 + operation: replace + tags_end: 9555320 + tags_start: 9508781 + 126: + operation: search + 127: + ids_end: 7256463 + ids_start: 7234176 + operation: replace + tags_end: 9745034 + tags_start: 9722747 + 128: + operation: search diff --git a/neurips23/streaming/run.py b/neurips23/streaming/run.py index 9ef1541cb..f154c5744 100644 --- a/neurips23/streaming/run.py +++ b/neurips23/streaming/run.py @@ -45,6 +45,11 @@ def run_task(algo, ds, distance, count, run_count, search_type, private_query, r case 'delete': ids = np.arange(entry['start'], entry['end'], dtype=np.uint32) algo.delete(ids) + case 'replace': + tags_to_replace = np.arange(entry['tags_start'], entry['tags_end'], dtype=np.uint32) + ids_start = entry['ids_start'] + ids_end = entry['ids_end'] + algo.replace(ds.get_data_in_range(ids_start, ids_end), tags_to_replace) case 'search': if search_type == 'knn': algo.query(Q, count) diff --git a/neurips23/streaming/simple_replace_runbook.yaml b/neurips23/streaming/simple_replace_runbook.yaml new file mode 100644 index 000000000..aeaa0c2b2 --- /dev/null +++ b/neurips23/streaming/simple_replace_runbook.yaml @@ -0,0 +1,30 @@ +random-xs: + max_pts: 10000 + 1: + operation: "insert" + start: 0 + end: 7500 + 2: + operation: "search" + 3: + operation: "replace" + tags_start: 0 + tags_end: 2500 + ids_start: 7500 + ids_end: 10000 + 4: + operation: "search" + 5: + operation: "replace" + tags_start: 0 + tags_end: 2500 + ids_start: 0 + ids_end: 2500 + 6: + operation: "search" + 7: + operation: "delete" + start: 2500 + end: 5000 + 8: + operation: "search" \ No newline at end of file