Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Re-organize runbooks and add wiki replace runbook #312

Merged
merged 11 commits into from
Oct 18, 2024
1 change: 1 addition & 0 deletions benchmark/streaming/load_runbook.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@ def load_runbook(dataset_name, max_pts, runbook_file):
if entry['tags_start'] < 0 or entry ['tags_start'] >= max_pts:
raise Exception('Start of indices to be replaced out of range in runbook')
if entry['tags_end'] < 0 or entry ['tags_end'] > max_pts:
print(entry['tags_end'])
raise Exception('End of indices to be replaced out of range in runbook')
if entry['ids_start'] < 0 or entry ['ids_start'] >= max_pts:
raise Exception('Start of indices to replace out of range in runbook')
Expand Down
23 changes: 12 additions & 11 deletions data_export.py
Original file line number Diff line number Diff line change
Expand Up @@ -96,17 +96,18 @@ def cleaned_run_metric(run_metrics):
dataset = DATASETS[dataset_name]()
runbook_paths = [None]
if track == 'streaming':
runbook_paths = ['neurips23/streaming/simple_runbook.yaml',
'neurips23/streaming/simple_replace_runbook.yaml',
'neurips23/streaming/random_replace_runbook.yaml',
'neurips23/streaming/clustered_replace_runbook.yaml',
'neurips23/streaming/clustered_runbook.yaml',
'neurips23/streaming/clustered_runbook.yaml',
'neurips23/streaming/delete_runbook.yaml',
'neurips23/streaming/final_runbook.yaml',
'neurips23/streaming/msturing-10M_slidingwindow_runbook.yaml',
'neurips23/streaming/wikipedia-35M_expirationtime_runbook.yaml',
'neurips23/streaming/msmarco-100M_expirationtime_runbook.yaml']
runbook_paths = ['neurips23/runbooks/streaming/simple_runbook.yaml',
'neurips23/runbooks/streaming/simple_replace_runbook.yaml',
'neurips23/runbooks/streaming/random_replace_runbook.yaml',
'neurips23/runbooks/streaming/clustered_replace_runbook.yaml',
'neurips23/runbooks/streaming/clustered_runbook.yaml',
'neurips23/runbooks/streaming/clustered_runbook.yaml',
'neurips23/runbooks/streaming/delete_runbook.yaml',
'neurips23/runbooks/streaming/final_runbook.yaml',
'neurips23/runbooks/streaming/msturing-10M_slidingwindow_runbook.yaml',
'neurips23/runbooks/streaming/wikipedia-35M_expirationtime_runbook.yaml',
'neurips23/runbooks/streaming/wikipedia-35M_expiration_time_replace_runbook.yaml',
'neurips23/runbooks/streaming/msmarco-100M_expirationtime_runbook.yaml']
for runbook_path in runbook_paths:
print("Looking for runbook ", runbook_path)
results = load_all_results(dataset_name, neurips23track=track, runbook_path=runbook_path)
Expand Down
20 changes: 10 additions & 10 deletions neurips23/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -46,8 +46,8 @@ The baselines were run on an Azure Standard D8lds v5 (8 vcpus, 16 GiB memory) ma
|---------|-------------|-----------------------------|---------|
|Sparse | Linear Scan | 101 | `python3 run.py --dataset sparse-full --algorithm linscan --neurips23track sparse` |
|Filter | faiss | 3200 | `python3 run.py --dataset yfcc-10M --algorithm faiss --neurips23track filter` |
|Streaming| DiskANN | 0.924 (recall@10), 23 mins | `python3 run.py --dataset msturing-10M-clustered --algorithm diskann --neurips23track streaming --runbook_path neurips23/streaming/delete_runbook.yaml` |
|Streaming| DiskANN | 0.883 (recall@10), 45 mins | `python3 run.py --dataset msturing-30M-clustered --algorithm diskann --neurips23track streaming --runbook_path neurips23/streaming/final_runbook.yaml` |
|Streaming| DiskANN | 0.924 (recall@10), 23 mins | `python3 run.py --dataset msturing-10M-clustered --algorithm diskann --neurips23track streaming --runbook_path neurips23/runbooks/delete_runbook.yaml` |
|Streaming| DiskANN | 0.883 (recall@10), 45 mins | `python3 run.py --dataset msturing-30M-clustered --algorithm diskann --neurips23track streaming --runbook_path neurips23/runbooks/final_runbook.yaml` |
|OOD | DiskANN | 4882 | `python3 run.py --dataset text2image-10M --algorithm diskann --neurips23track ood` |

## For_Participants
Expand Down Expand Up @@ -99,7 +99,7 @@ Test the benchmark and baseline using the algorithm's definition file on small t
python run.py --neurips23track filter --algorithm faiss --dataset random-filter-s
python run.py --neurips23track sparse --algorithm linscan --dataset sparse-small
python run.py --neurips23track ood --algorithm diskann --dataset random-xs
python run.py --neurips23track streaming --algorithm diskann --dataset random-xs --runbook_path neurips23/streaming/simple_runbook.yaml
python run.py --neurips23track streaming --algorithm diskann --dataset random-xs --runbook_path neurips23/runbooks/simple_runbook.yaml
```

For the competition dataset, run commands mentioned in the table above, for example:
Expand All @@ -108,22 +108,22 @@ python run.py --neurips23track filter --algorithm faiss --dataset yfcc-10M
python run.py --neurips23track sparse --algorithm linscan --dataset sparse-full
python run.py --neurips23track ood --algorithm diskann --dataset text2image-10M
# preliminary runbook for testing
python run.py --neurips23track streaming --algorithm diskann --dataset msturing-10M-clustered --runbook_path neurips23/streaming/delete_runbook.yaml
python run.py --neurips23track streaming --algorithm diskann --dataset msturing-10M-clustered --runbook_path neurips23/runbooks/delete_runbook.yaml
#Final runbook for evaluation
python run.py --neurips23track streaming --algorithm diskann --dataset msturing-30M-clustered --runbook_path neurips23/streaming/final_runbook.yaml
python run.py --neurips23track streaming --algorithm diskann --dataset msturing-30M-clustered --runbook_path neurips23/runbooks/final_runbook.yaml
```

For streaming track, runbook specifies the order of operations to be executed by the algorithms. To download the ground truth for every search operation: (needs azcopy tool in your binary path):
```
python -m benchmark.streaming.download_gt --runbook_file neurips23/streaming/simple_runbook.yaml --dataset msspacev-10M
python -m benchmark.streaming.download_gt --runbook_file neurips23/streaming/delete_runbook.yaml --dataset msturing-10M-clustered
python -m benchmark.streaming.download_gt --runbook_file neurips23/streaming/final_runbook.yaml --dataset msturing-30M-clustered
python -m benchmark.streaming.download_gt --runbook_file neurips23/runbooks/simple_runbook.yaml --dataset msspacev-10M
python -m benchmark.streaming.download_gt --runbook_file neurips23/runbooks/delete_runbook.yaml --dataset msturing-10M-clustered
python -m benchmark.streaming.download_gt --runbook_file neurips23/runbooks/final_runbook.yaml --dataset msturing-30M-clustered
```
Alternately, to compute ground truth for an arbitrary runbook, [clone and build DiskANN repo](https://github.com/Microsoft/DiskANN) and use the command line tool to compute ground truth at various search checkpoints. The `--gt_cmdline_tool` points to the directory with DiskANN commandline tools.
```
python benchmark/streaming/compute_gt.py --dataset msspacev-10M --runbook neurips23/streaming/simple_runbook.yaml --gt_cmdline_tool ~/DiskANN/build/apps/utils/compute_groundtruth
python benchmark/streaming/compute_gt.py --dataset msspacev-10M --runbook neurips23/runbooks/simple_runbook.yaml --gt_cmdline_tool ~/DiskANN/build/apps/utils/compute_groundtruth
```
Consider also the examples in runbooks [here]]neurips23/streaming/clustered_runbook.yaml) and [here](neurips23/streaming/delete_runbook.yaml). The datasets here are [generated](neurips23/streaming/clustered_data_gen.py) by clustering the original dataset with k-means and packing points in the same cluster into contiguous indices. Then insertions are then performed one cluster at a time. This runbook tests if an indexing algorithm can adapt to data draft. The `max_pts` entry for the dataset in the runbook indicates an upper bound on the number of active points that the index must support during the runbook execution.
Consider also the examples in runbooks [here]]neurips23/runbooks/clustered_runbook.yaml) and [here](neurips23/runbooks/delete_runbook.yaml). The datasets here are [generated](neurips23/runbooks/clustered_data_gen.py) by clustering the original dataset with k-means and packing points in the same cluster into contiguous indices. Then insertions are then performed one cluster at a time. This runbook tests if an indexing algorithm can adapt to data draft. The `max_pts` entry for the dataset in the runbook indicates an upper bound on the number of active points that the index must support during the runbook execution.


To make the results available for post-processing, change permissions of the results folder
Expand Down
176 changes: 176 additions & 0 deletions neurips23/runbooks/gen_expiration_time_runbook.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,176 @@
import yaml
import os
import random

'''
dataset_name: dataset key as specified in benchmark/datasets.py
dataset_size: size of datasets
max_t: number of timesteps
runbook_filename: name to save the runbook to
ratios: tuple of three numbers indicating proportion of deletes/replaces assigned to each timestep
timesteps: how long to wait before deleting for each ratio
seed: seed given to random generator
do_replace: whether to include replace in runbook or not
'''
def gen_exp_time_runbook(dataset_name, dataset_size, max_t, runbook_filename, ratios, timesteps, seed = 0, do_replace = False, gt_url = None, do_delete = True):
random.seed(seed)
data = {dataset_name: {}}

max_num_points=0
num_points=0

batch_size = dataset_size//max_t
to_delete=[[] for _ in range(max_t)]
to_replace=[[] for _ in range(max_t)]

t=1

for i in range(max_t):
if do_replace:
fraction = random.uniform(.5, .9)
else:
fraction = 1.0
start = i*batch_size
end = start + int(fraction*batch_size)
ids_start = end
ids_end = (i+1)*batch_size
tags_start = i*batch_size
tags_end = tags_start + (ids_end - ids_start)
replace_info = (tags_start, tags_end, ids_start, ids_end)
delete_info = (tags_start, end)
data[dataset_name][t]={
'operation': 'insert',
'start': i*(batch_size),
'end': end
}
t+=1

num_points+=int(fraction*batch_size)

max_num_points=max(max_num_points,num_points)


data_type = random.randint(0, ratios[2])
if do_delete:
if data_type <= ratios[0]:
pass
elif data_type > ratios[0] and data_type < ratios[1]:
if (i+timesteps[1] < max_t):
to_delete[i+timesteps[1]].append(delete_info)
else:
if (i+timesteps[2] < max_t):
to_delete[i+timesteps[2]].append(delete_info)

if do_replace:
if data_type <= ratios[0]:
remaining_steps = (max_t - t)//2
to_replace[i+remaining_steps].append(replace_info)
# with probability 1/19, the points get replaced at t_max-t/2 steps
elif data_type > ratios[0] and data_type < ratios[1]:
if (i + timesteps[1]//2 < max_t):
to_replace[i+timesteps[1]//2].append(replace_info)
# with probability 3/19, the points get replaced after 50 steps
else:
if (i + timesteps[2]//2 < max_t):
to_replace[i+timesteps[2]//2].append(replace_info)
# with probability 15/19, the points get replaced after 10 steps

for (start, end) in to_delete[i]:
data[dataset_name][t]={
'operation': 'delete',
'start': start,
'end': end
}
t+=1
num_points-=batch_size

for (tags_start, tags_end, ids_start, ids_end) in to_replace[i]:
data[dataset_name][t] ={
'operation' : 'replace',
'tags_start': tags_start,
'tags_end': tags_end,
'ids_start': ids_start,
'ids_end': ids_end
}
t += 1

data[dataset_name][t]={
'operation': 'search',
}
t+=1

data[dataset_name]["max_pts"]=max_num_points

if gt_url is not None:
data[dataset_name]["gt_url"] = gt_url

with open(runbook_filename, 'w') as outfile:
yaml.dump(data, outfile, default_flow_style=False)

ratios = (0, 4, 18)
timesteps = (0, 100, 20)
seed = 809
dataset_file = 'wikipedia-35M_expirationtime_runbook.yaml'
dataset_name = 'wikipedia-35M'
dataset_size = 35000000
max_t = 350
gt_url = "https://comp21storage.z5.web.core.windows.net/wiki-cohere-35M/wikipedia-35M_expirationtime_runbook.yaml"
gen_exp_time_runbook(dataset_name, dataset_size, max_t, dataset_file, ratios, timesteps, seed, False, gt_url)

ratios = (0, 4, 18)
timesteps = (0, 100, 20)
seed = 1232
dataset_file = 'wikipedia-1M_expiration_time_runbook.yaml'
dataset_name = 'wikipedia-1M'
dataset_size = 1000000
max_t = 100
gt_url = "https://comp21storage.z5.web.core.windows.net/wiki-cohere-35M/wikipedia-1M_expiration_time_runbook.yaml/"
gen_exp_time_runbook(dataset_name, dataset_size, max_t, dataset_file, ratios, timesteps, seed, False, gt_url)

ratios = (0, 4, 18)
timesteps = (0, 100, 20)
seed = 10001
dataset_file = 'wikipedia-35M_expiration_time_replace_only_runbook.yaml'
dataset_name = 'wikipedia-35M'
dataset_size = 8000000 #only use a prefix of the dataset
max_t = 80
gt_url = None
gen_exp_time_runbook(dataset_name, dataset_size, max_t, dataset_file, ratios, timesteps, seed, True, gt_url, False)

ratios = (0, 4, 18)
timesteps = (0, 100, 20)
seed = 754
dataset_file = 'wikipedia-1M_expiration_time_replace_only_runbook.yaml'
dataset_name = 'wikipedia-1M'
dataset_size = 1000000
max_t = 100
gt_url = None
gen_exp_time_runbook(dataset_name, dataset_size, max_t, dataset_file, ratios, timesteps, seed, True, gt_url, False)

ratios = (3, 8, 18)
timesteps = (0, 300, 50)
seed = 22
dataset_file = 'wikipedia-35M_expiration_time_replace_delete_runbook.yaml'
dataset_name = 'wikipedia-35M'
dataset_size = 35000000
max_t = 350
gen_exp_time_runbook(dataset_name, dataset_size, max_t, dataset_file, ratios, timesteps, seed, True, None)

ratios = (1, 8, 18)
timesteps = (0, 100, 20)
seed = 56
dataset_file = 'wikipedia-1M_expiration_time_replace_delete_runbook.yaml'
dataset_name = 'wikipedia-1M'
dataset_size = 1000000
max_t = 100
gen_exp_time_runbook(dataset_name, dataset_size, max_t, dataset_file, ratios, timesteps, seed, True, None)

ratios = (0, 6, 25)
timesteps = (0, 200, 50)
seed = 809
dataset_file = 'msmarco-100M_expirationtime_runbook.yaml'
dataset_name = 'msmarco-100M'
dataset_size = 101070374
max_t = 1000
gen_exp_time_runbook(dataset_name, dataset_size, max_t, dataset_file, ratios, timesteps, seed, False, None)

Loading
Loading