Skip to content

Commit

Permalink
3 new runbooks (#300)
Browse files Browse the repository at this point in the history
* 3 new runbooks

* fix random seed and add to data_export

---------

Co-authored-by: Ubuntu <philbe@Haike-ANNS.xyzfpnokv33ezlb2f0j2zk0m1f.phxx.internal.cloudapp.net>
  • Loading branch information
xuhaike and Ubuntu authored Aug 9, 2024
1 parent df1e53a commit 1f53df5
Show file tree
Hide file tree
Showing 7 changed files with 14,472 additions and 1 deletion.
5 changes: 4 additions & 1 deletion data_export.py
Original file line number Diff line number Diff line change
Expand Up @@ -99,7 +99,10 @@ def cleaned_run_metric(run_metrics):
runbook_paths = ['neurips23/streaming/simple_runbook.yaml',
'neurips23/streaming/clustered_runbook.yaml',
'neurips23/streaming/delete_runbook.yaml',
'neurips23/streaming/final_runbook.yaml']
'neurips23/streaming/final_runbook.yaml',
'neurips23/streaming/msturing-10M_slidingwindow_runbook.yaml',
'neurips23/streaming/wikipedia-35M_expirationtime_runbook.yaml',
'neurips23/streaming/msmarco-100M_expirationtime_runbook.yaml']
for runbook_path in runbook_paths:
results = load_all_results(dataset_name, neurips23track=track, runbook_path=runbook_path)
results = compute_metrics_all_runs(dataset, dataset_name, results, args.recompute, \
Expand Down
67 changes: 67 additions & 0 deletions neurips23/streaming/generate_msmarco100m_runbooks.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,67 @@
import yaml
import os
import random

dataset_name="msmarco-100M"

random.seed(809)

total_points=101070374
max_t=1000
# insert the points in 1000 steps

data = {dataset_name: {}}

max_num_points=0
num_points=0

to_delete=[[] for _ in range(max_t+300)]

t=1
for i in range(max_t):

data[dataset_name][t]={
'operation': 'insert',
'start': i*(total_points//max_t),
'end': (i+1)*(total_points//max_t)
}
t+=1

num_points+=total_points//max_t

max_num_points=max(max_num_points,num_points)

data_type = random.randint(0, 25)
if data_type == 0:
pass
# with probability 1/26, the inserted point always stay in the index
elif data_type >0 and data_type < 6:
to_delete[i+200].append(i)
# with probability 5/26, the inserted point always stay in the index for 200 steps
elif data_type >=6 and data_type:
to_delete[i+50].append(i)
# with probability 20/26, the inserted point always stay in the index for 50 steps


for x in to_delete[i]:
data[dataset_name][t]={
'operation': 'delete',
'start': x*(total_points//max_t),
'end': (x+1)*(total_points//max_t)
}
t+=1
num_points-=total_points//max_t

data[dataset_name][t]={
'operation': 'search',
}
t+=1


data[dataset_name]["max_pts"]=max_num_points

run_book_name=dataset_name+"_"+"expirationtime_runbook.yaml"

with open(run_book_name, 'w') as outfile:
yaml.dump(data, outfile, default_flow_style=False)

50 changes: 50 additions & 0 deletions neurips23/streaming/generate_msturing10m_runbooks.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@
import yaml
import os


dataset_name="msturing-10M"

data = {dataset_name: {}}

total_points=10000000

num_points=0
max_num_points=0


max_t=200
# insert 10000000/200 points per step
# start deleting points after 100 steps

t=1
for i in range(max_t):
if i>=max_t//2:
data[dataset_name][t]={
'operation': 'search',
}
t+=1
data[dataset_name][t]={
'operation': 'delete',
'start': (i-max_t//2)*(total_points//max_t),
'end': (i-max_t//2+1)*(total_points//max_t)
}
t+=1
num_points-=total_points//max_t
data[dataset_name][t]={
'operation': 'insert',
'start': i*(total_points//max_t),
'end': (i+1)*(total_points//max_t)
}
t+=1

num_points+=total_points//max_t
max_num_points=max(max_num_points,num_points)

data[dataset_name]["max_pts"]=max_num_points

run_book_name=dataset_name+"_"+"slidingwindow_runbook.yaml"

with open(run_book_name, 'w') as outfile:
yaml.dump(data, outfile, default_flow_style=False)


65 changes: 65 additions & 0 deletions neurips23/streaming/generate_wiki35m_runbooks.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,65 @@
import yaml
import os
import random

dataset_name="wikipedia-35M"

random.seed(809)

total_points=35000000
max_t=350


data = {dataset_name: {}}

max_num_points=0
num_points=0

to_delete=[[] for _ in range(max_t+100)]

t=1
for i in range(max_t):

data[dataset_name][t]={
'operation': 'insert',
'start': i*(total_points//max_t),
'end': (i+1)*(total_points//max_t)
}
t+=1

num_points+=total_points//max_t

max_num_points=max(max_num_points,num_points)

data_type = random.randint(0, 18)
if data_type == 0:
pass
# with probability 1/19, the points added always stay in the index
elif data_type >0 and data_type < 4:
to_delete[i+100].append(i)
# with probability 3/19, the points added stay in the index for 100 steps
else:
to_delete[i+20].append(i)
# with probability 15/19, the points added stay in the index for 20 steps

for x in to_delete[i]:
data[dataset_name][t]={
'operation': 'delete',
'start': x*(total_points//max_t),
'end': (x+1)*(total_points//max_t)
}
t+=1
num_points-=total_points//max_t

data[dataset_name][t]={
'operation': 'search',
}
t+=1


data[dataset_name]["max_pts"]=max_num_points

run_book_name=dataset_name+"_"+"expirationtime_runbook.yaml"

with open(run_book_name, 'w') as outfile:
yaml.dump(data, outfile, default_flow_style=False)
Loading

0 comments on commit 1f53df5

Please sign in to comment.