Skip to content

Commit

Permalink
3 new runbooks
Browse files Browse the repository at this point in the history
  • Loading branch information
Ubuntu authored and Ubuntu committed Aug 8, 2024
1 parent c206f33 commit 4dcc8fe
Show file tree
Hide file tree
Showing 6 changed files with 14,463 additions and 0 deletions.
65 changes: 65 additions & 0 deletions neurips23/streaming/generate_msmarco100m_runbooks.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,65 @@
import yaml
import os
import random

dataset_name="msmarco-100M"

total_points=101070374
max_t=1000
# insert the points in 1000 steps

data = {dataset_name: {}}

max_num_points=0
num_points=0

to_delete=[[] for _ in range(max_t+300)]

t=1
for i in range(max_t):

data[dataset_name][t]={
'operation': 'insert',
'start': i*(total_points//max_t),
'end': (i+1)*(total_points//max_t)
}
t+=1

num_points+=total_points//max_t

max_num_points=max(max_num_points,num_points)

data_type = random.randint(0, 25)
if data_type == 0:
pass
# with probability 1/26, the inserted point always stay in the index
elif data_type >0 and data_type < 6:
to_delete[i+200].append(i)
# with probability 5/26, the inserted point always stay in the index for 200 steps
elif data_type >=6 and data_type:
to_delete[i+50].append(i)
# with probability 20/26, the inserted point always stay in the index for 50 steps


for x in to_delete[i]:
data[dataset_name][t]={
'operation': 'delete',
'start': x*(total_points//max_t),
'end': (x+1)*(total_points//max_t)
}
t+=1
num_points-=total_points//max_t

data[dataset_name][t]={
'operation': 'search',
}
t+=1


data[dataset_name]["max_pts"]=max_num_points

run_book_name=dataset_name+"_"+"expirationtime_runbook.yaml"

with open(run_book_name, 'w') as outfile:
yaml.dump(data, outfile, default_flow_style=False)

49 changes: 49 additions & 0 deletions neurips23/streaming/generate_msturing10m_runbooks.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@
import yaml
import os

dataset_name="msturing-10M"

data = {dataset_name: {}}

total_points=10000000

num_points=0
max_num_points=0


max_t=200
# insert 10000000/200 points per step
# start deleting points after 100 steps

t=1
for i in range(max_t):
if i>=max_t//2:
data[dataset_name][t]={
'operation': 'search',
}
t+=1
data[dataset_name][t]={
'operation': 'delete',
'start': (i-max_t//2)*(total_points//max_t),
'end': (i-max_t//2+1)*(total_points//max_t)
}
t+=1
num_points-=total_points//max_t
data[dataset_name][t]={
'operation': 'insert',
'start': i*(total_points//max_t),
'end': (i+1)*(total_points//max_t)
}
t+=1

num_points+=total_points//max_t
max_num_points=max(max_num_points,num_points)

data[dataset_name]["max_pts"]=max_num_points

run_book_name=dataset_name+"_"+"slidingwindow_runbook.yaml"

with open(run_book_name, 'w') as outfile:
yaml.dump(data, outfile, default_flow_style=False)


63 changes: 63 additions & 0 deletions neurips23/streaming/generate_wiki35m_runbooks.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,63 @@
import yaml
import os
import random

dataset_name="wikipedia-35M"

total_points=35000000
max_t=350


data = {dataset_name: {}}

max_num_points=0
num_points=0

to_delete=[[] for _ in range(max_t+100)]

t=1
for i in range(max_t):

data[dataset_name][t]={
'operation': 'insert',
'start': i*(total_points//max_t),
'end': (i+1)*(total_points//max_t)
}
t+=1

num_points+=total_points//max_t

max_num_points=max(max_num_points,num_points)

data_type = random.randint(0, 18)
if data_type == 0:
pass
# with probability 1/19, the points added always stay in the index
elif data_type >0 and data_type < 4:
to_delete[i+100].append(i)
# with probability 3/19, the points added stay in the index for 100 steps
else:
to_delete[i+20].append(i)
# with probability 15/19, the points added stay in the index for 20 steps

for x in to_delete[i]:
data[dataset_name][t]={
'operation': 'delete',
'start': x*(total_points//max_t),
'end': (x+1)*(total_points//max_t)
}
t+=1
num_points-=total_points//max_t

data[dataset_name][t]={
'operation': 'search',
}
t+=1


data[dataset_name]["max_pts"]=max_num_points

run_book_name=dataset_name+"_"+"expirationtime_runbook.yaml"

with open(run_book_name, 'w') as outfile:
yaml.dump(data, outfile, default_flow_style=False)
Loading

0 comments on commit 4dcc8fe

Please sign in to comment.