3 new runbooks (#300)

* 3 new runbooks * fix random seed and add to data_export --------- Co-authored-by: Ubuntu <philbe@Haike-ANNS.xyzfpnokv33ezlb2f0j2zk0m1f.phxx.internal.cloudapp.net>
harsha-simhadri · Aug 9, 2024 · 1f53df5 · 1f53df5
1 parent df1e53a
commit 1f53df5
Show file tree

Hide file tree

Showing 7 changed files with 14,472 additions and 1 deletion.
diff --git a/data_export.py b/data_export.py
@@ -99,7 +99,10 @@ def cleaned_run_metric(run_metrics):
                 runbook_paths = ['neurips23/streaming/simple_runbook.yaml',
                                     'neurips23/streaming/clustered_runbook.yaml',
                                     'neurips23/streaming/delete_runbook.yaml',
-                                    'neurips23/streaming/final_runbook.yaml']
+                                    'neurips23/streaming/final_runbook.yaml',
+                                    'neurips23/streaming/msturing-10M_slidingwindow_runbook.yaml',
+                                    'neurips23/streaming/wikipedia-35M_expirationtime_runbook.yaml',
+                                    'neurips23/streaming/msmarco-100M_expirationtime_runbook.yaml']
             for runbook_path in runbook_paths:
                 results = load_all_results(dataset_name, neurips23track=track, runbook_path=runbook_path)
                 results = compute_metrics_all_runs(dataset, dataset_name, results, args.recompute, \

diff --git a/neurips23/streaming/generate_msmarco100m_runbooks.py b/neurips23/streaming/generate_msmarco100m_runbooks.py
@@ -0,0 +1,67 @@
+import yaml
+import os
+import random
+
+dataset_name="msmarco-100M"
+
+random.seed(809)
+
+total_points=101070374
+max_t=1000
+# insert the points in 1000 steps
+
+data = {dataset_name: {}}
+
+max_num_points=0
+num_points=0
+
+to_delete=[[] for _ in range(max_t+300)]
+
+t=1
+for i in range(max_t): 
+
+    data[dataset_name][t]={
+        'operation': 'insert',
+        'start': i*(total_points//max_t),
+        'end': (i+1)*(total_points//max_t)
+    }
+    t+=1
+
+    num_points+=total_points//max_t
+
+    max_num_points=max(max_num_points,num_points)
+
+    data_type = random.randint(0, 25)
+    if data_type == 0:
+        pass
+        # with probability 1/26, the inserted point always stay in the index
+    elif data_type >0 and data_type < 6:
+        to_delete[i+200].append(i)
+        # with probability 5/26, the inserted point always stay in the index for 200 steps
+    elif data_type >=6 and data_type:
+        to_delete[i+50].append(i)
+        # with probability 20/26, the inserted point always stay in the index for 50 steps
+
+
+    for x in to_delete[i]:
+        data[dataset_name][t]={
+            'operation': 'delete',
+            'start': x*(total_points//max_t),
+            'end': (x+1)*(total_points//max_t)
+        }
+        t+=1
+        num_points-=total_points//max_t
+
+    data[dataset_name][t]={
+        'operation': 'search',
+    }
+    t+=1
+
+
+data[dataset_name]["max_pts"]=max_num_points
+
+run_book_name=dataset_name+"_"+"expirationtime_runbook.yaml"
+
+with open(run_book_name, 'w') as outfile:
+    yaml.dump(data, outfile, default_flow_style=False)
+
diff --git a/neurips23/streaming/generate_msturing10m_runbooks.py b/neurips23/streaming/generate_msturing10m_runbooks.py
@@ -0,0 +1,50 @@
+import yaml
+import os
+
+
+dataset_name="msturing-10M"
+
+data = {dataset_name: {}}
+
+total_points=10000000
+
+num_points=0
+max_num_points=0
+
+
+max_t=200
+# insert 10000000/200 points per step
+# start deleting points after 100 steps
+
+t=1
+for i in range(max_t): 
+    if i>=max_t//2:
+        data[dataset_name][t]={
+            'operation': 'search',
+        }
+        t+=1
+        data[dataset_name][t]={
+            'operation': 'delete',
+            'start': (i-max_t//2)*(total_points//max_t),
+            'end': (i-max_t//2+1)*(total_points//max_t)
+        }
+        t+=1
+        num_points-=total_points//max_t
+    data[dataset_name][t]={
+            'operation': 'insert',
+            'start': i*(total_points//max_t),
+            'end': (i+1)*(total_points//max_t)
+        }
+    t+=1
+
+    num_points+=total_points//max_t
+    max_num_points=max(max_num_points,num_points)
+
+data[dataset_name]["max_pts"]=max_num_points
+
+run_book_name=dataset_name+"_"+"slidingwindow_runbook.yaml"
+
+with open(run_book_name, 'w') as outfile:
+    yaml.dump(data, outfile, default_flow_style=False)
+
+
diff --git a/neurips23/streaming/generate_wiki35m_runbooks.py b/neurips23/streaming/generate_wiki35m_runbooks.py
@@ -0,0 +1,65 @@
+import yaml
+import os
+import random
+
+dataset_name="wikipedia-35M"
+
+random.seed(809)
+
+total_points=35000000
+max_t=350
+
+
+data = {dataset_name: {}}
+
+max_num_points=0
+num_points=0
+
+to_delete=[[] for _ in range(max_t+100)]
+
+t=1
+for i in range(max_t): 
+
+    data[dataset_name][t]={
+        'operation': 'insert',
+        'start': i*(total_points//max_t),
+        'end': (i+1)*(total_points//max_t)
+    }
+    t+=1
+
+    num_points+=total_points//max_t
+
+    max_num_points=max(max_num_points,num_points)
+
+    data_type = random.randint(0, 18)
+    if data_type == 0:
+        pass
+        # with probability 1/19, the points added always stay in the index
+    elif data_type >0 and data_type < 4:
+        to_delete[i+100].append(i)
+        # with probability 3/19, the points added stay in the index for 100 steps
+    else:
+        to_delete[i+20].append(i)
+        # with probability 15/19, the points added stay in the index for 20 steps
+
+    for x in to_delete[i]:
+        data[dataset_name][t]={
+            'operation': 'delete',
+            'start': x*(total_points//max_t),
+            'end': (x+1)*(total_points//max_t)
+        }
+        t+=1
+        num_points-=total_points//max_t
+
+    data[dataset_name][t]={
+        'operation': 'search',
+    }
+    t+=1
+
+
+data[dataset_name]["max_pts"]=max_num_points
+
+run_book_name=dataset_name+"_"+"expirationtime_runbook.yaml"
+
+with open(run_book_name, 'w') as outfile:
+    yaml.dump(data, outfile, default_flow_style=False)