added wikipedia-cohere and msmarco web search datasets

harsha-simhadri · Jul 30, 2024 · fe29759 · fe29759
1 parent 1fafd82
commit fe29759
Showing 1 changed file with 121 additions and 1 deletion.
diff --git a/benchmark/datasets.py b/benchmark/datasets.py
@@ -158,6 +158,9 @@ def prepare(self, skip_data=False, original_size=10**9):
             return
         if self.nb == 10**9:
             download_accelerated(sourceurl, outfile)
+        elif self.nb == original_size:
+            #if nb vectors is less than 1 billion, can download the whole dataset in the normal fashion without a cropped header
+            download(sourceurl, outfile)
         else:
             # download cropped version of file
             file_size = 8 + self.d * self.nb * np.dtype(self.dtype).itemsize
@@ -168,7 +171,6 @@ def prepare(self, skip_data=False, original_size=10**9):
             download(sourceurl, outfile, max_size=file_size)
             # then overwrite the header...
             header = np.memmap(outfile, shape=2, dtype='uint32', mode="r+")
-
             assert header[0] == original_size
             assert header[1] == self.d
             header[0] = self.nb
@@ -496,6 +498,116 @@ def __init__(self, nb_M=1000):
     def distance(self):
         return "euclidean"
 
+class WikipediaDataset(BillionScaleDatasetCompetitionFormat):
+    def __init__(self, nb=35000000):
+        self.nb = nb
+        self.d = 768
+        self.nq = 5000
+        self.dtype = "float32"
+        self.ds_fn = "wikipedia_base.bin"
+        self.qs_fn = "wikipedia_query.bin"
+        self.gt_fn = (
+            "wikipedia-35M" if self.nb == 35000000 else
+            "wikipedia-1M" if self.nb == 1000000 else
+            "wikipedia-100K" if self.nb == 100000 else
+            None
+        )
+        self.basedir = os.path.join(BASEDIR, "wikipedia_cohere")
+        self.base_url = "https://comp21storage.z5.web.core.windows.net/wiki-cohere-35M/"
+
+        self.private_qs_url = None
+        self.private_gt_url = None
+
+    def prepare(self, skip_data=False, original_size=35000000):
+        return super().prepare(skip_data, 35000000)
+
+    def get_dataset_fn(self):
+        fn = os.path.join(self.basedir, self.ds_fn)
+        if self.nb != 35000000:
+            fn += '.crop_nb_%d' % self.nb
+        if os.path.exists(fn):
+            return fn
+        else:
+            raise RuntimeError("file %s not found" %fn)
+
+    def get_dataset(self):
+        slice = next(self.get_dataset_iterator(bs=self.nb))
+        return sanitize(slice)
+
+    def distance(self):
+        return "ip"
+
+class MSMarcoWebSearchDataset(BillionScaleDatasetCompetitionFormat):
+    def __init__(self, nb=101070374):
+        self.nb = nb
+        self.d = 768
+        self.nq = 9376
+        self.dtype = "float32"
+        self.ds_fn = "vectors.bin"
+        self.qs_fn = "query.bin"
+        self.gt_fn = (
+            "msmarco-100M-gt100" if self.nb == 101070374 else
+            "msmarco-10M-gt100" if self.nb == 10000000 else
+            "msmarco-1M-gt100" if self.nb == 1000000 else
+            None
+        )
+        self.basedir = os.path.join(BASEDIR, "msmarco_websearch")
+        self.base_url = "https://msmarco.z22.web.core.windows.net/msmarcowebsearch/vectors/SimANS/passage_vectors/vectors.bin"
+        self.query_url = "https://msmarco.z22.web.core.windows.net/msmarcowebsearch/vectors/SimANS/query_vectors/vectors.bin"
+        self.gt_url = "https://comp21storage.z5.web.core.windows.net/msmarcowebsearch/"
+
+    def prepare(self, skip_data=False, original_size=101070374):
+        if not os.path.exists(self.basedir):
+            os.makedirs(self.basedir)
+
+        qs_outfile = os.path.join(self.basedir, self.qs_fn)
+        download(self.query_url, qs_outfile)
+
+        gt_outfile = os.path.join(self.basedir, self.gt_fn)
+        groundtruth_url = os.path.join(self.gt_url, self.gt_fn)
+        download(groundtruth_url, gt_outfile)
+
+        if skip_data:
+            return
+
+        fn = self.ds_fn
+        sourceurl = self.base_url
+        outfile = os.path.join(self.basedir, fn)
+        if os.path.exists(outfile):
+            print("file %s already exists" % outfile)
+            return
+        if self.nb == original_size:
+            download(self.base_url, outfile)
+        else:
+            # download cropped version of file
+            file_size = 8 + self.d * self.nb * np.dtype(self.dtype).itemsize
+            outfile = outfile + '.crop_nb_%d' % self.nb
+            if os.path.exists(outfile):
+                print("file %s already exists" % outfile)
+                return
+            download(sourceurl, outfile, max_size=file_size)
+            # then overwrite the header...
+            header = np.memmap(outfile, shape=2, dtype='uint32', mode="r+")
+            assert header[0] == original_size
+            assert header[1] == self.d
+            header[0] = self.nb
+
+    def get_dataset_fn(self):
+        fn = os.path.join(self.basedir, self.ds_fn)
+        if self.nb != 101070374:
+            fn += '.crop_nb_%d' % self.nb
+        if os.path.exists(fn):
+            return fn
+        else:
+            raise RuntimeError("file %s not found" %fn)
+
+    def get_dataset(self):
+        slice = next(self.get_dataset_iterator(bs=self.nb))
+        return sanitize(slice)
+
+    def distance(self):
+        return "ip"
+
 class RandomClusteredDS(DatasetCompetitionFormat):
     def __init__(self, basedir="random-clustered"):
         self.nb = 10000
@@ -1133,6 +1245,14 @@ def short_name(self):
     'sparse-1M': lambda: SparseDataset("1M"),
     'sparse-full': lambda: SparseDataset("full"), 
 
+    'wikipedia-35M': lambda : WikipediaDataset(35000000),
+    'wikipedia-1M': lambda : WikipediaDataset(1000000),
+    'wikipedia-100K': lambda : WikipediaDataset(100000),
+
+    'msmarco-100M': lambda : MSMarcoWebSearchDataset(101070374),
+    'msmarco-10M': lambda : MSMarcoWebSearchDataset(10000000),
+    'msmarco-1M': lambda : MSMarcoWebSearchDataset(1000000),
+
     'random-xs': lambda : RandomDS(10000, 1000, 20),
     'random-s': lambda : RandomDS(100000, 1000, 50),