Skip to content

Commit

Permalink
added wikipedia-cohere and msmarco web search datasets
Browse files Browse the repository at this point in the history
  • Loading branch information
magdalendobson committed Jul 30, 2024
1 parent 1fafd82 commit fe29759
Showing 1 changed file with 121 additions and 1 deletion.
122 changes: 121 additions & 1 deletion benchmark/datasets.py
Original file line number Diff line number Diff line change
Expand Up @@ -158,6 +158,9 @@ def prepare(self, skip_data=False, original_size=10**9):
return
if self.nb == 10**9:
download_accelerated(sourceurl, outfile)
elif self.nb == original_size:
#if nb vectors is less than 1 billion, can download the whole dataset in the normal fashion without a cropped header
download(sourceurl, outfile)
else:
# download cropped version of file
file_size = 8 + self.d * self.nb * np.dtype(self.dtype).itemsize
Expand All @@ -168,7 +171,6 @@ def prepare(self, skip_data=False, original_size=10**9):
download(sourceurl, outfile, max_size=file_size)
# then overwrite the header...
header = np.memmap(outfile, shape=2, dtype='uint32', mode="r+")

assert header[0] == original_size
assert header[1] == self.d
header[0] = self.nb
Expand Down Expand Up @@ -496,6 +498,116 @@ def __init__(self, nb_M=1000):
def distance(self):
return "euclidean"

class WikipediaDataset(BillionScaleDatasetCompetitionFormat):
def __init__(self, nb=35000000):
self.nb = nb
self.d = 768
self.nq = 5000
self.dtype = "float32"
self.ds_fn = "wikipedia_base.bin"
self.qs_fn = "wikipedia_query.bin"
self.gt_fn = (
"wikipedia-35M" if self.nb == 35000000 else
"wikipedia-1M" if self.nb == 1000000 else
"wikipedia-100K" if self.nb == 100000 else
None
)
self.basedir = os.path.join(BASEDIR, "wikipedia_cohere")
self.base_url = "https://comp21storage.z5.web.core.windows.net/wiki-cohere-35M/"

self.private_qs_url = None
self.private_gt_url = None

def prepare(self, skip_data=False, original_size=35000000):
return super().prepare(skip_data, 35000000)

def get_dataset_fn(self):
fn = os.path.join(self.basedir, self.ds_fn)
if self.nb != 35000000:
fn += '.crop_nb_%d' % self.nb
if os.path.exists(fn):
return fn
else:
raise RuntimeError("file %s not found" %fn)

def get_dataset(self):
slice = next(self.get_dataset_iterator(bs=self.nb))
return sanitize(slice)

def distance(self):
return "ip"

class MSMarcoWebSearchDataset(BillionScaleDatasetCompetitionFormat):
def __init__(self, nb=101070374):
self.nb = nb
self.d = 768
self.nq = 9376
self.dtype = "float32"
self.ds_fn = "vectors.bin"
self.qs_fn = "query.bin"
self.gt_fn = (
"msmarco-100M-gt100" if self.nb == 101070374 else
"msmarco-10M-gt100" if self.nb == 10000000 else
"msmarco-1M-gt100" if self.nb == 1000000 else
None
)
self.basedir = os.path.join(BASEDIR, "msmarco_websearch")
self.base_url = "https://msmarco.z22.web.core.windows.net/msmarcowebsearch/vectors/SimANS/passage_vectors/vectors.bin"
self.query_url = "https://msmarco.z22.web.core.windows.net/msmarcowebsearch/vectors/SimANS/query_vectors/vectors.bin"
self.gt_url = "https://comp21storage.z5.web.core.windows.net/msmarcowebsearch/"

def prepare(self, skip_data=False, original_size=101070374):
if not os.path.exists(self.basedir):
os.makedirs(self.basedir)

qs_outfile = os.path.join(self.basedir, self.qs_fn)
download(self.query_url, qs_outfile)

gt_outfile = os.path.join(self.basedir, self.gt_fn)
groundtruth_url = os.path.join(self.gt_url, self.gt_fn)
download(groundtruth_url, gt_outfile)

if skip_data:
return

fn = self.ds_fn
sourceurl = self.base_url
outfile = os.path.join(self.basedir, fn)
if os.path.exists(outfile):
print("file %s already exists" % outfile)
return
if self.nb == original_size:
download(self.base_url, outfile)
else:
# download cropped version of file
file_size = 8 + self.d * self.nb * np.dtype(self.dtype).itemsize
outfile = outfile + '.crop_nb_%d' % self.nb
if os.path.exists(outfile):
print("file %s already exists" % outfile)
return
download(sourceurl, outfile, max_size=file_size)
# then overwrite the header...
header = np.memmap(outfile, shape=2, dtype='uint32', mode="r+")
assert header[0] == original_size
assert header[1] == self.d
header[0] = self.nb

def get_dataset_fn(self):
fn = os.path.join(self.basedir, self.ds_fn)
if self.nb != 101070374:
fn += '.crop_nb_%d' % self.nb
if os.path.exists(fn):
return fn
else:
raise RuntimeError("file %s not found" %fn)

def get_dataset(self):
slice = next(self.get_dataset_iterator(bs=self.nb))
return sanitize(slice)

def distance(self):
return "ip"

class RandomClusteredDS(DatasetCompetitionFormat):
def __init__(self, basedir="random-clustered"):
self.nb = 10000
Expand Down Expand Up @@ -1133,6 +1245,14 @@ def short_name(self):
'sparse-1M': lambda: SparseDataset("1M"),
'sparse-full': lambda: SparseDataset("full"),

'wikipedia-35M': lambda : WikipediaDataset(35000000),
'wikipedia-1M': lambda : WikipediaDataset(1000000),
'wikipedia-100K': lambda : WikipediaDataset(100000),

'msmarco-100M': lambda : MSMarcoWebSearchDataset(101070374),
'msmarco-10M': lambda : MSMarcoWebSearchDataset(10000000),
'msmarco-1M': lambda : MSMarcoWebSearchDataset(1000000),

'random-xs': lambda : RandomDS(10000, 1000, 20),
'random-s': lambda : RandomDS(100000, 1000, 50),

Expand Down

0 comments on commit fe29759

Please sign in to comment.