From b35329d73a119983af08dce381dd9a4592c3a0fe Mon Sep 17 00:00:00 2001
From: Magdalen <magdalenruth@gmail.com>
Date: Wed, 31 Jul 2024 14:49:48 -0700
Subject: [PATCH] added descriptions to new datasets

---
 benchmark/datasets.py | 11 +++++++++++
 1 file changed, 11 insertions(+)

diff --git a/benchmark/datasets.py b/benchmark/datasets.py
index 03814048..39223ff3 100644
--- a/benchmark/datasets.py
+++ b/benchmark/datasets.py
@@ -498,6 +498,11 @@ def __init__(self, nb_M=1000):
     def distance(self):
         return "euclidean"
 
+'''
+The base vectors of Wikipedia-Cohere consist of 35 million cohere embeddings of the title and text of Wikipedia English articles. 
+The 5000 query vectors consist of 5000 cohere embeddings of the title and text of Wikipedia simple articles.
+See https://huggingface.co/datasets/Cohere/wikipedia-22-12-en-embeddings?row=2 for more details.
+'''
 class WikipediaDataset(BillionScaleDatasetCompetitionFormat):
     def __init__(self, nb=35000000):
         self.nb = nb
@@ -537,6 +542,12 @@ def get_dataset(self):
     def distance(self):
         return "ip"
 
+'''
+The MSMarco Web Search dataset has 100,924,960 base vectors consisting of embeddings of web documents 
+from the ClueWeb22 document dataset, while its 9,374 queries correspond to web queries collected from 
+the Microsoft Bing search engine.
+See https://github.com/microsoft/MS-MARCO-Web-Search for more details.
+'''
 class MSMarcoWebSearchDataset(BillionScaleDatasetCompetitionFormat):
     def __init__(self, nb=101070374):
         self.nb = nb