diff --git a/benchmark/datasets.py b/benchmark/datasets.py index 9d79f8a8..7e4e95fb 100644 --- a/benchmark/datasets.py +++ b/benchmark/datasets.py @@ -294,6 +294,7 @@ def get_dataset_iterator(self, bs=512, split=(1,0)): i0, i1 = self.nb * rank // nsplit, self.nb * (rank + 1) // nsplit filename = self.get_dataset_fn() x = xbin_mmap(filename, dtype=self.dtype, maxn=self.nb) + print(f"x.shape={x.shape} self.nb={self.nb} self.d={self.d}") assert x.shape == (self.nb, self.d) for j0 in range(i0, i1, bs): j1 = min(j0 + bs, i1) @@ -385,6 +386,33 @@ def __init__(self, nb_M=1000): def distance(self): return "euclidean" + +class BigANNDimReducedDataset(DatasetCompetitionFormat): + def __init__(self, nb_M=1000): + self.nb_M = nb_M + self.nb = 10**6 * nb_M + self.d = 32 + self.nq = 10000 + # use for indexing of the dimensionality reduced dataset + self.dtype = "float32" + # use for search over the index built from the dimensionality reduced dataset + # self.dtype = "uint8" + self.ds_fn = "base.1B.u8bin" + self.qs_fn = "query.public.10K.u8bin" + self.gt_fn = ( + "GT.public.1B.ibin" if self.nb_M == 1000 else + subset_url + "GT_100M/bigann-100M" if self.nb_M == 100 else + subset_url + "GT_10M/bigann-10M" if self.nb_M == 10 else + None + ) + # self.gt_fn = "https://comp21storage.blob.core.windows.net/publiccontainer/comp21/bigann/public_query_gt100.bin" if self.nb == 10**9 else None + self.base_url = "https://dl.fbaipublicfiles.com/billion-scale-ann-benchmarks/bigann/" + self.basedir = os.path.join(BASEDIR, "bigann") + + def distance(self): + return "euclidean" + + class Deep1BDataset(DatasetCompetitionFormat): def __init__(self, nb_M=1000): self.nb_M = nb_M @@ -616,6 +644,7 @@ def default_count(self): 'bigann-1B': lambda : BigANNDataset(1000), 'bigann-100M': lambda : BigANNDataset(100), 'bigann-10M': lambda : BigANNDataset(10), + 'bigann-dim-reduced-100M': lambda: BigANNDimReducedDataset(100), 'deep-1B': lambda : Deep1BDataset(), 'deep-100M': lambda : Deep1BDataset(100), diff --git a/run_t1_faiss_baseline_eval.sh b/run_t1_faiss_baseline_eval.sh new file mode 100755 index 00000000..257a20de --- /dev/null +++ b/run_t1_faiss_baseline_eval.sh @@ -0,0 +1,37 @@ +params=" +nprobe=1,quantizer_efSearch=4 +nprobe=2,quantizer_efSearch=4 +nprobe=4,quantizer_efSearch=4 +nprobe=4,quantizer_efSearch=8 +nprobe=8,quantizer_efSearch=4 +nprobe=8,quantizer_efSearch=8 +nprobe=8,quantizer_efSearch=16 +nprobe=8,quantizer_efSearch=32 +nprobe=16,quantizer_efSearch=16 +nprobe=16,quantizer_efSearch=32 +nprobe=16,quantizer_efSearch=64 +nprobe=32,quantizer_efSearch=8 +nprobe=32,quantizer_efSearch=32 +nprobe=32,quantizer_efSearch=64 +nprobe=32,quantizer_efSearch=128 +nprobe=64,quantizer_efSearch=16 +nprobe=64,quantizer_efSearch=32 +nprobe=64,quantizer_efSearch=64 +nprobe=64,quantizer_efSearch=128 +nprobe=64,quantizer_efSearch=256 +nprobe=128,quantizer_efSearch=32 +nprobe=128,quantizer_efSearch=64 +nprobe=128,quantizer_efSearch=128 +nprobe=128,quantizer_efSearch=256 +nprobe=128,quantizer_efSearch=512 +nprobe=256,quantizer_efSearch=64 +nprobe=256,quantizer_efSearch=128 +nprobe=256,quantizer_efSearch=512 +nprobe=512,quantizer_efSearch=256 +nprobe=512,quantizer_efSearch=512 +nprobe=1024,quantizer_efSearch=256 +" + +python track1_baseline_faiss/baseline_faiss.py \ + --dataset bigann-dim-reduced-100M --indexfile data/track1_baseline_faiss/bigann-100M.IVF1M_2level_PQ64x4fsr.faissindex \ + --search --searchparams $params diff --git a/run_t1_faiss_baseline_index.sh b/run_t1_faiss_baseline_index.sh new file mode 100755 index 00000000..dd1912fe --- /dev/null +++ b/run_t1_faiss_baseline_index.sh @@ -0,0 +1,11 @@ +python -u track1_baseline_faiss/baseline_faiss.py --dataset bigann-dim-reduced-100M \ + --indexkey OPQ64_128,IVF1048576_HNSW32,PQ64x4fsr \ + --maxtrain 10000000 \ + --two_level_clustering \ + --build \ + --add_splits 30 \ + --indexfile data/track1_baseline_faiss_dim_reduction/bigann-100M.IVF1M_2level_PQ64x4fsr.faissindex \ + --quantizer_efConstruction 200 \ + --quantizer_add_efSearch 80 + +