Skip to content

Commit

Permalink
harsha-simhadri#1 T1 FAISS customized for own quantized dataset
Browse files Browse the repository at this point in the history
  • Loading branch information
Ubuntu committed Sep 27, 2021
1 parent a2d6d7e commit c7acd84
Show file tree
Hide file tree
Showing 3 changed files with 88 additions and 2 deletions.
42 changes: 40 additions & 2 deletions benchmark/datasets.py
Original file line number Diff line number Diff line change
Expand Up @@ -79,8 +79,17 @@ def ivecs_read(fname):

def xbin_mmap(fname, dtype, maxn=-1):
""" mmap the competition file format for a given type of items """
n, d = map(int, np.fromfile(fname, dtype="uint32", count=2))
assert os.stat(fname).st_size == 8 + n * d * np.dtype(dtype).itemsize
# for search
# n, d = map(int, np.fromfile(fname, dtype="uint32", count=2))
# for dim reduced file of dtype float32
n, d = map(int, np.fromfile(fname, dtype="float32", count=2))
n = 100000000
d = 32
dtype = "float32"
print(f"fname={fname} n={n} d={d} dtype={dtype} os.stat(fname).st_size={os.stat(fname).st_size} np.dtype(dtype).itemsize={np.dtype(dtype).itemsize}")
# offset = 8
offset = 0
assert os.stat(fname).st_size == offset + n * d * np.dtype(dtype).itemsize
if maxn > 0:
n = min(n, maxn)
return np.memmap(fname, dtype=dtype, mode="r", offset=8, shape=(n, d))
Expand Down Expand Up @@ -278,6 +287,7 @@ def get_dataset_iterator(self, bs=512, split=(1,0)):
i0, i1 = self.nb * rank // nsplit, self.nb * (rank + 1) // nsplit
filename = self.get_dataset_fn()
x = xbin_mmap(filename, dtype=self.dtype, maxn=self.nb)
print(f"x.shape={x.shape} self.nb={self.nb} self.d={self.d}")
assert x.shape == (self.nb, self.d)
for j0 in range(i0, i1, bs):
j1 = min(j0 + bs, i1)
Expand Down Expand Up @@ -369,6 +379,33 @@ def __init__(self, nb_M=1000):
def distance(self):
return "euclidean"


class BigANNDimReducedDataset(DatasetCompetitionFormat):
def __init__(self, nb_M=1000):
self.nb_M = nb_M
self.nb = 10**6 * nb_M
self.d = 32
self.nq = 10000
# use for indexing of the dimensionality reduced dataset
self.dtype = "float32"
# use for search over the index built from the dimensionality reduced dataset
# self.dtype = "uint8"
self.ds_fn = "base.1B.u8bin"
self.qs_fn = "query.public.10K.u8bin"
self.gt_fn = (
"GT.public.1B.ibin" if self.nb_M == 1000 else
subset_url + "GT_100M/bigann-100M" if self.nb_M == 100 else
subset_url + "GT_10M/bigann-10M" if self.nb_M == 10 else
None
)
# self.gt_fn = "https://comp21storage.blob.core.windows.net/publiccontainer/comp21/bigann/public_query_gt100.bin" if self.nb == 10**9 else None
self.base_url = "https://dl.fbaipublicfiles.com/billion-scale-ann-benchmarks/bigann/"
self.basedir = os.path.join(BASEDIR, "bigann")

def distance(self):
return "euclidean"


class Deep1BDataset(DatasetCompetitionFormat):
def __init__(self, nb_M=1000):
self.nb_M = nb_M
Expand Down Expand Up @@ -600,6 +637,7 @@ def default_count(self):
'bigann-1B': lambda : BigANNDataset(1000),
'bigann-100M': lambda : BigANNDataset(100),
'bigann-10M': lambda : BigANNDataset(10),
'bigann-dim-reduced-100M': lambda: BigANNDimReducedDataset(100),

'deep-1B': lambda : Deep1BDataset(),
'deep-100M': lambda : Deep1BDataset(100),
Expand Down
37 changes: 37 additions & 0 deletions run_t1_faiss_baseline_eval.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
params="
nprobe=1,quantizer_efSearch=4
nprobe=2,quantizer_efSearch=4
nprobe=4,quantizer_efSearch=4
nprobe=4,quantizer_efSearch=8
nprobe=8,quantizer_efSearch=4
nprobe=8,quantizer_efSearch=8
nprobe=8,quantizer_efSearch=16
nprobe=8,quantizer_efSearch=32
nprobe=16,quantizer_efSearch=16
nprobe=16,quantizer_efSearch=32
nprobe=16,quantizer_efSearch=64
nprobe=32,quantizer_efSearch=8
nprobe=32,quantizer_efSearch=32
nprobe=32,quantizer_efSearch=64
nprobe=32,quantizer_efSearch=128
nprobe=64,quantizer_efSearch=16
nprobe=64,quantizer_efSearch=32
nprobe=64,quantizer_efSearch=64
nprobe=64,quantizer_efSearch=128
nprobe=64,quantizer_efSearch=256
nprobe=128,quantizer_efSearch=32
nprobe=128,quantizer_efSearch=64
nprobe=128,quantizer_efSearch=128
nprobe=128,quantizer_efSearch=256
nprobe=128,quantizer_efSearch=512
nprobe=256,quantizer_efSearch=64
nprobe=256,quantizer_efSearch=128
nprobe=256,quantizer_efSearch=512
nprobe=512,quantizer_efSearch=256
nprobe=512,quantizer_efSearch=512
nprobe=1024,quantizer_efSearch=256
"

python track1_baseline_faiss/baseline_faiss.py \
--dataset bigann-dim-reduced-100M --indexfile data/track1_baseline_faiss/bigann-100M.IVF1M_2level_PQ64x4fsr.faissindex \
--search --searchparams $params
11 changes: 11 additions & 0 deletions run_t1_faiss_baseline_index.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
python -u track1_baseline_faiss/baseline_faiss.py --dataset bigann-dim-reduced-100M \
--indexkey OPQ64_128,IVF1048576_HNSW32,PQ64x4fsr \
--maxtrain 10000000 \
--two_level_clustering \
--build \
--add_splits 30 \
--indexfile data/track1_baseline_faiss_dim_reduction/bigann-100M.IVF1M_2level_PQ64x4fsr.faissindex \
--quantizer_efConstruction 200 \
--quantizer_add_efSearch 80


0 comments on commit c7acd84

Please sign in to comment.