diff --git a/.github/workflows/benchmarks.yml b/.github/workflows/benchmarks.yml index 1eb37f7d..63d6c16a 100644 --- a/.github/workflows/benchmarks.yml +++ b/.github/workflows/benchmarks.yml @@ -18,6 +18,12 @@ jobs: - algorithm: faiss-t1 dataset: random-range-xs library: faissconda + - algorithm: diskann-t2 + dataset: random-xs + library: diskann + - algorithm: diskann-t2 + dataset: random-range-xs + library: diskann fail-fast: false steps: diff --git a/.gitignore b/.gitignore new file mode 100644 index 00000000..982bc6c5 --- /dev/null +++ b/.gitignore @@ -0,0 +1,15 @@ +.DS_Store +*.pyc +*.o + +data/* +*.class + +*.log + +results/* +!results/*.png + +venv + +.idea diff --git a/algos.yaml b/algos.yaml index 01b7a855..fea968f4 100644 --- a/algos.yaml +++ b/algos.yaml @@ -34,6 +34,17 @@ random-range-xs: "nprobe=2,quantizer_efSearch=8", "nprobe=4,quantizer_efSearch=4", "nprobe=2,quantizer_efSearch=16"] + diskann-t2: + docker-tag: billion-scale-benchmark-diskann + module: benchmark.algorithms.diskann-t2 + constructor: Diskann + base-args: ["@metric"] + run-groups: + base: + args: | + [{"R":32, "L":32, "B":0.0001, "M":1}] + query-args: | + [{"Ls":10, "BW":4, "T":16}] random-xs: faiss-t1: docker-tag: billion-scale-benchmark-faissconda @@ -51,6 +62,17 @@ random-xs: "nprobe=2,quantizer_efSearch=8", "nprobe=4,quantizer_efSearch=4", "nprobe=2,quantizer_efSearch=16"] + diskann-t2: + docker-tag: billion-scale-benchmark-diskann + module: benchmark.algorithms.diskann-t2 + constructor: Diskann + base-args: ["@metric"] + run-groups: + base: + args: | + [{"R":32, "L":32, "B":0.0001, "M":1}] + query-args: | + [{"Ls":10, "BW":4, "T":16}] deep-10M: faiss-t1: docker-tag: billion-scale-benchmark-faissconda @@ -116,6 +138,28 @@ deep-1B: "nprobe=128,quantizer_efSearch=512", "nprobe=256,quantizer_efSearch=64", "nprobe=256,quantizer_efSearch=128"] + diskann-t2: + docker-tag: billion-scale-benchmark-diskann + module: benchmark.algorithms.diskann-t2 + constructor: Diskann + base-args: ["@metric"] + run-groups: + base: + args: | + [{"R":100, "L":100, "B":50, "M":110, + "url": "https://comp21storage.blob.core.windows.net/publiccontainer/comp21/diskann-T2-baseline-indices/deep-1B/R100_L100_B50_M110" + }] + query-args: | + [{"Ls":30, "BW":4, "T":16}, + {"Ls":40, "BW":4, "T":16}, + {"Ls":50, "BW":4, "T":16}, + {"Ls":53, "BW":4, "T":16}, + {"Ls":56, "BW":4, "T":16}, + {"Ls":58, "BW":4, "T":16}, + {"Ls":60, "BW":4, "T":16}, + {"Ls":70, "BW":4, "T":16}, + {"Ls":80, "BW":4, "T":16}, + {"Ls":100, "BW":4, "T":16}] msspacev-1B: faiss-t1: docker-tag: billion-scale-benchmark-faissconda @@ -155,6 +199,28 @@ msspacev-1B: "nprobe=128,quantizer_efSearch=512", "nprobe=256,quantizer_efSearch=256", "nprobe=256,quantizer_efSearch=512"] + diskann-t2: + docker-tag: billion-scale-benchmark-diskann + module: benchmark.algorithms.diskann-t2 + constructor: Diskann + base-args: ["@metric"] + run-groups: + base: + args: | + [{"R":100, "L":100, "B":47, "M":100, + "url": "https://comp21storage.blob.core.windows.net/publiccontainer/comp21/diskann-T2-baseline-indices/msspacev-1B/R100_L100_B47_M100" + }] + query-args: | + [{"Ls":40, "BW":4, "T":16}, + {"Ls":50, "BW":4, "T":16}, + {"Ls":60, "BW":4, "T":16}, + {"Ls":70, "BW":4, "T":16}, + {"Ls":80, "BW":4, "T":16}, + {"Ls":90, "BW":4, "T":16}, + {"Ls":100, "BW":4, "T":16}, + {"Ls":110, "BW":4, "T":16}, + {"Ls":120, "BW":4, "T":16}, + {"Ls":130, "BW":4, "T":16}] msturing-1B: faiss-t1: docker-tag: billion-scale-benchmark-faissconda @@ -192,6 +258,28 @@ msturing-1B: "nprobe=128,quantizer_efSearch=512", "nprobe=256,quantizer_efSearch=256", "nprobe=256,quantizer_efSearch=512"] + diskann-t2: + docker-tag: billion-scale-benchmark-diskann + module: benchmark.algorithms.diskann-t2 + constructor: Diskann + base-args: ["@metric"] + run-groups: + base: + args: | + [{"R":100, "L":100, "B":50, "M":80, + "url": "https://comp21storage.blob.core.windows.net/publiccontainer/comp21/diskann-T2-baseline-indices/msturing-1B/R100_L100_B50_M80" + }] + query-args: | + [{"Ls":30, "BW":4, "T":16}, + {"Ls":40, "BW":4, "T":16}, + {"Ls":50, "BW":4, "T":16}, + {"Ls":55, "BW":4, "T":16}, + {"Ls":57, "BW":4, "T":16}, + {"Ls":59, "BW":4, "T":16}, + {"Ls":60, "BW":4, "T":16}, + {"Ls":70, "BW":4, "T":16}, + {"Ls":80, "BW":4, "T":16}, + {"Ls":100, "BW":4, "T":16}] bigann-1B: faiss-t1: docker-tag: billion-scale-benchmark-faissconda @@ -233,6 +321,28 @@ bigann-1B: "nprobe=256,quantizer_efSearch=64", "nprobe=256,quantizer_efSearch=128", "nprobe=256,quantizer_efSearch=512"] + diskann-t2: + docker-tag: billion-scale-benchmark-diskann + module: benchmark.algorithms.diskann-t2 + constructor: Diskann + base-args: ["@metric"] + run-groups: + base: + args: | + [{"R":100, "L":100, "B":50, "M":80, + "url": "https://comp21storage.blob.core.windows.net/publiccontainer/comp21/diskann-T2-baseline-indices/bigann-1B/R100_L100_B50_M80" + }] + query-args: | + [{"Ls":30, "BW":4, "T":16}, + {"Ls":40, "BW":4, "T":16}, + {"Ls":50, "BW":4, "T":16}, + {"Ls":55, "BW":4, "T":16}, + {"Ls":60, "BW":4, "T":16}, + {"Ls":62, "BW":4, "T":16}, + {"Ls":65, "BW":4, "T":16}, + {"Ls":70, "BW":4, "T":16}, + {"Ls":80, "BW":4, "T":16}, + {"Ls":100, "BW":4, "T":16}] ssnpp-1B: faiss-t1: docker-tag: billion-scale-benchmark-faissconda @@ -274,6 +384,28 @@ ssnpp-1B: "nprobe=32,quantizer_efSearch=512,ht=256", "nprobe=64,quantizer_efSearch=512,ht=126", "nprobe=256,quantizer_efSearch=256,ht=128"] + diskann-t2: + docker-tag: billion-scale-benchmark-diskann + module: benchmark.algorithms.diskann-t2 + constructor: Diskann + base-args: ["@metric"] + run-groups: + base: + args: | + [{"R":100, "L":100, "B":60, "M":100, "C":500000, "CM":2, + "url": "https://comp21storage.blob.core.windows.net/publiccontainer/comp21/diskann-T2-baseline-indices/fbssnpp-1B/R100_L100_B60_M100" + }] + query-args: | + [{"Ls":30, "BW":4, "T":16}, + {"Ls":40, "BW":4, "T":16}, + {"Ls":50, "BW":4, "T":16}, + {"Ls":55, "BW":4, "T":16}, + {"Ls":60, "BW":4, "T":16}, + {"Ls":62, "BW":4, "T":16}, + {"Ls":65, "BW":4, "T":16}, + {"Ls":70, "BW":4, "T":16}, + {"Ls":80, "BW":4, "T":16}, + {"Ls":100, "BW":4, "T":16}] text2image-1B: faiss-t1: docker-tag: billion-scale-benchmark-faissconda @@ -308,6 +440,28 @@ text2image-1B: "nprobe=128,quantizer_efSearch=512,ht=256", "nprobe=256,quantizer_efSearch=512,ht=120", "nprobe=256,quantizer_efSearch=512,ht=122"] + diskann-t2: + docker-tag: billion-scale-benchmark-diskann + module: benchmark.algorithms.diskann-t2 + constructor: Diskann + base-args: ["@metric"] + run-groups: + base: + args: | + [{"R":100, "L":100, "B":60, "M":115, "PQ":200, "C":500000, "CM":2, + "url": "https://comp21storage.blob.core.windows.net/publiccontainer/comp21/diskann-T2-baseline-indices/text2image-1B/R100_L100_B60_M115_PQ200" + }] + query-args: | + [{"Ls":10, "BW":10, "T":16}, + {"Ls":20, "BW":10, "T":16}, + {"Ls":30, "BW":10, "T":16}, + {"Ls":40, "BW":10, "T":16}, + {"Ls":50, "BW":10, "T":16}, + {"Ls":60, "BW":10, "T":16}, + {"Ls":70, "BW":10, "T":16}, + {"Ls":80, "BW":10, "T":16}, + {"Ls":90, "BW":10, "T":16}, + {"Ls":100, "BW":10, "T":16}] ssnpp-10M: faiss-t1: docker-tag: billion-scale-benchmark-faissconda @@ -324,3 +478,75 @@ ssnpp-10M: "nprobe=1,quantizer_efSearch=4,ht=98", "nprobe=1,quantizer_efSearch=4,ht=104", "nprobe=1,quantizer_efSearch=4,ht=112"] +deep-10M: + diskann-t2: + docker-tag: billion-scale-benchmark-diskann + module: benchmark.algorithms.diskann-t2 + constructor: Diskann + base-args: ["@metric"] + run-groups: + base: + args: | + [{"R":100, "L":100, "B":0.3, "M":15}] + query-args: | + [{"Ls":50, "BW":4, "T":16}] +bigann-10M: + diskann-t2: + docker-tag: billion-scale-benchmark-diskann + module: benchmark.algorithms.diskann-t2 + constructor: Diskann + base-args: ["@metric"] + run-groups: + base: + args: | + [{"R":100, "L":100, "B":0.3, "M":15}] + query-args: | + [{"Ls":50, "BW":4, "T":16}] +msturing-1M: + diskann-t2: + docker-tag: billion-scale-benchmark-diskann + module: benchmark.algorithms.diskann-t2 + constructor: Diskann + base-args: ["@metric"] + run-groups: + base: + args: | + [{"R":50, "L":50, "B":0.03, "M":1}] + query-args: | + [{"Ls":50, "BW":4, "T":16}] +msspacev-1M: + diskann-t2: + docker-tag: billion-scale-benchmark-diskann + module: benchmark.algorithms.diskann-t2 + constructor: Diskann + base-args: ["@metric"] + run-groups: + base: + args: | + [{"R":50, "L":50, "B":0.03, "M":1}] + query-args: | + [{"Ls":50, "BW":4, "T":16}] +text2image-1M: + diskann-t2: + docker-tag: billion-scale-benchmark-diskann + module: benchmark.algorithms.diskann-t2 + constructor: Diskann + base-args: ["@metric"] + run-groups: + base: + args: | + [{"R":50, "L":50, "B":0.03, "M":1, "PQ":200}] + query-args: | + [{"Ls":50, "BW":4, "T":16}] +text2image-10M: + diskann-t2: + docker-tag: billion-scale-benchmark-diskann + module: benchmark.algorithms.diskann-t2 + constructor: Diskann + base-args: ["@metric"] + run-groups: + base: + args: | + [{"R":50, "L":50, "B":0.3, "M":10, "PQ":200}] + query-args: | + [{"Ls":50, "BW":4, "T":16}] diff --git a/benchmark/algorithms/diskann-t2.py b/benchmark/algorithms/diskann-t2.py new file mode 100755 index 00000000..0f8ce9ee --- /dev/null +++ b/benchmark/algorithms/diskann-t2.py @@ -0,0 +1,214 @@ +from __future__ import absolute_import +import psutil +import os +import time +import numpy as np +import diskannpy + +from benchmark.algorithms.base import BaseANN +from benchmark.datasets import DATASETS, download_accelerated + +class Diskann(BaseANN): + def __init__(self, metric, index_params): + if (index_params.get("R")==None): + print("Error: missing parameter R") + return + if (index_params.get("L")==None): + print("Error: missing parameter L") + return + if (index_params.get("B")==None): + print("Error: missing parameter B") + return + if(index_params.get("M")==None): + print("Error: missing parameter M") + return + + self._index_params = index_params + self._metric = metric + + self.R = index_params.get("R") + self.L = index_params.get("L") + self.B = index_params.get("B") + self.M = index_params.get("M") + self.PQ = 0 if index_params.get("PQ") == None else index_params.get("PQ") + self.C = -1 if index_params.get("C") == None else index_params.get("C") + self.cache_mechanism = 1 if index_params.get("CM") == None else index_params.get("CM") + if self.C == 0: + self.cache_mechanism = 0 + print(self.PQ) + + def index_name(self): + if self.PQ == 0: + return f"R{self.R}_L{self.L}_B{self.B}_M{self.M}" + else: + return f"R{self.R}_L{self.L}_B{self.B}_M{self.M}_PQ{self.PQ}" + + def create_index_dir(self, dataset): + index_dir = os.path.join(os.getcwd(), "data", "indices") + os.makedirs(index_dir, mode=0o777, exist_ok=True) + index_dir = os.path.join(index_dir, "T2") + os.makedirs(index_dir, mode=0o777, exist_ok=True) + index_dir = os.path.join(index_dir, self.__str__()) + os.makedirs(index_dir, mode=0o777, exist_ok=True) + index_dir = os.path.join(index_dir, dataset.short_name()) + os.makedirs(index_dir, mode=0o777, exist_ok=True) + index_dir = os.path.join(index_dir, self.index_name()) + os.makedirs(index_dir, mode=0o777, exist_ok=True) + return index_dir + + def fit(self, dataset): + """ + Build the index for the data points given in dataset name. + """ + + ds = DATASETS[dataset]() + d = ds.d + + buildthreads = self._index_params.get("buildthreads", -1) + if buildthreads == -1: + buildthreads = diskannpy.omp_get_max_threads() + + print("Set build-time number of threads:", buildthreads) + diskannpy.omp_set_num_threads(buildthreads) + + index_dir = self.create_index_dir(ds) + self.index_path = os.path.join(index_dir, self.index_name()) + + if ds.distance() == "euclidean": + metric = diskannpy.L2 + elif ds.distance() == "ip": + metric = diskannpy.INNER_PRODUCT + else: + print("Unsuported distance function.") + return False + + + if not hasattr(self, 'index'): + if ds.dtype == "float32": + self.index = diskannpy.DiskANNFloatIndex(metric) + elif ds.dtype == "int8": + self.index = diskannpy.DiskANNInt8Index(metric) + elif ds.dtype == "uint8": + self.index = diskannpy.DiskANNUInt8Index(metric) + else: + print ("Unsupported data type.") + return False + + start = time.time() + if self.PQ > 0: + self.index.build(ds.get_dataset_fn(), self.index_path, self.R, self.L, self.B, self.M, buildthreads, self.PQ) + else: + self.index.build(ds.get_dataset_fn(), self.index_path, self.R, self.L, self.B, self.M, buildthreads) + end = time.time() + print("DiskANN index built in %.3f s" % (end - start)) + + + if self.C > 0: + num_nodes_to_cache = self.C + else: + num_nodes_to_cache = int(ds.nb/1000) if ds.nb > 1000000 else int(ds.nb/100) + print(f"Loading index and caching {num_nodes_to_cache} nodes..") + self.index.load_index(self.index_path, diskannpy.omp_get_max_threads(), num_nodes_to_cache, self.cache_mechanism) + + def load_index(self, dataset): + """ + Load the index for dataset. Returns False if index + is not available, True otherwise. + + Checking the index usually involves the dataset name + and the index build paramters passed during construction. + """ + ds = DATASETS[dataset]() + if ds.distance() == "euclidean": + metric = diskannpy.L2 + elif ds.distance() == "ip": + metric = diskannpy.INNER_PRODUCT + else: + print("Unsuported distance function.") + return False + + if ds.dtype == "float32": + self.index = diskannpy.DiskANNFloatIndex(metric) + elif ds.dtype == "int8": + self.index = diskannpy.DiskANNInt8Index(metric) + elif ds.dtype == "uint8": + self.index = diskannpy.DiskANNUInt8Index(metric) + else: + print ("Unsupported data type.") + return False + + index_dir = self.create_index_dir(ds) + if not (os.path.exists(index_dir)) and 'url' not in self._index_params: + return False + + index_path = os.path.join(index_dir, self.index_name()) + index_components = [ + 'pq_pivots.bin', 'pq_pivots.bin_centroid.bin', 'pq_pivots.bin_chunk_offsets.bin', + 'pq_pivots.bin_rearrangement_perm.bin', 'sample_data.bin', 'sample_ids.bin', + 'pq_compressed.bin', 'disk.index' + ] + if ds.distance() == "ip": + index_components = index_components + [ + 'disk.index_centroids.bin', 'disk.index_max_base_norm.bin', 'disk.index_medoids.bin' + ] + if self.PQ > 0: + index_components = index_components + [ + 'disk.index_pq_pivots.bin', 'disk.index_pq_pivots.bin_centroid.bin', + 'disk.index_pq_pivots.bin_chunk_offsets.bin', 'disk.index_pq_pivots.bin_rearrangement_perm.bin' + ] + + + for component in index_components: + index_file = index_path + '_' + component + if not (os.path.exists(index_file)): + if 'url' in self._index_params: + index_file_source = self._index_params['url'] + '/' + self.index_name() + '_' + component + print(f"Downloading index in background. This can take a while.") + download_accelerated(index_file_source, index_file, quiet=True) + else: + return False + + print("Loading index") + + if self.C > 0: + num_nodes_to_cache = self.C + else: + num_nodes_to_cache = int(ds.nb/1000) if ds.nb > 1000000 else int(ds.nb/100) + if (self.index.load_index(index_path, diskannpy.omp_get_max_threads(), num_nodes_to_cache, self.cache_mechanism) == 0): + print ("Load index success.") + return True + else: + return False + + def query(self, X, k): + """Carry out a batch query for k-NN of query set X.""" + nq, dim = (np.shape(X)) + self.res, self.query_dists = self.index.batch_search_numpy_input(X, dim, nq, k, self.Ls, self.BW, self.threads) + + def range_query(self, X, radius): + """ + Carry out a batch query for range search with + radius. + """ + nq, dim = np.shape(X) + self.rangeres_lim, (self.rangeres_ids, self.rangeres_dists) = self.index.batch_range_search_numpy_input( + X, dim, nq, radius, self.Ls, self.BW, self.threads) + + def get_range_results(self): + return (self.rangeres_lim, self.rangeres_ids, self.rangeres_dists) + + def get_additional(self): + """ + Allows to retrieve additional results. + """ + return {} + + def set_query_arguments(self, query_args): + self._query_args = query_args + self.Ls = self._query_args.get("Ls") + self.BW = self._query_args.get("BW") + self.threads = self._query_args.get("T") + + + def __str__(self): + return "DiskANN" diff --git a/benchmark/algorithms/diskann.py b/benchmark/algorithms/diskann.py deleted file mode 100644 index e69de29b..00000000 diff --git a/benchmark/algorithms/faiss_t3.py b/benchmark/algorithms/faiss_t3.py index 49d68d24..59ef4104 100644 --- a/benchmark/algorithms/faiss_t3.py +++ b/benchmark/algorithms/faiss_t3.py @@ -211,7 +211,11 @@ def __init__(self, index, search_bs): self.search_bs = search_bs index_ivf, vec_transform = unwind_index_ivf(index) self.index_ivf = index_ivf - self.vec_transform = vec_transform.apply + if vec_transform: +# print(type(vec_transform),dir(vec_transform)) + self.vec_transform = vec_transform.apply + else: + self.vec_transform = None self.quantizer_gpu = faiss.index_cpu_to_all_gpus(self.index_ivf.quantizer) diff --git a/benchmark/datasets.py b/benchmark/datasets.py index e2f9c3dd..ca52b4fe 100644 --- a/benchmark/datasets.py +++ b/benchmark/datasets.py @@ -204,7 +204,9 @@ def distance(self): def default_count(self): return 10 - + def short_name(self): + return f"{self.__class__.__name__}-{self.nb}" + def __str__(self): return ( f"Dataset {self.__class__.__name__} in dimension {self.d}, with distance {self.distance()}, " diff --git a/benchmark/main.py b/benchmark/main.py index 6627d429..13aa403d 100644 --- a/benchmark/main.py +++ b/benchmark/main.py @@ -182,25 +182,26 @@ def main(): logger.info(f'running only {args.algorithm}') definitions = [d for d in definitions if d.algorithm == args.algorithm] - # See which Docker images we have available - docker_client = docker.from_env() - docker_tags = set() - for image in docker_client.images.list(): - for tag in image.tags: - tag = tag.split(':')[0] - docker_tags.add(tag) - - if args.docker_tag: - logger.info(f'running only {args.docker_tag}') - definitions = [ - d for d in definitions if d.docker_tag == args.docker_tag] - - if set(d.docker_tag for d in definitions).difference(docker_tags) and not args.nodocker: - logger.info(f'not all docker images available, only: {set(docker_tags)}') - logger.info(f'missing docker images: ' - f'{str(set(d.docker_tag for d in definitions).difference(docker_tags))}') - definitions = [ - d for d in definitions if d.docker_tag in docker_tags] + if not args.nodocker: + # See which Docker images we have available + docker_client = docker.from_env() + docker_tags = set() + for image in docker_client.images.list(): + for tag in image.tags: + tag = tag.split(':')[0] + docker_tags.add(tag) + + if args.docker_tag: + logger.info(f'running only {args.docker_tag}') + definitions = [ + d for d in definitions if d.docker_tag == args.docker_tag] + + if set(d.docker_tag for d in definitions).difference(docker_tags): + logger.info(f'not all docker images available, only: {set(docker_tags)}') + logger.info(f'missing docker images: ' + f'{str(set(d.docker_tag for d in definitions).difference(docker_tags))}') + definitions = [ + d for d in definitions if d.docker_tag in docker_tags] if args.max_n_algorithms >= 0: definitions = definitions[:args.max_n_algorithms] diff --git a/benchmark/plotting/utils.py b/benchmark/plotting/utils.py index 9a599303..033de092 100644 --- a/benchmark/plotting/utils.py +++ b/benchmark/plotting/utils.py @@ -67,17 +67,13 @@ def compute_metrics(true_nn, res, metric_1, metric_2, return all_results -def compute_metrics_all_runs(dataset, res, recompute=False): +def compute_metrics_all_runs(dataset, res, recompute=False, sensor_metrics=False): try: true_nn = dataset.get_groundtruth() except: print(f"Groundtruth for {dataset} not found.") return - # removes 'wspq' metric if no power benchmarks found - # in the loaded runs - power_capture.detect_power_benchmarks(metrics, res) - search_type = dataset.search_type() for i, (properties, run) in enumerate(res): algo = properties['algo'] @@ -112,6 +108,8 @@ def compute_metrics_all_runs(dataset, res, recompute=False): if search_type == "knn" and name == "ap" or\ search_type == "range" and name == "k-nn": continue + if not sensor_metrics and name=="wspq": #don't process sensor_metrics by default + break v = metric["function"](true_nn, run_nn, metrics_cache, properties) run_result[name] = v yield run_result diff --git a/benchmark/results.py b/benchmark/results.py index 9a236003..ff2ec5ee 100644 --- a/benchmark/results.py +++ b/benchmark/results.py @@ -57,6 +57,9 @@ def store_results(dataset, count, definition, query_arguments, def load_all_results(dataset=None, count=None): + """ + A generator for all result files. + """ for root, _, files in os.walk(get_result_filename(dataset, count)): for fn in files: if os.path.splitext(fn)[-1] != '.hdf5': diff --git a/benchmark/runner.py b/benchmark/runner.py index 3f74011c..4d139b3f 100644 --- a/benchmark/runner.py +++ b/benchmark/runner.py @@ -29,6 +29,7 @@ def run_individual_query(algo, X, distance, count, run_count, search_type): algo.query(X, count) total = (time.time() - start) results = algo.get_results() + assert len(results) == len(X) else: algo.range_query(X, count) total = (time.time() - start) @@ -106,7 +107,7 @@ def run(definition, dataset, count, run_count, rebuild): run_count, search_type, descriptor) store_results(dataset, count, definition, - query_arguments, descriptor, results, search_type) + query_arguments, descriptor, results, search_type) finally: algo.done() @@ -202,7 +203,7 @@ def run_docker(definition, dataset, count, runs, timeout, rebuild, if mem_limit is None: mem_limit = psutil.virtual_memory().available - + container = None if t3: container = t3_create_container(definition, cmd, cpu_limit, mem_limit ) diff --git a/install/Dockerfile b/install/Dockerfile index 3f22a66b..259e2083 100644 --- a/install/Dockerfile +++ b/install/Dockerfile @@ -1,6 +1,9 @@ FROM ubuntu:18.04 -RUN apt-get update && apt-get install -y python3-numpy python3-scipy python3-pip build-essential git axel +RUN apt-get update && apt-get install -y python3-numpy python3-scipy python3-pip build-essential git axel wget +RUN wget https://aka.ms/downloadazcopy-v10-linux && mv downloadazcopy-v10-linux azcopy.tgz && tar xzf azcopy.tgz --transform 's!^[^/]\+\($\|/\)!azcopy_folder\1!' +RUN cp azcopy_folder/azcopy /usr/bin + RUN pip3 install -U pip WORKDIR /home/app diff --git a/install/Dockerfile.diskann b/install/Dockerfile.diskann new file mode 100644 index 00000000..54599646 --- /dev/null +++ b/install/Dockerfile.diskann @@ -0,0 +1,29 @@ +FROM billion-scale-benchmark + +RUN apt-get update +RUN apt-get install -y wget git cmake g++ libaio-dev libgoogle-perftools-dev clang-format libboost-dev python3 python3-setuptools python3-pip +RUN pip3 install pybind11 numpy + +RUN cd /tmp && wget https://apt.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRODUCTS-2019.PUB +RUN cd /tmp && apt-key add GPG-PUB-KEY-INTEL-SW-PRODUCTS-2019.PUB +RUN cd /tmp && rm GPG-PUB-KEY-INTEL-SW-PRODUCTS-2019.PUB +RUN cd /tmp && sh -c 'echo deb https://apt.repos.intel.com/mkl all main > /etc/apt/sources.list.d/intel-mkl.list' +RUN apt-get update +RUN apt-get install -y intel-mkl-64bit-2020.0-088 + +RUN update-alternatives --install /usr/lib/x86_64-linux-gnu/libblas.so libblas.so-x86_64-linux-gnu /opt/intel/mkl/lib/intel64/libmkl_rt.so 150 +RUN update-alternatives --install /usr/lib/x86_64-linux-gnu/libblas.so.3 libblas.so.3-x86_64-linux-gnu /opt/intel/mkl/lib/intel64/libmkl_rt.so 150 +RUN update-alternatives --install /usr/lib/x86_64-linux-gnu/liblapack.so liblapack.so-x86_64-linux-gnu /opt/intel/mkl/lib/intel64/libmkl_rt.so 150 +RUN update-alternatives --install /usr/lib/x86_64-linux-gnu/liblapack.so.3 liblapack.so.3-x86_64-linux-gnu /opt/intel/mkl/lib/intel64/libmkl_rt.so 150 + +RUN echo "/opt/intel/lib/intel64" > /etc/ld.so.conf.d/mkl.conf +RUN echo "/opt/intel/mkl/lib/intel64" >> /etc/ld.so.conf.d/mkl.conf +RUN ldconfig +RUN echo "MKL_THREADING_LAYER=GNU" >> /etc/environment + +RUN git clone --single-branch --branch python_bindings_diskann https://github.com/microsoft/diskann +RUN mkdir -p diskann/build +RUN cd diskann/build && cmake -DCMAKE_BUILD_TYPE=Release .. +RUN cd diskann/build && make -j +RUN cd diskann/python && pip install -e . +RUN python3 -c 'import diskannpy' diff --git a/requirements.txt b/requirements.txt index 5ce8131f..13100ee0 100644 --- a/requirements.txt +++ b/requirements.txt @@ -7,5 +7,5 @@ pyyaml==5.1 psutil==5.6.6 scipy==1.0.0 scikit-learn==0.19.1 -jinja2==2.10.1 +jinja2==2.11.3 pandas diff --git a/requirements_py38.txt b/requirements_py38.txt index 810067d1..a4035bc1 100644 --- a/requirements_py38.txt +++ b/requirements_py38.txt @@ -2,10 +2,10 @@ ansicolors==1.1.8 docker==2.6.1 h5py==2.10.0 matplotlib==3.3.4 -numpy==1.20.1 +numpy==1.19.5 pyyaml==5.3.1 psutil==5.6.6 -scipy==1.6.1 +scipy==1.5.4 scikit-learn -jinja2==2.10.1 -pandas==1.2.3 +jinja2==2.11.3 +pandas==1.1.5 diff --git a/results/text2image-1B.png b/results/text2image-1B.png old mode 100644 new mode 100755 diff --git a/t3/faiss_t3/algos.yaml b/t3/faiss_t3/algos.yaml index 1bab0978..5f36333c 100644 --- a/t3/faiss_t3/algos.yaml +++ b/t3/faiss_t3/algos.yaml @@ -39,7 +39,8 @@ deep-1B: run-groups: base: args: | - [{"indexkey": "IVF1048576,SQ8"}] + [{"indexkey": "IVF1048576,SQ8", + "url":"https://storage.googleapis.com/bigann/indexes/faiss/1.7.1/deep-1B.IVF1048576%2CSQ8.faissindex"}] query-args: | ["nprobe=1", "nprobe=2", @@ -47,8 +48,15 @@ deep-1B: "nprobe=8", "nprobe=16", "nprobe=32", + "nprobe=40", + "nprobe=50", "nprobe=64", "nprobe=128", + "nprobe=140", + "nprobe=160", + "nprobe=180", + "nprobe=190", + "nprobe=200", "nprobe=256"] bigann-1B: faiss-t3: @@ -59,7 +67,8 @@ bigann-1B: run-groups: base: args: | - [{"indexkey": "IVF1048576,SQ8"}] + [{"indexkey": "IVF1048576,SQ8", + "url":"https://storage.googleapis.com/bigann/indexes/faiss/1.7.1/bigann-1B.IVF1048576%2CSQ8.faissindex"}] query-args: | ["nprobe=1", "nprobe=2", @@ -79,7 +88,8 @@ msspacev-1B: run-groups: base: args: | - [{"indexkey": "IVF1048576,SQ8"}] + [{"indexkey": "IVF1048576,SQ8", + "url":"https://storage.googleapis.com/bigann/indexes/faiss/1.7.1/msspacev-1B.IVF1048576%2CSQ8.faissindex"}}] query-args: | ["nprobe=1", "nprobe=2", @@ -99,7 +109,8 @@ text2image-1B: run-groups: base: args: | - [{"indexkey": "IVF1048576,SQ8"}] + [{"indexkey": "IVF1048576,SQ8", + "url":"https://storage.googleapis.com/bigann/indexes/faiss/1.7.1/text2image-1B.IVF1048576%2CSQ8.faissindex"}}] query-args: | ["nprobe=1", "nprobe=2", @@ -119,7 +130,8 @@ msturing-10M: run-groups: base: args: | - [{"indexkey": "IVF1048576,SQ8"}] + [{"indexkey": "IVF1048576,SQ8", + "url":"https://storage.googleapis.com/bigann/indexes/faiss/1.7.1/msturing-10M.IVF1048576%2CSQ8.faissindex"}] query-args: | ["nprobe=1", "nprobe=2", @@ -139,27 +151,8 @@ msturing-1B: run-groups: base: args: | - [{"indexkey": "IVF1048576,SQ8"}] - query-args: | - ["nprobe=1", - "nprobe=2", - "nprobe=4", - "nprobe=8", - "nprobe=16", - "nprobe=32", - "nprobe=64", - "nprobe=128", - "nprobe=256"] -msturing-1B: - faiss-t3: - docker-tag: billion-scale-benchmark-faiss_t3 - module: benchmark.algorithms.faiss_t3 - constructor: FaissT3 - base-args: ["@metric"] - run-groups: - base: - args: | - [{"indexkey": "IVF1048576,SQ8"}] + [{"indexkey": "IVF1048576,SQ8", + "url":"https://storage.googleapis.com/bigann/indexes/faiss/1.7.1/msturing-1B.IVF1048576%2CSQ8.faissindex"}] query-args: | ["nprobe=1", "nprobe=2", @@ -180,6 +173,7 @@ ssnpp-1M: base: args: | [{"indexkey": "OPQ32_128,IVF100_HNSW32,PQ32", + "url":"https://storage.googleapis.com/bigann/indexes/faiss/1.7.1/ssnpp-1M.IVF1048576%2CSQ8.faissindex", "parallel_mode": 2, "add_splits": 0}] query-args: | @@ -202,6 +196,7 @@ ssnpp-10M: base: args: | [{"indexkey": "OPQ32_128,IVF1048576_HNSW32,PQ32", + "url":"https://storage.googleapis.com/bigann/indexes/faiss/1.7.1/ssnpp-10M.OPQ32_128%2CIVF1048576_HNSW32%2CPQ32.faissindex", "parallel_mode": 2, "add_splits": 0}] query-args: | @@ -223,7 +218,8 @@ ssnpp-1B: run-groups: base: args: | - [{"indexkey": "OPQ32_128,IVF1048576_HNSW32,PQ32", + [{"indexkey": "IVF1048576,SQ8", + "url":"https://storage.googleapis.com/bigann/indexes/faiss/1.7.1/ssnpp-1B.IVF1048576%2CSQ8.faissindex", "parallel_mode": 2, "add_splits": 0}] query-args: |