Merge branch 'harsha-simhadri:main' into main

DmitryKey · Oct 1, 2021 · adc134f · adc134f
2 parents 920e6f6 + 2dda69f
commit adc134f
Show file tree

Hide file tree

Showing 22 changed files with 494 additions and 90 deletions.
diff --git a/README.md b/README.md
@@ -43,26 +43,6 @@ and `ALGO` is the name of the algorithm. (Use `python run.py --list-algorithms`)
 
 The parameters used by the implementation to build and query the index can be found in `algos.yaml`.
 
-
-## Evaluating the Results
-Run `sudo python plot.py --dataset ...` or `sudo python data_export.py --output res.csv` to plot results or dump all of them to csv for further post-processing.
-To avoid sudo, run `sudo chmod -R 777 results/` before invoking these scripts.
-
-To get a table overview over the best recall/ap achieved over a certain threshold, run `python3 eval/show_operating_points.py --algorithm $ALGO --threshold $THRESHOLD res.csv`, where `res.csv` is the file produced by running `data_export.py` above.
-
-For the track1 baseline, the output `python3 eval/show_operating_points.py --algorithm faiss-t1 --threshold 10000 res.csv` led to
-
-```
- recall/ap
-algorithm dataset
-faiss-t1 bigann-1B 0.634510
- deep-1B 0.650280
- msspacev-1B 0.728861
- msturing-1B 0.703611
- ssnpp-1B 0.753780
- text2image-1B 0.069275
-```
-
 ## Running the track 1 baseline
 After running the installation, we can evaluate the baseline as follows.
 
@@ -83,10 +63,8 @@ python data_export.py --output res.csv
 python3.8 eval/show_operating_points.py --algorithm faiss-t1 --threshold 10000
 ```
 
-## Including your algorithm
 
-1. Add your algorithm into `benchmark/algorithms` by providing a small Python wrapper inheriting from `BaseANN` defined in `benchmark/algorithms/base.py`. See `benchmark/algorithm/faiss_t1.py` for an example.
-2. Add a Dockerfile in `install/` 
-3. Edit `algos.yaml with the parameter choices you would like to test.
-4. (Add an option to download pre-built indexes as seen in `faiss_t1.py`.)
+## Including your algorithm and Evaluating the Results
+See [Track T1/T2](t1_t2/README.md) for more details on evaluation for Tracks T1 and T2.
 
+See [Track T3](t3/README.md) for more details on evaluation for Track T3.
diff --git a/benchmark/algorithms/base.py b/benchmark/algorithms/base.py
@@ -3,14 +3,26 @@
 
 class BaseANN(object):
  def done(self):
+ """
+ This is called after results have been processed.
+ Use it for cleaning up if necessary.
+ """
  pass
 
+ def track(self):
+ """
+ return "T1" if submitting an entry for track 1
+ return "T2" if submitting an entry for track 2
+ return "T3" if submitting an entry for track 3
+ """
+ raise NotImplementedError()
+
  def fit(self, dataset):
  """
  Build the index for the data points given in dataset name.
  Assumes that after fitting index is loaded in memory.
  """
- pass
+ raise NotImplementedError()
 
  def load_index(self, dataset):
  """
@@ -20,18 +32,31 @@ def load_index(self, dataset):
  Checking the index usually involves the dataset name
  and the index build paramters passed during construction.
  """
- pass
+ raise NotImplementedError()
+
+ def index_files_to_store(self, dataset):
+ """
+ Specify a triplet with the local directory path of index files,
+ the common prefix name of index component(s) and a list of
+ index components that need to be uploaded to (after build)
+ or downloaded from (for search) cloud storage.
+
+ For local directory path under docker environment, please use
+ a directory under
+ data/indices/track(T1 or T2)/algo.__str__()/DATASETS[dataset]().short_name()
+ """
+ raise NotImplementedError()
 
  def query(self, X, k):
  """Carry out a batch query for k-NN of query set X."""
- pass
+ raise NotImplementedError()
 
  def range_query(self, X, radius):
  """
  Carry out a batch query for range search with
  radius.
  """
- pass
+ raise NotImplementedError()
 
  def get_results(self):
  """

diff --git a/benchmark/algorithms/diskann-t2.py b/benchmark/algorithms/diskann-t2.py
@@ -10,6 +10,7 @@
 
 class Diskann(BaseANN):
  def __init__(self, metric, index_params):
+ self.name = "DiskANN"
  if (index_params.get("R")==None):
  print("Error: missing parameter R")
  return
@@ -37,6 +38,9 @@ def __init__(self, metric, index_params):
  self.cache_mechanism = 0
  print(self.PQ)
 
+ def track(self):
+ return "T2"
+
  def index_name(self):
  if self.PQ == 0:
  return f"R{self.R}_L{self.L}_B{self.B}_M{self.M}"
@@ -110,6 +114,27 @@ def fit(self, dataset):
  print(f"Loading index and caching {num_nodes_to_cache} nodes..")
  self.index.load_index(self.index_path, diskannpy.omp_get_max_threads(), num_nodes_to_cache, self.cache_mechanism)
 
+ def get_index_components(self, dataset):
+ index_components = [
+ '_pq_pivots.bin', '_pq_pivots.bin_centroid.bin', '_pq_pivots.bin_chunk_offsets.bin',
+ '_pq_pivots.bin_rearrangement_perm.bin', '_sample_data.bin', '_sample_ids.bin',
+ '_pq_compressed.bin', '_disk.index'
+ ]
+ ds = DATASETS[dataset]()
+ if ds.distance() == "ip":
+ index_components = index_components + [
+ '_disk.index_centroids.bin', '_disk.index_max_base_norm.bin', '_disk.index_medoids.bin'
+ ]
+ if self.PQ > 0:
+ index_components = index_components + [
+ '_disk.index_pq_pivots.bin', '_disk.index_pq_pivots.bin_centroid.bin',
+ '_disk.index_pq_pivots.bin_chunk_offsets.bin', '_disk.index_pq_pivots.bin_rearrangement_perm.bin'
+ ]
+ return index_components
+
+ def index_files_to_store(self, dataset):
+ return [self.create_index_dir(DATASETS[dataset]()), self.index_name(), self.get_index_components(dataset)]
+
  def load_index(self, dataset):
  """
  Load the index for dataset. Returns False if index
@@ -142,27 +167,13 @@ def load_index(self, dataset):
  return False
 
  index_path = os.path.join(index_dir, self.index_name())
- index_components = [
- 'pq_pivots.bin', 'pq_pivots.bin_centroid.bin', 'pq_pivots.bin_chunk_offsets.bin',
- 'pq_pivots.bin_rearrangement_perm.bin', 'sample_data.bin', 'sample_ids.bin',
- 'pq_compressed.bin', 'disk.index'
- ]
- if ds.distance() == "ip":
- index_components = index_components + [
- 'disk.index_centroids.bin', 'disk.index_max_base_norm.bin', 'disk.index_medoids.bin'
- ]
- if self.PQ > 0:
- index_components = index_components + [
- 'disk.index_pq_pivots.bin', 'disk.index_pq_pivots.bin_centroid.bin',
- 'disk.index_pq_pivots.bin_chunk_offsets.bin', 'disk.index_pq_pivots.bin_rearrangement_perm.bin'
- ]
-
+ index_components = self.get_index_components(dataset)
 
  for component in index_components:
- index_file = index_path + '_' + component
+ index_file = index_path + component
  if not (os.path.exists(index_file)):
  if 'url' in self._index_params:
- index_file_source = self._index_params['url'] + '/' + self.index_name() + '_' + component
+ index_file_source = self._index_params['url'] + '/' + self.index_name() + component
  print(f"Downloading index in background. This can take a while.")
  download_accelerated(index_file_source, index_file, quiet=True)
  else:
@@ -208,7 +219,3 @@ def set_query_arguments(self, query_args):
  self.Ls = self._query_args.get("Ls")
  self.BW = self._query_args.get("BW")
  self.threads = self._query_args.get("T")
-
-
- def __str__(self):
- return "DiskANN"
diff --git a/benchmark/algorithms/faiss_t1.py b/benchmark/algorithms/faiss_t1.py
@@ -84,6 +84,9 @@ def __init__(self, metric, index_params):
  if 'query_bs' in index_params:
  self._query_bs = index_params['query_bs']
 
+ def track(self):
+ return "T1"
+
  def index_name(self, name):
  return f"data/{name}.{self.indexkey}.faissindex"
 

diff --git a/benchmark/algorithms/faiss_t3.py b/benchmark/algorithms/faiss_t3.py
@@ -305,6 +305,9 @@ class FaissT3(BaseANN):
  def __init__(self, metric, index_params):
  self._index_params = index_params
  self._metric = metric
+
+ def track(self):
+ return "T3"
 
  def index_name(self, name):
  return f"data/{name}.{self._index_params['indexkey']}.faissindex"

diff --git a/benchmark/datasets.py b/benchmark/datasets.py
@@ -52,11 +52,14 @@ def download(src, dst=None, max_size=None):
  ))
 
 
-def download_accelerated(src, dst, quiet=False):
+def download_accelerated(src, dst, quiet=False, sas_string=""):
  """ dowload using an accelerator. Make sure the executable is in the path """
  print('downloading %s -> %s...' % (src, dst))
  if "windows.net" in src:
- cmd = f"azcopy copy {src} {dst}"
+ if sas_string == "":
+ cmd = f"azcopy copy {src} {dst}"
+ else:
+ cmd = f"azcopy copy '{src}?{sas_string}' '{dst}'"
  else:
  cmd = f"axel --alternate -n 10 {src} -o {dst}"
  if quiet:
@@ -66,6 +69,17 @@ def download_accelerated(src, dst, quiet=False):
  ret = os.system(cmd)
  assert ret == 0
 
+def upload_accelerated(local_dir, blob_prefix, component, sas_string, quiet=False):
+ """ Upload index component to Azure blob using SAS string"""
+ src = os.path.join(local_dir, component)
+ dst = blob_prefix + '/' + component + '?' + sas_string
+ print('Uploading %s -> %s...' % (src, dst))
+
+ cmd = f"azcopy copy '{src}' '{dst}'"
+ print("running", cmd)
+ ret = os.system(cmd)
+ assert ret == 0
+
 
 def bvecs_mmap(fname):
  x = numpy.memmap(fname, dtype='uint8', mode='r')

diff --git a/benchmark/main.py b/benchmark/main.py
@@ -43,13 +43,17 @@ def run_worker(args, queue):
 
  if args.nodocker:
  run_no_docker(definition, args.dataset, args.count,
- args.runs, args.timeout, args.rebuild, cpu_limit, mem_limit,
- args.t3, args.power_capture)
+ args.runs, args.timeout, args.rebuild, cpu_limit, mem_limit,
+ args.t3, args.power_capture,
+ args.upload_index, args.download_index,
+ args.blob_prefix, args.sas_string)
 
  else:
  run_docker(definition, args.dataset, args.count,
  args.runs, args.timeout, args.rebuild, cpu_limit, mem_limit,
- args.t3, args.power_capture )
+ args.t3, args.power_capture,
+ args.upload_index, args.download_index,
+ args.blob_prefix, args.sas_string)
 
 
 def main():
@@ -124,7 +128,24 @@ def main():
  '--nodocker',
  help='Override default of invoking algorithm in docker container.',
  action='store_true')
+ parser.add_argument(
+ '--upload-index',
+ help='Upload index to Azure blob storage and avoid local queries.',
+ action='store_true')
+ parser.add_argument(
+ '--download-index',
+ help='Download index uploaded to Azure blob storage and run local queries.',
+ action='store_true') 
+ parser.add_argument(
+ '--blob-prefix',
+ help='Azure blob prefix to upload indices to and download indices from.'
+ )
+ parser.add_argument(
+ '--sas-string',
+ help='SAS string to authenticate to Azure blob storage.'
+ )
 
+
  args = parser.parse_args()
  if args.timeout == -1:
  args.timeout = None