Skip to content

Commit

Permalink
Merge branch 'harsha-simhadri:main' into main
Browse files Browse the repository at this point in the history
  • Loading branch information
DmitryKey authored Oct 1, 2021
2 parents 920e6f6 + 2dda69f commit adc134f
Show file tree
Hide file tree
Showing 22 changed files with 494 additions and 90 deletions.
28 changes: 3 additions & 25 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -43,26 +43,6 @@ and `ALGO` is the name of the algorithm. (Use `python run.py --list-algorithms`)

The parameters used by the implementation to build and query the index can be found in `algos.yaml`.


## Evaluating the Results
Run `sudo python plot.py --dataset ...` or `sudo python data_export.py --output res.csv` to plot results or dump all of them to csv for further post-processing.
To avoid sudo, run `sudo chmod -R 777 results/` before invoking these scripts.

To get a table overview over the best recall/ap achieved over a certain threshold, run `python3 eval/show_operating_points.py --algorithm $ALGO --threshold $THRESHOLD res.csv`, where `res.csv` is the file produced by running `data_export.py` above.

For the track1 baseline, the output `python3 eval/show_operating_points.py --algorithm faiss-t1 --threshold 10000 res.csv` led to

```
recall/ap
algorithm dataset
faiss-t1 bigann-1B 0.634510
deep-1B 0.650280
msspacev-1B 0.728861
msturing-1B 0.703611
ssnpp-1B 0.753780
text2image-1B 0.069275
```

## Running the track 1 baseline
After running the installation, we can evaluate the baseline as follows.

Expand All @@ -83,10 +63,8 @@ python data_export.py --output res.csv
python3.8 eval/show_operating_points.py --algorithm faiss-t1 --threshold 10000
```

## Including your algorithm

1. Add your algorithm into `benchmark/algorithms` by providing a small Python wrapper inheriting from `BaseANN` defined in `benchmark/algorithms/base.py`. See `benchmark/algorithm/faiss_t1.py` for an example.
2. Add a Dockerfile in `install/`
3. Edit `algos.yaml with the parameter choices you would like to test.
4. (Add an option to download pre-built indexes as seen in `faiss_t1.py`.)
## Including your algorithm and Evaluating the Results
See [Track T1/T2](t1_t2/README.md) for more details on evaluation for Tracks T1 and T2.

See [Track T3](t3/README.md) for more details on evaluation for Track T3.
33 changes: 29 additions & 4 deletions benchmark/algorithms/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,14 +3,26 @@

class BaseANN(object):
def done(self):
"""
This is called after results have been processed.
Use it for cleaning up if necessary.
"""
pass

def track(self):
"""
return "T1" if submitting an entry for track 1
return "T2" if submitting an entry for track 2
return "T3" if submitting an entry for track 3
"""
raise NotImplementedError()

def fit(self, dataset):
"""
Build the index for the data points given in dataset name.
Assumes that after fitting index is loaded in memory.
"""
pass
raise NotImplementedError()

def load_index(self, dataset):
"""
Expand All @@ -20,18 +32,31 @@ def load_index(self, dataset):
Checking the index usually involves the dataset name
and the index build paramters passed during construction.
"""
pass
raise NotImplementedError()

def index_files_to_store(self, dataset):
"""
Specify a triplet with the local directory path of index files,
the common prefix name of index component(s) and a list of
index components that need to be uploaded to (after build)
or downloaded from (for search) cloud storage.
For local directory path under docker environment, please use
a directory under
data/indices/track(T1 or T2)/algo.__str__()/DATASETS[dataset]().short_name()
"""
raise NotImplementedError()

def query(self, X, k):
"""Carry out a batch query for k-NN of query set X."""
pass
raise NotImplementedError()

def range_query(self, X, radius):
"""
Carry out a batch query for range search with
radius.
"""
pass
raise NotImplementedError()

def get_results(self):
"""
Expand Down
49 changes: 28 additions & 21 deletions benchmark/algorithms/diskann-t2.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@

class Diskann(BaseANN):
def __init__(self, metric, index_params):
self.name = "DiskANN"
if (index_params.get("R")==None):
print("Error: missing parameter R")
return
Expand Down Expand Up @@ -37,6 +38,9 @@ def __init__(self, metric, index_params):
self.cache_mechanism = 0
print(self.PQ)

def track(self):
return "T2"

def index_name(self):
if self.PQ == 0:
return f"R{self.R}_L{self.L}_B{self.B}_M{self.M}"
Expand Down Expand Up @@ -110,6 +114,27 @@ def fit(self, dataset):
print(f"Loading index and caching {num_nodes_to_cache} nodes..")
self.index.load_index(self.index_path, diskannpy.omp_get_max_threads(), num_nodes_to_cache, self.cache_mechanism)

def get_index_components(self, dataset):
index_components = [
'_pq_pivots.bin', '_pq_pivots.bin_centroid.bin', '_pq_pivots.bin_chunk_offsets.bin',
'_pq_pivots.bin_rearrangement_perm.bin', '_sample_data.bin', '_sample_ids.bin',
'_pq_compressed.bin', '_disk.index'
]
ds = DATASETS[dataset]()
if ds.distance() == "ip":
index_components = index_components + [
'_disk.index_centroids.bin', '_disk.index_max_base_norm.bin', '_disk.index_medoids.bin'
]
if self.PQ > 0:
index_components = index_components + [
'_disk.index_pq_pivots.bin', '_disk.index_pq_pivots.bin_centroid.bin',
'_disk.index_pq_pivots.bin_chunk_offsets.bin', '_disk.index_pq_pivots.bin_rearrangement_perm.bin'
]
return index_components

def index_files_to_store(self, dataset):
return [self.create_index_dir(DATASETS[dataset]()), self.index_name(), self.get_index_components(dataset)]

def load_index(self, dataset):
"""
Load the index for dataset. Returns False if index
Expand Down Expand Up @@ -142,27 +167,13 @@ def load_index(self, dataset):
return False

index_path = os.path.join(index_dir, self.index_name())
index_components = [
'pq_pivots.bin', 'pq_pivots.bin_centroid.bin', 'pq_pivots.bin_chunk_offsets.bin',
'pq_pivots.bin_rearrangement_perm.bin', 'sample_data.bin', 'sample_ids.bin',
'pq_compressed.bin', 'disk.index'
]
if ds.distance() == "ip":
index_components = index_components + [
'disk.index_centroids.bin', 'disk.index_max_base_norm.bin', 'disk.index_medoids.bin'
]
if self.PQ > 0:
index_components = index_components + [
'disk.index_pq_pivots.bin', 'disk.index_pq_pivots.bin_centroid.bin',
'disk.index_pq_pivots.bin_chunk_offsets.bin', 'disk.index_pq_pivots.bin_rearrangement_perm.bin'
]

index_components = self.get_index_components(dataset)

for component in index_components:
index_file = index_path + '_' + component
index_file = index_path + component
if not (os.path.exists(index_file)):
if 'url' in self._index_params:
index_file_source = self._index_params['url'] + '/' + self.index_name() + '_' + component
index_file_source = self._index_params['url'] + '/' + self.index_name() + component
print(f"Downloading index in background. This can take a while.")
download_accelerated(index_file_source, index_file, quiet=True)
else:
Expand Down Expand Up @@ -208,7 +219,3 @@ def set_query_arguments(self, query_args):
self.Ls = self._query_args.get("Ls")
self.BW = self._query_args.get("BW")
self.threads = self._query_args.get("T")


def __str__(self):
return "DiskANN"
3 changes: 3 additions & 0 deletions benchmark/algorithms/faiss_t1.py
Original file line number Diff line number Diff line change
Expand Up @@ -84,6 +84,9 @@ def __init__(self, metric, index_params):
if 'query_bs' in index_params:
self._query_bs = index_params['query_bs']

def track(self):
return "T1"

def index_name(self, name):
return f"data/{name}.{self.indexkey}.faissindex"

Expand Down
3 changes: 3 additions & 0 deletions benchmark/algorithms/faiss_t3.py
Original file line number Diff line number Diff line change
Expand Up @@ -305,6 +305,9 @@ class FaissT3(BaseANN):
def __init__(self, metric, index_params):
self._index_params = index_params
self._metric = metric

def track(self):
return "T3"

def index_name(self, name):
return f"data/{name}.{self._index_params['indexkey']}.faissindex"
Expand Down
18 changes: 16 additions & 2 deletions benchmark/datasets.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,11 +52,14 @@ def download(src, dst=None, max_size=None):
))


def download_accelerated(src, dst, quiet=False):
def download_accelerated(src, dst, quiet=False, sas_string=""):
""" dowload using an accelerator. Make sure the executable is in the path """
print('downloading %s -> %s...' % (src, dst))
if "windows.net" in src:
cmd = f"azcopy copy {src} {dst}"
if sas_string == "":
cmd = f"azcopy copy {src} {dst}"
else:
cmd = f"azcopy copy '{src}?{sas_string}' '{dst}'"
else:
cmd = f"axel --alternate -n 10 {src} -o {dst}"
if quiet:
Expand All @@ -66,6 +69,17 @@ def download_accelerated(src, dst, quiet=False):
ret = os.system(cmd)
assert ret == 0

def upload_accelerated(local_dir, blob_prefix, component, sas_string, quiet=False):
""" Upload index component to Azure blob using SAS string"""
src = os.path.join(local_dir, component)
dst = blob_prefix + '/' + component + '?' + sas_string
print('Uploading %s -> %s...' % (src, dst))

cmd = f"azcopy copy '{src}' '{dst}'"
print("running", cmd)
ret = os.system(cmd)
assert ret == 0


def bvecs_mmap(fname):
x = numpy.memmap(fname, dtype='uint8', mode='r')
Expand Down
27 changes: 24 additions & 3 deletions benchmark/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,13 +43,17 @@ def run_worker(args, queue):

if args.nodocker:
run_no_docker(definition, args.dataset, args.count,
args.runs, args.timeout, args.rebuild, cpu_limit, mem_limit,
args.t3, args.power_capture)
args.runs, args.timeout, args.rebuild, cpu_limit, mem_limit,
args.t3, args.power_capture,
args.upload_index, args.download_index,
args.blob_prefix, args.sas_string)

else:
run_docker(definition, args.dataset, args.count,
args.runs, args.timeout, args.rebuild, cpu_limit, mem_limit,
args.t3, args.power_capture )
args.t3, args.power_capture,
args.upload_index, args.download_index,
args.blob_prefix, args.sas_string)


def main():
Expand Down Expand Up @@ -124,7 +128,24 @@ def main():
'--nodocker',
help='Override default of invoking algorithm in docker container.',
action='store_true')
parser.add_argument(
'--upload-index',
help='Upload index to Azure blob storage and avoid local queries.',
action='store_true')
parser.add_argument(
'--download-index',
help='Download index uploaded to Azure blob storage and run local queries.',
action='store_true')
parser.add_argument(
'--blob-prefix',
help='Azure blob prefix to upload indices to and download indices from.'
)
parser.add_argument(
'--sas-string',
help='SAS string to authenticate to Azure blob storage.'
)


args = parser.parse_args()
if args.timeout == -1:
args.timeout = None
Expand Down
Loading

0 comments on commit adc134f

Please sign in to comment.