diff --git a/.gitignore b/.gitignore index 0d327a6..4ae1e6c 100644 --- a/.gitignore +++ b/.gitignore @@ -7,3 +7,6 @@ # Python egg metadata, regenerated from source files by setuptools. /*.egg-info /*.egg + +# PyCharm-Files +/.idea diff --git a/.travis.yml b/.travis.yml new file mode 100644 index 0000000..5f1b715 --- /dev/null +++ b/.travis.yml @@ -0,0 +1,12 @@ +services: + - redis-server +language: python +python: + - "2.6" + - "2.7" + - "3.3" + - "3.4" +# command to install dependencies +install: "pip install -r requirements.txt" +# command to run tests +script: nosetests diff --git a/lshash/__init__.py b/lshash/__init__.py index e805896..e4424c7 100644 --- a/lshash/__init__.py +++ b/lshash/__init__.py @@ -9,4 +9,4 @@ __license__ = 'MIT' __version__ = '0.0.4dev' -from lshash import LSHash +from .lshash import LSHash diff --git a/lshash/lshash.py b/lshash/lshash.py index 5c895a6..9f375dd 100644 --- a/lshash/lshash.py +++ b/lshash/lshash.py @@ -4,11 +4,18 @@ # This module is part of lshash and is released under # the MIT License: http://www.opensource.org/licenses/mit-license.php +import sys + +if sys.version_info[0] >= 3: + basestring = str +else: + range = xrange + import os import json import numpy as np -from storage import storage +from .storage import storage try: from bitarray import bitarray @@ -92,7 +99,7 @@ def _init_uniform_planes(self): self.uniform_planes = [t[1] for t in npzfiles] else: self.uniform_planes = [self._generate_uniform_planes() - for _ in xrange(self.num_hashtables)] + for _ in range(self.num_hashtables)] try: np.savez_compressed(self.matrices_filename, *self.uniform_planes) @@ -101,14 +108,14 @@ def _init_uniform_planes(self): raise else: self.uniform_planes = [self._generate_uniform_planes() - for _ in xrange(self.num_hashtables)] + for _ in range(self.num_hashtables)] def _init_hashtables(self): """ Initialize the hash tables such that each record will be in the form of "[storage1, storage2, ...]" """ self.hash_tables = [storage(self.storage_config, i) - for i in xrange(self.num_hashtables)] + for i in range(self.num_hashtables)] def _generate_uniform_planes(self): """ Generate uniformly distributed hyperplanes and return it as a 2D diff --git a/lshash/storage.py b/lshash/storage.py index 85e4c4a..6352f9b 100644 --- a/lshash/storage.py +++ b/lshash/storage.py @@ -4,6 +4,8 @@ # This module is part of lshash and is released under # the MIT License: http://www.opensource.org/licenses/mit-license.php +from __future__ import unicode_literals + import json try: @@ -27,10 +29,6 @@ def storage(storage_config, index): class BaseStorage(object): - def __init__(self, config): - """ An abstract class used as an adapter printfor storages. """ - raise NotImplementedError - def keys(self): """ Returns a list of binary hashes that are used as dict keys. """ raise NotImplementedError @@ -81,14 +79,14 @@ def _list(self, key): def keys(self, pattern='*'): # return the keys BUT be agnostic with reference to the hash table - return [k.split('.')[1] for k in self.storage.keys(self.h_index + pattern)] + return [k.decode('ascii').split('.')[1] for k in self.storage.keys(self.h_index + pattern)] def append_val(self, key, val): self.storage.sadd(self._list(key), json.dumps(val)) def get_list(self, key): _list = list(self.storage.smembers(self._list(key))) # list elements are plain strings here - _list = [json.loads(el) for el in _list] # transform strings into python tuples + _list = [json.loads(el.decode('ascii')) for el in _list] # transform strings into python tuples for el in _list: # if len(el) is 2, then el[1] is the extra value associated to the element if len(el) == 2 and type(el[0]) == list: diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..576fa0e --- /dev/null +++ b/requirements.txt @@ -0,0 +1,2 @@ +numpy>=1.9.1 +redis==2.10.3 diff --git a/tests/__init__.py b/tests/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/tests/test_lsh.py b/tests/test_lsh.py index a283bee..5a7f830 100644 --- a/tests/test_lsh.py +++ b/tests/test_lsh.py @@ -1,5 +1,6 @@ import random import string +from unittest import TestCase from redis import StrictRedis from pprint import pprint import sys @@ -10,112 +11,117 @@ # now we can use our lshash package and not the standard one from lshash import LSHash -num_elements = 100 -els = [] -el_names = [] -for i in range(num_elements): - el = [random.randint(0, 100) for _ in xrange(8)] - elname = ''.join(random.choice(string.ascii_uppercase + string.digits) for _ in range(10)) - els.append(tuple(el)) - el_names.append(elname) +class TestLSHash(TestCase): + num_elements = 100 + def setUp(self): + self.els = [] + self.el_names = [] + for i in range(self.num_elements): + el = [random.randint(0, 100) for _ in range(8)] + elname = ''.join(random.choice(string.ascii_uppercase + string.digits) for _ in range(10)) + self.els.append(tuple(el)) + self.el_names.append(elname) -def test_lshash(): - lsh = LSHash(6, 8, 1) - for i in xrange(num_elements): - lsh.index(list(els[i])) - lsh.index(list(els[i])) # multiple insertions - hasht = lsh.hash_tables[0] - itms = [hasht.get_list(k) for k in hasht.keys()] - for itm in itms: - assert itms.count(itm) == 1 - for el in itm: - assert el in els - for el in els: - res = lsh.query(list(el), num_results=1, distance_func='euclidean')[0] - # res is a tuple containing the vector and the distance - el_v, el_dist = res - assert el_v in els - assert el_dist == 0 - del lsh + def test_lshash(self): + lsh = LSHash(6, 8, 1) + for i in range(self.num_elements): + lsh.index(list(self.els[i])) + lsh.index(list(self.els[i])) # multiple insertions + hasht = lsh.hash_tables[0] + itms = [hasht.get_list(k) for k in hasht.keys()] + for itm in itms: + assert itms.count(itm) == 1 + for el in itm: + assert el in self.els + for el in self.els: + res = lsh.query(list(el), num_results=1, distance_func='euclidean')[0] + # res is a tuple containing the vector and the distance + el_v, el_dist = res + assert el_v in self.els + assert el_dist == 0 + del lsh -def test_lshash_extra_val(): - lsh = LSHash(6, 8, 1) - for i in xrange(num_elements): - lsh.index(list(els[i]), el_names[i]) - hasht = lsh.hash_tables[0] - itms = [hasht.get_list(k) for k in hasht.keys()] - for itm in itms: - for el in itm: - assert el[0] in els - assert el[1] in el_names - for el in els: - # res is a list, so we need to select the first entry only - res = lsh.query(list(el), num_results=1, distance_func='euclidean')[0] - # vector an name are in the first element of the tuple res[0] - el_v, el_name = res[0] - # the distance is in the second element of the tuple - el_dist = res[1] - assert el_v in els - assert el_name in el_names - assert el_dist == 0 - del lsh + def test_lshash_extra_val(self): + lsh = LSHash(6, 8, 1) + for i in range(self.num_elements): + lsh.index(list(self.els[i]), self.el_names[i]) + hasht = lsh.hash_tables[0] + itms = [hasht.get_list(k) for k in hasht.keys()] + for itm in itms: + for el in itm: + assert el[0] in self.els + assert el[1] in self.el_names + for el in self.els: + # res is a list, so we need to select the first entry only + res = lsh.query(list(el), num_results=1, distance_func='euclidean')[0] + # vector an name are in the first element of the tuple res[0] + el_v, el_name = res[0] + # the distance is in the second element of the tuple + el_dist = res[1] + assert el_v in self.els + assert el_name in self.el_names + assert el_dist == 0 + del lsh -def test_lshash_redis(): - """ - Test external lshash module - """ - config = {"redis": {"host": 'localhost', "port": 6379, "db": 15}} - sr = StrictRedis(**config['redis']) - sr.flushdb() + def test_lshash_redis(self): + """ + Test external lshash module + """ + config = {"redis": {"host": 'localhost', "port": 6379, "db": 15}} + sr = StrictRedis(**config['redis']) + sr.flushdb() - lsh = LSHash(6, 8, 1, config) - for i in xrange(num_elements): - lsh.index(list(els[i])) - lsh.index(list(els[i])) # multiple insertions should be prevented by the library - hasht = lsh.hash_tables[0] - itms = [hasht.get_list(k) for k in hasht.keys()] - for itm in itms: - for el in itm: - assert itms.count(itm) == 1 # have multiple insertions been prevented? - assert el in els - for el in els: - res = lsh.query(list(el), num_results=1, distance_func='euclidean')[0] - el_v, el_dist = res - assert el_v in els - assert el_dist == 0 - del lsh - sr.flushdb() + lsh = LSHash(6, 8, 1, config) + for i in range(self.num_elements): + lsh.index(list(self.els[i])) + lsh.index(list(self.els[i])) # multiple insertions should be prevented by the library -def test_lshash_redis_extra_val(): - """ - Test external lshash module - """ - config = {"redis": {"host": 'localhost', "port": 6379, "db": 15}} - sr = StrictRedis(**config['redis']) - sr.flushdb() + hasht = lsh.hash_tables[0] + itms = [hasht.get_list(k) for k in hasht.keys()] - lsh = LSHash(6, 8, 1, config) - for i in xrange(num_elements): - lsh.index(list(els[i]), el_names[i]) - lsh.index(list(els[i]), el_names[i]) # multiple insertions - hasht = lsh.hash_tables[0] - itms = [hasht.get_list(k) for k in hasht.keys()] - for itm in itms: - assert itms.count(itm) == 1 - for el in itm: - assert el[0] in els - assert el[1] in el_names - for el in els: - res = lsh.query(list(el), num_results=1, distance_func='euclidean')[0] - # vector an name are in the first element of the tuple res[0] - el_v, el_name = res[0] - # the distance is in the second element of the tuple - el_dist = res[1] - assert el_v in els - assert el_name in el_names - assert el_dist == 0 - del lsh - sr.flushdb() + for itm in itms: + for el in itm: + assert itms.count(itm) == 1 # have multiple insertions been prevented? + assert el in self.els + + for el in self.els: + res = lsh.query(list(el), num_results=1, distance_func='euclidean')[0] + el_v, el_dist = res + assert el_v in self.els + assert el_dist == 0 + del lsh + sr.flushdb() + + def test_lshash_redis_extra_val(self): + """ + Test external lshash module + """ + config = {"redis": {"host": 'localhost', "port": 6379, "db": 15}} + sr = StrictRedis(**config['redis']) + sr.flushdb() + + lsh = LSHash(6, 8, 1, config) + for i in range(self.num_elements): + lsh.index(list(self.els[i]), self.el_names[i]) + lsh.index(list(self.els[i]), self.el_names[i]) # multiple insertions + hasht = lsh.hash_tables[0] + itms = [hasht.get_list(k) for k in hasht.keys()] + for itm in itms: + assert itms.count(itm) == 1 + for el in itm: + assert el[0] in self.els + assert el[1] in self.el_names + for el in self.els: + res = lsh.query(list(el), num_results=1, distance_func='euclidean')[0] + # vector an name are in the first element of the tuple res[0] + el_v, el_name = res[0] + # the distance is in the second element of the tuple + el_dist = res[1] + assert el_v in self.els + assert el_name in self.el_names + assert el_dist == 0 + del lsh + sr.flushdb()