diff --git a/README.rst b/README.rst index 194bb1f..8494930 100644 --- a/README.rst +++ b/README.rst @@ -20,6 +20,9 @@ or:: Usage ===== +We use a pre-trained keras NN model. The weights will be downloaded *once* by +keras automatically upon first import and placed into ``~/.keras/models/``. + See ``imagecluster.main.main()`` for a usage example. If there is no fingerprints database, it will first run all images through the @@ -28,50 +31,53 @@ the fingerprints and a similarity index (more details below). Example session:: - >>> from imagecluster import main - >>> main.main('/path/to/testpics/', sim=0.5) - no fingerprints database /path/to/testpics/fingerprints.pk found - running all images thru NN model ... - /path/to/testpics/DSC_1061.JPG - /path/to/testpics/DSC_1080.JPG - ... - /path/to/testpics/DSC_1087.JPG - clustering ... - cluster dir: /path/to/testpics/clusters - items per cluster : number of such clusters - 2 : 7 - 3 : 2 - 4 : 4 - 5 : 1 - 10 : 1 + >>> from imagecluster import main + >>> main.main('/path/to/testpics/', sim=0.5) + no fingerprints database /path/to/testpics/fingerprints.pk found + running all images through NN model ... + /path/to/testpics/DSC_1061.JPG + /path/to/testpics/DSC_1080.JPG + ... + /path/to/testpics/DSC_1087.JPG + clustering ... + cluster dir: /path/to/testpics/clusters + items per cluster : number of such clusters + 2 : 7 + 3 : 2 + 4 : 4 + 5 : 1 + 10 : 1 Have a look at the clusters (as dirs with symlinks to the relevant files):: - $ tree /path/to/testpics - /path/to/testpics/clusters - ├── cluster_with_10 - │   └── cluster_0 - │   ├── DSC_1068.JPG -> /path/to/testpics/DSC_1068.JPG - │   ├── DSC_1070.JPG -> /path/to/testpics/DSC_1070.JPG - │   ├── DSC_1071.JPG -> /path/to/testpics/DSC_1071.JPG - │   ├── DSC_1072.JPG -> /path/to/testpics/DSC_1072.JPG - │   ├── DSC_1073.JPG -> /path/to/testpics/DSC_1073.JPG - │   ├── DSC_1074.JPG -> /path/to/testpics/DSC_1074.JPG - │   ├── DSC_1075.JPG -> /path/to/testpics/DSC_1075.JPG - │   ├── DSC_1076.JPG -> /path/to/testpics/DSC_1076.JPG - │   ├── DSC_1077.JPG -> /path/to/testpics/DSC_1077.JPG - │   └── DSC_1078.JPG -> /path/to/testpics/DSC_1078.JPG - ├── cluster_with_2 - │   ├── cluster_0 - │   │   ├── DSC_1037.JPG -> /path/to/testpics/DSC_1037.JPG - │   │   └── DSC_1038.JPG -> /path/to/testpics/DSC_1038.JPG - │   ├── cluster_1 - │   │   ├── DSC_1053.JPG -> /path/to/testpics/DSC_1053.JPG - │   │   └── DSC_1054.JPG -> /path/to/testpics/DSC_1054.JPG - │   ├── cluster_2 - │   │   ├── DSC_1046.JPG -> /path/to/testpics/DSC_1046.JPG - │   │   └── DSC_1047.JPG -> /path/to/testpics/DSC_1047.JPG - ... + $ tree /path/to/testpics + /path/to/testpics/clusters + ├── cluster_with_10 + │   └── cluster_0 + │   ├── DSC_1068.JPG -> /path/to/testpics/DSC_1068.JPG + │   ├── DSC_1070.JPG -> /path/to/testpics/DSC_1070.JPG + │   ├── DSC_1071.JPG -> /path/to/testpics/DSC_1071.JPG + │   ├── DSC_1072.JPG -> /path/to/testpics/DSC_1072.JPG + │   ├── DSC_1073.JPG -> /path/to/testpics/DSC_1073.JPG + │   ├── DSC_1074.JPG -> /path/to/testpics/DSC_1074.JPG + │   ├── DSC_1075.JPG -> /path/to/testpics/DSC_1075.JPG + │   ├── DSC_1076.JPG -> /path/to/testpics/DSC_1076.JPG + │   ├── DSC_1077.JPG -> /path/to/testpics/DSC_1077.JPG + │   └── DSC_1078.JPG -> /path/to/testpics/DSC_1078.JPG + ├── cluster_with_2 + │   ├── cluster_0 + │   │   ├── DSC_1037.JPG -> /path/to/testpics/DSC_1037.JPG + │   │   └── DSC_1038.JPG -> /path/to/testpics/DSC_1038.JPG + │   ├── cluster_1 + │   │   ├── DSC_1053.JPG -> /path/to/testpics/DSC_1053.JPG + │   │   └── DSC_1054.JPG -> /path/to/testpics/DSC_1054.JPG + │   ├── cluster_2 + │   │   ├── DSC_1046.JPG -> /path/to/testpics/DSC_1046.JPG + │   │   └── DSC_1047.JPG -> /path/to/testpics/DSC_1047.JPG + ... + +If you run this again on the same directory, only the clustering will be +repeated. Methods ======= @@ -111,14 +117,14 @@ Now with NN-based fingerprints, we also cluster all sorts of images which have, e.g. mountains, tents, or beaches, so this is far better. However, if you run this on a large collection of images which contain images with tents or beaches, then the system won't recognize that certain images belong together -because they were taken on the same trip. All tent images will be in one -cluster, and so will all beaches images. This is probably b/c in this case, the -classification of the image happens by looking at the background. A tent in the -center of the image will always look the same, but it is the background which -makes humans distinguish the context. The problem is: VGG16 and all the other -popular networks have been trained on ridiculously small images of 224x224 size -because of computational limitations, where it is impossible to recognize -background details. +because they were taken on the same trip, for instance. All tent images will be +in one cluster, and so will all beaches images. This is probably b/c in this +case, the human classification of the image happens by looking at the +background. A tent in the center of the image will always look the same, but it +is the background which makes humans distinguish the context. The problem is: +VGG16 and all the other popular networks have been trained on ridiculously +small images of 224x224 size because of computational limitations, where it is +impossible to recognize background details. Clustering ---------- diff --git a/imagecluster/imagecluster.py b/imagecluster/imagecluster.py index b5acc6e..dda818d 100644 --- a/imagecluster/imagecluster.py +++ b/imagecluster/imagecluster.py @@ -3,7 +3,7 @@ import numpy as np from matplotlib import pyplot as plt -import PIL.Image, os, multiprocessing, shutil +import PIL.Image, os, multiprocessing, shutil, pickle from keras.applications.vgg16 import VGG16 from keras.preprocessing import image from keras.applications.vgg16 import preprocess_input @@ -12,6 +12,8 @@ pj = os.path.join def get_model(): + """Keras Model of the VGG16 network, with the output layer set to the + pre-last fully connected layer 'fc2' of shape (4096,).""" # base_model.summary(): # .... # block5_conv4 (Conv2D) (None, 15, 15, 512) 2359808 @@ -26,7 +28,6 @@ def get_model(): # _________________________________________________________________ # predictions (Dense) (None, 1000) 4097000 # - # model: get output from pre-last fully connected layer 'fc2' base_model = VGG16(weights='imagenet', include_top=True) model = Model(inputs=base_model.input, outputs=base_model.get_layer('fc2').output) @@ -34,6 +35,21 @@ def get_model(): def fingerprint(fn, model, size): + """Load image from file `fn`, resize to `size` and run through `model` + (keras.models.Model). + + Parameters + ---------- + fn : str + filename + model : keras.models.Model instance + size : tuple + input image size (width, height), must match `model`, e.g. (224,224) + + Returns + ------- + fingerprint : 1d array + """ # keras.preprocessing.image.load_img() uses img.rezize(shape) with the # default interpolation which is pretty bad (see # imagecluster/play/pil_resample_methods.py). Given that we are restricted @@ -54,56 +70,45 @@ def fingerprint(fn, model, size): return model.predict(arr4d_pp)[0,:] -def _worker(fn, model, size): - print(fn) - return fn, fingerprint(fn, model, size) - - -def fingerprints(files, model, size=(224,224)): - # Cannot use multiprocessing: - # TypeError: can't pickle _thread.lock objects - # The error doesn't come from functools.partial since those objects are - # pickable since python3. The reason is the keras.model.Model, which is not - # pickable. However keras with tensorflow backend runs multithreaded - # (model.predict()), so we don't need that. +# Cannot use multiprocessing: +# TypeError: can't pickle _thread.lock objects +# The error doesn't come from functools.partial since those objects are +# pickable since python3. The reason is the keras.model.Model, which is not +# pickable. However keras with tensorflow backend runs multi-threaded +# (model.predict()), so we don't need that. I guess it will scale better if we +# parallelize over images than to run a muti-threaded tensorflow on each image, +# but OK. On low core counts (2-4), it won't matter. +# +##def _worker(fn, model, size): +## print(fn) +## return fn, fingerprint(fn, model, size) +## +##def fingerprints(files, model, size=(224,224)): ## worker = functools.partial(_worker, ## model=model, ## size=size) ## pool = multiprocessing.Pool(multiprocessing.cpu_count()) ## return dict(pool.map(worker, files)) - return dict(_worker(fn, model, size) for fn in files) -def make_links(clusters, cluster_dr): - # [[list_of_files], [list_of_files], ...] - clst_multi = [x for x in clusters.values() if len(x) > 1] - - # {number_of_files1: [[list_of_files], [list_of_files],...], - # number_of_files2: [[list_of_files],...], - # } - cdct_multi = {} - for x in clst_multi: - nn = len(x) - if not (nn in cdct_multi.keys()): - cdct_multi[nn] = [x] - else: - cdct_multi[nn].append(x) - - print("cluster dir: {}".format(cluster_dr)) - print("items per cluster : number of such clusters") - if os.path.exists(cluster_dr): - shutil.rmtree(cluster_dr) - for n_in_cluster in np.sort(list(cdct_multi.keys())): - cluster_list = cdct_multi[n_in_cluster] - print("{} : {}".format(n_in_cluster, len(cluster_list))) - for iclus, lst in enumerate(cluster_list): - dr = pj(cluster_dr, - 'cluster_with_{}'.format(n_in_cluster), - 'cluster_{}'.format(iclus)) - for fn in lst: - link = pj(dr, os.path.basename(fn)) - os.makedirs(os.path.dirname(link), exist_ok=True) - os.symlink(os.path.abspath(fn), link) +def fingerprints(files, model, size=(224,224)): + """Calculate fingerprints for all `files`. + + Parameters + ---------- + files : sequence + image filenames + model, size : see :func:`fingerprint` + + Returns + ------- + fingerprint : dict + {filename1: array([...]), + filename2: array([...]), + ... + } + """ + return dict((fn, fingerprint(fn, model, size)) for fn in files) def get_files(dr): @@ -113,7 +118,9 @@ def get_files(dr): def cluster(files, fps, sim=0.5, method='average', metric='euclidean'): """Hierarchical clustering of images `files` based on image fingerprints `fps`. - + + Parameters + ---------- files : list of file names sim : float 0..1 similarity tolerance (1=max. allowed similarity tolerance, all images @@ -126,6 +133,16 @@ def cluster(files, fps, sim=0.5, method='average', metric='euclidean'): pretty much the same result metric : see scipy.hierarchy.linkage(), make sure to use 'euclidean' in case of method='centroid', 'median' or 'ward' + + Returns + ------- + clusters : dict + key = number of the cluster, value = list of filenames in the cluster + {1: [filename1, filename5], + 2: [filename23], + 3: [filename48, filename2, filename42, ...], + ... + } """ dfps = distance.pdist(fps, metric) # hierarchical/agglomerative clustering (Z = linkage matrix, construct @@ -139,8 +156,50 @@ def cluster(files, fps, sim=0.5, method='average', metric='euclidean'): return clusters +def make_links(clusters, cluster_dr): + # group all clusters (cluster = list_of_files) of equal size together + # {number_of_files1: [[list_of_files], [list_of_files],...], + # number_of_files2: [[list_of_files],...], + # } + cdct_multi = {} + for x in (x for x in clusters.values() if len(x) > 1): + nn = len(x) + if not (nn in cdct_multi.keys()): + cdct_multi[nn] = [x] + else: + cdct_multi[nn].append(x) + + print("cluster dir: {}".format(cluster_dr)) + print("items per cluster : number of such clusters") + if os.path.exists(cluster_dr): + shutil.rmtree(cluster_dr) + for n_in_cluster in np.sort(list(cdct_multi.keys())): + cluster_list = cdct_multi[n_in_cluster] + print("{} : {}".format(n_in_cluster, len(cluster_list))) + for iclus, lst in enumerate(cluster_list): + dr = pj(cluster_dr, + 'cluster_with_{}'.format(n_in_cluster), + 'cluster_{}'.format(iclus)) + for fn in lst: + link = pj(dr, os.path.basename(fn)) + os.makedirs(os.path.dirname(link), exist_ok=True) + os.symlink(os.path.abspath(fn), link) + + def view_image_list(lst): for filename in lst: fig,ax = plt.subplots() ax.imshow(plt.imread(filename)) plt.show() + + +def read_pk(fn): + with open(fn, 'rb') as fd: + ret = pickle.load(fd) + return ret + + +def write_pk(obj, fn): + with open(fn, 'wb') as fd: + pickle.dump(obj, fd) + diff --git a/imagecluster/main.py b/imagecluster/main.py index f7bce90..11870aa 100644 --- a/imagecluster/main.py +++ b/imagecluster/main.py @@ -1,24 +1,30 @@ -import os, pickle +import os import numpy as np from imagecluster import imagecluster as ic pj = os.path.join + def main(imagedir, sim=0.5): - """Example main app using this library. """ + """Example main app using this library. + + Parameters + ---------- + imagedir : str + path to directory with images + sim : float (0..1) + similarity index (see imagecluster.cluster()) + """ dbfn = pj(imagedir, 'fingerprints.pk') if not os.path.exists(dbfn): print("no fingerprints database {} found".format(dbfn)) files = ic.get_files(imagedir) model = ic.get_model() - print("running all images thru NN model ...".format(dbfn)) + print("running all images through NN model ...".format(dbfn)) fps = ic.fingerprints(files, model, size=(224,224)) - with open(dbfn, 'wb') as fd: - pickle.dump(fps, fd) - fd.close() + ic.write_pk(fps, dbfn) else: print("loading fingerprints database {} ...".format(dbfn)) - with open(dbfn, 'rb') as fd: - fps = pickle.load(fd) + fps = ic.read_pk(dbfn) print("clustering ...") clusters = ic.cluster(list(fps.keys()), np.array(list(fps.values())),