diff --git a/README.rst b/README.rst index 6cbdb1b..936efda 100644 --- a/README.rst +++ b/README.rst @@ -25,6 +25,10 @@ The package is designed as a library. Here is what you can do: # Feed images through the model and extract fingerprints (feature vectors). fps = ic.fingerprints(ias, model) + # Optionally run a PCA on the fingerprints to compress the dimensions. Use a + # cumulative explained variance ratio of 0.95. + fps = ic.pca(fps, n_components=0.95) + # Run clustering on the fingerprints. Select clusters with similarity index # sim=0.5 clusters = ic.cluster(fps, sim=0.5) @@ -183,8 +187,15 @@ connected layers as features, but instead the output of the last pooling layer (layer 'flatten' in Keras' VGG16). We tested that briefly (see ``get_model(... layer='fc2')`` or ``main(..., layer='fc2')`` and found our default 'fc2' to perform well enough. 'fc1' performs almost the same, while -'flatten' seems to do worse. But again, a quantitative analysis is in order. But -who has the time! +'flatten' seems to do worse. But again, a quantitative analysis is in order. + +PCA: Because of the `Curse of dimensionality `_, it may be helpful to +perform a PCA on the fingerprints before clustering to reduce the feature +vector dimensions to, say, a few 100, thus making the distance metrics used in +clustering more effective. However, our tests so far show no substantial change +in clustering results, in accordance to what `others have found +`_. See ``examples/example_api.py`` and ``calc.pca()``. + Tests ===== @@ -224,3 +235,5 @@ Related projects .. _hc: https://en.wikipedia.org/wiki/Hierarchical_clustering .. _dendro: https://en.wikipedia.org/wiki/Dendrogram .. _holiday: http://lear.inrialpes.fr/~jegou/data.php +.. _curse: https://en.wikipedia.org/wiki/Curse_of_dimensionality +.. _gh_beleidy: https://github.com/beleidy/unsupervised-image-clustering diff --git a/examples/example_api.py b/examples/example_api.py index c580a3b..babd9da 100644 --- a/examples/example_api.py +++ b/examples/example_api.py @@ -11,6 +11,10 @@ # Feed images through the model and extract fingerprints (feature vectors). fps = ic.fingerprints(ias, model) +# Optionally run a PCA on the fingerprints to compress the dimensions. Use a +# cumulative explained variance ratio of 0.95. +fps = ic.pca(fps, n_components=0.95) + # Run clustering on the fingerprints. Select clusters with similarity index # sim=0.5 clusters = ic.cluster(fps, sim=0.5) diff --git a/imagecluster/calc.py b/imagecluster/calc.py index caba77a..7997fc6 100644 --- a/imagecluster/calc.py +++ b/imagecluster/calc.py @@ -2,11 +2,13 @@ import multiprocessing as mp import functools +from collections import OrderedDict import PIL.Image from scipy.spatial import distance from scipy.cluster import hierarchy import numpy as np +from sklearn.decomposition import PCA from keras.applications.vgg16 import VGG16, preprocess_input from keras.preprocessing import image @@ -155,6 +157,16 @@ def fingerprints(ias, model): return fps +def pca(fps, n_components=0.9, **kwds): + if 'n_components' not in kwds.keys(): + kwds['n_components'] = n_components + # Yes in recent Pythons, dicts are ordered in CPython, but still. + _fps = OrderedDict(fps) + X = np.array(list(_fps.values())) + Xp = PCA(**kwds).fit(X).transform(X) + return {k:v for k,v in zip(_fps.keys(), Xp)} + + def cluster(fps, sim=0.5, method='average', metric='euclidean', extra_out=False, print_stats=True, min_csize=2): """Hierarchical clustering of images based on image fingerprints. diff --git a/imagecluster/main.py b/imagecluster/main.py index 4a7fe67..8c5e9e9 100644 --- a/imagecluster/main.py +++ b/imagecluster/main.py @@ -1,8 +1,4 @@ import os -from collections import OrderedDict - -import numpy as np -from sklearn.decomposition import PCA from imagecluster import calc as ic from imagecluster import common as co @@ -76,12 +72,8 @@ def main(imagedir, sim=0.5, layer='fc2', size=(224,224), links=True, vis=False, print(f"loading fingerprints database {fps_fn} ...") fps = co.read_pk(fps_fn) if pca: - # Yes in recent Pythons, dicts are ordered in CPython, but still. - _fps = OrderedDict(fps) - X = np.array(list(_fps.values())) - Xp = PCA(**pca_params).fit(X).transform(X) - fps = {k:v for k,v in zip(_fps.keys(), Xp)} - print("pca dims:", Xp.shape[1]) + fps = ic.pca(fps, **pca_params) + print("pca dims:", list(fps.values())[0].shape[0]) print("clustering ...") clusters = ic.cluster(fps, sim) if links: