ENH: move main's PCA to calc.pca(), update docs and examples

elcorto · Feb 19, 2019 · 21091e6 · 21091e6
1 parent 9553929
commit 21091e6
Show file tree

Hide file tree

Showing 4 changed files with 33 additions and 12 deletions.
diff --git a/README.rst b/README.rst
@@ -25,6 +25,10 @@ The package is designed as a library. Here is what you can do:
     # Feed images through the model and extract fingerprints (feature vectors).
     fps = ic.fingerprints(ias, model)
 
+    # Optionally run a PCA on the fingerprints to compress the dimensions. Use a
+    # cumulative explained variance ratio of 0.95.
+    fps = ic.pca(fps, n_components=0.95)
+
     # Run clustering on the fingerprints.  Select clusters with similarity index
     # sim=0.5
     clusters = ic.cluster(fps, sim=0.5)
@@ -183,8 +187,15 @@ connected layers as features, but instead the output of the last pooling
 layer (layer 'flatten' in Keras' VGG16). We tested that briefly (see
 ``get_model(... layer='fc2')`` or ``main(..., layer='fc2')`` and found our
 default 'fc2' to perform well enough. 'fc1' performs almost the same, while
-'flatten' seems to do worse. But again, a quantitative analysis is in order. But
-who has the time!
+'flatten' seems to do worse. But again, a quantitative analysis is in order.
+
+PCA: Because of the `Curse of dimensionality <curse_>`_, it may be helpful to
+perform a PCA on the fingerprints before clustering to reduce the feature
+vector dimensions to, say, a few 100, thus making the distance metrics used in
+clustering more effective. However, our tests so far show no substantial change
+in clustering results, in accordance to what `others have found
+<gh_beleidy_>`_. See ``examples/example_api.py`` and ``calc.pca()``.
+
 
 Tests
 =====
@@ -224,3 +235,5 @@ Related projects
 .. _hc: https://en.wikipedia.org/wiki/Hierarchical_clustering
 .. _dendro: https://en.wikipedia.org/wiki/Dendrogram
 .. _holiday: http://lear.inrialpes.fr/~jegou/data.php
+.. _curse: https://en.wikipedia.org/wiki/Curse_of_dimensionality
+.. _gh_beleidy: https://github.com/beleidy/unsupervised-image-clustering
diff --git a/examples/example_api.py b/examples/example_api.py
@@ -11,6 +11,10 @@
 # Feed images through the model and extract fingerprints (feature vectors).
 fps = ic.fingerprints(ias, model)
 
+# Optionally run a PCA on the fingerprints to compress the dimensions. Use a
+# cumulative explained variance ratio of 0.95.
+fps = ic.pca(fps, n_components=0.95)
+
 # Run clustering on the fingerprints.  Select clusters with similarity index
 # sim=0.5
 clusters = ic.cluster(fps, sim=0.5)

diff --git a/imagecluster/calc.py b/imagecluster/calc.py
@@ -2,11 +2,13 @@
 
 import multiprocessing as mp
 import functools
+from collections import OrderedDict
 
 import PIL.Image
 from scipy.spatial import distance
 from scipy.cluster import hierarchy
 import numpy as np
+from sklearn.decomposition import PCA
 
 from keras.applications.vgg16 import VGG16, preprocess_input
 from keras.preprocessing import image
@@ -155,6 +157,16 @@ def fingerprints(ias, model):
     return fps
 
 
+def pca(fps, n_components=0.9, **kwds):
+    if 'n_components' not in kwds.keys():
+        kwds['n_components'] = n_components
+    # Yes in recent Pythons, dicts are ordered in CPython, but still.
+    _fps = OrderedDict(fps)
+    X = np.array(list(_fps.values()))
+    Xp = PCA(**kwds).fit(X).transform(X)
+    return {k:v for k,v in zip(_fps.keys(), Xp)}
+
+
 def cluster(fps, sim=0.5, method='average', metric='euclidean',
             extra_out=False, print_stats=True, min_csize=2):
     """Hierarchical clustering of images based on image fingerprints.

diff --git a/imagecluster/main.py b/imagecluster/main.py
@@ -1,8 +1,4 @@
 import os
-from collections import OrderedDict
-
-import numpy as np
-from sklearn.decomposition import PCA
 
 from imagecluster import calc as ic
 from imagecluster import common as co
@@ -76,12 +72,8 @@ def main(imagedir, sim=0.5, layer='fc2', size=(224,224), links=True, vis=False,
         print(f"loading fingerprints database {fps_fn} ...")
         fps = co.read_pk(fps_fn)
     if pca:
-        # Yes in recent Pythons, dicts are ordered in CPython, but still.
-        _fps = OrderedDict(fps)
-        X = np.array(list(_fps.values()))
-        Xp = PCA(**pca_params).fit(X).transform(X)
-        fps = {k:v for k,v in zip(_fps.keys(), Xp)}
-        print("pca dims:", Xp.shape[1])
+        fps = ic.pca(fps, **pca_params)
+        print("pca dims:", list(fps.values())[0].shape[0])
     print("clustering ...")
     clusters = ic.cluster(fps, sim)
     if links: