From 21091e652505f600b58843bdfc941adbb8cd369a Mon Sep 17 00:00:00 2001
From: Steve Schmerler <git@elcorto.com>
Date: Tue, 19 Feb 2019 02:41:41 +0100
Subject: [PATCH] ENH: move main's PCA to calc.pca(), update docs and examples

---
 README.rst              | 17 +++++++++++++++--
 examples/example_api.py |  4 ++++
 imagecluster/calc.py    | 12 ++++++++++++
 imagecluster/main.py    | 12 ++----------
 4 files changed, 33 insertions(+), 12 deletions(-)

diff --git a/README.rst b/README.rst
index 6cbdb1b..936efda 100644
--- a/README.rst
+++ b/README.rst
@@ -25,6 +25,10 @@ The package is designed as a library. Here is what you can do:
     # Feed images through the model and extract fingerprints (feature vectors).
     fps = ic.fingerprints(ias, model)
 
+    # Optionally run a PCA on the fingerprints to compress the dimensions. Use a
+    # cumulative explained variance ratio of 0.95.
+    fps = ic.pca(fps, n_components=0.95)
+
     # Run clustering on the fingerprints.  Select clusters with similarity index
     # sim=0.5
     clusters = ic.cluster(fps, sim=0.5)
@@ -183,8 +187,15 @@ connected layers as features, but instead the output of the last pooling
 layer (layer 'flatten' in Keras' VGG16). We tested that briefly (see
 ``get_model(... layer='fc2')`` or ``main(..., layer='fc2')`` and found our
 default 'fc2' to perform well enough. 'fc1' performs almost the same, while
-'flatten' seems to do worse. But again, a quantitative analysis is in order. But
-who has the time!
+'flatten' seems to do worse. But again, a quantitative analysis is in order.
+
+PCA: Because of the `Curse of dimensionality <curse_>`_, it may be helpful to
+perform a PCA on the fingerprints before clustering to reduce the feature
+vector dimensions to, say, a few 100, thus making the distance metrics used in
+clustering more effective. However, our tests so far show no substantial change
+in clustering results, in accordance to what `others have found
+<gh_beleidy_>`_. See ``examples/example_api.py`` and ``calc.pca()``.
+
 
 Tests
 =====
@@ -224,3 +235,5 @@ Related projects
 .. _hc: https://en.wikipedia.org/wiki/Hierarchical_clustering
 .. _dendro: https://en.wikipedia.org/wiki/Dendrogram
 .. _holiday: http://lear.inrialpes.fr/~jegou/data.php
+.. _curse: https://en.wikipedia.org/wiki/Curse_of_dimensionality
+.. _gh_beleidy: https://github.com/beleidy/unsupervised-image-clustering
diff --git a/examples/example_api.py b/examples/example_api.py
index c580a3b..babd9da 100644
--- a/examples/example_api.py
+++ b/examples/example_api.py
@@ -11,6 +11,10 @@
 # Feed images through the model and extract fingerprints (feature vectors).
 fps = ic.fingerprints(ias, model)
 
+# Optionally run a PCA on the fingerprints to compress the dimensions. Use a
+# cumulative explained variance ratio of 0.95.
+fps = ic.pca(fps, n_components=0.95)
+
 # Run clustering on the fingerprints.  Select clusters with similarity index
 # sim=0.5
 clusters = ic.cluster(fps, sim=0.5)
diff --git a/imagecluster/calc.py b/imagecluster/calc.py
index caba77a..7997fc6 100644
--- a/imagecluster/calc.py
+++ b/imagecluster/calc.py
@@ -2,11 +2,13 @@
 
 import multiprocessing as mp
 import functools
+from collections import OrderedDict
 
 import PIL.Image
 from scipy.spatial import distance
 from scipy.cluster import hierarchy
 import numpy as np
+from sklearn.decomposition import PCA
 
 from keras.applications.vgg16 import VGG16, preprocess_input
 from keras.preprocessing import image
@@ -155,6 +157,16 @@ def fingerprints(ias, model):
     return fps
 
 
+def pca(fps, n_components=0.9, **kwds):
+    if 'n_components' not in kwds.keys():
+        kwds['n_components'] = n_components
+    # Yes in recent Pythons, dicts are ordered in CPython, but still.
+    _fps = OrderedDict(fps)
+    X = np.array(list(_fps.values()))
+    Xp = PCA(**kwds).fit(X).transform(X)
+    return {k:v for k,v in zip(_fps.keys(), Xp)}
+
+
 def cluster(fps, sim=0.5, method='average', metric='euclidean',
             extra_out=False, print_stats=True, min_csize=2):
     """Hierarchical clustering of images based on image fingerprints.
diff --git a/imagecluster/main.py b/imagecluster/main.py
index 4a7fe67..8c5e9e9 100644
--- a/imagecluster/main.py
+++ b/imagecluster/main.py
@@ -1,8 +1,4 @@
 import os
-from collections import OrderedDict
-
-import numpy as np
-from sklearn.decomposition import PCA
 
 from imagecluster import calc as ic
 from imagecluster import common as co
@@ -76,12 +72,8 @@ def main(imagedir, sim=0.5, layer='fc2', size=(224,224), links=True, vis=False,
         print(f"loading fingerprints database {fps_fn} ...")
         fps = co.read_pk(fps_fn)
     if pca:
-        # Yes in recent Pythons, dicts are ordered in CPython, but still.
-        _fps = OrderedDict(fps)
-        X = np.array(list(_fps.values()))
-        Xp = PCA(**pca_params).fit(X).transform(X)
-        fps = {k:v for k,v in zip(_fps.keys(), Xp)}
-        print("pca dims:", Xp.shape[1])
+        fps = ic.pca(fps, **pca_params)
+        print("pca dims:", list(fps.values())[0].shape[0])
     print("clustering ...")
     clusters = ic.cluster(fps, sim)
     if links: