From 1cafed32e87d95847d8d100f06124e7f312a7b9b Mon Sep 17 00:00:00 2001 From: Steve Schmerler Date: Mon, 18 Feb 2019 01:04:27 +0100 Subject: [PATCH] ENH: Add optional PCA before clustering in main --- examples/example_main.py | 2 +- imagecluster/main.py | 17 ++++++++++++++++- requirements.txt | 1 + 3 files changed, 18 insertions(+), 2 deletions(-) diff --git a/examples/example_main.py b/examples/example_main.py index 7a98755..7b0783c 100644 --- a/examples/example_main.py +++ b/examples/example_main.py @@ -1,3 +1,3 @@ from imagecluster import main -main.main('pics/', sim=0.65, vis=True, maxelem=30) +main.main('pics/', sim=0.65, vis=True, maxelem=10, pca=True) diff --git a/imagecluster/main.py b/imagecluster/main.py index 3a7a72e..271fadb 100644 --- a/imagecluster/main.py +++ b/imagecluster/main.py @@ -1,4 +1,8 @@ import os +from collections import OrderedDict + +import numpy as np +from sklearn.decomposition import PCA from imagecluster import calc as ic from imagecluster import common as co @@ -11,7 +15,7 @@ def main(imagedir, sim=0.5, layer='fc2', size=(224,224), links=True, vis=False, - maxelem=None): + maxelem=None, pca=False, pca_params=dict(n_components=0.9)): """Example main app using this library. Upon first invocation, the image and fingerprint databases are built and @@ -40,6 +44,10 @@ def main(imagedir, sim=0.5, layer='fc2', size=(224,224), links=True, vis=False, plot images in clusters maxelem : max number of images per cluster for visualization (see :mod:`~postproc`) + pca : bool + Perform PCA on fingerprints before clustering, using `pca_params`. + pca_params : dict + kwargs to sklearn's PCA Notes ----- @@ -67,6 +75,13 @@ def main(imagedir, sim=0.5, layer='fc2', size=(224,224), links=True, vis=False, else: print(f"loading fingerprints database {fps_fn} ...") fps = co.read_pk(fps_fn) + if pca: + # Yes in recent Pythons, dicts are ordered in CPython, but still. + _fps = OrderedDict(fps) + X = np.array(list(_fps.values())) + Xp = PCA(**pca_params).fit(X).transform(X) + fps = {k:v for k,v in zip(_fps.keys(), Xp)} + print("pca dims:", Xp.shape[1]) print("clustering ...") clusters = ic.cluster(fps, sim) if links: diff --git a/requirements.txt b/requirements.txt index 0c2b6d4..613413b 100644 --- a/requirements.txt +++ b/requirements.txt @@ -2,3 +2,4 @@ numpy tensorflow keras Pillow +scikit-learn