From 5863c13c6ba6d69ef38a1269670bd33946c80062 Mon Sep 17 00:00:00 2001
From: Steve Schmerler <git@elcorto.com>
Date: Wed, 25 Oct 2017 23:23:08 +0200
Subject: [PATCH] BEH: invert `sim` parameter effect in cluster()

Now, sim = required similarity. See update of the README file.
---
 README.rst                   | 16 +++++++++-------
 imagecluster/imagecluster.py | 10 ++++------
 2 files changed, 13 insertions(+), 13 deletions(-)

diff --git a/README.rst b/README.rst
index 08d5dc2..e92f9bd 100644
--- a/README.rst
+++ b/README.rst
@@ -141,13 +141,15 @@ fingerprints (4096-dim vectors) are compared using a distance metric and
 similar images are put together in a cluster. The threshold for what counts as
 similar is defined by a similar index (again, see ``calc.cluster()``).
 
-The index can be thought of as the allowed *dissimilarity* or a similarity
-tolerance. A small index means to put only very similar images in one cluster.
-The extreme case 0.0 means to allow zero dissimilarity and thus to put each image
-in a cluster of size 1. In contrast, large values imply less strict clustering
-and will put more but less similar images in a cluster. A value of 1.0 is equal
-to putting all images in one single cluster (all images are treated as
-equal).
+The index (0...1) defines the minimum required similarity that images must have
+in order to be clustered together. A high index means to put only very similar
+images in one cluster. The extreme case of similarity index 1 means to require
+100% similarity and thus to put each image in a cluster of size 1 (unless there
+are completely equal images). In contrast, low values imply low required
+similarity. This results in less strict clustering which will put more but less
+similar images in a cluster. A value of 0 (zero required similarity) is equal
+to putting all images in one single cluster since all images are treated as
+equal.
 
 Tests
 =====
diff --git a/imagecluster/imagecluster.py b/imagecluster/imagecluster.py
index b606c8e..f94a79f 100644
--- a/imagecluster/imagecluster.py
+++ b/imagecluster/imagecluster.py
@@ -122,7 +122,7 @@ def fingerprints(files, model, size=(224,224)):
 
     Returns
     -------
-    fingerprint : dict
+    fingerprints : dict
         {filename1: array([...]),
          filename2: array([...]),
          ...
@@ -139,9 +139,7 @@ def cluster(fps, sim=0.5, method='average', metric='euclidean'):
     fps: dict
         output of :func:`fingerprints`
     sim : float 0..1
-        similarity tolerance (1=max. allowed similarity tolerance, all images
-        are considered similar and are in one cluster, 0=zero similarity
-        allowed, each image is it's own cluster of size 1)
+        similarity index
     method : see scipy.hierarchy.linkage(), all except 'centroid' produce
         pretty much the same result
     metric : see scipy.hierarchy.linkage(), make sure to use 'euclidean' in
@@ -150,13 +148,13 @@ def cluster(fps, sim=0.5, method='average', metric='euclidean'):
     Returns
     -------
     clusters : nested list
-        key = number of the cluster, value = list of filenames in the cluster
         [[filename1, filename5],                    # cluster 1
          [filename23],                              # cluster 2
          [filename48, filename2, filename42, ...],  # cluster 3
          ...
          ]
     """
+    assert 0 <= sim <= 1, "sim not 0..1"
     # array(list(...)): 2d array
     #   [[... fingerprint of image1 (4096,) ...],
     #    [... fingerprint of image2 (4096,) ...],
@@ -168,7 +166,7 @@ def cluster(fps, sim=0.5, method='average', metric='euclidean'):
     # dendrogram)
     Z = hierarchy.linkage(dfps, method=method, metric=metric)
     # cut dendrogram, extract clusters
-    cut = hierarchy.fcluster(Z, t=dfps.max()*sim, criterion='distance')
+    cut = hierarchy.fcluster(Z, t=dfps.max()*(1.0-sim), criterion='distance')
     cluster_dct = dict((ii,[]) for ii in np.unique(cut))
     for iimg,iclus in enumerate(cut):
         cluster_dct[iclus].append(files[iimg])