API: make cluster_stats() return a numpy array

This is much more convenient than a dict.
elcorto · Feb 19, 2019 · 9553929 · 9553929
1 parent 43c8ec5
commit 9553929
Show file tree

Hide file tree

Showing 3 changed files with 25 additions and 25 deletions.
diff --git a/TODO b/TODO
@@ -1,9 +1,5 @@
 * use logging instead of print
 
-* Split imagecluster.make_links() into grouping clusters of same size (e.g.
-  group_clusters() and link creation (make_links()). Add test for
-  group_clusters().
-
 * When calculating fingerprints: If fingerprints.pk is present, check for new
   image files and calculate for those only.
 

diff --git a/imagecluster/calc.py b/imagecluster/calc.py
@@ -221,7 +221,7 @@ def cluster(fps, sim=0.5, method='average', metric='euclidean',
             else:
                 clusters[csize].append(cluster)
     if print_stats:
-        print_cluster_stats(clusters=clusters)
+        print_cluster_stats(clusters)
     if extra_out:
         extra = {'Z': Z, 'dfps': dfps, 'cluster_dct': cluster_dct, 'cut': cut}
         return clusters, extra
@@ -230,19 +230,29 @@ def cluster(fps, sim=0.5, method='average', metric='euclidean',
 
 
 def cluster_stats(clusters):
-    return {k:len(v) for k,v in clusters.items()}
+    """Count clusters of different sizes.
+
+    Returns
+    -------
+    2d array
+        Array with column 1 = csize sorted (number of images in the cluster)
+        and column 2 = cnum (number of clusters with that size).
+
+        [[csize, cnum],
+         [...],
+        ]
+    """
+    return np.array([[k, len(clusters[k])] for k in
+                     np.sort(list(clusters.keys()))], dtype=int)
 
 
 def print_cluster_stats(clusters):
     print("#images : #clusters")
     stats = cluster_stats(clusters)
-    for csize in np.sort(list(stats.keys())):
-        print("{} : {}".format(csize, stats[csize]))
-    if len(stats) > 0:
-        nimg = np.array(list(stats.items())).prod(axis=1).sum()
+    for csize,cnum in stats:
+        print(f"{csize} : {cnum}")
+    if stats.shape[0] > 0:
+        nimg = stats.prod(axis=1).sum()
     else:
         nimg = 0
     print("#images in clusters total: ", nimg)
-
-
-
diff --git a/imagecluster/postproc.py b/imagecluster/postproc.py
@@ -1,6 +1,5 @@
 import os
 import shutil
-from collections import OrderedDict
 
 from matplotlib import pyplot as plt
 import numpy as np
@@ -26,18 +25,13 @@ def plot_clusters(clusters, ias, max_csize=None, mem_limit=1024**3):
         have (i) enough memory, (ii) many clusters and/or (iii) large
         max(csize) and (iv) max_csize is large or None
     """
-    _stats = ic.cluster_stats(clusters)
-    csize_sorted = np.sort(list(_stats.keys()))
-    if max_csize is None:
-        max_csize = csize_sorted.max() + 1
-    # stats sorted by csize and truncated to max_csize
-    stats = {csize : _stats[csize] for csize in csize_sorted
-             if csize <= max_csize}
-    stats = OrderedDict(stats)
+    stats = ic.cluster_stats(clusters)
+    if max_csize is not None:
+        stats = stats[stats[:,0] <= max_csize, :]
     # number of clusters
-    ncols = sum(list(stats.values()))
+    ncols = stats[:,1].sum()
     # csize (number of images per cluster)
-    nrows = max(stats.keys())
+    nrows = stats[:,0].max()
     shape = ias[list(ias.keys())[0]].shape[:2]
     mem = nrows * shape[0] * ncols * shape[1] * 3
     if mem > mem_limit:
@@ -47,7 +41,7 @@ def plot_clusters(clusters, ias, max_csize=None, mem_limit=1024**3):
     # rather big arrays possible
     arr = np.ones((nrows*shape[0], ncols*shape[1], 3), dtype=np.uint8) * 255
     icol = -1
-    for csize in list(stats.keys()):
+    for csize in stats[:,0]:
         for cluster in clusters[csize]:
             icol += 1
             for irow, filename in enumerate(cluster):