Skip to content
This repository has been archived by the owner on Dec 13, 2024. It is now read-only.

Commit

Permalink
API: make cluster_stats() return a numpy array
Browse files Browse the repository at this point in the history
This is much more convenient than a dict.
  • Loading branch information
elcorto committed Feb 19, 2019
1 parent 43c8ec5 commit 9553929
Show file tree
Hide file tree
Showing 3 changed files with 25 additions and 25 deletions.
4 changes: 0 additions & 4 deletions TODO
Original file line number Diff line number Diff line change
@@ -1,9 +1,5 @@
* use logging instead of print

* Split imagecluster.make_links() into grouping clusters of same size (e.g.
group_clusters() and link creation (make_links()). Add test for
group_clusters().

* When calculating fingerprints: If fingerprints.pk is present, check for new
image files and calculate for those only.

Expand Down
28 changes: 19 additions & 9 deletions imagecluster/calc.py
Original file line number Diff line number Diff line change
Expand Up @@ -221,7 +221,7 @@ def cluster(fps, sim=0.5, method='average', metric='euclidean',
else:
clusters[csize].append(cluster)
if print_stats:
print_cluster_stats(clusters=clusters)
print_cluster_stats(clusters)
if extra_out:
extra = {'Z': Z, 'dfps': dfps, 'cluster_dct': cluster_dct, 'cut': cut}
return clusters, extra
Expand All @@ -230,19 +230,29 @@ def cluster(fps, sim=0.5, method='average', metric='euclidean',


def cluster_stats(clusters):
return {k:len(v) for k,v in clusters.items()}
"""Count clusters of different sizes.
Returns
-------
2d array
Array with column 1 = csize sorted (number of images in the cluster)
and column 2 = cnum (number of clusters with that size).
[[csize, cnum],
[...],
]
"""
return np.array([[k, len(clusters[k])] for k in
np.sort(list(clusters.keys()))], dtype=int)


def print_cluster_stats(clusters):
print("#images : #clusters")
stats = cluster_stats(clusters)
for csize in np.sort(list(stats.keys())):
print("{} : {}".format(csize, stats[csize]))
if len(stats) > 0:
nimg = np.array(list(stats.items())).prod(axis=1).sum()
for csize,cnum in stats:
print(f"{csize} : {cnum}")
if stats.shape[0] > 0:
nimg = stats.prod(axis=1).sum()
else:
nimg = 0
print("#images in clusters total: ", nimg)



18 changes: 6 additions & 12 deletions imagecluster/postproc.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,5 @@
import os
import shutil
from collections import OrderedDict

from matplotlib import pyplot as plt
import numpy as np
Expand All @@ -26,18 +25,13 @@ def plot_clusters(clusters, ias, max_csize=None, mem_limit=1024**3):
have (i) enough memory, (ii) many clusters and/or (iii) large
max(csize) and (iv) max_csize is large or None
"""
_stats = ic.cluster_stats(clusters)
csize_sorted = np.sort(list(_stats.keys()))
if max_csize is None:
max_csize = csize_sorted.max() + 1
# stats sorted by csize and truncated to max_csize
stats = {csize : _stats[csize] for csize in csize_sorted
if csize <= max_csize}
stats = OrderedDict(stats)
stats = ic.cluster_stats(clusters)
if max_csize is not None:
stats = stats[stats[:,0] <= max_csize, :]
# number of clusters
ncols = sum(list(stats.values()))
ncols = stats[:,1].sum()
# csize (number of images per cluster)
nrows = max(stats.keys())
nrows = stats[:,0].max()
shape = ias[list(ias.keys())[0]].shape[:2]
mem = nrows * shape[0] * ncols * shape[1] * 3
if mem > mem_limit:
Expand All @@ -47,7 +41,7 @@ def plot_clusters(clusters, ias, max_csize=None, mem_limit=1024**3):
# rather big arrays possible
arr = np.ones((nrows*shape[0], ncols*shape[1], 3), dtype=np.uint8) * 255
icol = -1
for csize in list(stats.keys()):
for csize in stats[:,0]:
for cluster in clusters[csize]:
icol += 1
for irow, filename in enumerate(cluster):
Expand Down

0 comments on commit 9553929

Please sign in to comment.