From 237e17d91b8d0afe5c4a0867af7475e137788c74 Mon Sep 17 00:00:00 2001 From: Steve Schmerler Date: Tue, 27 Dec 2016 02:35:54 +0100 Subject: [PATCH] ENH: use resized files by default in bin/10fingerprints.py --- README.rst | 25 ++++++++++++++++--------- bin/10fingerprints.py | 11 ++++++++--- bin/20cluster.py | 1 + 3 files changed, 25 insertions(+), 12 deletions(-) diff --git a/README.rst b/README.rst index cb0cc53..87a09ef 100644 --- a/README.rst +++ b/README.rst @@ -28,23 +28,30 @@ This will resize all images to 20% (factor 0.2) of their original size. Resized files are written to ``~/.imgcmp/convert/`` by default. Now, calculate the fingerprint database:: - $ ./bin/10fingerprints.py ~/.imgcmp/convert/* + $ ./bin/10fingerprints.py -This creates a file ``~/.imgcmp/fingerprints.hdf`` (HDF5 file format). Last, -cluster images by using a similarity index (``0.3`` below). A small index means -to put only very similar images in one cluster. The extreme case 0.0 means to -allow zero dissimilarity and thus put each image in a cluster of size 1. In -contrast, large values imply less strict clustering and will put more but less -similar images in a cluster. A value of 1.0 is equal to putting all images in -one single cluster (e.g. all are treated as equal):: +This uses all files in ``~/.imgcmp/convert/*`` by default. Alternatively, +specify all images on the command line. It creates a file +``~/.imgcmp/fingerprints.hdf`` (HDF5 file format, fingerprint database). + +Last, we will cluster the images (actually their fingerprints) by using a +similarity index ( ``0..1``, ``0.3`` below). The index can be thought of as the +allowed *dissimilarity*. A small index means to put only very similar images in +one cluster. The extreme case 0.0 means to allow zero dissimilarity and thus +put each image in a cluster of size 1. In contrast, large values imply less +strict clustering and will put more but less similar images in a cluster. A +value of 1.0 is equal to putting all images in one single cluster (e.g. all +images are treated as equal):: $ ./bin/20cluster.py 0.3 + cluster dir: /home/user/.imgcmp/cluster items per cluster : number of such clusters 2 : 41 3 : 2 4 : 2 -Have a look at the clusters (as dirs with symlinks to the relevant files):: +By default, the database ``~/.imgcmp/fingerprints.hdf`` is used. Have a look at +the clusters (as dirs with symlinks to the relevant files):: $ ls ~/.imgcmp/cluster/ cluster_with_2 cluster_with_3 cluster_with_4 diff --git a/bin/10fingerprints.py b/bin/10fingerprints.py index 09ff525..66d738f 100755 --- a/bin/10fingerprints.py +++ b/bin/10fingerprints.py @@ -1,9 +1,11 @@ #!/usr/bin/python3 -import sys, multiprocessing, functools, argparse +import sys, multiprocessing, functools, argparse, os from PIL import Image from imgcmp import calc, io, cli, env import numpy as np +pj = os.path.join + def _worker(tup, size_x=None, fpsdct=None): ii, name = tup @@ -19,8 +21,11 @@ def _worker(tup, size_x=None, fpsdct=None): Calculate fingerprint database. """ parser = argparse.ArgumentParser(description=desc) - parser.add_argument('files', metavar='FILE', nargs='+', - help='image file names') + parser.add_argument('files', metavar='FILE', nargs='*', + default=[pj(cli.convert_dr, x) for x in \ + os.listdir(cli.convert_dr)], + help='image file names, [default: ' + '{}/*]'.format(cli.convert_dr)) parser.add_argument('-x', dest='size_x', default=8, type=int, help='resize images to (size_x, size_x), fingerprints ' diff --git a/bin/20cluster.py b/bin/20cluster.py index 6988636..a016780 100755 --- a/bin/20cluster.py +++ b/bin/20cluster.py @@ -48,6 +48,7 @@ else: cdct_multi[nn].append(x) + print("cluster dir: {}".format(cli.cluster_dr)) print("items per cluster : number of such clusters") shutil.rmtree(cli.cluster_dr) for n_in_cluster in np.sort(list(cdct_multi.keys())):