initial commit

elcorto · Aug 1, 2016 · 9d66810 · 9d66810
commit 9d66810
Show file tree

Hide file tree

Showing 12 changed files with 466 additions and 0 deletions.
diff --git a/.gitignore b/.gitignore
@@ -0,0 +1 @@
+*.pyc
diff --git a/README.rst b/README.rst
@@ -0,0 +1,40 @@
+Package for comaring images by content.
+
+image fingerprints: simple and fast
+-----------------------------------
+These methods basically squash down the image to something like 16x16,
+transform to gray scale and store that as a feature vector of length 16x16, for
+example -> fast. But the method s not invariant against rotation, only scaling
+along x and/or y. 
+
+The idea is always to calculate a database of image fingerprints ("hashes",
+feature vectors) and then do searches in feature space (all fingerprints) using
+some form of KD-tree / nearest neighbor search.
+
+* google: calculate image fingerprint
+* [a|p|d]hash: https://realpython.com/blog/python/fingerprinting-images-for-near-duplicate-detection/ 
+* espcially: phash.org
+* older Perl implemention of a ahash(?)-like method: http://www.jhnc.org/findimagedupes/manpage.html, also as Debian
+  package
+
+more scientific "feature extraction"
+------------------------------------
+
+* classical CV (computer vision): SIFT (good but slow, old-school
+  hand-engineered feature detector), SURF (faster version of
+  SIFT)
+* http://opencv-python-tutroals.readthedocs.org/en/latest/index.html
+    * SIFT and SURF are patented, so fuck them and use ORB
+      http://opencv-python-tutroals.readthedocs.org/en/latest/py_tutorials/py_feature2d/py_orb/py_orb.html#orb
+* opencv Bag Of Words: http://stackoverflow.com/questions/7205489/opencv-fingerprint-image-and-compare-against-database
+
+Python image processing
+-----------------------
+* google: python image processing :)
+* http://scikit-image.org/
+* PIL vs. Pillow: http://docs.python-guide.org/en/latest/scenarios/imaging/
+* http://www.scipy-lectures.org/advanced/image_processing
+
+better methods
+--------------
+read about: Content-based image classification
diff --git a/__init__.py b/__init__.py
@@ -0,0 +1 @@
+__all__ = ['calc', 'misc', 'io']
diff --git a/bin/00resize.py b/bin/00resize.py
@@ -0,0 +1,87 @@
+#!/usr/bin/python3
+
+# python3: Only python3 has (finally!) a decent multiprocessing module, which
+# handles KeyboardInterrupt (CTRL-C) *at all* and does this without need for
+# any extra hassle and awkward try..except stunts. Thank you!
+#
+# verbose: use -v know which image is processed. With multiprocessing, the order
+# is messed up a bit but you will still get a rough estimate of how many images
+# are left, assuming all images are equal in size and thus all processes are
+# approximately equally fast.
+# 
+# multiprocessing: We use Python's multiprocessing instead of the default
+# imagemagick OpenMP parallelization since the former is a little faster -- and
+# well .. coding and benchmarking is fun!.
+# 
+# wall clock times, dual-core box
+# 
+#     multiprocessing, ncore = 1,2,4, max OpenMP threads = 1
+#
+#     $ for x in 1 2 4; do time ./00resize.py -n $x 20 files/*; done
+#     
+#     real    1m15.663s     # 1
+#     real    0m38.577s     # 2  ***
+#     real    0m39.365s     # 4
+#     
+#     multiprocessing, ncore = 1,2,4, max OpenMP threads = 2
+#     
+#     $ for x in 1 2 4; do time ./00resize.py -n $x 20 files/*; done
+#     
+#     real    0m46.304s     # 1  ***
+#     real    0m38.766s     # 2
+#     real    0m38.984s     # 4
+# 
+# The figures to compare are marked with ***, i.e. 2 threads vs. 2 processes
+# with 1 thread / core. With the latter, we are about factor 1.2 faster.
+#
+# Note: We know that even setting OMP_NUM_THREADS=1 (which is probably
+# equivalent to "-limit threads 1" in the imagemagick case) is not a good idea
+# since there is still overhead caused by the creation of an OpenMP thread. The
+# only way to get rid of OpenMP completely is to re-compile imagemagick with
+# ./configure --disable-openmp .
+
+import os, multiprocessing, subprocess, functools, argparse
+from imgcmp import cli
+
+def _worker(tup, percent=None, tgtdir=None, nfiles=None, verbose=False):
+    idx, _src = tup
+    src = os.path.abspath(_src)
+    # /home/foo -> _home_foo -> home_foo
+    tgt = os.path.join(tgtdir, src.replace('/','_')[1:])
+    cmd = "convert -limit thread 1 -resize {}% -auto-orient {} {}".format(
+            percent, src, tgt)
+    if verbose >= 1:   
+        print("{} of {}".format(idx+1, nfiles))
+    if verbose >= 2:
+        print(cmd)
+    subprocess.call(cmd, shell=True)
+
+if __name__ == '__main__':
+
+    desc = """
+Resize images to PERCENT % with imagemagick's convert. Store them in dir TGTDIR
+with their full name with / replaced by _, such that /path/to/file.png becomes
+TGTDIR/path_to_file.png
+"""
+    parser = argparse.ArgumentParser(description=desc) 
+    parser.add_argument('percent', metavar='PERCENT', type=float,
+                        help='percent value for resizing')
+    parser.add_argument('files', metavar='FILE', nargs='+',
+                        help='image file names')
+    parser.add_argument('-t', '--tgtdir',
+                        default=cli.convert_dr,
+                        help='store resized files here [%(default)s]')
+    parser.add_argument('-n', '--ncore',
+                        default=multiprocessing.cpu_count(), type=int,
+                        help='number of cores for parallel work [%(default)s]')
+    parser.add_argument('-v', '--verbose', default=0, action='count',
+                        help='increase verbosity level, -vv prints convert commands')
+    args = parser.parse_args()
+    worker = functools.partial(_worker, 
+                               percent=args.percent,
+                               tgtdir=args.tgtdir,
+                               nfiles=len(args.files),
+                               verbose=args.verbose)
+    pool = multiprocessing.Pool(args.ncore)
+    pool.map(worker, enumerate(args.files))
+
diff --git a/bin/10fingerprints.py b/bin/10fingerprints.py
@@ -0,0 +1,47 @@
+#!/usr/bin/python
+
+import sys, multiprocessing, functools, argparse
+from PIL import Image
+import numpy as np
+from imgcmp import calc, io, cli
+
+def _worker(tup, size_x=None, fpsdct=None):
+    print(tup)
+    ii, name = tup     
+    img = Image.open(name)
+    fpsdct[ii] = calc.phash(img, 
+                            size=(size_x, size_x), 
+                            highfreq_factor=4, 
+                            backtransform=False).flatten()
+
+if __name__ == '__main__':
+
+    desc = """
+Calculate fingerprint database.    
+"""
+    parser = argparse.ArgumentParser(description=desc) 
+    parser.add_argument('files', metavar='FILE', nargs='+',
+                        help='image file names')
+    parser.add_argument('-x', dest='size_x',
+                        default=8, type=int,
+                        help='resize images to (size_x, size_x), fingerprints '
+                             'are then (size_x**2,) 1d arrays [%(default)s]')
+    parser.add_argument('-f', dest='dbfile',
+                        default=cli.dbfile,
+                        help='database HDF file [%(default)s]')
+    args = parser.parse_args()
+
+    # "parallel" dict for sharing between procs
+    manager = multiprocessing.Manager()
+    fpsdct = manager.dict() 
+
+    pool = multiprocessing.Pool(multiprocessing.cpu_count())
+    worker = functools.partial(_worker, 
+                               size_x=args.size_x,
+                               fpsdct=fpsdct)
+    pool.map(worker, enumerate(args.files))
+
+    # sort: order array to match file names in list `files`
+    fps = np.array([fpsdct[ii] for ii in np.sort(fpsdct.keys())])
+    io.write_h5(args.dbfile, dict(files=args.files, fps=fps))    
+
diff --git a/bin/20cluster.py b/bin/20cluster.py
@@ -0,0 +1,59 @@
+#!/usr/bin/python
+
+import os, shutil, argparse
+import numpy as np
+from imgcmp import misc, io, calc, cli
+
+pj = os.path.join
+
+if __name__ == '__main__':
+
+    desc = """
+Read fingerprints database, perform clustering.
+"""
+    parser = argparse.ArgumentParser(description=desc) 
+    parser.add_argument('frac', type=float,
+                        help='similarity fraction')
+    parser.add_argument('-f', dest='dbfile',
+                        default=cli.dbfile,
+                        help='database HDF file [%(default)s]')
+    args = parser.parse_args()
+
+    db = io.read_h5(args.dbfile)
+    files = db['/files'] 
+    fps = db['/fps']
+
+    # {1: [list_of_files], 2: [list_of_files], ...}
+    cluster_dct = calc.cluster(files, fps, args.frac, 'hamming')
+
+    # [[list_of_files], [list_of_files], ...]
+    clst_multi = [x for x in cluster_dct.values() if len(x) > 1]
+
+    # {number_of_files1: [[list_of_files], [list_of_files],...],
+    #  number_of_files2: [[list_of_files],...],
+    # }
+    cdct_multi = {}
+    for x in clst_multi:
+        nn = len(x)
+        if not cdct_multi.has_key(nn):
+            cdct_multi[nn] = [x]
+        else:    
+            cdct_multi[nn].append(x)
+
+    print("items per cluster : number of such clusters")
+    shutil.rmtree(cli.view_dr)
+    for n_in_cluster in np.sort(cdct_multi.keys()):
+        cluster_list = cdct_multi[n_in_cluster]
+        print("{} : {}".format(n_in_cluster, len(cluster_list)))
+        for iclus, lst in enumerate(cluster_list):
+            dr = pj(cli.view_dr, 
+                    'cluster_with_{}'.format(n_in_cluster),
+                    'cluster_{}'.format(iclus))
+            for fn in lst:
+                link = pj(dr, os.path.basename(fn))
+                misc.makedirs(os.path.dirname(link))
+                os.symlink(fn, link)
+
+    ##key = raw_input("View? [N,y] ")
+    ##if key.lower() == 'y':
+
diff --git a/calc.py b/calc.py
@@ -0,0 +1,108 @@
+# https://github.com/JohannesBuchner/imagehash
+# http://www.hackerfactor.com/blog/index.php?/archives/432-Looks-Like-It.html
+
+# help for PIL.Image.Image.resize
+# -------------------------------
+# 
+# PIL.Image.Image.resize(self, size, resample=0)
+#
+# :param size: The requested size in pixels, as a 2-tuple:
+#    (width, height).
+# :param resample: An optional resampling filter.  This can be
+#    one of :py:attr:`PIL.Image.NEAREST` (use nearest neighbour),
+#    :py:attr:`PIL.Image.BILINEAR` (linear interpolation),
+#    :py:attr:`PIL.Image.BICUBIC` (cubic spline interpolation), or
+#    :py:attr:`PIL.Image.LANCZOS` (a high-quality downsampling filter).
+#    If omitted, or if the image has mode "1" or "P", it is
+#    set :py:attr:`PIL.Image.NEAREST`.
+# :returns: An :py:class:`~PIL.Image.Image` object.
+# 
+# Each PIL.Image.<method> variable is actually an integer (e.g. Image.NEAREST
+# is 0).
+#
+# We tried the resample interpolation methods and measured the speed measured
+# (ipython's timeit) for resizing an image
+# 3840x2160 -> 8x8
+#
+#                                      speed [ms]     
+# Image.NEAREST                   = 0  29.9e-3
+# Image.LANCZOS = Image.ANTIALIAS = 1  123
+# Image.BILINEAR                  = 2  47
+# Image.BICUBIC                   = 3  87
+#
+# resample quality (see pil_resample_methods.py)
+# method = 0, diff to ref(1) = 1.0
+# method = 1, diff to ref(1) = 0.0
+# method = 2, diff to ref(1) = 0.135679761399
+# method = 3, diff to ref(1) = 0.0549413095836
+#
+# -> method=2 is probably best
+
+
+import numpy as np
+import scipy.fftpack as fftpack
+from scipy.spatial import distance
+from scipy.cluster import hierarchy
+
+INT = np.int32
+FLOAT = np.float64
+
+
+def img2arr(img, size=(8,8), dtype=INT, resample=2):
+    """
+    Convert PIL Image to gray scale and resample to numpy array of shape
+    ``(size,size)`` and `dtype`.
+
+    Parameters
+    ----------
+    img : PIL Image
+    resample : int
+        interpolation method, see help of ``PIL.Image.Image.resize``
+    """
+    # convert('L'): to 1D grey scale array
+    return np.array(img.convert("L").resize(size, resample), dtype=dtype)
+
+
+def ahash(img, size=(8,8)):
+    """
+    Parameters
+    ----------
+    img : PIL image
+    size : (int, int)
+        size of fingerprint array
+    """
+    pixels = img2arr(img, size=size)
+    return (pixels > pixels.mean()).astype(INT)
+
+
+def phash(img, size=(8,8), highfreq_factor=4, backtransform=False):
+    img_size = (size[0]*highfreq_factor, 
+                size[1]*highfreq_factor)
+    pixels = img2arr(img, size=img_size, dtype=np.float64)
+    fpixels = fftpack.dct(fftpack.dct(pixels, axis=0), axis=1)
+    # XXX we had fpixels[1:size[0], 1:size[1]] before, find out why
+    fpixels_lowfreq = fpixels[:size[0], :size[1]]
+    if backtransform:
+        tmp = fftpack.idct(fftpack.idct(fpixels_lowfreq, axis=0), axis=1)
+    else:
+        tmp = fpixels_lowfreq
+    return (tmp > np.median(tmp)).astype(INT)
+
+
+def dhash(img, size=(8,8)):
+    pixels = img2arr(img, size=(size[0] + 1, size[1]))
+    return (pixels[1:, :] > pixels[:-1, :]).astype(INT)
+
+
+def cluster(files, fps, frac=0.2, metric='hamming'):
+    """
+    files : list of file names
+    fps : 
+    """
+    dfps = distance.pdist(fps.astype(bool), metric)
+    Z = hierarchy.linkage(dfps, method='average', metric=metric)
+    cut = hierarchy.fcluster(Z, t=dfps.max()*frac, criterion='distance')
+    clusters = dict((ii,[]) for ii in np.unique(cut))
+    for iimg,iclus in enumerate(cut):
+        clusters[iclus].append(files[iimg])
+    return clusters 
diff --git a/cli.py b/cli.py
@@ -0,0 +1,11 @@
+import os
+from imgcmp import misc
+pj = os.path.join
+
+base_dir = pj(os.environ['HOME'], '.imgcmp')
+convert_dr = pj(base_dir, 'convert')
+view_dr = pj(base_dir, 'view')
+dbfile = './fingerprints.hdf'
+
+for pp in base_dir, convert_dr, view_dr:
+    misc.makedirs(pp)
diff --git a/io.py b/io.py
@@ -0,0 +1,24 @@
+import h5py
+
+def write_h5(fn, dct, mode='w', **kwds):
+    fh = h5py.File(fn, mode=mode, **kwds)
+    for key,val in dct.iteritems():
+        _key = key if key.startswith('/') else '/'+key
+        fh[_key] = val
+    fh.close()
+
+
+def read_h5(fn):
+    fh = h5py.File(fn, mode='r')
+    dct = {}
+    def get(key, obj, dct=dct):
+        if isinstance(obj, h5py.Dataset):
+            _key = key if key.startswith('/') else '/'+key
+            dct[_key] = obj.value
+    fh.visititems(get)
+    fh.close()
+    return dct
+
+def read_db(dbfile):
+    db = read_h5(dbfile)
+    return db['/files'], db['/fps']