diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..0d20b64 --- /dev/null +++ b/.gitignore @@ -0,0 +1 @@ +*.pyc diff --git a/README.rst b/README.rst new file mode 100644 index 0000000..567b4c1 --- /dev/null +++ b/README.rst @@ -0,0 +1,40 @@ +Package for comaring images by content. + +image fingerprints: simple and fast +----------------------------------- +These methods basically squash down the image to something like 16x16, +transform to gray scale and store that as a feature vector of length 16x16, for +example -> fast. But the method s not invariant against rotation, only scaling +along x and/or y. + +The idea is always to calculate a database of image fingerprints ("hashes", +feature vectors) and then do searches in feature space (all fingerprints) using +some form of KD-tree / nearest neighbor search. + +* google: calculate image fingerprint +* [a|p|d]hash: https://realpython.com/blog/python/fingerprinting-images-for-near-duplicate-detection/ +* espcially: phash.org +* older Perl implemention of a ahash(?)-like method: http://www.jhnc.org/findimagedupes/manpage.html, also as Debian + package + +more scientific "feature extraction" +------------------------------------ + +* classical CV (computer vision): SIFT (good but slow, old-school + hand-engineered feature detector), SURF (faster version of + SIFT) +* http://opencv-python-tutroals.readthedocs.org/en/latest/index.html + * SIFT and SURF are patented, so fuck them and use ORB + http://opencv-python-tutroals.readthedocs.org/en/latest/py_tutorials/py_feature2d/py_orb/py_orb.html#orb +* opencv Bag Of Words: http://stackoverflow.com/questions/7205489/opencv-fingerprint-image-and-compare-against-database + +Python image processing +----------------------- +* google: python image processing :) +* http://scikit-image.org/ +* PIL vs. Pillow: http://docs.python-guide.org/en/latest/scenarios/imaging/ +* http://www.scipy-lectures.org/advanced/image_processing + +better methods +-------------- +read about: Content-based image classification diff --git a/__init__.py b/__init__.py new file mode 100644 index 0000000..a894a7f --- /dev/null +++ b/__init__.py @@ -0,0 +1 @@ +__all__ = ['calc', 'misc', 'io'] diff --git a/bin/00resize.py b/bin/00resize.py new file mode 100755 index 0000000..0457887 --- /dev/null +++ b/bin/00resize.py @@ -0,0 +1,87 @@ +#!/usr/bin/python3 + +# python3: Only python3 has (finally!) a decent multiprocessing module, which +# handles KeyboardInterrupt (CTRL-C) *at all* and does this without need for +# any extra hassle and awkward try..except stunts. Thank you! +# +# verbose: use -v know which image is processed. With multiprocessing, the order +# is messed up a bit but you will still get a rough estimate of how many images +# are left, assuming all images are equal in size and thus all processes are +# approximately equally fast. +# +# multiprocessing: We use Python's multiprocessing instead of the default +# imagemagick OpenMP parallelization since the former is a little faster -- and +# well .. coding and benchmarking is fun!. +# +# wall clock times, dual-core box +# +# multiprocessing, ncore = 1,2,4, max OpenMP threads = 1 +# +# $ for x in 1 2 4; do time ./00resize.py -n $x 20 files/*; done +# +# real 1m15.663s # 1 +# real 0m38.577s # 2 *** +# real 0m39.365s # 4 +# +# multiprocessing, ncore = 1,2,4, max OpenMP threads = 2 +# +# $ for x in 1 2 4; do time ./00resize.py -n $x 20 files/*; done +# +# real 0m46.304s # 1 *** +# real 0m38.766s # 2 +# real 0m38.984s # 4 +# +# The figures to compare are marked with ***, i.e. 2 threads vs. 2 processes +# with 1 thread / core. With the latter, we are about factor 1.2 faster. +# +# Note: We know that even setting OMP_NUM_THREADS=1 (which is probably +# equivalent to "-limit threads 1" in the imagemagick case) is not a good idea +# since there is still overhead caused by the creation of an OpenMP thread. The +# only way to get rid of OpenMP completely is to re-compile imagemagick with +# ./configure --disable-openmp . + +import os, multiprocessing, subprocess, functools, argparse +from imgcmp import cli + +def _worker(tup, percent=None, tgtdir=None, nfiles=None, verbose=False): + idx, _src = tup + src = os.path.abspath(_src) + # /home/foo -> _home_foo -> home_foo + tgt = os.path.join(tgtdir, src.replace('/','_')[1:]) + cmd = "convert -limit thread 1 -resize {}% -auto-orient {} {}".format( + percent, src, tgt) + if verbose >= 1: + print("{} of {}".format(idx+1, nfiles)) + if verbose >= 2: + print(cmd) + subprocess.call(cmd, shell=True) + +if __name__ == '__main__': + + desc = """ +Resize images to PERCENT % with imagemagick's convert. Store them in dir TGTDIR +with their full name with / replaced by _, such that /path/to/file.png becomes +TGTDIR/path_to_file.png +""" + parser = argparse.ArgumentParser(description=desc) + parser.add_argument('percent', metavar='PERCENT', type=float, + help='percent value for resizing') + parser.add_argument('files', metavar='FILE', nargs='+', + help='image file names') + parser.add_argument('-t', '--tgtdir', + default=cli.convert_dr, + help='store resized files here [%(default)s]') + parser.add_argument('-n', '--ncore', + default=multiprocessing.cpu_count(), type=int, + help='number of cores for parallel work [%(default)s]') + parser.add_argument('-v', '--verbose', default=0, action='count', + help='increase verbosity level, -vv prints convert commands') + args = parser.parse_args() + worker = functools.partial(_worker, + percent=args.percent, + tgtdir=args.tgtdir, + nfiles=len(args.files), + verbose=args.verbose) + pool = multiprocessing.Pool(args.ncore) + pool.map(worker, enumerate(args.files)) + diff --git a/bin/10fingerprints.py b/bin/10fingerprints.py new file mode 100755 index 0000000..96d5ee5 --- /dev/null +++ b/bin/10fingerprints.py @@ -0,0 +1,47 @@ +#!/usr/bin/python + +import sys, multiprocessing, functools, argparse +from PIL import Image +import numpy as np +from imgcmp import calc, io, cli + +def _worker(tup, size_x=None, fpsdct=None): + print(tup) + ii, name = tup + img = Image.open(name) + fpsdct[ii] = calc.phash(img, + size=(size_x, size_x), + highfreq_factor=4, + backtransform=False).flatten() + +if __name__ == '__main__': + + desc = """ +Calculate fingerprint database. +""" + parser = argparse.ArgumentParser(description=desc) + parser.add_argument('files', metavar='FILE', nargs='+', + help='image file names') + parser.add_argument('-x', dest='size_x', + default=8, type=int, + help='resize images to (size_x, size_x), fingerprints ' + 'are then (size_x**2,) 1d arrays [%(default)s]') + parser.add_argument('-f', dest='dbfile', + default=cli.dbfile, + help='database HDF file [%(default)s]') + args = parser.parse_args() + + # "parallel" dict for sharing between procs + manager = multiprocessing.Manager() + fpsdct = manager.dict() + + pool = multiprocessing.Pool(multiprocessing.cpu_count()) + worker = functools.partial(_worker, + size_x=args.size_x, + fpsdct=fpsdct) + pool.map(worker, enumerate(args.files)) + + # sort: order array to match file names in list `files` + fps = np.array([fpsdct[ii] for ii in np.sort(fpsdct.keys())]) + io.write_h5(args.dbfile, dict(files=args.files, fps=fps)) + diff --git a/bin/20cluster.py b/bin/20cluster.py new file mode 100755 index 0000000..da45a70 --- /dev/null +++ b/bin/20cluster.py @@ -0,0 +1,59 @@ +#!/usr/bin/python + +import os, shutil, argparse +import numpy as np +from imgcmp import misc, io, calc, cli + +pj = os.path.join + +if __name__ == '__main__': + + desc = """ +Read fingerprints database, perform clustering. +""" + parser = argparse.ArgumentParser(description=desc) + parser.add_argument('frac', type=float, + help='similarity fraction') + parser.add_argument('-f', dest='dbfile', + default=cli.dbfile, + help='database HDF file [%(default)s]') + args = parser.parse_args() + + db = io.read_h5(args.dbfile) + files = db['/files'] + fps = db['/fps'] + + # {1: [list_of_files], 2: [list_of_files], ...} + cluster_dct = calc.cluster(files, fps, args.frac, 'hamming') + + # [[list_of_files], [list_of_files], ...] + clst_multi = [x for x in cluster_dct.values() if len(x) > 1] + + # {number_of_files1: [[list_of_files], [list_of_files],...], + # number_of_files2: [[list_of_files],...], + # } + cdct_multi = {} + for x in clst_multi: + nn = len(x) + if not cdct_multi.has_key(nn): + cdct_multi[nn] = [x] + else: + cdct_multi[nn].append(x) + + print("items per cluster : number of such clusters") + shutil.rmtree(cli.view_dr) + for n_in_cluster in np.sort(cdct_multi.keys()): + cluster_list = cdct_multi[n_in_cluster] + print("{} : {}".format(n_in_cluster, len(cluster_list))) + for iclus, lst in enumerate(cluster_list): + dr = pj(cli.view_dr, + 'cluster_with_{}'.format(n_in_cluster), + 'cluster_{}'.format(iclus)) + for fn in lst: + link = pj(dr, os.path.basename(fn)) + misc.makedirs(os.path.dirname(link)) + os.symlink(fn, link) + + ##key = raw_input("View? [N,y] ") + ##if key.lower() == 'y': + diff --git a/calc.py b/calc.py new file mode 100644 index 0000000..9a9f9b2 --- /dev/null +++ b/calc.py @@ -0,0 +1,108 @@ +# https://github.com/JohannesBuchner/imagehash +# http://www.hackerfactor.com/blog/index.php?/archives/432-Looks-Like-It.html + +# help for PIL.Image.Image.resize +# ------------------------------- +# +# PIL.Image.Image.resize(self, size, resample=0) +# +# :param size: The requested size in pixels, as a 2-tuple: +# (width, height). +# :param resample: An optional resampling filter. This can be +# one of :py:attr:`PIL.Image.NEAREST` (use nearest neighbour), +# :py:attr:`PIL.Image.BILINEAR` (linear interpolation), +# :py:attr:`PIL.Image.BICUBIC` (cubic spline interpolation), or +# :py:attr:`PIL.Image.LANCZOS` (a high-quality downsampling filter). +# If omitted, or if the image has mode "1" or "P", it is +# set :py:attr:`PIL.Image.NEAREST`. +# :returns: An :py:class:`~PIL.Image.Image` object. +# +# Each PIL.Image. variable is actually an integer (e.g. Image.NEAREST +# is 0). +# +# We tried the resample interpolation methods and measured the speed measured +# (ipython's timeit) for resizing an image +# 3840x2160 -> 8x8 +# +# speed [ms] +# Image.NEAREST = 0 29.9e-3 +# Image.LANCZOS = Image.ANTIALIAS = 1 123 +# Image.BILINEAR = 2 47 +# Image.BICUBIC = 3 87 +# +# resample quality (see pil_resample_methods.py) +# method = 0, diff to ref(1) = 1.0 +# method = 1, diff to ref(1) = 0.0 +# method = 2, diff to ref(1) = 0.135679761399 +# method = 3, diff to ref(1) = 0.0549413095836 +# +# -> method=2 is probably best + + +import numpy as np +import scipy.fftpack as fftpack +from scipy.spatial import distance +from scipy.cluster import hierarchy + +INT = np.int32 +FLOAT = np.float64 + + +def img2arr(img, size=(8,8), dtype=INT, resample=2): + """ + Convert PIL Image to gray scale and resample to numpy array of shape + ``(size,size)`` and `dtype`. + + Parameters + ---------- + img : PIL Image + resample : int + interpolation method, see help of ``PIL.Image.Image.resize`` + """ + # convert('L'): to 1D grey scale array + return np.array(img.convert("L").resize(size, resample), dtype=dtype) + + +def ahash(img, size=(8,8)): + """ + Parameters + ---------- + img : PIL image + size : (int, int) + size of fingerprint array + """ + pixels = img2arr(img, size=size) + return (pixels > pixels.mean()).astype(INT) + + +def phash(img, size=(8,8), highfreq_factor=4, backtransform=False): + img_size = (size[0]*highfreq_factor, + size[1]*highfreq_factor) + pixels = img2arr(img, size=img_size, dtype=np.float64) + fpixels = fftpack.dct(fftpack.dct(pixels, axis=0), axis=1) + # XXX we had fpixels[1:size[0], 1:size[1]] before, find out why + fpixels_lowfreq = fpixels[:size[0], :size[1]] + if backtransform: + tmp = fftpack.idct(fftpack.idct(fpixels_lowfreq, axis=0), axis=1) + else: + tmp = fpixels_lowfreq + return (tmp > np.median(tmp)).astype(INT) + + +def dhash(img, size=(8,8)): + pixels = img2arr(img, size=(size[0] + 1, size[1])) + return (pixels[1:, :] > pixels[:-1, :]).astype(INT) + + +def cluster(files, fps, frac=0.2, metric='hamming'): + """ + files : list of file names + fps : + """ + dfps = distance.pdist(fps.astype(bool), metric) + Z = hierarchy.linkage(dfps, method='average', metric=metric) + cut = hierarchy.fcluster(Z, t=dfps.max()*frac, criterion='distance') + clusters = dict((ii,[]) for ii in np.unique(cut)) + for iimg,iclus in enumerate(cut): + clusters[iclus].append(files[iimg]) + return clusters diff --git a/cli.py b/cli.py new file mode 100644 index 0000000..ee22094 --- /dev/null +++ b/cli.py @@ -0,0 +1,11 @@ +import os +from imgcmp import misc +pj = os.path.join + +base_dir = pj(os.environ['HOME'], '.imgcmp') +convert_dr = pj(base_dir, 'convert') +view_dr = pj(base_dir, 'view') +dbfile = './fingerprints.hdf' + +for pp in base_dir, convert_dr, view_dr: + misc.makedirs(pp) diff --git a/io.py b/io.py new file mode 100644 index 0000000..bc066b5 --- /dev/null +++ b/io.py @@ -0,0 +1,24 @@ +import h5py + +def write_h5(fn, dct, mode='w', **kwds): + fh = h5py.File(fn, mode=mode, **kwds) + for key,val in dct.iteritems(): + _key = key if key.startswith('/') else '/'+key + fh[_key] = val + fh.close() + + +def read_h5(fn): + fh = h5py.File(fn, mode='r') + dct = {} + def get(key, obj, dct=dct): + if isinstance(obj, h5py.Dataset): + _key = key if key.startswith('/') else '/'+key + dct[_key] = obj.value + fh.visititems(get) + fh.close() + return dct + +def read_db(dbfile): + db = read_h5(dbfile) + return db['/files'], db['/fps'] diff --git a/misc.py b/misc.py new file mode 100644 index 0000000..3ff9d21 --- /dev/null +++ b/misc.py @@ -0,0 +1,32 @@ +import itertools, math, os + +def makedirs(path): + """Same as os.makedirs() but silently skips existing dirs.""" + pp = './' if path.strip() == '' else path + if not os.path.exists(pp): + os.makedirs(pp) + + +# http://stackoverflow.com/a/25649669 +def idx_square2cond(ii, jj, nn): + return ii*nn + jj - ii*(ii+1)/2 - ii - 1 + +# http://stackoverflow.com/a/14839010 +def idx_cond2square(kk, nn): + b = 1 - 2*nn + x = int(math.floor((-b - math.sqrt(b**2 - 8*kk))/2)) + y = kk + x*(b + x + 2)/2 + 1 + return (x,y) + + +def view_lst(lst): + from pwtools.common import system + cmd = 'qiv -fm ' + ' '.join(lst) + system(cmd, wait=True) + +##def view_collection( + +##def hamming(arr1, arr2): +## a1 = arr1.flatten() +## a2 = arr2.flatten() +## return np.count_nonzero(a1 != a2) diff --git a/play/pil_resample_methods.py b/play/pil_resample_methods.py new file mode 100644 index 0000000..1c97583 --- /dev/null +++ b/play/pil_resample_methods.py @@ -0,0 +1,34 @@ +import PIL +import scipy.misc +from matplotlib import cm, use +use('Agg') +from matplotlib import pyplot as plt +import numpy as np + +# face() retruns a (x,y,3) RGB array +im = PIL.Image.fromarray(scipy.misc.face()) + +fig = plt.figure() +ax = fig.add_subplot(111) +nn = 4 +img1d = [] +for method in range(4): + ##arr = np.array(im.convert('L').resize((nn,nn),method).getdata(), dtype=float).reshape(nn,nn) + # much faster + arr = np.array(im.convert('L').resize((nn,nn), method), dtype=float) + ax.matshow(arr, cmap=cm.gray) + fig.savefig('method_{}.png'.format(method)) + ax.cla() + img1d.append(arr.flatten()) + +refidx = 1 +ref = img1d[refidx] +diffs = [] +for m,arr1d in enumerate(img1d): + diffs.append(np.sqrt(((ref - arr1d)**2.0).sum())) + +diffs = np.array(diffs) +diffs = diffs / diffs.max() +for m,d in enumerate(diffs): + msg = "method = {m}, diff to ref({refidx}) = {d}" + print(msg.format(**dict(m=m, refidx=refidx,d=d))) diff --git a/test/test_idx.py b/test/test_idx.py new file mode 100644 index 0000000..6ce45eb --- /dev/null +++ b/test/test_idx.py @@ -0,0 +1,22 @@ +import numpy as np +from scipy.spatial import distance +import itertools +from imgcmp import misc + +def test_pdist(): + # play with pdist, which returns the upper triangle of the full squareform + # distance matrix, test our indexing routines (thank you + # http://stackoverflow.com) + nvec = 100 + dim = 10 + triu_list = zip(*np.triu_indices(nvec, k=1)) + vecs = np.random.rand(nvec, dim) + dist1d = distance.pdist(vecs) + assert len(triu_list) == len(dist1d) + dist2d = distance.squareform(dist1d) + for ii,jj in itertools.product(range(nvec), repeat=2): + if ii < jj: + idx_1d = misc.idx_square2cond(ii, jj, nvec) + assert dist1d[idx_1d] == dist2d[ii,jj] + assert misc.idx_cond2square(idx_1d, nvec) == (ii,jj) + assert triu_list[idx_1d] == (ii,jj)