Skip to content
This repository has been archived by the owner on Dec 13, 2024. It is now read-only.

Commit

Permalink
initial commit
Browse files Browse the repository at this point in the history
  • Loading branch information
elcorto committed Aug 1, 2016
0 parents commit 9d66810
Show file tree
Hide file tree
Showing 12 changed files with 466 additions and 0 deletions.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
*.pyc
40 changes: 40 additions & 0 deletions README.rst
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
Package for comaring images by content.

image fingerprints: simple and fast
-----------------------------------
These methods basically squash down the image to something like 16x16,
transform to gray scale and store that as a feature vector of length 16x16, for
example -> fast. But the method s not invariant against rotation, only scaling
along x and/or y.

The idea is always to calculate a database of image fingerprints ("hashes",
feature vectors) and then do searches in feature space (all fingerprints) using
some form of KD-tree / nearest neighbor search.

* google: calculate image fingerprint
* [a|p|d]hash: https://realpython.com/blog/python/fingerprinting-images-for-near-duplicate-detection/
* espcially: phash.org
* older Perl implemention of a ahash(?)-like method: http://www.jhnc.org/findimagedupes/manpage.html, also as Debian
package

more scientific "feature extraction"
------------------------------------

* classical CV (computer vision): SIFT (good but slow, old-school
hand-engineered feature detector), SURF (faster version of
SIFT)
* http://opencv-python-tutroals.readthedocs.org/en/latest/index.html
* SIFT and SURF are patented, so fuck them and use ORB
http://opencv-python-tutroals.readthedocs.org/en/latest/py_tutorials/py_feature2d/py_orb/py_orb.html#orb
* opencv Bag Of Words: http://stackoverflow.com/questions/7205489/opencv-fingerprint-image-and-compare-against-database

Python image processing
-----------------------
* google: python image processing :)
* http://scikit-image.org/
* PIL vs. Pillow: http://docs.python-guide.org/en/latest/scenarios/imaging/
* http://www.scipy-lectures.org/advanced/image_processing

better methods
--------------
read about: Content-based image classification
1 change: 1 addition & 0 deletions __init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
__all__ = ['calc', 'misc', 'io']
87 changes: 87 additions & 0 deletions bin/00resize.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,87 @@
#!/usr/bin/python3

# python3: Only python3 has (finally!) a decent multiprocessing module, which
# handles KeyboardInterrupt (CTRL-C) *at all* and does this without need for
# any extra hassle and awkward try..except stunts. Thank you!
#
# verbose: use -v know which image is processed. With multiprocessing, the order
# is messed up a bit but you will still get a rough estimate of how many images
# are left, assuming all images are equal in size and thus all processes are
# approximately equally fast.
#
# multiprocessing: We use Python's multiprocessing instead of the default
# imagemagick OpenMP parallelization since the former is a little faster -- and
# well .. coding and benchmarking is fun!.
#
# wall clock times, dual-core box
#
# multiprocessing, ncore = 1,2,4, max OpenMP threads = 1
#
# $ for x in 1 2 4; do time ./00resize.py -n $x 20 files/*; done
#
# real 1m15.663s # 1
# real 0m38.577s # 2 ***
# real 0m39.365s # 4
#
# multiprocessing, ncore = 1,2,4, max OpenMP threads = 2
#
# $ for x in 1 2 4; do time ./00resize.py -n $x 20 files/*; done
#
# real 0m46.304s # 1 ***
# real 0m38.766s # 2
# real 0m38.984s # 4
#
# The figures to compare are marked with ***, i.e. 2 threads vs. 2 processes
# with 1 thread / core. With the latter, we are about factor 1.2 faster.
#
# Note: We know that even setting OMP_NUM_THREADS=1 (which is probably
# equivalent to "-limit threads 1" in the imagemagick case) is not a good idea
# since there is still overhead caused by the creation of an OpenMP thread. The
# only way to get rid of OpenMP completely is to re-compile imagemagick with
# ./configure --disable-openmp .

import os, multiprocessing, subprocess, functools, argparse
from imgcmp import cli

def _worker(tup, percent=None, tgtdir=None, nfiles=None, verbose=False):
idx, _src = tup
src = os.path.abspath(_src)
# /home/foo -> _home_foo -> home_foo
tgt = os.path.join(tgtdir, src.replace('/','_')[1:])
cmd = "convert -limit thread 1 -resize {}% -auto-orient {} {}".format(
percent, src, tgt)
if verbose >= 1:
print("{} of {}".format(idx+1, nfiles))
if verbose >= 2:
print(cmd)
subprocess.call(cmd, shell=True)

if __name__ == '__main__':

desc = """
Resize images to PERCENT % with imagemagick's convert. Store them in dir TGTDIR
with their full name with / replaced by _, such that /path/to/file.png becomes
TGTDIR/path_to_file.png
"""
parser = argparse.ArgumentParser(description=desc)
parser.add_argument('percent', metavar='PERCENT', type=float,
help='percent value for resizing')
parser.add_argument('files', metavar='FILE', nargs='+',
help='image file names')
parser.add_argument('-t', '--tgtdir',
default=cli.convert_dr,
help='store resized files here [%(default)s]')
parser.add_argument('-n', '--ncore',
default=multiprocessing.cpu_count(), type=int,
help='number of cores for parallel work [%(default)s]')
parser.add_argument('-v', '--verbose', default=0, action='count',
help='increase verbosity level, -vv prints convert commands')
args = parser.parse_args()
worker = functools.partial(_worker,
percent=args.percent,
tgtdir=args.tgtdir,
nfiles=len(args.files),
verbose=args.verbose)
pool = multiprocessing.Pool(args.ncore)
pool.map(worker, enumerate(args.files))

47 changes: 47 additions & 0 deletions bin/10fingerprints.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,47 @@
#!/usr/bin/python

import sys, multiprocessing, functools, argparse
from PIL import Image
import numpy as np
from imgcmp import calc, io, cli

def _worker(tup, size_x=None, fpsdct=None):
print(tup)
ii, name = tup
img = Image.open(name)
fpsdct[ii] = calc.phash(img,
size=(size_x, size_x),
highfreq_factor=4,
backtransform=False).flatten()

if __name__ == '__main__':

desc = """
Calculate fingerprint database.
"""
parser = argparse.ArgumentParser(description=desc)
parser.add_argument('files', metavar='FILE', nargs='+',
help='image file names')
parser.add_argument('-x', dest='size_x',
default=8, type=int,
help='resize images to (size_x, size_x), fingerprints '
'are then (size_x**2,) 1d arrays [%(default)s]')
parser.add_argument('-f', dest='dbfile',
default=cli.dbfile,
help='database HDF file [%(default)s]')
args = parser.parse_args()

# "parallel" dict for sharing between procs
manager = multiprocessing.Manager()
fpsdct = manager.dict()

pool = multiprocessing.Pool(multiprocessing.cpu_count())
worker = functools.partial(_worker,
size_x=args.size_x,
fpsdct=fpsdct)
pool.map(worker, enumerate(args.files))

# sort: order array to match file names in list `files`
fps = np.array([fpsdct[ii] for ii in np.sort(fpsdct.keys())])
io.write_h5(args.dbfile, dict(files=args.files, fps=fps))

59 changes: 59 additions & 0 deletions bin/20cluster.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,59 @@
#!/usr/bin/python

import os, shutil, argparse
import numpy as np
from imgcmp import misc, io, calc, cli

pj = os.path.join

if __name__ == '__main__':

desc = """
Read fingerprints database, perform clustering.
"""
parser = argparse.ArgumentParser(description=desc)
parser.add_argument('frac', type=float,
help='similarity fraction')
parser.add_argument('-f', dest='dbfile',
default=cli.dbfile,
help='database HDF file [%(default)s]')
args = parser.parse_args()

db = io.read_h5(args.dbfile)
files = db['/files']
fps = db['/fps']

# {1: [list_of_files], 2: [list_of_files], ...}
cluster_dct = calc.cluster(files, fps, args.frac, 'hamming')

# [[list_of_files], [list_of_files], ...]
clst_multi = [x for x in cluster_dct.values() if len(x) > 1]

# {number_of_files1: [[list_of_files], [list_of_files],...],
# number_of_files2: [[list_of_files],...],
# }
cdct_multi = {}
for x in clst_multi:
nn = len(x)
if not cdct_multi.has_key(nn):
cdct_multi[nn] = [x]
else:
cdct_multi[nn].append(x)

print("items per cluster : number of such clusters")
shutil.rmtree(cli.view_dr)
for n_in_cluster in np.sort(cdct_multi.keys()):
cluster_list = cdct_multi[n_in_cluster]
print("{} : {}".format(n_in_cluster, len(cluster_list)))
for iclus, lst in enumerate(cluster_list):
dr = pj(cli.view_dr,
'cluster_with_{}'.format(n_in_cluster),
'cluster_{}'.format(iclus))
for fn in lst:
link = pj(dr, os.path.basename(fn))
misc.makedirs(os.path.dirname(link))
os.symlink(fn, link)

##key = raw_input("View? [N,y] ")
##if key.lower() == 'y':

108 changes: 108 additions & 0 deletions calc.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,108 @@
# https://github.com/JohannesBuchner/imagehash
# http://www.hackerfactor.com/blog/index.php?/archives/432-Looks-Like-It.html

# help for PIL.Image.Image.resize
# -------------------------------
#
# PIL.Image.Image.resize(self, size, resample=0)
#
# :param size: The requested size in pixels, as a 2-tuple:
# (width, height).
# :param resample: An optional resampling filter. This can be
# one of :py:attr:`PIL.Image.NEAREST` (use nearest neighbour),
# :py:attr:`PIL.Image.BILINEAR` (linear interpolation),
# :py:attr:`PIL.Image.BICUBIC` (cubic spline interpolation), or
# :py:attr:`PIL.Image.LANCZOS` (a high-quality downsampling filter).
# If omitted, or if the image has mode "1" or "P", it is
# set :py:attr:`PIL.Image.NEAREST`.
# :returns: An :py:class:`~PIL.Image.Image` object.
#
# Each PIL.Image.<method> variable is actually an integer (e.g. Image.NEAREST
# is 0).
#
# We tried the resample interpolation methods and measured the speed measured
# (ipython's timeit) for resizing an image
# 3840x2160 -> 8x8
#
# speed [ms]
# Image.NEAREST = 0 29.9e-3
# Image.LANCZOS = Image.ANTIALIAS = 1 123
# Image.BILINEAR = 2 47
# Image.BICUBIC = 3 87
#
# resample quality (see pil_resample_methods.py)
# method = 0, diff to ref(1) = 1.0
# method = 1, diff to ref(1) = 0.0
# method = 2, diff to ref(1) = 0.135679761399
# method = 3, diff to ref(1) = 0.0549413095836
#
# -> method=2 is probably best


import numpy as np
import scipy.fftpack as fftpack
from scipy.spatial import distance
from scipy.cluster import hierarchy

INT = np.int32
FLOAT = np.float64


def img2arr(img, size=(8,8), dtype=INT, resample=2):
"""
Convert PIL Image to gray scale and resample to numpy array of shape
``(size,size)`` and `dtype`.
Parameters
----------
img : PIL Image
resample : int
interpolation method, see help of ``PIL.Image.Image.resize``
"""
# convert('L'): to 1D grey scale array
return np.array(img.convert("L").resize(size, resample), dtype=dtype)


def ahash(img, size=(8,8)):
"""
Parameters
----------
img : PIL image
size : (int, int)
size of fingerprint array
"""
pixels = img2arr(img, size=size)
return (pixels > pixels.mean()).astype(INT)


def phash(img, size=(8,8), highfreq_factor=4, backtransform=False):
img_size = (size[0]*highfreq_factor,
size[1]*highfreq_factor)
pixels = img2arr(img, size=img_size, dtype=np.float64)
fpixels = fftpack.dct(fftpack.dct(pixels, axis=0), axis=1)
# XXX we had fpixels[1:size[0], 1:size[1]] before, find out why
fpixels_lowfreq = fpixels[:size[0], :size[1]]
if backtransform:
tmp = fftpack.idct(fftpack.idct(fpixels_lowfreq, axis=0), axis=1)
else:
tmp = fpixels_lowfreq
return (tmp > np.median(tmp)).astype(INT)


def dhash(img, size=(8,8)):
pixels = img2arr(img, size=(size[0] + 1, size[1]))
return (pixels[1:, :] > pixels[:-1, :]).astype(INT)


def cluster(files, fps, frac=0.2, metric='hamming'):
"""
files : list of file names
fps :
"""
dfps = distance.pdist(fps.astype(bool), metric)
Z = hierarchy.linkage(dfps, method='average', metric=metric)
cut = hierarchy.fcluster(Z, t=dfps.max()*frac, criterion='distance')
clusters = dict((ii,[]) for ii in np.unique(cut))
for iimg,iclus in enumerate(cut):
clusters[iclus].append(files[iimg])
return clusters
11 changes: 11 additions & 0 deletions cli.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
import os
from imgcmp import misc
pj = os.path.join

base_dir = pj(os.environ['HOME'], '.imgcmp')
convert_dr = pj(base_dir, 'convert')
view_dr = pj(base_dir, 'view')
dbfile = './fingerprints.hdf'

for pp in base_dir, convert_dr, view_dr:
misc.makedirs(pp)
24 changes: 24 additions & 0 deletions io.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
import h5py

def write_h5(fn, dct, mode='w', **kwds):
fh = h5py.File(fn, mode=mode, **kwds)
for key,val in dct.iteritems():
_key = key if key.startswith('/') else '/'+key
fh[_key] = val
fh.close()


def read_h5(fn):
fh = h5py.File(fn, mode='r')
dct = {}
def get(key, obj, dct=dct):
if isinstance(obj, h5py.Dataset):
_key = key if key.startswith('/') else '/'+key
dct[_key] = obj.value
fh.visititems(get)
fh.close()
return dct

def read_db(dbfile):
db = read_h5(dbfile)
return db['/files'], db['/fps']
Loading

0 comments on commit 9d66810

Please sign in to comment.