ocropus-gpageseg

#!/usr/bin/env python

# TODO:
# ! add option for padding
# - fix occasionally missing page numbers
# - treat large h-whitespace as separator
# - handle overlapping candidates
# - use cc distance statistics instead of character scale
# - page frame detection
# - read and use text image segmentation mask
# - pick up stragglers
# ? laplacian as well

from __future__ import print_function
from pylab import *
import argparse,glob,os,os.path
import traceback
from scipy.ndimage import measurements
from scipy.misc import imsave
from scipy.ndimage.filters import gaussian_filter,uniform_filter,maximum_filter
from multiprocessing import Pool
import ocrolib
from ocrolib import psegutils,morph,sl
from ocrolib.toplevel import *

parser = argparse.ArgumentParser()
# error checking
parser.add_argument('-n','--nocheck',action="store_true",
                    help="disable error checking on inputs")

parser.add_argument('-z','--zoom',type=float,default=0.5,
                    help='zoom for page background estimation, smaller=faster, default: %(default)s')

parser.add_argument('--gray',action='store_true',
                    help='output grayscale lines as well, default: %(default)s')
parser.add_argument('-q','--quiet',action='store_true',
                    help='be less verbose, default: %(default)s')

# limits
parser.add_argument('--minscale',type=float,default=12.0,
                    help='minimum scale permitted, default: %(default)s')
parser.add_argument('--maxlines',type=float,default=300,
                    help='maximum # lines permitted, default: %(default)s')

# scale parameters
parser.add_argument('--scale',type=float,default=0.0,
                    help='the basic scale of the document (roughly, xheight) 0=automatic, default: %(default)s')
parser.add_argument('--hscale',type=float,default=1.0,
                    help='non-standard scaling of horizontal parameters, default: %(default)s')
parser.add_argument('--vscale',type=float,default=1.0,
                    help='non-standard scaling of vertical parameters, default: %(default)s')

# line parameters
parser.add_argument('--threshold',type=float,default=0.2,
                    help='baseline threshold, default: %(default)s')
parser.add_argument('--noise',type=int,default=8,
                    help="noise threshold for removing small components from lines, default: %(default)s")
parser.add_argument('--usegauss',action='store_true',
                    help='use gaussian instead of uniform, default: %(default)s')

# column parameters
parser.add_argument('--maxseps',type=int,default=2,
                    help='maximum black column separators, default: %(default)s')
parser.add_argument('--sepwiden',type=int,default=10,
                    help='widen black separators (to account for warping), default: %(default)s')
parser.add_argument('-b','--blackseps',action="store_true",
                    help="also check for black column separators")

# whitespace column separators
parser.add_argument('--maxcolseps',type=int,default=2,
                    help='maximum # whitespace column separators, default: %(default)s')
parser.add_argument('--csminaspect',type=float,default=1.1,
                    help='minimum aspect ratio for column separators')
parser.add_argument('--csminheight',type=float,default=10,
                    help='minimum column height (units=scale), default: %(default)s')

# wait for input after everything is done

parser.add_argument('-p','--pad',type=int,default=3,
                    help='padding for extracted lines, default: %(default)s')
parser.add_argument('-e','--expand',type=int,default=3,
                    help='expand mask for grayscale extraction, default: %(default)s')
parser.add_argument('-Q','--parallel',type=int,default=0,
                    help="number of CPUs to use")
parser.add_argument('-d','--debug',action="store_true")
parser.add_argument('files',nargs='+')
args = parser.parse_args()
args.files = ocrolib.glob_all(args.files)

def norm_max(v):
    return v/amax(v)
def check_page(image):
    if len(image.shape)==3: return "input image is color image %s"%(image.shape,)
    if mean(image)<median(image): return "image may be inverted"
    h,w = image.shape
    if h<600: return "image not tall enough for a page image %s"%(image.shape,)
    if h>10000: return "image too tall for a page image %s"%(image.shape,)
    if w<600: return "image too narrow for a page image %s"%(image.shape,)
    if w>10000: return "line too wide for a page image %s"%(image.shape,)
    slots = int(w*h*1.0/(30*30))
    _,ncomps = measurements.label(image>mean(image))
    if ncomps<10: return "too few connected components for a page image (got %d)"%(ncomps,)
    if ncomps>slots: return "too many connnected components for a page image (%d > %d)"%(ncomps,slots)
    return None


def print_info(*objs):
    print("INFO: ", *objs, file=sys.stdout)

def print_error(*objs):
    print("ERROR: ", *objs, file=sys.stderr)


if len(args.files)<1:
    parser.print_help()
    sys.exit(0)

print_info("")
print_info("#"*10,(" ".join(sys.argv))[:60])
print_info("")

if args.parallel>1:
    args.quiet = 1

def B(a):
    if a.dtype==dtype('B'): return a
    return array(a,'B')

def DSAVE(title,image):
    if not args.debug: return
    if type(image)==list:
        assert len(image)==3
        image = transpose(array(image),[1,2,0])
    fname = "_"+title+".png"
    print_info("debug " + fname)
    imsave(fname,image)


################################################################
### Column finding.
###
### This attempts to find column separators, either as extended
### vertical black lines or extended vertical whitespace.
### It will work fairly well in simple cases, but for unusual
### documents, you need to tune the parameters.
################################################################

def compute_separators_morph(binary,scale):
    """Finds vertical black lines corresponding to column separators."""
    d0 = int(max(5,scale/4))
    d1 = int(max(5,scale))+args.sepwiden
    thick = morph.r_dilation(binary,(d0,d1))
    vert = morph.rb_opening(thick,(10*scale,1))
    vert = morph.r_erosion(vert,(d0//2,args.sepwiden))
    vert = morph.select_regions(vert,sl.dim1,min=3,nbest=2*args.maxseps)
    vert = morph.select_regions(vert,sl.dim0,min=20*scale,nbest=args.maxseps)
    return vert

def compute_colseps_morph(binary,scale,maxseps=3,minheight=20,maxwidth=5):
    """Finds extended vertical whitespace corresponding to column separators
    using morphological operations."""
    boxmap = psegutils.compute_boxmap(binary,scale,dtype='B')
    bounds = morph.rb_closing(B(boxmap),(int(5*scale),int(5*scale)))
    bounds = maximum(B(1-bounds),B(boxmap))
    cols = 1-morph.rb_closing(boxmap,(int(20*scale),int(scale)))
    cols = morph.select_regions(cols,sl.aspect,min=args.csminaspect)
    cols = morph.select_regions(cols,sl.dim0,min=args.csminheight*scale,nbest=args.maxcolseps)
    cols = morph.r_erosion(cols,(int(0.5+scale),0))
    cols = morph.r_dilation(cols,(int(0.5+scale),0),origin=(int(scale/2)-1,0))
    return cols

def compute_colseps_mconv(binary,scale=1.0):
    """Find column separators using a combination of morphological
    operations and convolution."""
    h,w = binary.shape
    smoothed = gaussian_filter(1.0*binary,(scale,scale*0.5))
    smoothed = uniform_filter(smoothed,(5.0*scale,1))
    thresh = (smoothed<amax(smoothed)*0.1)
    DSAVE("1thresh",thresh)
    blocks = morph.rb_closing(binary,(int(4*scale),int(4*scale)))
    DSAVE("2blocks",blocks)
    seps = minimum(blocks,thresh)
    seps = morph.select_regions(seps,sl.dim0,min=args.csminheight*scale,nbest=args.maxcolseps)
    DSAVE("3seps",seps)
    blocks = morph.r_dilation(blocks,(5,5))
    DSAVE("4blocks",blocks)
    seps = maximum(seps,1-blocks)
    DSAVE("5combo",seps)
    return seps

def compute_colseps_conv(binary,scale=1.0):
    """Find column separators by convoluation and
    thresholding."""
    h,w = binary.shape
    # find vertical whitespace by thresholding
    smoothed = gaussian_filter(1.0*binary,(scale,scale*0.5))
    smoothed = uniform_filter(smoothed,(5.0*scale,1))
    thresh = (smoothed<amax(smoothed)*0.1)
    DSAVE("1thresh",thresh)
    # find column edges by filtering
    grad = gaussian_filter(1.0*binary,(scale,scale*0.5),order=(0,1))
    grad = uniform_filter(grad,(10.0*scale,1))
    # grad = abs(grad) # use this for finding both edges
    grad = (grad>0.5*amax(grad))
    DSAVE("2grad",grad)
    # combine edges and whitespace
    seps = minimum(thresh,maximum_filter(grad,(int(scale),int(5*scale))))
    seps = maximum_filter(seps,(int(2*scale),1))
    DSAVE("3seps",seps)
    # select only the biggest column separators
    seps = morph.select_regions(seps,sl.dim0,min=args.csminheight*scale,nbest=args.maxcolseps+1)
    DSAVE("4seps",seps)
    return seps

def compute_colseps(binary,scale):
    """Computes column separators either from vertical black lines or whitespace."""
    colseps = compute_colseps_conv(binary,scale)
    DSAVE("colwsseps",0.7*colseps+0.3*binary)
    if args.blackseps:
        seps = compute_separators_morph(binary,scale)
        DSAVE("colseps",0.7*seps+0.3*binary)
        #colseps = compute_colseps_morph(binary,scale)
        colseps = maximum(colseps,seps)
        binary = minimum(binary,1-seps)
    return colseps,binary


################################################################
### Text Line Finding.
###
### This identifies the tops and bottoms of text lines by
### computing gradients and performing some adaptive thresholding.
### Those components are then used as seeds for the text lines.
################################################################

def compute_gradmaps(binary,scale):
    # use gradient filtering to find baselines
    boxmap = psegutils.compute_boxmap(binary,scale)
    cleaned = boxmap*binary
    DSAVE("cleaned",cleaned)
    if args.usegauss:
        # this uses Gaussians
        grad = gaussian_filter(1.0*cleaned,(args.vscale*0.3*scale,
                                            args.hscale*6*scale),order=(1,0))
    else:
        # this uses non-Gaussian oriented filters
        grad = gaussian_filter(1.0*cleaned,(max(4,args.vscale*0.3*scale),
                                            args.hscale*scale),order=(1,0))
        grad = uniform_filter(grad,(args.vscale,args.hscale*6*scale))
    bottom = ocrolib.norm_max((grad<0)*(-grad))
    top = ocrolib.norm_max((grad>0)*grad)
    return bottom,top,boxmap

def compute_line_seeds(binary,bottom,top,colseps,scale):
    """Base on gradient maps, computes candidates for baselines
    and xheights.  Then, it marks the regions between the two
    as a line seed."""
    t = args.threshold
    vrange = int(args.vscale*scale)
    bmarked = maximum_filter(bottom==maximum_filter(bottom,(vrange,0)),(2,2))
    bmarked = bmarked*(bottom>t*amax(bottom)*t)*(1-colseps)
    tmarked = maximum_filter(top==maximum_filter(top,(vrange,0)),(2,2))
    tmarked = tmarked*(top>t*amax(top)*t/2)*(1-colseps)
    tmarked = maximum_filter(tmarked,(1,20))
    seeds = zeros(binary.shape,'i')
    delta = max(3,int(scale/2))
    for x in range(bmarked.shape[1]):
        transitions = sorted([(y,1) for y in find(bmarked[:,x])]+[(y,0) for y in find(tmarked[:,x])])[::-1]
        transitions += [(0,0)]
        for l in range(len(transitions)-1):
            y0,s0 = transitions[l]
            if s0==0: continue
            seeds[y0-delta:y0,x] = 1
            y1,s1 = transitions[l+1]
            if s1==0 and (y0-y1)<5*scale: seeds[y1:y0,x] = 1
    seeds = maximum_filter(seeds,(1,int(1+scale)))
    seeds = seeds*(1-colseps)
    DSAVE("lineseeds",[seeds,0.3*tmarked+0.7*bmarked,binary])
    seeds,_ = morph.label(seeds)
    return seeds


################################################################
### The complete line segmentation process.
################################################################

def remove_hlines(binary,scale,maxsize=10):
    labels,_ = morph.label(binary)
    objects = morph.find_objects(labels)
    for i,b in enumerate(objects):
        if sl.width(b)>maxsize*scale:
            labels[b][labels[b]==i+1] = 0
    return array(labels!=0,'B')

def compute_segmentation(binary,scale):
    """Given a binary image, compute a complete segmentation into
    lines, computing both columns and text lines."""
    binary = array(binary,'B')

    # start by removing horizontal black lines, which only
    # interfere with the rest of the page segmentation
    binary = remove_hlines(binary,scale)

    # do the column finding
    if not args.quiet: print_info("computing column separators")
    colseps,binary = compute_colseps(binary,scale)

    # now compute the text line seeds
    if not args.quiet: print_info("computing lines")
    bottom,top,boxmap = compute_gradmaps(binary,scale)
    seeds = compute_line_seeds(binary,bottom,top,colseps,scale)
    DSAVE("seeds",[bottom,top,boxmap])

    # spread the text line seeds to all the remaining
    # components
    if not args.quiet: print_info("propagating labels")
    llabels = morph.propagate_labels(boxmap,seeds,conflict=0)
    if not args.quiet: print_info("spreading labels")
    spread = morph.spread_labels(seeds,maxdist=scale)
    llabels = where(llabels>0,llabels,spread*binary)
    segmentation = llabels*binary
    return segmentation


################################################################
### Processing each file.
################################################################

def process1(job):
    fname,i = job
    global base
    base,_ = ocrolib.allsplitext(fname)
    outputdir = base

    try:
        binary = ocrolib.read_image_binary(base+".bin.png")
    except IOError:
        try:
            binary = ocrolib.read_image_binary(fname)
        except IOError:
            if ocrolib.trace: traceback.print_exc()
            print_error("cannot open either " + base+".bin.png" + " or " + fname)
            return

    checktype(binary,ABINARY2)

    if not args.nocheck:
        check = check_page(amax(binary)-binary)
        if check is not None:
            print_error(fname + " SKIPPED " + check + " (use -n to disable this check)")
            return

    if args.gray:
        if os.path.exists(base+".nrm.png"):
            gray = ocrolib.read_image_gray(base+".nrm.png")
        checktype(gray,GRAYSCALE)

    binary = 1-binary # invert

    if args.scale==0:
        scale = psegutils.estimate_scale(binary)
    else:
        scale = args.scale
    print_info("scale " + str(scale))
    if isnan(scale) or scale>1000.0:
        print_error("%s: bad scale (%g); skipping\n"%(fname,str(scale)))
        return
    if scale<args.minscale:
        print_error("%s: scale (%g) less than --minscale; skipping\n"%(fname,str(scale)))
        return

    # find columns and text lines

    if not args.quiet: print_info("computing segmentation")
    segmentation = compute_segmentation(binary,scale)
    if amax(segmentation)>args.maxlines:
        print_error(fname + ": too many lines" + str(amax(segmentation)))
        return
    if not args.quiet: print_info("number of lines"+ str(amax(segmentation)))

    # compute the reading order

    if not args.quiet: print_info("finding reading order")
    lines = psegutils.compute_lines(segmentation,scale)
    order = psegutils.reading_order([l.bounds for l in lines])
    lsort = psegutils.topsort(order)

    # renumber the labels so that they conform to the specs

    nlabels = amax(segmentation)+1
    renumber = zeros(nlabels,'i')
    for i,v in enumerate(lsort): renumber[lines[v].label] = 0x010000+(i+1)
    segmentation = renumber[segmentation]

    # finally, output everything

    if not args.quiet: print_info("writing lines")
    if not os.path.exists(outputdir):
        os.mkdir(outputdir)
    lines = [lines[i] for i in lsort]
    ocrolib.write_page_segmentation("%s.pseg.png"%outputdir,segmentation)
    cleaned = ocrolib.remove_noise(binary,args.noise)
    for i,l in enumerate(lines):
        binline = psegutils.extract_masked(1-cleaned,l,pad=args.pad,expand=args.expand)
        ocrolib.write_image_binary("%s/01%04x.bin.png"%(outputdir,i+1),binline)
        if args.gray:
            grayline = psegutils.extract_masked(gray,l,pad=args.pad,expand=args.expand)
            ocrolib.write_image_gray("%s/01%04x.nrm.png"%(outputdir,i+1),grayline)
    print_info("%6d "%i + fname + " %4.1f "%  + scale + " " + str(len(lines)))

if len(args.files)==1 and os.path.isdir(args.files[0]):
    files = glob.glob(args.files[0]+"/????.png")
else:
    files = args.files

def safe_process1(job):
    fname,i = job
    try:
        process1(job)
    except ocrolib.OcropusException as e:
        if e.trace:
            traceback.print_exc()
        else:
            print_info(fname+":"+e)
    except Exception as e:
        traceback.print_exc()

if args.parallel<2:
    count = 0
    for i,f in enumerate(files):
        if args.parallel==0: print_info(f)
        count += 1
        safe_process1((f,i+1))
else:
    pool = Pool(processes=args.parallel)
    jobs = []
    for i,f in enumerate(files): jobs += [(f,i+1)]
    result = pool.map(process1,jobs)