forked from ocropus-archive/DUP-ocropy
-
Notifications
You must be signed in to change notification settings - Fork 0
/
ocropus-econf
executable file
·116 lines (97 loc) · 3.82 KB
/
ocropus-econf
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
#!/usr/bin/env python
# -*- coding: utf-8 -*-
from __future__ import print_function
import warnings
import argparse
import sys
import os.path
import multiprocessing
import codecs
from collections import Counter
import numpy as np
import ocrolib
from ocrolib import edist
# disable rank warnings from polyfit
warnings.simplefilter('ignore',np.RankWarning)
parser = argparse.ArgumentParser(description = """
Compute the edit distances between ground truth and recognizer output.
Run with the ground truth files as arguments, and it will find the
corresponnding recognizer output files using the given extension (-x).
Missing output files are handled as empty strings, unless the -s
option is given.
""")
parser.add_argument("files",default=[],nargs='*',help="input lines")
parser.add_argument("-x","--extension",default=".txt",help="extension for recognizer output, default: %(default)s")
parser.add_argument("-k","--kind",default="exact",help="kind of comparison (exact, nospace, letdig, letters, digits, lnc), default: %(default)s")
parser.add_argument("-s","--skipmissing",action="store_true",help="don't use missing or empty output files in the calculation")
parser.add_argument("-c","--confusion",default=10,type=int,help="output this many top confusion, default: %(default)s")
parser.add_argument("-a","--allconf",default=None,help="output all confusions to this file")
parser.add_argument("-e","--perfile",default=None,help="output per-file errors to this file")
parser.add_argument("-C","--context",type=int,default=0,help="context for confusion matrix, default: %(default)s")
parser.add_argument("-Q","--parallel",type=int,default=multiprocessing.cpu_count())
args = parser.parse_args()
args.files = ocrolib.glob_all(args.files)
if not ".gt." in args.files[0]:
sys.stderr.write("warning: compare on .gt.txt files, not .txt files\n")
def process1(fname):
# fgt = ocrolib.allsplitext(fname)[0]+args.gtextension
counts = Counter()
gt = ocrolib.project_text(ocrolib.read_text(fname),kind=args.kind)
ftxt = ocrolib.allsplitext(fname)[0]+args.extension
missing = 0
if os.path.exists(ftxt):
txt = ocrolib.project_text(ocrolib.read_text(ftxt),kind=args.kind)
else:
missing = len(gt)
txt = ""
# Also the ground truth cannot be empty, it is possible that
# after filtering (args.kind) the gt string is empty.
if len(gt) == 0:
err = len(txt)
if(len(txt)>0):
cs = [(txt,'_'*len(txt))]
else:
cs = []
else:
err,cs = edist.xlevenshtein(txt,gt,context=args.context)
if args.confusion>0 or args.allconf is not None:
for u,v in cs:
counts[(u,v)] += 1
return fname,err,len(gt),missing,counts
outputs = ocrolib.parallel_map(process1,args.files,parallel=args.parallel,chunksize=10)
outputs = sorted(list(outputs))
perfile = None
if args.perfile is not None:
perfile = codecs.open(args.perfile,"w","utf-8")
allconf = None
if args.allconf is not None:
allconf = codecs.open(args.allconf,"w","utf-8")
errs = 0
total = 0
missing = 0
counts = Counter()
for fname,e,t,m,c in outputs:
errs += e
total += t
missing += m
counts += c
if perfile is not None:
perfile.write("%6d\t%6d\t%s\n"%(e,t,fname))
if allconf is not None:
for (a,b),v in c.most_common(1000):
allconf.write("%s\t%s\t%s\n"%(a,b,fname))
if perfile is not None: perfile.close()
if allconf is not None: allconf.close()
print("errors %8d"%errs)
print("missing %8d"%missing)
print("total %8d"%total)
if (total>0):
print("err %8.3f %%"%(errs*100.0/total))
print("errnomiss %8.3f %%"%((errs-missing)*100.0/total))
if args.confusion>0:
for (a,b),v in counts.most_common(args.confusion):
print("%d\t%s\t%s" % (v, a, b))
if (total>0):
print(errs * 1.0 / total)
else:
print("Nothing to compare")