Skip to content
This repository has been archived by the owner on Jun 14, 2018. It is now read-only.

Commit

Permalink
Add support for caclulating confidence measure in libtesseract (Cunei…
Browse files Browse the repository at this point in the history
…form will default to confidence=0)
  • Loading branch information
a-pagano committed Nov 30, 2017
1 parent c2460ad commit 01b1b2c
Show file tree
Hide file tree
Showing 3 changed files with 24 additions and 6 deletions.
5 changes: 2 additions & 3 deletions src/pyocr/builders.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,7 @@ class Box(object):
was used.
"""

def __init__(self, content, position, confidence=None):
def __init__(self, content, position, confidence=0):
"""
Arguments:
content --- a single string
Expand All @@ -61,9 +61,8 @@ def get_unicode_string(self):
This string can be stored in a file as-is (see write_box_file())
and reread using read_box_file().
"""
return to_unicode("%s %s %d %d %d %d") % (
return to_unicode("%s %d %d %d %d") % (
self.content,
self.confidence,
self.position[0][0],
self.position[0][1],
self.position[1][0],
Expand Down
11 changes: 8 additions & 3 deletions src/pyocr/libtesseract/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -75,10 +75,11 @@ def get_available_builders():
]


def _tess_box_to_pyocr_box(box):
def _tess_box_to_pyocr_box(box, confidence=None):
return (
(box[0], box[1]),
(box[2], box[3]),
confidence
)


Expand Down Expand Up @@ -140,13 +141,17 @@ def image_to_string(image, lang=None, builder=None):
res_iterator, lvl_word
)

if word is not None and word != "":
confidence = tesseract_raw.result_iterator_get_confidence(
res_iterator, lvl_word
)

if word is not None and confidence is not None and word != "":
(r, box) = tesseract_raw.page_iterator_bounding_box(
page_iterator, lvl_word
)
assert(r)
box = _tess_box_to_pyocr_box(box)
builder.add_word(word, box)
builder.add_word(word, box, confidence)

if last_word_in_line:
builder.end_line()
Expand Down
14 changes: 14 additions & 0 deletions src/pyocr/libtesseract/tesseract_raw.py
Original file line number Diff line number Diff line change
Expand Up @@ -298,6 +298,12 @@ class OSResults(ctypes.Structure):
g_libtesseract.TessResultIteratorGetUTF8Text.restype = \
ctypes.c_void_p

g_libtesseract.TessResultIteratorConfidence.argtypes = [
ctypes.c_void_p,
ctypes.c_int,
]
g_libtesseract.TessResultIteratorConfidence.restype = ctypes.c_float

g_libtesseract.TessDeleteText.argtypes = [
ctypes.c_void_p
]
Expand Down Expand Up @@ -586,6 +592,14 @@ def result_iterator_get_utf8_text(iterator, level):
g_libtesseract.TessDeleteText(ptr)
return val

def result_iterator_get_confidence(iterator, level):
ptr = g_libtesseract.TessResultIteratorConfidence(
ctypes.c_void_p(iterator), level
)
if ptr is None:
return None
val = ctypes.c_float(ptr).value
return val

def detect_os(handle):
global g_libtesseract
Expand Down

0 comments on commit 01b1b2c

Please sign in to comment.