Skip to content
This repository has been archived by the owner on Jun 14, 2018. It is now read-only.

Commit

Permalink
Merge pull request #86 from Sqooba/enhancement/add-confidence-measure
Browse files Browse the repository at this point in the history
[Enhancement]: Propagate ocr confidence to output hocr file
  • Loading branch information
jflesch authored Nov 30, 2017
2 parents fb4be79 + ec5e511 commit 30182d1
Show file tree
Hide file tree
Showing 3 changed files with 49 additions and 12 deletions.
39 changes: 29 additions & 10 deletions src/pyocr/builders.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,9 +12,12 @@
from html.parser import HTMLParser

import xml.dom.minidom
import logging

from .util import to_unicode

logger = logging.getLogger(__name__)

__all__ = [
'Box',
'TextBuilder',
Expand Down Expand Up @@ -42,7 +45,7 @@ class Box(object):
was used.
"""

def __init__(self, content, position):
def __init__(self, content, position, confidence=0):
"""
Arguments:
content --- a single string
Expand All @@ -53,6 +56,7 @@ def __init__(self, content, position):
content = to_unicode(content)
self.content = content
self.position = position
self.confidence = confidence

def get_unicode_string(self):
"""
Expand All @@ -71,9 +75,10 @@ def get_unicode_string(self):
def get_xml_tag(self, parent_doc):
span_tag = parent_doc.createElement("span")
span_tag.setAttribute("class", "ocrx_word")
span_tag.setAttribute("title", ("bbox %d %d %d %d" % (
span_tag.setAttribute("title", ("bbox %d %d %d %d; x_wconf %d" % (
(self.position[0][0], self.position[0][1],
self.position[1][0], self.position[1][1]))))
self.position[1][0], self.position[1][1],
self.confidence))))
txt = xml.dom.minidom.Text()
txt.data = self.content
span_tag.appendChild(txt)
Expand Down Expand Up @@ -268,7 +273,7 @@ def start_line(self, box):
"""
raise NotImplementedError("Implement in subclasses")

def add_word(self, word, box):
def add_word(self, word, box, confidence=0):
"""
Add a word to output.
"""
Expand Down Expand Up @@ -329,7 +334,7 @@ def write_file(file_descriptor, text):
def start_line(self, box):
self.built_text.append(u"")

def add_word(self, word, box):
def add_word(self, word, box, confidence=0):
if self.built_text[-1] != u"":
self.built_text[-1] += u" "
self.built_text[-1] += word
Expand Down Expand Up @@ -381,12 +386,24 @@ def __init__(self):

self.__current_box_position = None
self.__current_box_text = None
self.__current_box_confidence = None
self.boxes = []

self.__current_line_position = None
self.__current_line_content = []
self.lines = []

@staticmethod
def __parse_confidence(title):
for piece in title.split("; "):
piece = piece.strip()
if not piece.startswith("x_wconf"):
continue
confidence = piece.split(" ")[1]
return int(confidence)
logger.info("OCR confidence measure not found")
return 0

@staticmethod
def __parse_position(title):
for piece in title.split("; "):
Expand All @@ -413,7 +430,9 @@ def handle_starttag(self, tag, attrs):
return
if tag_type == 'ocr_word' or tag_type == 'ocrx_word':
try:
confidence = self.__parse_confidence(position)
position = self.__parse_position(position)
self.__current_box_confidence = confidence
self.__current_box_position = position
except Exception:
# invalid position --> old format --> we ignore this tag
Expand All @@ -439,7 +458,7 @@ def handle_endtag(self, tag):
if self.__current_box_text is None:
return
box_position = self.__current_box_position
box = Box(self.__current_box_text, box_position)
box = Box(self.__current_box_text, box_position, self.__current_box_confidence)
self.boxes.append(box)
self.__current_line_content.append(box)
self.__current_box_text = None
Expand Down Expand Up @@ -596,8 +615,8 @@ def write_file(file_descriptor, boxes):
def start_line(self, box):
pass

def add_word(self, word, box):
self.word_boxes.append(Box(word, box))
def add_word(self, word, box, confidence=0):
self.word_boxes.append(Box(word, box, confidence))

def end_line(self):
pass
Expand Down Expand Up @@ -680,8 +699,8 @@ def start_line(self, box):
return
self.lines.append(LineBox([], box))

def add_word(self, word, box):
self.lines[-1].word_boxes.append(Box(word, box))
def add_word(self, word, box, confidence=0):
self.lines[-1].word_boxes.append(Box(word, box, confidence))

def end_line(self):
pass
Expand Down
8 changes: 6 additions & 2 deletions src/pyocr/libtesseract/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -140,13 +140,17 @@ def image_to_string(image, lang=None, builder=None):
res_iterator, lvl_word
)

if word is not None and word != "":
confidence = tesseract_raw.result_iterator_get_confidence(
res_iterator, lvl_word
)

if word is not None and confidence is not None and word != "":
(r, box) = tesseract_raw.page_iterator_bounding_box(
page_iterator, lvl_word
)
assert(r)
box = _tess_box_to_pyocr_box(box)
builder.add_word(word, box)
builder.add_word(word, box, confidence)

if last_word_in_line:
builder.end_line()
Expand Down
14 changes: 14 additions & 0 deletions src/pyocr/libtesseract/tesseract_raw.py
Original file line number Diff line number Diff line change
Expand Up @@ -298,6 +298,12 @@ class OSResults(ctypes.Structure):
g_libtesseract.TessResultIteratorGetUTF8Text.restype = \
ctypes.c_void_p

g_libtesseract.TessResultIteratorConfidence.argtypes = [
ctypes.c_void_p,
ctypes.c_int,
]
g_libtesseract.TessResultIteratorConfidence.restype = ctypes.c_float

g_libtesseract.TessDeleteText.argtypes = [
ctypes.c_void_p
]
Expand Down Expand Up @@ -586,6 +592,14 @@ def result_iterator_get_utf8_text(iterator, level):
g_libtesseract.TessDeleteText(ptr)
return val

def result_iterator_get_confidence(iterator, level):
ptr = g_libtesseract.TessResultIteratorConfidence(
ctypes.c_void_p(iterator), level
)
if ptr is None:
return None
val = ctypes.c_float(ptr).value
return val

def detect_os(handle):
global g_libtesseract
Expand Down

0 comments on commit 30182d1

Please sign in to comment.