Skip to content
This repository has been archived by the owner on Jun 14, 2018. It is now read-only.

[Enhancement]: Propagate ocr confidence to output hocr file #86

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
39 changes: 29 additions & 10 deletions src/pyocr/builders.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,9 +12,12 @@
from html.parser import HTMLParser

import xml.dom.minidom
import logging

from .util import to_unicode

logger = logging.getLogger(__name__)

__all__ = [
'Box',
'TextBuilder',
Expand Down Expand Up @@ -42,7 +45,7 @@ class Box(object):
was used.
"""

def __init__(self, content, position):
def __init__(self, content, position, confidence=0):
"""
Arguments:
content --- a single string
Expand All @@ -53,6 +56,7 @@ def __init__(self, content, position):
content = to_unicode(content)
self.content = content
self.position = position
self.confidence = confidence

def get_unicode_string(self):
"""
Expand All @@ -71,9 +75,10 @@ def get_unicode_string(self):
def get_xml_tag(self, parent_doc):
span_tag = parent_doc.createElement("span")
span_tag.setAttribute("class", "ocrx_word")
span_tag.setAttribute("title", ("bbox %d %d %d %d" % (
span_tag.setAttribute("title", ("bbox %d %d %d %d; x_wconf %d" % (
(self.position[0][0], self.position[0][1],
self.position[1][0], self.position[1][1]))))
self.position[1][0], self.position[1][1],
self.confidence))))
txt = xml.dom.minidom.Text()
txt.data = self.content
span_tag.appendChild(txt)
Expand Down Expand Up @@ -268,7 +273,7 @@ def start_line(self, box):
"""
raise NotImplementedError("Implement in subclasses")

def add_word(self, word, box):
def add_word(self, word, box, confidence=0):
"""
Add a word to output.
"""
Expand Down Expand Up @@ -329,7 +334,7 @@ def write_file(file_descriptor, text):
def start_line(self, box):
self.built_text.append(u"")

def add_word(self, word, box):
def add_word(self, word, box, confidence=0):
if self.built_text[-1] != u"":
self.built_text[-1] += u" "
self.built_text[-1] += word
Expand Down Expand Up @@ -381,12 +386,24 @@ def __init__(self):

self.__current_box_position = None
self.__current_box_text = None
self.__current_box_confidence = None
self.boxes = []

self.__current_line_position = None
self.__current_line_content = []
self.lines = []

@staticmethod
def __parse_confidence(title):
for piece in title.split("; "):
piece = piece.strip()
if not piece.startswith("x_wconf"):
continue
confidence = piece.split(" ")[1]
return int(confidence)
logger.info("OCR confidence measure not found")
return 0

@staticmethod
def __parse_position(title):
for piece in title.split("; "):
Expand All @@ -413,7 +430,9 @@ def handle_starttag(self, tag, attrs):
return
if tag_type == 'ocr_word' or tag_type == 'ocrx_word':
try:
confidence = self.__parse_confidence(position)
position = self.__parse_position(position)
self.__current_box_confidence = confidence
self.__current_box_position = position
except Exception:
# invalid position --> old format --> we ignore this tag
Expand All @@ -439,7 +458,7 @@ def handle_endtag(self, tag):
if self.__current_box_text is None:
return
box_position = self.__current_box_position
box = Box(self.__current_box_text, box_position)
box = Box(self.__current_box_text, box_position, self.__current_box_confidence)
self.boxes.append(box)
self.__current_line_content.append(box)
self.__current_box_text = None
Expand Down Expand Up @@ -596,8 +615,8 @@ def write_file(file_descriptor, boxes):
def start_line(self, box):
pass

def add_word(self, word, box):
self.word_boxes.append(Box(word, box))
def add_word(self, word, box, confidence=0):
self.word_boxes.append(Box(word, box, confidence))

def end_line(self):
pass
Expand Down Expand Up @@ -680,8 +699,8 @@ def start_line(self, box):
return
self.lines.append(LineBox([], box))

def add_word(self, word, box):
self.lines[-1].word_boxes.append(Box(word, box))
def add_word(self, word, box, confidence=0):
self.lines[-1].word_boxes.append(Box(word, box, confidence))

def end_line(self):
pass
Expand Down
8 changes: 6 additions & 2 deletions src/pyocr/libtesseract/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -140,13 +140,17 @@ def image_to_string(image, lang=None, builder=None):
res_iterator, lvl_word
)

if word is not None and word != "":
confidence = tesseract_raw.result_iterator_get_confidence(
res_iterator, lvl_word
)

if word is not None and confidence is not None and word != "":
(r, box) = tesseract_raw.page_iterator_bounding_box(
page_iterator, lvl_word
)
assert(r)
box = _tess_box_to_pyocr_box(box)
builder.add_word(word, box)
builder.add_word(word, box, confidence)

if last_word_in_line:
builder.end_line()
Expand Down
14 changes: 14 additions & 0 deletions src/pyocr/libtesseract/tesseract_raw.py
Original file line number Diff line number Diff line change
Expand Up @@ -298,6 +298,12 @@ class OSResults(ctypes.Structure):
g_libtesseract.TessResultIteratorGetUTF8Text.restype = \
ctypes.c_void_p

g_libtesseract.TessResultIteratorConfidence.argtypes = [
ctypes.c_void_p,
ctypes.c_int,
]
g_libtesseract.TessResultIteratorConfidence.restype = ctypes.c_float

g_libtesseract.TessDeleteText.argtypes = [
ctypes.c_void_p
]
Expand Down Expand Up @@ -586,6 +592,14 @@ def result_iterator_get_utf8_text(iterator, level):
g_libtesseract.TessDeleteText(ptr)
return val

def result_iterator_get_confidence(iterator, level):
ptr = g_libtesseract.TessResultIteratorConfidence(
ctypes.c_void_p(iterator), level
)
if ptr is None:
return None
val = ctypes.c_float(ptr).value
return val

def detect_os(handle):
global g_libtesseract
Expand Down