diff --git a/src/pyocr/builders.py b/src/pyocr/builders.py index 20f390c..1af8cf7 100644 --- a/src/pyocr/builders.py +++ b/src/pyocr/builders.py @@ -12,9 +12,12 @@ from html.parser import HTMLParser import xml.dom.minidom +import logging from .util import to_unicode +logger = logging.getLogger(__name__) + __all__ = [ 'Box', 'TextBuilder', @@ -42,7 +45,7 @@ class Box(object): was used. """ - def __init__(self, content, position): + def __init__(self, content, position, confidence=0): """ Arguments: content --- a single string @@ -53,6 +56,7 @@ def __init__(self, content, position): content = to_unicode(content) self.content = content self.position = position + self.confidence = confidence def get_unicode_string(self): """ @@ -71,9 +75,10 @@ def get_unicode_string(self): def get_xml_tag(self, parent_doc): span_tag = parent_doc.createElement("span") span_tag.setAttribute("class", "ocrx_word") - span_tag.setAttribute("title", ("bbox %d %d %d %d" % ( + span_tag.setAttribute("title", ("bbox %d %d %d %d; x_wconf %d" % ( (self.position[0][0], self.position[0][1], - self.position[1][0], self.position[1][1])))) + self.position[1][0], self.position[1][1], + self.confidence)))) txt = xml.dom.minidom.Text() txt.data = self.content span_tag.appendChild(txt) @@ -268,7 +273,7 @@ def start_line(self, box): """ raise NotImplementedError("Implement in subclasses") - def add_word(self, word, box): + def add_word(self, word, box, confidence=0): """ Add a word to output. """ @@ -329,7 +334,7 @@ def write_file(file_descriptor, text): def start_line(self, box): self.built_text.append(u"") - def add_word(self, word, box): + def add_word(self, word, box, confidence=0): if self.built_text[-1] != u"": self.built_text[-1] += u" " self.built_text[-1] += word @@ -381,12 +386,24 @@ def __init__(self): self.__current_box_position = None self.__current_box_text = None + self.__current_box_confidence = None self.boxes = [] self.__current_line_position = None self.__current_line_content = [] self.lines = [] + @staticmethod + def __parse_confidence(title): + for piece in title.split("; "): + piece = piece.strip() + if not piece.startswith("x_wconf"): + continue + confidence = piece.split(" ")[1] + return int(confidence) + logger.info("OCR confidence measure not found") + return 0 + @staticmethod def __parse_position(title): for piece in title.split("; "): @@ -413,7 +430,9 @@ def handle_starttag(self, tag, attrs): return if tag_type == 'ocr_word' or tag_type == 'ocrx_word': try: + confidence = self.__parse_confidence(position) position = self.__parse_position(position) + self.__current_box_confidence = confidence self.__current_box_position = position except Exception: # invalid position --> old format --> we ignore this tag @@ -439,7 +458,7 @@ def handle_endtag(self, tag): if self.__current_box_text is None: return box_position = self.__current_box_position - box = Box(self.__current_box_text, box_position) + box = Box(self.__current_box_text, box_position, self.__current_box_confidence) self.boxes.append(box) self.__current_line_content.append(box) self.__current_box_text = None @@ -596,8 +615,8 @@ def write_file(file_descriptor, boxes): def start_line(self, box): pass - def add_word(self, word, box): - self.word_boxes.append(Box(word, box)) + def add_word(self, word, box, confidence=0): + self.word_boxes.append(Box(word, box, confidence)) def end_line(self): pass @@ -680,8 +699,8 @@ def start_line(self, box): return self.lines.append(LineBox([], box)) - def add_word(self, word, box): - self.lines[-1].word_boxes.append(Box(word, box)) + def add_word(self, word, box, confidence=0): + self.lines[-1].word_boxes.append(Box(word, box, confidence)) def end_line(self): pass diff --git a/src/pyocr/libtesseract/__init__.py b/src/pyocr/libtesseract/__init__.py index 6fb8e64..985aab0 100644 --- a/src/pyocr/libtesseract/__init__.py +++ b/src/pyocr/libtesseract/__init__.py @@ -140,13 +140,17 @@ def image_to_string(image, lang=None, builder=None): res_iterator, lvl_word ) - if word is not None and word != "": + confidence = tesseract_raw.result_iterator_get_confidence( + res_iterator, lvl_word + ) + + if word is not None and confidence is not None and word != "": (r, box) = tesseract_raw.page_iterator_bounding_box( page_iterator, lvl_word ) assert(r) box = _tess_box_to_pyocr_box(box) - builder.add_word(word, box) + builder.add_word(word, box, confidence) if last_word_in_line: builder.end_line() diff --git a/src/pyocr/libtesseract/tesseract_raw.py b/src/pyocr/libtesseract/tesseract_raw.py index 0264df9..30658cc 100644 --- a/src/pyocr/libtesseract/tesseract_raw.py +++ b/src/pyocr/libtesseract/tesseract_raw.py @@ -298,6 +298,12 @@ class OSResults(ctypes.Structure): g_libtesseract.TessResultIteratorGetUTF8Text.restype = \ ctypes.c_void_p + g_libtesseract.TessResultIteratorConfidence.argtypes = [ + ctypes.c_void_p, + ctypes.c_int, + ] + g_libtesseract.TessResultIteratorConfidence.restype = ctypes.c_float + g_libtesseract.TessDeleteText.argtypes = [ ctypes.c_void_p ] @@ -586,6 +592,14 @@ def result_iterator_get_utf8_text(iterator, level): g_libtesseract.TessDeleteText(ptr) return val +def result_iterator_get_confidence(iterator, level): + ptr = g_libtesseract.TessResultIteratorConfidence( + ctypes.c_void_p(iterator), level + ) + if ptr is None: + return None + val = ctypes.c_float(ptr).value + return val def detect_os(handle): global g_libtesseract