From c2460ade7f08e7278715573c6bb61ecc72be4682 Mon Sep 17 00:00:00 2001 From: Adriano Pagano Date: Wed, 15 Nov 2017 15:35:56 +0100 Subject: [PATCH] Propagate ocr confidence to output hocr file when using tesseract --- src/pyocr/builders.py | 38 +++++++++++++++++++++++++++----------- 1 file changed, 27 insertions(+), 11 deletions(-) diff --git a/src/pyocr/builders.py b/src/pyocr/builders.py index 20f390c..b26b288 100644 --- a/src/pyocr/builders.py +++ b/src/pyocr/builders.py @@ -42,7 +42,7 @@ class Box(object): was used. """ - def __init__(self, content, position): + def __init__(self, content, position, confidence=None): """ Arguments: content --- a single string @@ -53,6 +53,7 @@ def __init__(self, content, position): content = to_unicode(content) self.content = content self.position = position + self.confidence = confidence def get_unicode_string(self): """ @@ -60,8 +61,9 @@ def get_unicode_string(self): This string can be stored in a file as-is (see write_box_file()) and reread using read_box_file(). """ - return to_unicode("%s %d %d %d %d") % ( + return to_unicode("%s %s %d %d %d %d") % ( self.content, + self.confidence, self.position[0][0], self.position[0][1], self.position[1][0], @@ -71,9 +73,10 @@ def get_unicode_string(self): def get_xml_tag(self, parent_doc): span_tag = parent_doc.createElement("span") span_tag.setAttribute("class", "ocrx_word") - span_tag.setAttribute("title", ("bbox %d %d %d %d" % ( + span_tag.setAttribute("title", ("bbox %d %d %d %d; x_wconf %d" % ( (self.position[0][0], self.position[0][1], - self.position[1][0], self.position[1][1])))) + self.position[1][0], self.position[1][1], + self.confidence)))) txt = xml.dom.minidom.Text() txt.data = self.content span_tag.appendChild(txt) @@ -268,7 +271,7 @@ def start_line(self, box): """ raise NotImplementedError("Implement in subclasses") - def add_word(self, word, box): + def add_word(self, word, box, confidence): """ Add a word to output. """ @@ -329,7 +332,7 @@ def write_file(file_descriptor, text): def start_line(self, box): self.built_text.append(u"") - def add_word(self, word, box): + def add_word(self, word, box, confidence=None): if self.built_text[-1] != u"": self.built_text[-1] += u" " self.built_text[-1] += word @@ -381,12 +384,23 @@ def __init__(self): self.__current_box_position = None self.__current_box_text = None + self.__current_box_confidence = None self.boxes = [] self.__current_line_position = None self.__current_line_content = [] self.lines = [] + @staticmethod + def __parse_confidence(title): + for piece in title.split("; "): + piece = piece.strip() + if not piece.startswith("x_wconf"): + continue + confidence = piece.split(" ")[1] + return int(confidence) + raise Exception("Invalid hocr confidence measure: %s" % title) + @staticmethod def __parse_position(title): for piece in title.split("; "): @@ -413,7 +427,9 @@ def handle_starttag(self, tag, attrs): return if tag_type == 'ocr_word' or tag_type == 'ocrx_word': try: + confidence = self.__parse_confidence(position) position = self.__parse_position(position) + self.__current_box_confidence = confidence self.__current_box_position = position except Exception: # invalid position --> old format --> we ignore this tag @@ -439,7 +455,7 @@ def handle_endtag(self, tag): if self.__current_box_text is None: return box_position = self.__current_box_position - box = Box(self.__current_box_text, box_position) + box = Box(self.__current_box_text, box_position, self.__current_box_confidence) self.boxes.append(box) self.__current_line_content.append(box) self.__current_box_text = None @@ -596,8 +612,8 @@ def write_file(file_descriptor, boxes): def start_line(self, box): pass - def add_word(self, word, box): - self.word_boxes.append(Box(word, box)) + def add_word(self, word, box, confidence): + self.word_boxes.append(Box(word, box, confidence)) def end_line(self): pass @@ -680,8 +696,8 @@ def start_line(self, box): return self.lines.append(LineBox([], box)) - def add_word(self, word, box): - self.lines[-1].word_boxes.append(Box(word, box)) + def add_word(self, word, box, confidence): + self.lines[-1].word_boxes.append(Box(word, box, confidence)) def end_line(self): pass