Merge pull request #86 from Sqooba/enhancement/add-confidence-measure

[Enhancement]: Propagate ocr confidence to output hocr file
openpaperwork · Nov 30, 2017 · 30182d1 · 30182d1
2 parents fb4be79 + ec5e511
commit 30182d1
Show file tree

Hide file tree

Showing 3 changed files with 49 additions and 12 deletions.
diff --git a/src/pyocr/builders.py b/src/pyocr/builders.py
@@ -12,9 +12,12 @@
     from html.parser import HTMLParser
 
 import xml.dom.minidom
+import logging
 
 from .util import to_unicode
 
+logger = logging.getLogger(__name__)
+
 __all__ = [
     'Box',
     'TextBuilder',
@@ -42,7 +45,7 @@ class Box(object):
     was used.
     """
 
-    def __init__(self, content, position):
+    def __init__(self, content, position, confidence=0):
         """
         Arguments:
             content --- a single string
@@ -53,6 +56,7 @@ def __init__(self, content, position):
         content = to_unicode(content)
         self.content = content
         self.position = position
+        self.confidence = confidence
 
     def get_unicode_string(self):
         """
@@ -71,9 +75,10 @@ def get_unicode_string(self):
     def get_xml_tag(self, parent_doc):
         span_tag = parent_doc.createElement("span")
         span_tag.setAttribute("class", "ocrx_word")
-        span_tag.setAttribute("title", ("bbox %d %d %d %d" % (
+        span_tag.setAttribute("title", ("bbox %d %d %d %d; x_wconf %d" % (
             (self.position[0][0], self.position[0][1],
-             self.position[1][0], self.position[1][1]))))
+             self.position[1][0], self.position[1][1],
+             self.confidence))))
         txt = xml.dom.minidom.Text()
         txt.data = self.content
         span_tag.appendChild(txt)
@@ -268,7 +273,7 @@ def start_line(self, box):
         """
         raise NotImplementedError("Implement in subclasses")
 
-    def add_word(self, word, box):
+    def add_word(self, word, box, confidence=0):
         """
         Add a word to output.
         """
@@ -329,7 +334,7 @@ def write_file(file_descriptor, text):
     def start_line(self, box):
         self.built_text.append(u"")
 
-    def add_word(self, word, box):
+    def add_word(self, word, box, confidence=0):
         if self.built_text[-1] != u"":
             self.built_text[-1] += u" "
         self.built_text[-1] += word
@@ -381,12 +386,24 @@ def __init__(self):
 
         self.__current_box_position = None
         self.__current_box_text = None
+        self.__current_box_confidence = None
         self.boxes = []
 
         self.__current_line_position = None
         self.__current_line_content = []
         self.lines = []
 
+    @staticmethod
+    def __parse_confidence(title):
+        for piece in title.split("; "):
+            piece = piece.strip()
+            if not piece.startswith("x_wconf"):
+                continue
+            confidence = piece.split(" ")[1]
+            return int(confidence)
+        logger.info("OCR confidence measure not found")
+        return 0
+
     @staticmethod
     def __parse_position(title):
         for piece in title.split("; "):
@@ -413,7 +430,9 @@ def handle_starttag(self, tag, attrs):
             return
         if tag_type == 'ocr_word' or tag_type == 'ocrx_word':
             try:
+                confidence = self.__parse_confidence(position)
                 position = self.__parse_position(position)
+                self.__current_box_confidence = confidence
                 self.__current_box_position = position
             except Exception:
                 # invalid position --> old format --> we ignore this tag
@@ -439,7 +458,7 @@ def handle_endtag(self, tag):
             if self.__current_box_text is None:
                 return
             box_position = self.__current_box_position
-            box = Box(self.__current_box_text, box_position)
+            box = Box(self.__current_box_text, box_position, self.__current_box_confidence)
             self.boxes.append(box)
             self.__current_line_content.append(box)
             self.__current_box_text = None
@@ -596,8 +615,8 @@ def write_file(file_descriptor, boxes):
     def start_line(self, box):
         pass
 
-    def add_word(self, word, box):
-        self.word_boxes.append(Box(word, box))
+    def add_word(self, word, box, confidence=0):
+        self.word_boxes.append(Box(word, box, confidence))
 
     def end_line(self):
         pass
@@ -680,8 +699,8 @@ def start_line(self, box):
             return
         self.lines.append(LineBox([], box))
 
-    def add_word(self, word, box):
-        self.lines[-1].word_boxes.append(Box(word, box))
+    def add_word(self, word, box, confidence=0):
+        self.lines[-1].word_boxes.append(Box(word, box, confidence))
 
     def end_line(self):
         pass

diff --git a/src/pyocr/libtesseract/__init__.py b/src/pyocr/libtesseract/__init__.py
@@ -140,13 +140,17 @@ def image_to_string(image, lang=None, builder=None):
                 res_iterator, lvl_word
             )
 
-            if word is not None and word != "":
+            confidence = tesseract_raw.result_iterator_get_confidence(
+                res_iterator, lvl_word
+            )
+
+            if word is not None and confidence is not None and word != "":
                 (r, box) = tesseract_raw.page_iterator_bounding_box(
                     page_iterator, lvl_word
                 )
                 assert(r)
                 box = _tess_box_to_pyocr_box(box)
-                builder.add_word(word, box)
+                builder.add_word(word, box, confidence)
 
                 if last_word_in_line:
                     builder.end_line()

diff --git a/src/pyocr/libtesseract/tesseract_raw.py b/src/pyocr/libtesseract/tesseract_raw.py
@@ -298,6 +298,12 @@ class OSResults(ctypes.Structure):
     g_libtesseract.TessResultIteratorGetUTF8Text.restype = \
         ctypes.c_void_p
 
+    g_libtesseract.TessResultIteratorConfidence.argtypes = [
+        ctypes.c_void_p,
+        ctypes.c_int,
+    ]
+    g_libtesseract.TessResultIteratorConfidence.restype = ctypes.c_float
+
     g_libtesseract.TessDeleteText.argtypes = [
         ctypes.c_void_p
     ]
@@ -586,6 +592,14 @@ def result_iterator_get_utf8_text(iterator, level):
     g_libtesseract.TessDeleteText(ptr)
     return val
 
+def result_iterator_get_confidence(iterator, level):
+    ptr = g_libtesseract.TessResultIteratorConfidence(
+        ctypes.c_void_p(iterator), level
+    )
+    if ptr is None:
+        return None
+    val = ctypes.c_float(ptr).value
+    return val
 
 def detect_os(handle):
     global g_libtesseract