diff --git a/src/pyocr/builders.py b/src/pyocr/builders.py index b26b288..ca34afd 100644 --- a/src/pyocr/builders.py +++ b/src/pyocr/builders.py @@ -42,7 +42,7 @@ class Box(object): was used. """ - def __init__(self, content, position, confidence=None): + def __init__(self, content, position, confidence=0): """ Arguments: content --- a single string @@ -61,9 +61,8 @@ def get_unicode_string(self): This string can be stored in a file as-is (see write_box_file()) and reread using read_box_file(). """ - return to_unicode("%s %s %d %d %d %d") % ( + return to_unicode("%s %d %d %d %d") % ( self.content, - self.confidence, self.position[0][0], self.position[0][1], self.position[1][0], diff --git a/src/pyocr/libtesseract/__init__.py b/src/pyocr/libtesseract/__init__.py index c3e8e27..d338d53 100644 --- a/src/pyocr/libtesseract/__init__.py +++ b/src/pyocr/libtesseract/__init__.py @@ -75,10 +75,11 @@ def get_available_builders(): ] -def _tess_box_to_pyocr_box(box): +def _tess_box_to_pyocr_box(box, confidence=None): return ( (box[0], box[1]), (box[2], box[3]), + confidence ) @@ -140,13 +141,17 @@ def image_to_string(image, lang=None, builder=None): res_iterator, lvl_word ) - if word is not None and word != "": + confidence = tesseract_raw.result_iterator_get_confidence( + res_iterator, lvl_word + ) + + if word is not None and confidence is not None and word != "": (r, box) = tesseract_raw.page_iterator_bounding_box( page_iterator, lvl_word ) assert(r) box = _tess_box_to_pyocr_box(box) - builder.add_word(word, box) + builder.add_word(word, box, confidence) if last_word_in_line: builder.end_line() diff --git a/src/pyocr/libtesseract/tesseract_raw.py b/src/pyocr/libtesseract/tesseract_raw.py index 0264df9..30658cc 100644 --- a/src/pyocr/libtesseract/tesseract_raw.py +++ b/src/pyocr/libtesseract/tesseract_raw.py @@ -298,6 +298,12 @@ class OSResults(ctypes.Structure): g_libtesseract.TessResultIteratorGetUTF8Text.restype = \ ctypes.c_void_p + g_libtesseract.TessResultIteratorConfidence.argtypes = [ + ctypes.c_void_p, + ctypes.c_int, + ] + g_libtesseract.TessResultIteratorConfidence.restype = ctypes.c_float + g_libtesseract.TessDeleteText.argtypes = [ ctypes.c_void_p ] @@ -586,6 +592,14 @@ def result_iterator_get_utf8_text(iterator, level): g_libtesseract.TessDeleteText(ptr) return val +def result_iterator_get_confidence(iterator, level): + ptr = g_libtesseract.TessResultIteratorConfidence( + ctypes.c_void_p(iterator), level + ) + if ptr is None: + return None + val = ctypes.c_float(ptr).value + return val def detect_os(handle): global g_libtesseract