Skip to content
This repository has been archived by the owner on Jun 14, 2018. It is now read-only.

Commit

Permalink
Merge pull request #88 from ZoranPavlovic/feature/minor_changes
Browse files Browse the repository at this point in the history
Feature/minor changes
  • Loading branch information
jflesch authored Jan 2, 2018
2 parents 1ba207c + 45361a8 commit ce23c24
Show file tree
Hide file tree
Showing 3 changed files with 22 additions and 31 deletions.
5 changes: 5 additions & 0 deletions src/pyocr/libtesseract/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,9 @@
from ..error import TesseractError
from ..util import digits_only

import logging
logger = logging.getLogger(__name__)


__all__ = [
'can_detect_orientation',
Expand Down Expand Up @@ -212,11 +215,13 @@ def is_available():
if not available:
return False
version = get_version()

# C-API with Tesseract <= 3.02 segfaults sometimes
# (seen with Debian stable + Paperwork)
# not tested with 3.03
if (version[0] < 3 or
(version[0] == 3 and version[1] < 4)):
logger.warning("Unsupported version [%s]" % ".".join([str(r) for r in version]))
return False
return True

Expand Down
38 changes: 10 additions & 28 deletions src/pyocr/libtesseract/tesseract_raw.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,12 +44,14 @@

g_libtesseract = None

lib_load_errors = []
for libname in libnames:
try:
g_libtesseract = ctypes.cdll.LoadLibrary(libname)
lib_load_errors = []
break
except OSError:
pass
except OSError as ex:
lib_load_errors.append((libname, ex.message))


class PageSegMode(object):
Expand Down Expand Up @@ -353,23 +355,20 @@ def init(lang=None):


def cleanup(handle):
assert(g_libtesseract)
g_libtesseract.TessBaseAPIDelete(ctypes.c_void_p(handle))


def is_available():
global g_libtesseract
return g_libtesseract is not None


def get_version():
global g_libtesseract
assert(g_libtesseract)

return g_libtesseract.TessVersion().decode("utf-8")


def get_available_languages(handle):
global g_libtesseract
assert(g_libtesseract)

langs = []
Expand All @@ -385,7 +384,6 @@ def get_available_languages(handle):


def set_is_numeric(handle, mode):
global g_libtesseract
assert(g_libtesseract)

if mode:
Expand All @@ -401,7 +399,6 @@ def set_is_numeric(handle, mode):


def set_debug_file(handle, filename):
global g_libtesseract
assert(g_libtesseract)

if not isinstance(filename, bytes):
Expand All @@ -415,7 +412,6 @@ def set_debug_file(handle, filename):


def set_page_seg_mode(handle, mode):
global g_libtesseract
assert(g_libtesseract)

g_libtesseract.TessBaseAPISetPageSegMode(
Expand All @@ -424,14 +420,12 @@ def set_page_seg_mode(handle, mode):


def init_for_analyse_page(handle):
global g_libtesseract
assert(g_libtesseract)

g_libtesseract.TessBaseAPIInitForAnalysePage(ctypes.c_void_p(handle))


def set_image(handle, image):
global g_libtesseract
assert(g_libtesseract)

image = image.convert("RGB")
Expand All @@ -451,7 +445,6 @@ def set_image(handle, image):


def recognize(handle):
global g_libtesseract
assert(g_libtesseract)

return g_libtesseract.TessBaseAPIRecognize(
Expand All @@ -460,35 +453,32 @@ def recognize(handle):


def analyse_layout(handle):
global g_libtesseract
assert(g_libtesseract)

return g_libtesseract.TessBaseAPIAnalyseLayout(ctypes.c_void_p(handle))


def get_utf8_text(handle):
assert(g_libtesseract)
ptr = g_libtesseract.TessBaseAPIGetUTF8Text(ctypes.c_void_p(handle))
val = ctypes.cast(ptr, ctypes.c_char_p).value.decode("utf-8")
g_libtesseract.TessDeleteText(ptr)
return val


def page_iterator_delete(iterator):
global g_libtesseract
assert(g_libtesseract)

return g_libtesseract.TessPageIteratorDelete(ctypes.c_void_p(iterator))


def page_iterator_next(iterator, level):
global g_libtesseract
assert(g_libtesseract)

return g_libtesseract.TessPageIteratorNext(ctypes.c_void_p(iterator), level)


def page_iterator_is_at_beginning_of(iterator, level):
global g_libtesseract
assert(g_libtesseract)

return g_libtesseract.TessPageIteratorIsAtBeginningOf(
Expand All @@ -497,7 +487,6 @@ def page_iterator_is_at_beginning_of(iterator, level):


def page_iterator_is_at_final_element(iterator, level, element):
global g_libtesseract
assert(g_libtesseract)

return g_libtesseract.TessPageIteratorIsAtFinalElement(
Expand All @@ -506,7 +495,6 @@ def page_iterator_is_at_final_element(iterator, level, element):


def page_iterator_block_type(iterator):
global g_libtesseract
assert(g_libtesseract)

return g_libtesseract.TessPageIteratorBlockType(
Expand All @@ -515,7 +503,6 @@ def page_iterator_block_type(iterator):


def page_iterator_bounding_box(iterator, level):
global g_libtesseract
assert(g_libtesseract)

left = ctypes.c_int(0)
Expand All @@ -541,7 +528,6 @@ def page_iterator_bounding_box(iterator, level):


def page_iterator_orientation(iterator):
global g_libtesseract
assert(g_libtesseract)

orientation = ctypes.c_int(0)
Expand All @@ -566,15 +552,13 @@ def page_iterator_orientation(iterator):


def get_iterator(handle):
global g_libtesseract
assert(g_libtesseract)

i = g_libtesseract.TessBaseAPIGetIterator(ctypes.c_void_p(handle))
return i


def result_iterator_get_page_iterator(res_iterator):
global g_libtesseract
assert(g_libtesseract)

return g_libtesseract.TessResultIteratorGetPageIterator(
Expand All @@ -583,6 +567,7 @@ def result_iterator_get_page_iterator(res_iterator):


def result_iterator_get_utf8_text(iterator, level):
assert(g_libtesseract)
ptr = g_libtesseract.TessResultIteratorGetUTF8Text(
ctypes.c_void_p(iterator), level
)
Expand All @@ -592,7 +577,9 @@ def result_iterator_get_utf8_text(iterator, level):
g_libtesseract.TessDeleteText(ptr)
return val


def result_iterator_get_confidence(iterator, level):
assert(g_libtesseract)
ptr = g_libtesseract.TessResultIteratorConfidence(
ctypes.c_void_p(iterator), level
)
Expand All @@ -601,8 +588,8 @@ def result_iterator_get_confidence(iterator, level):
val = ctypes.c_float(ptr).value
return val


def detect_os(handle):
global g_libtesseract
assert(g_libtesseract)

# Use the new API function if it is available, because since Tesseract
Expand Down Expand Up @@ -642,7 +629,6 @@ def detect_os(handle):


def set_input_name(handle, input_file):
global g_libtesseract
assert(g_libtesseract)

g_libtesseract.TessBaseAPISetInputName(
Expand All @@ -652,7 +638,6 @@ def set_input_name(handle, input_file):


def init_pdf_renderer(handle, output_file, textonly):
global g_libtesseract
assert(g_libtesseract)

tessdata_dir = g_libtesseract.TessBaseAPIGetDatapath(handle)
Expand All @@ -667,7 +652,6 @@ def init_pdf_renderer(handle, output_file, textonly):


def begin_document(renderer, doc_name):
global g_libtesseract
assert(g_libtesseract)

g_libtesseract.TessResultRendererBeginDocument(
Expand All @@ -677,7 +661,6 @@ def begin_document(renderer, doc_name):


def add_renderer_image(handle, renderer):
global g_libtesseract
assert(g_libtesseract)

g_libtesseract.TessResultRendererAddImage(
Expand All @@ -687,7 +670,6 @@ def add_renderer_image(handle, renderer):


def end_document(renderer):
global g_libtesseract
assert(g_libtesseract)

g_libtesseract.TessResultRendererEndDocument(
Expand Down
10 changes: 7 additions & 3 deletions src/pyocr/tesseract.py
Original file line number Diff line number Diff line change
Expand Up @@ -202,6 +202,10 @@ def detect_orientation(image, lang=None):

original_output = original_output.decode("utf-8")
original_output = original_output.strip()

if "Could not initialize tesseract" in original_output:
raise TesseractError(-1, "Error initializing tesseract: %s" % original_output)

try:
output = original_output.split("\n")
output = [line.split(": ", 1) for line in output if (": " in line)]
Expand All @@ -214,9 +218,9 @@ def detect_orientation(image, lang=None):
'angle': angle,
'confidence': float(output['Orientation confidence']),
}
except:
raise TesseractError(-1, "No script found in image (%s)"
% original_output)
except Exception as ex:
raise TesseractError(-1, "No script found in image (%s - %s)"
% (ex.message, original_output))


def get_name():
Expand Down

0 comments on commit ce23c24

Please sign in to comment.