From c4ae4b2ca4d3a0416c628efa958559f4419dd989 Mon Sep 17 00:00:00 2001 From: Nick White Date: Fri, 28 Jul 2017 13:09:34 +0100 Subject: [PATCH 1/2] Explicitly encode output as UTF-8 I think I've covered all the cases we need to, but there could be more. This is needed as if a locale isn't set to UTF-8, an error will result of this form: UnicodeEncodeError: 'ascii' codec can't encode character u'\u0113' in position 60: ordinal not in range(128) While the ideal solution is for the user to set their locale to UTF-8, it is better that we print debug output which may not be displayed correctly than that we output a fatal (and non-obvious) error, potentially some time into processing. This also fixes some cases of implicitly combining str and *obj together when printing debug output, which fails with some Python versions, by explicitly using str.join(obj). --- ocropus-gpageseg | 9 +++++++-- ocropus-nlbin | 9 +++++++-- ocropus-rpred | 8 ++++++-- ocropus-rtrain | 4 ++++ 4 files changed, 24 insertions(+), 6 deletions(-) diff --git a/ocropus-gpageseg b/ocropus-gpageseg index 88d069a8..9c379283 100755 --- a/ocropus-gpageseg +++ b/ocropus-gpageseg @@ -19,6 +19,7 @@ import os import os.path import sys import traceback +import codecs from multiprocessing import Pool import numpy as np @@ -31,6 +32,10 @@ from ocrolib import psegutils,morph,sl from ocrolib.exceptions import OcropusException from ocrolib.toplevel import * +utf8writer = codecs.getwriter('utf8') +sys.stdout = utf8writer(sys.stdout) +sys.stderr = utf8writer(sys.stderr) + parser = argparse.ArgumentParser(add_help=False) # error checking @@ -131,11 +136,11 @@ def check_page(image): def print_info(*objs): - print("INFO: ", *objs, file=sys.stdout) + print("INFO: ", " ".join(objs)) def print_error(*objs): - print("ERROR: ", *objs, file=sys.stderr) + print("ERROR: ", " ".join(objs), file=sys.stderr) if len(args.files)<1: diff --git a/ocropus-nlbin b/ocropus-nlbin index f5d55116..59905cbd 100755 --- a/ocropus-nlbin +++ b/ocropus-nlbin @@ -6,6 +6,7 @@ import argparse import os import multiprocessing import sys +import codecs import numpy as np import matplotlib.pyplot as plt @@ -14,6 +15,10 @@ from scipy import stats import ocrolib +utf8writer = codecs.getwriter('utf8') +sys.stdout = utf8writer(sys.stdout) +sys.stderr = utf8writer(sys.stderr) + parser = argparse.ArgumentParser(""" Image binarization using non-linear processing. @@ -51,11 +56,11 @@ if len(args.files)<1: def print_info(*objs): - print("INFO: ", *objs, file=sys.stdout) + print("INFO: ", " ".join(objs)) def print_error(*objs): - print("ERROR: ", *objs, file=sys.stderr) + print("ERROR: ", " ".join(objs), file=sys.stderr) def check_page(image): diff --git a/ocropus-rpred b/ocropus-rpred index 863947fa..8ebbfb20 100755 --- a/ocropus-rpred +++ b/ocropus-rpred @@ -19,6 +19,10 @@ from ocrolib import lstm from ocrolib import edist from ocrolib.exceptions import FileNotFound, OcropusException +utf8writer = codecs.getwriter('utf8') +sys.stdout = utf8writer(sys.stdout) +sys.stderr = utf8writer(sys.stderr) + parser = argparse.ArgumentParser("apply an RNN recognizer") # error checking @@ -72,11 +76,11 @@ args = parser.parse_args() def print_info(*objs): - print("INFO: ", *objs, file=sys.stdout) + print("INFO: ", " ".join(objs)) def print_error(*objs): - print("ERROR: ", *objs, file=sys.stderr) + print("ERROR: ", " ".join(objs), file=sys.stderr) def check_line(image): diff --git a/ocropus-rtrain b/ocropus-rtrain index 9d365ae3..8f52d43b 100755 --- a/ocropus-rtrain +++ b/ocropus-rtrain @@ -8,6 +8,7 @@ import os.path import traceback import argparse import sys +import codecs import numpy as np import matplotlib.pyplot as plt @@ -16,6 +17,9 @@ import ocrolib import ocrolib.lstm as lstm from ocrolib import lineest +utf8writer = codecs.getwriter('utf8') +sys.stdout = utf8writer(sys.stdout) + np.seterr(divide='raise',over='raise',invalid='raise',under='ignore') parser = argparse.ArgumentParser("train an RNN recognizer") From 6c5784edf6eedfd287d956c37988e082eca4fd3e Mon Sep 17 00:00:00 2001 From: Nick White Date: Mon, 27 Nov 2017 14:01:33 +0000 Subject: [PATCH 2/2] Explicitly encode output as UTF-8 for ocropus-errs and ocropus-econf I accidentally missed these from the original commit (c4ae4b). --- ocropus-econf | 4 ++++ ocropus-errs | 5 +++++ 2 files changed, 9 insertions(+) diff --git a/ocropus-econf b/ocropus-econf index 58b62909..cade65e7 100755 --- a/ocropus-econf +++ b/ocropus-econf @@ -16,6 +16,10 @@ import numpy as np import ocrolib from ocrolib import edist +utf8writer = codecs.getwriter('utf8') +sys.stdout = utf8writer(sys.stdout) +sys.stderr = utf8writer(sys.stderr) + # disable rank warnings from polyfit warnings.simplefilter('ignore',np.RankWarning) diff --git a/ocropus-errs b/ocropus-errs index 786a4ed8..8303a68b 100755 --- a/ocropus-errs +++ b/ocropus-errs @@ -8,10 +8,15 @@ import sys import os import os.path import multiprocessing +import codecs import ocrolib from ocrolib import edist +utf8writer = codecs.getwriter('utf8') +sys.stdout = utf8writer(sys.stdout) +sys.stderr = utf8writer(sys.stderr) + parser = argparse.ArgumentParser(description = """ Compute the edit distances between ground truth and recognizer output. Run with the ground truth files as arguments, and it will find the