Explicitly encode output as UTF-8

I think I've covered all the cases we need to, but there could be more. This is needed as if a locale isn't set to UTF-8, an error will result of this form: UnicodeEncodeError: 'ascii' codec can't encode character u'\u0113' in position 60: ordinal not in range(128) While the ideal solution is for the user to set their locale to UTF-8, it is better that we print debug output which may not be displayed correctly than that we output a fatal (and non-obvious) error, potentially some time into processing. This also fixes some cases of implicitly combining str and *obj together when printing debug output, which fails with some Python versions, by explicitly using str.join(obj).
ocropus-archive · Oct 24, 2017 · 2b97ad9 · 2b97ad9
1 parent ebd8235
commit 2b97ad9
Show file tree

Hide file tree

Showing 4 changed files with 25 additions and 6 deletions.
diff --git a/ocropus-gpageseg b/ocropus-gpageseg
@@ -19,6 +19,7 @@ import os
 import os.path
 import sys
 import traceback
+import codecs
 from multiprocessing import Pool
 
 import numpy as np
@@ -31,6 +32,10 @@ from ocrolib import psegutils,morph,sl
 from ocrolib.exceptions import OcropusException
 from ocrolib.toplevel import *
 
+utf8writer = codecs.getwriter('utf8')
+sys.stdout = utf8writer(sys.stdout)
+sys.stderr = utf8writer(sys.stderr)
+
 parser = argparse.ArgumentParser(add_help=False)
 
 # error checking
@@ -131,11 +136,11 @@ def check_page(image):
 
 
 def print_info(*objs):
-    print("INFO: ", *objs, file=sys.stdout)
+    print("INFO: ", " ".join(objs))
 
 
 def print_error(*objs):
-    print("ERROR: ", *objs, file=sys.stderr)
+    print("ERROR: ", " ".join(objs), file=sys.stderr)
 
 
 if len(args.files)<1:

diff --git a/ocropus-nlbin b/ocropus-nlbin
@@ -6,6 +6,7 @@ import argparse
 import os
 import multiprocessing
 import sys
+import codecs
 
 import numpy as np
 import matplotlib.pyplot as plt
@@ -14,6 +15,10 @@ from scipy import stats
 
 import ocrolib
 
+utf8writer = codecs.getwriter('utf8')
+sys.stdout = utf8writer(sys.stdout)
+sys.stderr = utf8writer(sys.stderr)
+
 
 parser = argparse.ArgumentParser("""
 Image binarization using non-linear processing.
@@ -51,11 +56,11 @@ if len(args.files)<1:
 
 
 def print_info(*objs):
-    print("INFO: ", *objs, file=sys.stdout)
+    print("INFO: ", " ".join(objs))
 
 
 def print_error(*objs):
-    print("ERROR: ", *objs, file=sys.stderr)
+    print("ERROR: ", " ".join(objs), file=sys.stderr)
 
 
 def check_page(image):

diff --git a/ocropus-rpred b/ocropus-rpred
@@ -2,6 +2,7 @@
 
 from __future__ import print_function
 
+import sys
 import traceback
 import codecs
 import os.path
@@ -19,6 +20,10 @@ from ocrolib import lstm
 from ocrolib import edist
 from ocrolib.exceptions import FileNotFound, OcropusException
 
+utf8writer = codecs.getwriter('utf8')
+sys.stdout = utf8writer(sys.stdout)
+sys.stderr = utf8writer(sys.stderr)
+
 parser = argparse.ArgumentParser("apply an RNN recognizer")
 
 # error checking
@@ -72,11 +77,11 @@ args = parser.parse_args()
 
 
 def print_info(*objs):
-    print("INFO: ", *objs, file=sys.stdout)
+    print("INFO: ", " ".join(objs))
 
 
 def print_error(*objs):
-    print("ERROR: ", *objs, file=sys.stderr)
+    print("ERROR: ", " ".join(objs), file=sys.stderr)
 
 
 def check_line(image):

diff --git a/ocropus-rtrain b/ocropus-rtrain
@@ -8,6 +8,7 @@ import os.path
 import traceback
 import argparse
 import sys
+import codecs
 
 import numpy as np
 import matplotlib.pyplot as plt
@@ -16,6 +17,9 @@ import ocrolib
 import ocrolib.lstm as lstm
 from ocrolib import lineest
 
+utf8writer = codecs.getwriter('utf8')
+sys.stdout = utf8writer(sys.stdout)
+
 np.seterr(divide='raise',over='raise',invalid='raise',under='ignore')
 
 parser = argparse.ArgumentParser("train an RNN recognizer")