From c4ae4b2ca4d3a0416c628efa958559f4419dd989 Mon Sep 17 00:00:00 2001
From: Nick White <git@njw.name>
Date: Fri, 28 Jul 2017 13:09:34 +0100
Subject: [PATCH 1/2] Explicitly encode output as UTF-8

I think I've covered all the cases we need to, but there could be more.

This is needed as if a locale isn't set to UTF-8, an error will result
of this form:

  UnicodeEncodeError: 'ascii' codec can't encode character u'\u0113' in position 60: ordinal not in range(128)

While the ideal solution is for the user to set their locale to UTF-8,
it is better that we print debug output which may not be displayed
correctly than that we output a fatal (and non-obvious) error,
potentially some time into processing.

This also fixes some cases of implicitly combining str and *obj together
when printing debug output, which fails with some Python versions, by
explicitly using str.join(obj).
---
 ocropus-gpageseg | 9 +++++++--
 ocropus-nlbin    | 9 +++++++--
 ocropus-rpred    | 8 ++++++--
 ocropus-rtrain   | 4 ++++
 4 files changed, 24 insertions(+), 6 deletions(-)

diff --git a/ocropus-gpageseg b/ocropus-gpageseg
index 88d069a8..9c379283 100755
--- a/ocropus-gpageseg
+++ b/ocropus-gpageseg
@@ -19,6 +19,7 @@ import os
 import os.path
 import sys
 import traceback
+import codecs
 from multiprocessing import Pool
 
 import numpy as np
@@ -31,6 +32,10 @@ from ocrolib import psegutils,morph,sl
 from ocrolib.exceptions import OcropusException
 from ocrolib.toplevel import *
 
+utf8writer = codecs.getwriter('utf8')
+sys.stdout = utf8writer(sys.stdout)
+sys.stderr = utf8writer(sys.stderr)
+
 parser = argparse.ArgumentParser(add_help=False)
 
 # error checking
@@ -131,11 +136,11 @@ def check_page(image):
 
 
 def print_info(*objs):
-    print("INFO: ", *objs, file=sys.stdout)
+    print("INFO: ", " ".join(objs))
 
 
 def print_error(*objs):
-    print("ERROR: ", *objs, file=sys.stderr)
+    print("ERROR: ", " ".join(objs), file=sys.stderr)
 
 
 if len(args.files)<1:
diff --git a/ocropus-nlbin b/ocropus-nlbin
index f5d55116..59905cbd 100755
--- a/ocropus-nlbin
+++ b/ocropus-nlbin
@@ -6,6 +6,7 @@ import argparse
 import os
 import multiprocessing
 import sys
+import codecs
 
 import numpy as np
 import matplotlib.pyplot as plt
@@ -14,6 +15,10 @@ from scipy import stats
 
 import ocrolib
 
+utf8writer = codecs.getwriter('utf8')
+sys.stdout = utf8writer(sys.stdout)
+sys.stderr = utf8writer(sys.stderr)
+
 
 parser = argparse.ArgumentParser("""
 Image binarization using non-linear processing.
@@ -51,11 +56,11 @@ if len(args.files)<1:
 
 
 def print_info(*objs):
-    print("INFO: ", *objs, file=sys.stdout)
+    print("INFO: ", " ".join(objs))
 
 
 def print_error(*objs):
-    print("ERROR: ", *objs, file=sys.stderr)
+    print("ERROR: ", " ".join(objs), file=sys.stderr)
 
 
 def check_page(image):
diff --git a/ocropus-rpred b/ocropus-rpred
index 863947fa..8ebbfb20 100755
--- a/ocropus-rpred
+++ b/ocropus-rpred
@@ -19,6 +19,10 @@ from ocrolib import lstm
 from ocrolib import edist
 from ocrolib.exceptions import FileNotFound, OcropusException
 
+utf8writer = codecs.getwriter('utf8')
+sys.stdout = utf8writer(sys.stdout)
+sys.stderr = utf8writer(sys.stderr)
+
 parser = argparse.ArgumentParser("apply an RNN recognizer")
 
 # error checking
@@ -72,11 +76,11 @@ args = parser.parse_args()
 
 
 def print_info(*objs):
-    print("INFO: ", *objs, file=sys.stdout)
+    print("INFO: ", " ".join(objs))
 
 
 def print_error(*objs):
-    print("ERROR: ", *objs, file=sys.stderr)
+    print("ERROR: ", " ".join(objs), file=sys.stderr)
 
 
 def check_line(image):
diff --git a/ocropus-rtrain b/ocropus-rtrain
index 9d365ae3..8f52d43b 100755
--- a/ocropus-rtrain
+++ b/ocropus-rtrain
@@ -8,6 +8,7 @@ import os.path
 import traceback
 import argparse
 import sys
+import codecs
 
 import numpy as np
 import matplotlib.pyplot as plt
@@ -16,6 +17,9 @@ import ocrolib
 import ocrolib.lstm as lstm
 from ocrolib import lineest
 
+utf8writer = codecs.getwriter('utf8')
+sys.stdout = utf8writer(sys.stdout)
+
 np.seterr(divide='raise',over='raise',invalid='raise',under='ignore')
 
 parser = argparse.ArgumentParser("train an RNN recognizer")

From 6c5784edf6eedfd287d956c37988e082eca4fd3e Mon Sep 17 00:00:00 2001
From: Nick White <nick@rescribe.xyz>
Date: Mon, 27 Nov 2017 14:01:33 +0000
Subject: [PATCH 2/2] Explicitly encode output as UTF-8 for ocropus-errs and
 ocropus-econf

I accidentally missed these from the original commit (c4ae4b).
---
 ocropus-econf | 4 ++++
 ocropus-errs  | 5 +++++
 2 files changed, 9 insertions(+)

diff --git a/ocropus-econf b/ocropus-econf
index 58b62909..cade65e7 100755
--- a/ocropus-econf
+++ b/ocropus-econf
@@ -16,6 +16,10 @@ import numpy as np
 import ocrolib
 from ocrolib import edist
 
+utf8writer = codecs.getwriter('utf8')
+sys.stdout = utf8writer(sys.stdout)
+sys.stderr = utf8writer(sys.stderr)
+
 # disable rank warnings from polyfit
 warnings.simplefilter('ignore',np.RankWarning)
 
diff --git a/ocropus-errs b/ocropus-errs
index 786a4ed8..8303a68b 100755
--- a/ocropus-errs
+++ b/ocropus-errs
@@ -8,10 +8,15 @@ import sys
 import os
 import os.path
 import multiprocessing
+import codecs
 
 import ocrolib
 from ocrolib import edist
 
+utf8writer = codecs.getwriter('utf8')
+sys.stdout = utf8writer(sys.stdout)
+sys.stderr = utf8writer(sys.stderr)
+
 parser = argparse.ArgumentParser(description = """
 Compute the edit distances between ground truth and recognizer output.
 Run with the ground truth files as arguments, and it will find the