ENH: be resilient to encoding errors in README

caused by poldracklab/open_fmri#39
yarikoptic · Nov 25, 2016 · 1679032 · 1679032
1 parent 96811aa
commit 1679032
Show file tree

Hide file tree

Showing 2 changed files with 24 additions and 8 deletions.
diff --git a/datalad/metadata/parsers/bids.py b/datalad/metadata/parsers/bids.py
@@ -8,10 +8,14 @@
 # ## ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ##
 """BIDS metadata parser (http://bids.neuroimaging.io)"""
 
+from io import open
 from os.path import join as opj, exists
 from datalad.support.json_py import load as jsonload
+from datalad.dochelpers import exc_str
 from datalad.metadata.parsers.base import BaseMetadataParser
 
+import logging
+lgr = logging.getLogger('datalad.meta.bids')
 
 class MetadataParser(BaseMetadataParser):
     _core_metadata_filenames = ['dataset_description.json']
@@ -36,7 +40,17 @@ def _get_metadata(self, ds_identifier, meta, full):
             # BIDS uses README to provide description, so if was not
             # explicitly provided to possibly override longer README, let's just
             # load README
-            meta['description'] = open(README_fname).read().strip()
+            try:
+                desc = open(README_fname, encoding="utf-8").read()
+            except UnicodeDecodeError as exc:
+                lgr.warning(
+                    "Failed to decode content of %s. "
+                    "Re-loading allowing for UTF-8 errors with replacement: %s"
+                    % (README_fname, exc_str(exc))
+                )
+                desc = open(README_fname, encoding="utf-8", errors="replace").read()
+
+            meta['description'] = desc.strip()
 
         compliance = ["http://docs.datalad.org/metadata.html#v0-1"]
 

diff --git a/datalad/metadata/parsers/tests/test_bids.py b/datalad/metadata/parsers/tests/test_bids.py
@@ -1,4 +1,4 @@
-# emacs: -*- mode: python-mode; py-indent-offset: 4; tab-width: 4; indent-tabs-mode: nil -*-
+# emacs: -*- mode: python-mode; py-indent-offset: 4; tab-width: 4; indent-tabs-mode: nil; coding: utf-8 -*-
 # ex: set sts=4 ts=4 sw=4 noet:
 # ## ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ##
 #
@@ -112,22 +112,24 @@ def test_get_metadata_with_description_and_README(path):
 }""")
 
 
+# actually does not demonstrate problem with unicode encountered in
+# https://github.com/datalad/datalad/issues/1138
 @with_tree(tree={'dataset_description.json': """
 {
     "Name": "test"
 }
 """,
-                 'README': """
+                 'README': u"""
 A very detailed
-description
+description с юникодом
 """})
 def test_get_metadata_with_README(path):
-
     ds = Dataset(path)
     meta = MetadataParser(ds).get_metadata('ID')
+    dump = dumps(meta, sort_keys=True, indent=2, ensure_ascii=False)
     assert_equal(
-        dumps(meta, sort_keys=True, indent=2),
-        """\
+        dump,
+        u"""\
 {
   "@context": {
     "@vocab": "http://schema.org/",
@@ -138,6 +140,6 @@ def test_get_metadata_with_README(path):
     "http://docs.datalad.org/metadata.html#v0-1",
     "http://bids.neuroimaging.io"
   ],
-  "description": "A very detailed\\ndescription",
+  "description": "A very detailed\\ndescription с юникодом",
   "name": "test"
 }""")