diff --git a/datalad/metadata/parsers/bids.py b/datalad/metadata/parsers/bids.py index 08131628b4..b7a6a4387a 100644 --- a/datalad/metadata/parsers/bids.py +++ b/datalad/metadata/parsers/bids.py @@ -8,10 +8,14 @@ # ## ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ## """BIDS metadata parser (http://bids.neuroimaging.io)""" +from io import open from os.path import join as opj, exists from datalad.support.json_py import load as jsonload +from datalad.dochelpers import exc_str from datalad.metadata.parsers.base import BaseMetadataParser +import logging +lgr = logging.getLogger('datalad.meta.bids') class MetadataParser(BaseMetadataParser): _core_metadata_filenames = ['dataset_description.json'] @@ -36,7 +40,17 @@ def _get_metadata(self, ds_identifier, meta, full): # BIDS uses README to provide description, so if was not # explicitly provided to possibly override longer README, let's just # load README - meta['description'] = open(README_fname).read().strip() + try: + desc = open(README_fname, encoding="utf-8").read() + except UnicodeDecodeError as exc: + lgr.warning( + "Failed to decode content of %s. " + "Re-loading allowing for UTF-8 errors with replacement: %s" + % (README_fname, exc_str(exc)) + ) + desc = open(README_fname, encoding="utf-8", errors="replace").read() + + meta['description'] = desc.strip() compliance = ["http://docs.datalad.org/metadata.html#v0-1"] diff --git a/datalad/metadata/parsers/tests/test_bids.py b/datalad/metadata/parsers/tests/test_bids.py index f17ba6804e..2e93fb5e3d 100644 --- a/datalad/metadata/parsers/tests/test_bids.py +++ b/datalad/metadata/parsers/tests/test_bids.py @@ -1,4 +1,4 @@ -# emacs: -*- mode: python-mode; py-indent-offset: 4; tab-width: 4; indent-tabs-mode: nil -*- +# emacs: -*- mode: python-mode; py-indent-offset: 4; tab-width: 4; indent-tabs-mode: nil; coding: utf-8 -*- # ex: set sts=4 ts=4 sw=4 noet: # ## ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ## # @@ -112,22 +112,24 @@ def test_get_metadata_with_description_and_README(path): }""") +# actually does not demonstrate problem with unicode encountered in +# https://github.com/datalad/datalad/issues/1138 @with_tree(tree={'dataset_description.json': """ { "Name": "test" } """, - 'README': """ + 'README': u""" A very detailed -description +description с юникодом """}) def test_get_metadata_with_README(path): - ds = Dataset(path) meta = MetadataParser(ds).get_metadata('ID') + dump = dumps(meta, sort_keys=True, indent=2, ensure_ascii=False) assert_equal( - dumps(meta, sort_keys=True, indent=2), - """\ + dump, + u"""\ { "@context": { "@vocab": "http://schema.org/", @@ -138,6 +140,6 @@ def test_get_metadata_with_README(path): "http://docs.datalad.org/metadata.html#v0-1", "http://bids.neuroimaging.io" ], - "description": "A very detailed\\ndescription", + "description": "A very detailed\\ndescription с юникодом", "name": "test" }""")