Skip to content

Commit

Permalink
ENH: be resilient to encoding errors in README
Browse files Browse the repository at this point in the history
  • Loading branch information
yarikoptic committed Nov 25, 2016
1 parent 96811aa commit 1679032
Show file tree
Hide file tree
Showing 2 changed files with 24 additions and 8 deletions.
16 changes: 15 additions & 1 deletion datalad/metadata/parsers/bids.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,10 +8,14 @@
# ## ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ##
"""BIDS metadata parser (http://bids.neuroimaging.io)"""

from io import open
from os.path import join as opj, exists
from datalad.support.json_py import load as jsonload
from datalad.dochelpers import exc_str
from datalad.metadata.parsers.base import BaseMetadataParser

import logging
lgr = logging.getLogger('datalad.meta.bids')

class MetadataParser(BaseMetadataParser):
_core_metadata_filenames = ['dataset_description.json']
Expand All @@ -36,7 +40,17 @@ def _get_metadata(self, ds_identifier, meta, full):
# BIDS uses README to provide description, so if was not
# explicitly provided to possibly override longer README, let's just
# load README
meta['description'] = open(README_fname).read().strip()
try:
desc = open(README_fname, encoding="utf-8").read()
except UnicodeDecodeError as exc:
lgr.warning(
"Failed to decode content of %s. "
"Re-loading allowing for UTF-8 errors with replacement: %s"
% (README_fname, exc_str(exc))
)
desc = open(README_fname, encoding="utf-8", errors="replace").read()

meta['description'] = desc.strip()

compliance = ["http://docs.datalad.org/metadata.html#v0-1"]

Expand Down
16 changes: 9 additions & 7 deletions datalad/metadata/parsers/tests/test_bids.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# emacs: -*- mode: python-mode; py-indent-offset: 4; tab-width: 4; indent-tabs-mode: nil -*-
# emacs: -*- mode: python-mode; py-indent-offset: 4; tab-width: 4; indent-tabs-mode: nil; coding: utf-8 -*-
# ex: set sts=4 ts=4 sw=4 noet:
# ## ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ##
#
Expand Down Expand Up @@ -112,22 +112,24 @@ def test_get_metadata_with_description_and_README(path):
}""")


# actually does not demonstrate problem with unicode encountered in
# https://github.com/datalad/datalad/issues/1138
@with_tree(tree={'dataset_description.json': """
{
"Name": "test"
}
""",
'README': """
'README': u"""
A very detailed
description
description с юникодом
"""})
def test_get_metadata_with_README(path):

ds = Dataset(path)
meta = MetadataParser(ds).get_metadata('ID')
dump = dumps(meta, sort_keys=True, indent=2, ensure_ascii=False)
assert_equal(
dumps(meta, sort_keys=True, indent=2),
"""\
dump,
u"""\
{
"@context": {
"@vocab": "http://schema.org/",
Expand All @@ -138,6 +140,6 @@ def test_get_metadata_with_README(path):
"http://docs.datalad.org/metadata.html#v0-1",
"http://bids.neuroimaging.io"
],
"description": "A very detailed\\ndescription",
"description": "A very detailed\\ndescription с юникодом",
"name": "test"
}""")

0 comments on commit 1679032

Please sign in to comment.