Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Dev/issue 8894 premis parsing #20

Closed
wants to merge 4 commits into from
Closed
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions metsrw/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,3 +9,4 @@
from .metadata import *
from .mets import *
from .utils import *
from . import premis
9 changes: 8 additions & 1 deletion metsrw/exceptions.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,11 +3,18 @@

All exceptions generated by this library will descend from MetsError.
"""


class MetsError(Exception):
""" Base Exception for this module. """
pass


class ConstructError(Exception):
""" Base constructing an object. """
pass


class ParseError(MetsError):
""" Error parsing a METS file. """
""" Error parsing a METS file or PREMIS element. """
pass
5 changes: 5 additions & 0 deletions metsrw/fsentry.py
Original file line number Diff line number Diff line change
Expand Up @@ -90,6 +90,11 @@ def __init__(self, path, label=None, use='original', type=u'Item', children=None
self.amdsecs = []
self.dmdsecs = []

# Convenient access to metadata (without cycling through amdsecs)
self.techmds = []
self.digiprovmds = []
self.rightsmds = []

def __str__(self):
return '{s.type}: {s.path}'.format(s=self)

Expand Down
67 changes: 65 additions & 2 deletions metsrw/metadata.py
Original file line number Diff line number Diff line change
Expand Up @@ -280,6 +280,60 @@ def serialize(self):
return el


class DublinCoreXmlData(object):
"""
An object representing a METS xmlData element containing a Dublin Core element.

:raises exceptions.ParseError: If the root element tag is not xmlData.
"""
DC_ELEMENTS = ['title', 'creator', 'subject', 'description', 'publisher', 'contributor', 'date', 'format', 'identifier', 'source', 'relation', 'language', 'coverage', 'rights']

def __init__(self, title=None, creator=None, subject=None, description=None, publisher=None, contributor=None, date=None, format=None, identifier=None, source=None, relation=None, language=None, coverage=None, rights=None):
for element in self.DC_ELEMENTS:
setattr(self, element, locals()[element])
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is a great use case for kwargs instead of accessing locals() directly. We could replace the list of parameters with **kwargs, which would allow us to accept an arbitrary number of inputs. kwargs is a dictionary. Then the loop could be setattr(self, element, kwargs[element])


@classmethod
def parse(cls, root):
"""
Parse an xmlData element containing a Dublin Core dublincore element.

:param root: Element or ElementTree to be parsed into an object.
:raises exceptions.ParseError: If the root is not xmlData or doesn't contain a dublincore element.
"""
if root.tag != utils.lxmlns('mets') + 'xmlData':
raise exceptions.ParseError('DublinCoreXmlData can only parse xmlData elements with mets namespace.')
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If DC is a plugin and a therefore a separate module, This should be a different, DC-specific exception, possibly that subclasses ParseError.


dc_el = root.find('dcterms:dublincore', namespaces=utils.NAMESPACES)

if dc_el is None or dc_el.tag != utils.lxmlns('dcterms') + 'dublincore':
raise exceptions.ParseError('xmlData can only contain a dublincore element with the dcterms namespace.')

args = []

for element in DublinCoreXmlData.DC_ELEMENTS:
args.append(dc_el.findtext("dc:" + element, namespaces=utils.NAMESPACES))

return cls(*args)

def serialize(self):
nsmap = {'mets': utils.NAMESPACES['mets'], 'xsi': utils.NAMESPACES['xsi'], 'xlink': utils.NAMESPACES['xlink']}
root = etree.Element(utils.lxmlns('mets') + 'xmlData', nsmap=nsmap)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

MDWrap already creates the xmlData element when serializing, and it makes sense to have DublinCore only handle elements in its own namespace. This could be merged with _serialize_dublincore below.

root.append(self._serialize_dublincore())
return root

def _serialize_dublincore(self):
nsmap = {'dcterms': utils.NAMESPACES['dcterms'], 'dc': utils.NAMESPACES['dc']}
attrib = {'{}schemaLocation'.format(utils.lxmlns('xsi')): utils.DUBLINCORE_SCHEMA_LOCATIONS}
dc_root = etree.Element(utils.lxmlns('dcterms') + 'dublincore', nsmap=nsmap, attrib=attrib)

for element in DublinCoreXmlData.DC_ELEMENTS:
dc_el = etree.Element(utils.lxmlns('dc') + element)
dc_el.text = getattr(self, element)
dc_root.append(dc_el)

return dc_root


class MDWrap(object):
"""
An object representing an XML document enclosed in a METS document.
Expand All @@ -291,13 +345,16 @@ class MDWrap(object):
:param str mdtype: The MDTYPE of XML document being enclosed. Examples
include "PREMIS:OBJECT" and "PREMIS:EVENT".
"""
def __init__(self, document, mdtype):
MDTYPE_CLASSES = {'DC': DublinCoreXmlData}
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I like this! It will have to be improved to talk to a plugin interface though.


def __init__(self, document, mdtype, data=None):
parser = etree.XMLParser(remove_blank_text=True)
if isinstance(document, six.string_types):
self.document = etree.fromstring(document, parser=parser)
elif isinstance(document, etree._Element):
self.document = document
self.mdtype = mdtype
self.data = data
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Rather than creating a new attribute, this could use document to store the child document - whether that's a string, ElementTree, or plugin class of the appropriate type.


@classmethod
def parse(cls, root):
Expand All @@ -313,11 +370,17 @@ def parse(cls, root):
mdtype = root.get('MDTYPE')
if not mdtype:
raise exceptions.ParseError('mdWrap must have a MDTYPE')
if mdtype in MDWrap.MDTYPE_CLASSES.keys():
mdtype_class = MDWrap.MDTYPE_CLASSES[mdtype]()
data = mdtype_class.parse(root.find('mets:xmlData', namespaces=utils.NAMESPACES)).__dict__
else:
data = None

document = root.xpath('mets:xmlData/*', namespaces=utils.NAMESPACES)
if len(document) != 1:
raise exceptions.ParseError('mdWrap and xmlData can only have one child')
document = document[0]
return cls(document, mdtype)
return cls(document, mdtype, data)

def serialize(self):
el = etree.Element(utils.lxmlns('mets') + 'mdWrap', MDTYPE=self.mdtype)
Expand Down
6 changes: 5 additions & 1 deletion metsrw/mets.py
Original file line number Diff line number Diff line change
Expand Up @@ -118,7 +118,7 @@ def _document_root(self, fully_qualified=True):
nsmap[None] = utils.NAMESPACES['mets']
attrib = {
'{}schemaLocation'.format(utils.lxmlns('xsi')):
utils.SCHEMA_LOCATIONS
utils.METS_SCHEMA_LOCATIONS
}
return etree.Element(utils.lxmlns('mets') + 'mets', nsmap=nsmap, attrib=attrib)

Expand Down Expand Up @@ -285,6 +285,10 @@ def _parse_tree_structmap(self, tree, parent_elem):
amdsec = metadata.AMDSec.parse(amdsec_elem)
fs_entry.amdsecs.append(amdsec)

# Add subsections to convience properties
for subsection in amdsec.subsections:
getattr(fs_entry, subsection.subsection.lower() + 's').append(subsection)

siblings.append(fs_entry)
return siblings

Expand Down
Loading