Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Dev/issue 8894 premis parsing #20

Closed
wants to merge 4 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
65 changes: 65 additions & 0 deletions docs/examples.rst
Original file line number Diff line number Diff line change
@@ -0,0 +1,65 @@
Example usage
=============

Parsing METS documents
----------------------

Example of listing the relative file paths of preservation files referenced in
a METS file:::

import metsrw

mets = metsrw.METSDocument.fromfile('fixtures/complete_mets.xml')
for entry in mets.all_files():
if entry.use == 'preservation':
print entry.path

Example of retrieving a file by UUID:::

import metsrw

mets = metsrw.METSDocument.fromfile('fixtures/complete_mets.xml')
entry = mets.get_file('46b7cb96-792c-4441-a5d6-67c83313501c')
print entry.path

Creating/modifying METS documents
---------------------------------

Example creation of a METS document (without PREMIS or Dublin Core metadata):::

import metsrw
import uuid

mw = metsrw.METSDocument()

# Create object entries
file1 = metsrw.FSEntry('objects/cat.png', file_uuid=str(uuid.uuid4()))
file2 = metsrw.FSEntry('objects/dog.jpg', file_uuid=str(uuid.uuid4()))

# Create preservation derivative entries
file1p = metsrw.FSEntry('objects/cat-preservation.tiff', use='preservation', file_uuid=str(uuid.uuid4()), derived_from=file1)
file2p = metsrw.FSEntry('objects/dog-preservation.tiff', use='preservation', file_uuid=str(uuid.uuid4()), derived_from=file2)

# Create object directory entry
objects = metsrw.FSEntry('objects', type='Directory', children=[file1, file2, file1p, file2p])

# Create metadata subdirectories then metadata directory entry
children = [
metsrw.FSEntry('transfers', type='Directory', children=[]),
metsrw.FSEntry('metadata/metadata.csv', use='metadata', file_uuid=str(uuid.uuid4())),
]
metadata = metsrw.FSEntry('metadata', type='Directory', children=children)

# Create submission METS entry and submission documentation parent directory entry
children = [
metsrw.FSEntry('submissionDocumentation/METS.xml', use='submissionDocumentation', file_uuid=str(uuid.uuid4())),
]
sub_doc = metsrw.FSEntry('submissionDocumentation', type='Directory', children=children)

# Create SIP entry containing objects, metadata, and submission documentaton entries
children = [objects, metadata, sub_doc]
sip = metsrw.FSEntry('sipname-uuid', type='Directory', children=children)

# Add SIP entry to METS document and write to file
mw.append_file(sip)
mw.write('mets.xml', fully_qualified=True, pretty_print=True)
9,250 changes: 7,540 additions & 1,710 deletions fixtures/complete_mets.xml

Large diffs are not rendered by default.

1 change: 1 addition & 0 deletions metsrw/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,3 +9,4 @@
from .metadata import *
from .mets import *
from .utils import *
from . import premis
9 changes: 8 additions & 1 deletion metsrw/exceptions.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,11 +3,18 @@

All exceptions generated by this library will descend from MetsError.
"""


class MetsError(Exception):
""" Base Exception for this module. """
pass


class ConstructError(Exception):
""" Base constructing an object. """
pass


class ParseError(MetsError):
""" Error parsing a METS file. """
""" Error parsing a METS file or PREMIS element. """
pass
5 changes: 5 additions & 0 deletions metsrw/fsentry.py
Original file line number Diff line number Diff line change
Expand Up @@ -90,6 +90,11 @@ def __init__(self, path, label=None, use='original', type=u'Item', children=None
self.amdsecs = []
self.dmdsecs = []

# Convenient access to metadata (without cycling through amdsecs)
self.techmds = []
self.digiprovmds = []
self.rightsmds = []

def __str__(self):
return '{s.type}: {s.path}'.format(s=self)

Expand Down
75 changes: 73 additions & 2 deletions metsrw/metadata.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
"""
from __future__ import absolute_import

from collections import OrderedDict
import logging
from lxml import etree
from random import randint
Expand Down Expand Up @@ -280,6 +281,67 @@ def serialize(self):
return el


class DublinCoreXmlData(object):
"""
An object representing a METS xmlData element containing a Dublin Core element.

:raises exceptions.ParseError: If the root element tag is not xmlData.
"""
DC_ELEMENTS = ['title', 'creator', 'subject', 'description', 'publisher', 'contributor', 'date', 'format', 'identifier', 'source', 'relation', 'language', 'coverage', 'rights']

def __init__(self, title=None, creator=None, subject=None, description=None, publisher=None, contributor=None, date=None, format=None, identifier=None, source=None, relation=None, language=None, coverage=None, rights=None):
for element in self.DC_ELEMENTS:
setattr(self, element, locals()[element])
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is a great use case for kwargs instead of accessing locals() directly. We could replace the list of parameters with **kwargs, which would allow us to accept an arbitrary number of inputs. kwargs is a dictionary. Then the loop could be setattr(self, element, kwargs[element])


@classmethod
def parse(cls, root):
"""
Parse an xmlData element containing a Dublin Core dublincore element.

:param root: Element or ElementTree to be parsed into an object.
:raises exceptions.ParseError: If the root is not xmlData or doesn't contain a dublincore element.
"""
if root.tag != utils.lxmlns('mets') + 'xmlData':
raise exceptions.ParseError('DublinCoreXmlData can only parse xmlData elements with mets namespace.')
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If DC is a plugin and a therefore a separate module, This should be a different, DC-specific exception, possibly that subclasses ParseError.


dc_el = root.find('dcterms:dublincore', namespaces=utils.NAMESPACES)

if dc_el is None or dc_el.tag != utils.lxmlns('dcterms') + 'dublincore':
raise exceptions.ParseError('xmlData can only contain a dublincore element with the dcterms namespace.')

args = []

for element in DublinCoreXmlData.DC_ELEMENTS:
args.append(dc_el.findtext("dc:" + element, namespaces=utils.NAMESPACES))

return cls(*args)

def serialize(self):
nsmap = OrderedDict([
('mets', utils.NAMESPACES['mets']),
('xsi', utils.NAMESPACES['xsi']),
('xlink', utils.NAMESPACES['xlink'])
])
root = etree.Element(utils.lxmlns('mets') + 'xmlData', nsmap=nsmap)
root.append(self._serialize_dublincore())
return root

def _serialize_dublincore(self):
nsmap = OrderedDict([
('dcterms', utils.NAMESPACES['dcterms']),
('dc', utils.NAMESPACES['dc'])
])
attrib = {'{}schemaLocation'.format(utils.lxmlns('xsi')): utils.DUBLINCORE_SCHEMA_LOCATIONS}
dc_root = etree.Element(utils.lxmlns('dcterms') + 'dublincore', nsmap=nsmap, attrib=attrib)

for element in DublinCoreXmlData.DC_ELEMENTS:
dc_el = etree.Element(utils.lxmlns('dc') + element)
dc_el.text = getattr(self, element)
dc_root.append(dc_el)

return dc_root


class MDWrap(object):
"""
An object representing an XML document enclosed in a METS document.
Expand All @@ -291,13 +353,16 @@ class MDWrap(object):
:param str mdtype: The MDTYPE of XML document being enclosed. Examples
include "PREMIS:OBJECT" and "PREMIS:EVENT".
"""
def __init__(self, document, mdtype):
MDTYPE_CLASSES = {'DC': DublinCoreXmlData}
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I like this! It will have to be improved to talk to a plugin interface though.


def __init__(self, document, mdtype, data=None):
parser = etree.XMLParser(remove_blank_text=True)
if isinstance(document, six.string_types):
self.document = etree.fromstring(document, parser=parser)
elif isinstance(document, etree._Element):
self.document = document
self.mdtype = mdtype
self.data = data
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Rather than creating a new attribute, this could use document to store the child document - whether that's a string, ElementTree, or plugin class of the appropriate type.


@classmethod
def parse(cls, root):
Expand All @@ -313,11 +378,17 @@ def parse(cls, root):
mdtype = root.get('MDTYPE')
if not mdtype:
raise exceptions.ParseError('mdWrap must have a MDTYPE')
if mdtype in MDWrap.MDTYPE_CLASSES.keys():
mdtype_class = MDWrap.MDTYPE_CLASSES[mdtype]()
data = mdtype_class.parse(root.find('mets:xmlData', namespaces=utils.NAMESPACES)).__dict__
else:
data = None

document = root.xpath('mets:xmlData/*', namespaces=utils.NAMESPACES)
if len(document) != 1:
raise exceptions.ParseError('mdWrap and xmlData can only have one child')
document = document[0]
return cls(document, mdtype)
return cls(document, mdtype, data)

def serialize(self):
el = etree.Element(utils.lxmlns('mets') + 'mdWrap', MDTYPE=self.mdtype)
Expand Down
15 changes: 10 additions & 5 deletions metsrw/mets.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
from __future__ import absolute_import

from collections import OrderedDict
from datetime import datetime
import logging
from lxml import etree
Expand Down Expand Up @@ -108,17 +109,17 @@ def _document_root(self, fully_qualified=True):
"""
Return the mets Element for the document root.
"""
nsmap = {
'xsi': utils.NAMESPACES['xsi'],
'xlink': utils.NAMESPACES['xlink']
}
nsmap = OrderedDict([
('xsi', utils.NAMESPACES['xsi']),
('xlink', utils.NAMESPACES['xlink'])
])
if fully_qualified:
nsmap['mets'] = utils.NAMESPACES['mets']
else:
nsmap[None] = utils.NAMESPACES['mets']
attrib = {
'{}schemaLocation'.format(utils.lxmlns('xsi')):
utils.SCHEMA_LOCATIONS
utils.METS_SCHEMA_LOCATIONS
}
return etree.Element(utils.lxmlns('mets') + 'mets', nsmap=nsmap, attrib=attrib)

Expand Down Expand Up @@ -285,6 +286,10 @@ def _parse_tree_structmap(self, tree, parent_elem):
amdsec = metadata.AMDSec.parse(amdsec_elem)
fs_entry.amdsecs.append(amdsec)

# Add subsections to convience properties
for subsection in amdsec.subsections:
getattr(fs_entry, subsection.subsection.lower() + 's').append(subsection)

siblings.append(fs_entry)
return siblings

Expand Down
8 changes: 8 additions & 0 deletions metsrw/premis/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
from __future__ import absolute_import

import logging
LOGGER = logging.getLogger(__name__)
LOGGER.addHandler(logging.NullHandler())

from .object import *
from .event import *
Loading