Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add yapremisrw2, yet another PREMIS reader/writer plugin #34

Open
wants to merge 6 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@ pip-delete-this-directory.txt
.tox/
.coverage
.cache
htmlcov
nosetests.xml
coverage.xml

Expand Down
65 changes: 65 additions & 0 deletions docs/examples.rst
Original file line number Diff line number Diff line change
@@ -0,0 +1,65 @@
Example usage
=============

Parsing METS documents
----------------------

Example of listing the relative file paths of preservation files referenced in
a METS file:::

import metsrw

mets = metsrw.METSDocument.fromfile('fixtures/complete_mets_2.xml')
for entry in mets.all_files():
if entry.use == 'preservation':
print entry.path

Example of retrieving a file by UUID:::

import metsrw

mets = metsrw.METSDocument.fromfile('fixtures/complete_mets_2.xml')
entry = mets.get_file('46b7cb96-792c-4441-a5d6-67c83313501c')
print entry.path

Creating/modifying METS documents
---------------------------------

Example creation of a METS document (without PREMIS or Dublin Core metadata):::

import metsrw
import uuid

mw = metsrw.METSDocument()

# Create object entries
file1 = metsrw.FSEntry('objects/cat.png', file_uuid=str(uuid.uuid4()))
file2 = metsrw.FSEntry('objects/dog.jpg', file_uuid=str(uuid.uuid4()))

# Create preservation derivative entries
file1p = metsrw.FSEntry('objects/cat-preservation.tiff', use='preservation', file_uuid=str(uuid.uuid4()), derived_from=file1)
file2p = metsrw.FSEntry('objects/dog-preservation.tiff', use='preservation', file_uuid=str(uuid.uuid4()), derived_from=file2)

# Create object directory entry
objects = metsrw.FSEntry('objects', type='Directory', children=[file1, file2, file1p, file2p])

# Create metadata subdirectories then metadata directory entry
children = [
metsrw.FSEntry('transfers', type='Directory', children=[]),
metsrw.FSEntry('metadata/metadata.csv', use='metadata', file_uuid=str(uuid.uuid4())),
]
metadata = metsrw.FSEntry('metadata', type='Directory', children=children)

# Create submission METS entry and submission documentation parent directory entry
children = [
metsrw.FSEntry('submissionDocumentation/METS.xml', use='submissionDocumentation', file_uuid=str(uuid.uuid4())),
]
sub_doc = metsrw.FSEntry('submissionDocumentation', type='Directory', children=children)

# Create SIP entry containing objects, metadata, and submission documentaton entries
children = [objects, metadata, sub_doc]
sip = metsrw.FSEntry('sipname-uuid', type='Directory', children=children)

# Add SIP entry to METS document and write to file
mw.append_file(sip)
mw.write('mets.xml', fully_qualified=True, pretty_print=True)
7,541 changes: 7,541 additions & 0 deletions fixtures/complete_mets_2.xml

Large diffs are not rendered by default.

14 changes: 8 additions & 6 deletions metsrw/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@
AM_PNTR_SCT_PATH,
get_schematron,
validate,
get_file_path,
get_xmlschema,
xsd_validate,
schematron_validate,
Expand All @@ -40,14 +41,15 @@

LOGGER = logging.getLogger(__name__)
LOGGER.addHandler(logging.NullHandler())
__version__ = '0.2.0'
__version__ = '0.2.1'

__all__ = ['MetsError', 'ParseError', 'FSEntry', 'AMDSec', 'SubSection',
'MDRef', 'MDWrap', 'METSDocument', 'NAMESPACES', 'SCHEMA_LOCATIONS',
'lxmlns', 'FILE_ID_PREFIX', 'GROUP_ID_PREFIX', 'METS_XSD_PATH',
'AM_SCT_PATH', 'AM_PNTR_SCT_PATH', 'get_schematron', 'validate',
'get_xmlschema', 'xsd_validate', 'schematron_validate',
'sct_report_string', 'xsd_error_log_string', 'report_string',
'FeatureBroker', 'set_feature_broker_to_default_state',
'feature_broker', 'Dependency', 'has_class_methods', 'has_methods',
'is_class', 'plugins', '__version__']
'get_file_path', 'get_xmlschema', 'xsd_validate',
'schematron_validate', 'sct_report_string', 'xsd_error_log_string',
'report_string', 'FeatureBroker',
'set_feature_broker_to_default_state', 'feature_broker',
'Dependency', 'has_class_methods', 'has_methods', 'is_class',
'plugins', '__version__']
16 changes: 15 additions & 1 deletion metsrw/di.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,10 @@
See http://code.activestate.com/recipes/413268/
"""

from .plugins import premisrw
from .plugins import (
premisrw,
dcrw
)


class FeatureBroker(object):
Expand Down Expand Up @@ -62,10 +65,21 @@ def __getitem__(self, feature_name):


def set_feature_broker_to_default_state(fb):
"""Provide dependencies via the global singleton feature broker.

To use yapremisrw, provide different class(es) from that plugin, e.g., to
use ``yapremisrw.Event``::

>>> from .plugins import yapremisrw
>>> from metsrw import feature_broker as fb
>>> fb.provide('premis_event_class', yapremisrw.Event)

"""
fb.clear()
fb.provide('premis_object_class', premisrw.PREMISObject)
fb.provide('premis_event_class', premisrw.PREMISEvent)
fb.provide('premis_agent_class', premisrw.PREMISAgent)
fb.provide('dublin_core_class', dcrw.DublinCoreXmlData)


feature_broker = FeatureBroker() # global singleton feature broker
Expand Down
28 changes: 24 additions & 4 deletions metsrw/fsentry.py
Original file line number Diff line number Diff line change
Expand Up @@ -89,10 +89,16 @@ class FSEntry(object):
has_methods('serialize'),
has_class_methods('fromtree'),
is_class)
dublin_core_class = Dependency(
'dublin_core_class',
has_methods('serialize'),
has_class_methods('fromtree'),
is_class)

PREMIS_OBJECT = 'PREMIS:OBJECT'
PREMIS_EVENT = 'PREMIS:EVENT'
PREMIS_AGENT = 'PREMIS:AGENT'
DublinCore = 'DC'

def __init__(self, path=None, label=None, use='original', type=u'Item',
children=None, file_uuid=None, derived_from=None,
Expand Down Expand Up @@ -131,6 +137,11 @@ def __init__(self, path=None, label=None, use='original', type=u'Item',
self.amdsecs = []
self.dmdsecs = []

# Convenient access to metadata (without cycling through amdsecs)
self.techmds = []
self.digiprovmds = []
self.rightsmds = []

def __str__(self):
return '{s.type}: {s.path}'.format(s=self)

Expand Down Expand Up @@ -271,7 +282,9 @@ def add_premis_rights(self, md, mode='mdwrap'):

def add_dublin_core(self, md, mode='mdwrap'):
# TODO add extra args and create DC object here
return self.add_dmdsec(md, 'DC', mode)
return self.add_dmdsec(
self.serialize_md_inst(md, self.dublin_core_class),
self.DublinCore, mode)

def add_child(self, child):
"""Add a child FSEntry to this FSEntry.
Expand Down Expand Up @@ -399,9 +412,12 @@ def serialize_structmap(self, recurse=True, normative=False):
return el

def get_subsections_of_type(self, mdtype, md_class):
return [md_class.fromtree(ss.contents.document)
for ss in self.amdsecs[0].subsections
if ss.contents.mdtype == mdtype]
try:
return [md_class.fromtree(ss.contents.document)
for ss in self.amdsecs[0].subsections
if ss.contents.mdtype == mdtype]
except IndexError:
return []

def get_premis_objects(self):
return self.get_subsections_of_type(
Expand All @@ -414,3 +430,7 @@ def get_premis_events(self):
def get_premis_agents(self):
return self.get_subsections_of_type(
self.PREMIS_AGENT, self.premis_agent_class)

def get_dublin_core(self):
return self.get_subsections_of_type(
self.DublinCore, self.dublin_core_class)
15 changes: 13 additions & 2 deletions metsrw/metadata.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@

from . import exceptions
from . import utils
from .plugins.dcrw import DublinCoreXmlData

LOGGER = logging.getLogger(__name__)

Expand Down Expand Up @@ -298,14 +299,18 @@ class MDWrap(object):
include "PREMIS:OBJECT", "PREMIS:EVENT,", "DC" and "OTHER".
:param str othermdtype: The OTHERMDTYPE of the XML document. Should be set if mdtype is "OTHER".
"""
def __init__(self, document, mdtype, othermdtype=None):

MDTYPE_CLASSES = {'DC': DublinCoreXmlData}

def __init__(self, document, mdtype, othermdtype=None, data=None):
parser = etree.XMLParser(remove_blank_text=True)
if isinstance(document, six.string_types):
self.document = etree.fromstring(document, parser=parser)
elif isinstance(document, (etree._Element, list)):
self.document = document
self.mdtype = mdtype
self.othermdtype = othermdtype
self.data = data
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Comment by @Hwesta: Rather than creating a new attribute, this could use document to store the child document - whether that's a string, ElementTree, or plugin class of the appropriate type.


@classmethod
def parse(cls, root):
Expand All @@ -321,6 +326,12 @@ def parse(cls, root):
mdtype = root.get('MDTYPE')
if not mdtype:
raise exceptions.ParseError('mdWrap must have a MDTYPE')
if mdtype in MDWrap.MDTYPE_CLASSES.keys():
mdtype_class = MDWrap.MDTYPE_CLASSES[mdtype]()
data = mdtype_class.parse(root.find('mets:xmlData', namespaces=utils.NAMESPACES)).__dict__
else:
data = None

othermdtype = root.get('OTHERMDTYPE')
document = root.xpath('mets:xmlData/*', namespaces=utils.NAMESPACES)
if len(document) == 0:
Expand All @@ -329,7 +340,7 @@ def parse(cls, root):
' one has none')
elif len(document) == 1:
document = document[0]
return cls(document, mdtype, othermdtype)
return cls(document, mdtype, othermdtype=othermdtype, data=data)

def serialize(self):
el = etree.Element(utils.lxmlns('mets') + 'mdWrap', MDTYPE=self.mdtype)
Expand Down
5 changes: 5 additions & 0 deletions metsrw/mets.py
Original file line number Diff line number Diff line change
Expand Up @@ -389,6 +389,11 @@ def _add_amdsecs_to_fs_entry(amdids, fs_entry, tree):
namespaces=utils.NAMESPACES)
amdsec = metadata.AMDSec.parse(amdsec_elem)
fs_entry.amdsecs.append(amdsec)
# Add subsections to convience properties
for subsection in amdsec.subsections:
getattr(
fs_entry, subsection.subsection.lower() + 's').append(
subsection)

def _parse_tree(self, tree=None):
if tree is None:
Expand Down
30 changes: 30 additions & 0 deletions metsrw/plugins/dcrw/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
from __future__ import absolute_import

import logging

from .dc import DublinCoreXmlData
from .utils import (
NAMESPACES,
DUBLINCORE_SCHEMA_LOCATIONS,
lxmlns,
)
from .exceptions import (
DcError,
ConstructError,
ParseError
)


LOGGER = logging.getLogger(__name__)
LOGGER.addHandler(logging.NullHandler())


__all__ = [
'DublinCoreXmlData',
'NAMESPACES',
'DUBLINCORE_SCHEMA_LOCATIONS',
'lxmlns',
'DcError',
'ConstructError',
'ParseError',
]
75 changes: 75 additions & 0 deletions metsrw/plugins/dcrw/dc.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,75 @@
from __future__ import absolute_import

from collections import OrderedDict
import logging
from lxml import etree

from .exceptions import ParseError
from .utils import lxmlns, NAMESPACES, DUBLINCORE_SCHEMA_LOCATIONS

LOGGER = logging.getLogger(__name__)


class DublinCoreXmlData(object):
"""
An object representing a METS xmlData element containing a Dublin Core element.

:raises ParseError: If the root element tag is not xmlData.
"""
DC_ELEMENTS = ['title', 'creator', 'subject', 'description', 'publisher',
'contributor', 'date', 'format', 'identifier', 'source',
'relation', 'language', 'coverage', 'rights']

def __init__(self, **kwargs):
for element in self.DC_ELEMENTS:
setattr(self, element, kwargs.get(element))

@classmethod
def parse(cls, root):
"""
Parse an xmlData element containing a Dublin Core dublincore element.

:param root: Element or ElementTree to be parsed into an object.
:raises ParseError: If the root is not xmlData or doesn't contain a dublincore element.
"""
if root.tag != lxmlns('mets') + 'xmlData':
raise ParseError('DublinCoreXmlData can only parse xmlData elements with mets namespace.')

dc_el = root.find('dcterms:dublincore', namespaces=NAMESPACES)

if dc_el is None or dc_el.tag != lxmlns('dcterms') + 'dublincore':
raise ParseError('xmlData can only contain a dublincore element with the dcterms namespace.')

kwargs = {}

for element in DublinCoreXmlData.DC_ELEMENTS:
kwargs[element] = dc_el.findtext("dc:" + element, namespaces=NAMESPACES)

return cls(**kwargs)

fromtree = parse

def serialize(self):
nsmap = OrderedDict([
('mets', NAMESPACES['mets']),
('xsi', NAMESPACES['xsi']),
('xlink', NAMESPACES['xlink'])
])
root = etree.Element(lxmlns('mets') + 'xmlData', nsmap=nsmap)
root.append(self._serialize_dublincore())
return root

def _serialize_dublincore(self):
nsmap = OrderedDict([
('dcterms', NAMESPACES['dcterms']),
('dc', NAMESPACES['dc'])
])
attrib = {'{}schemaLocation'.format(lxmlns('xsi')): DUBLINCORE_SCHEMA_LOCATIONS}
dc_root = etree.Element(lxmlns('dcterms') + 'dublincore', nsmap=nsmap, attrib=attrib)

for element in DublinCoreXmlData.DC_ELEMENTS:
dc_el = etree.Element(lxmlns('dc') + element)
dc_el.text = getattr(self, element)
dc_root.append(dc_el)

return dc_root
19 changes: 19 additions & 0 deletions metsrw/plugins/dcrw/exceptions.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
"""Exceptions for dcrw.

All exceptions generated by this library will descend from DcError.
"""


class DcError(Exception):
""" Base Exception for this module. """
pass


class ConstructError(DcError):
""" Error constructing an object. """
pass


class ParseError(DcError):
""" Error parsing a DC element. """
pass
Loading