Skip to content

Commit

Permalink
Add yapremisrw, Yet Another PREMIS reader/writer
Browse files Browse the repository at this point in the history
Added yapremisrw, yet another PREMIS reader/writer plugin. This was based on
previous work by mcantelon. It has been rebased against current master and
modified minimally to make it a dependency (plugin) injectable into
`metsrw.fsentry.FSEntry`.
  • Loading branch information
jrwdunham committed Nov 10, 2017
1 parent 21438f2 commit 3caf782
Show file tree
Hide file tree
Showing 28 changed files with 9,738 additions and 14 deletions.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@ pip-delete-this-directory.txt
.tox/
.coverage
.cache
htmlcov
nosetests.xml
coverage.xml

Expand Down
65 changes: 65 additions & 0 deletions docs/examples.rst
Original file line number Diff line number Diff line change
@@ -0,0 +1,65 @@
Example usage
=============

Parsing METS documents
----------------------

Example of listing the relative file paths of preservation files referenced in
a METS file:::

import metsrw

mets = metsrw.METSDocument.fromfile('fixtures/complete_mets_2.xml')
for entry in mets.all_files():
if entry.use == 'preservation':
print entry.path

Example of retrieving a file by UUID:::

import metsrw

mets = metsrw.METSDocument.fromfile('fixtures/complete_mets_2.xml')
entry = mets.get_file('46b7cb96-792c-4441-a5d6-67c83313501c')
print entry.path

Creating/modifying METS documents
---------------------------------

Example creation of a METS document (without PREMIS or Dublin Core metadata):::

import metsrw
import uuid

mw = metsrw.METSDocument()

# Create object entries
file1 = metsrw.FSEntry('objects/cat.png', file_uuid=str(uuid.uuid4()))
file2 = metsrw.FSEntry('objects/dog.jpg', file_uuid=str(uuid.uuid4()))

# Create preservation derivative entries
file1p = metsrw.FSEntry('objects/cat-preservation.tiff', use='preservation', file_uuid=str(uuid.uuid4()), derived_from=file1)
file2p = metsrw.FSEntry('objects/dog-preservation.tiff', use='preservation', file_uuid=str(uuid.uuid4()), derived_from=file2)

# Create object directory entry
objects = metsrw.FSEntry('objects', type='Directory', children=[file1, file2, file1p, file2p])

# Create metadata subdirectories then metadata directory entry
children = [
metsrw.FSEntry('transfers', type='Directory', children=[]),
metsrw.FSEntry('metadata/metadata.csv', use='metadata', file_uuid=str(uuid.uuid4())),
]
metadata = metsrw.FSEntry('metadata', type='Directory', children=children)

# Create submission METS entry and submission documentation parent directory entry
children = [
metsrw.FSEntry('submissionDocumentation/METS.xml', use='submissionDocumentation', file_uuid=str(uuid.uuid4())),
]
sub_doc = metsrw.FSEntry('submissionDocumentation', type='Directory', children=children)

# Create SIP entry containing objects, metadata, and submission documentaton entries
children = [objects, metadata, sub_doc]
sip = metsrw.FSEntry('sipname-uuid', type='Directory', children=children)

# Add SIP entry to METS document and write to file
mw.append_file(sip)
mw.write('mets.xml', fully_qualified=True, pretty_print=True)
7,541 changes: 7,541 additions & 0 deletions fixtures/complete_mets_2.xml

Large diffs are not rendered by default.

12 changes: 7 additions & 5 deletions metsrw/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@
AM_PNTR_SCT_PATH,
get_schematron,
validate,
get_file_path,
get_xmlschema,
xsd_validate,
schematron_validate,
Expand All @@ -46,8 +47,9 @@
'MDRef', 'MDWrap', 'METSDocument', 'NAMESPACES', 'SCHEMA_LOCATIONS',
'lxmlns', 'FILE_ID_PREFIX', 'GROUP_ID_PREFIX', 'METS_XSD_PATH',
'AM_SCT_PATH', 'AM_PNTR_SCT_PATH', 'get_schematron', 'validate',
'get_xmlschema', 'xsd_validate', 'schematron_validate',
'sct_report_string', 'xsd_error_log_string', 'report_string',
'FeatureBroker', 'set_feature_broker_to_default_state',
'feature_broker', 'Dependency', 'has_class_methods', 'has_methods',
'is_class', 'plugins', '__version__']
'get_file_path', 'get_xmlschema', 'xsd_validate',
'schematron_validate', 'sct_report_string', 'xsd_error_log_string',
'report_string', 'FeatureBroker',
'set_feature_broker_to_default_state', 'feature_broker',
'Dependency', 'has_class_methods', 'has_methods', 'is_class',
'plugins', '__version__']
16 changes: 15 additions & 1 deletion metsrw/di.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,10 @@
See http://code.activestate.com/recipes/413268/
"""

from .plugins import premisrw
from .plugins import (
premisrw,
dcrw
)


class FeatureBroker(object):
Expand Down Expand Up @@ -62,10 +65,21 @@ def __getitem__(self, feature_name):


def set_feature_broker_to_default_state(fb):
"""Provide dependencies via the global singleton feature broker.
To use yapremisrw, provide different class(es) from that plugin, e.g., to
use ``yapremisrw.Event``::
>>> from .plugins import yapremisrw
>>> from metsrw import feature_broker as fb
>>> fb.provide('premis_event_class', yapremisrw.Event)
"""
fb.clear()
fb.provide('premis_object_class', premisrw.PREMISObject)
fb.provide('premis_event_class', premisrw.PREMISEvent)
fb.provide('premis_agent_class', premisrw.PREMISAgent)
fb.provide('dublin_core_class', dcrw.DublinCoreXmlData)


feature_broker = FeatureBroker() # global singleton feature broker
Expand Down
19 changes: 18 additions & 1 deletion metsrw/fsentry.py
Original file line number Diff line number Diff line change
Expand Up @@ -89,10 +89,16 @@ class FSEntry(object):
has_methods('serialize'),
has_class_methods('fromtree'),
is_class)
dublin_core_class = Dependency(
'dublin_core_class',
has_methods('serialize'),
has_class_methods('fromtree'),
is_class)

PREMIS_OBJECT = 'PREMIS:OBJECT'
PREMIS_EVENT = 'PREMIS:EVENT'
PREMIS_AGENT = 'PREMIS:AGENT'
DublinCore = 'DC'

def __init__(self, path=None, label=None, use='original', type=u'Item',
children=None, file_uuid=None, derived_from=None,
Expand Down Expand Up @@ -131,6 +137,11 @@ def __init__(self, path=None, label=None, use='original', type=u'Item',
self.amdsecs = []
self.dmdsecs = []

# Convenient access to metadata (without cycling through amdsecs)
self.techmds = []
self.digiprovmds = []
self.rightsmds = []

def __str__(self):
return '{s.type}: {s.path}'.format(s=self)

Expand Down Expand Up @@ -271,7 +282,9 @@ def add_premis_rights(self, md, mode='mdwrap'):

def add_dublin_core(self, md, mode='mdwrap'):
# TODO add extra args and create DC object here
return self.add_dmdsec(md, 'DC', mode)
return self.add_dmdsec(
self.serialize_md_inst(md, self.dublin_core_class),
self.DublinCore, mode)

def add_child(self, child):
"""Add a child FSEntry to this FSEntry.
Expand Down Expand Up @@ -414,3 +427,7 @@ def get_premis_events(self):
def get_premis_agents(self):
return self.get_subsections_of_type(
self.PREMIS_AGENT, self.premis_agent_class)

def get_dublin_core(self):
return self.get_subsections_of_type(
self.DublinCore, self.dublin_core_class)
15 changes: 13 additions & 2 deletions metsrw/metadata.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@

from . import exceptions
from . import utils
from .plugins.dcrw import DublinCoreXmlData

LOGGER = logging.getLogger(__name__)

Expand Down Expand Up @@ -298,14 +299,18 @@ class MDWrap(object):
include "PREMIS:OBJECT", "PREMIS:EVENT,", "DC" and "OTHER".
:param str othermdtype: The OTHERMDTYPE of the XML document. Should be set if mdtype is "OTHER".
"""
def __init__(self, document, mdtype, othermdtype=None):

MDTYPE_CLASSES = {'DC': DublinCoreXmlData}

def __init__(self, document, mdtype, othermdtype=None, data=None):
parser = etree.XMLParser(remove_blank_text=True)
if isinstance(document, six.string_types):
self.document = etree.fromstring(document, parser=parser)
elif isinstance(document, (etree._Element, list)):
self.document = document
self.mdtype = mdtype
self.othermdtype = othermdtype
self.data = data

@classmethod
def parse(cls, root):
Expand All @@ -321,6 +326,12 @@ def parse(cls, root):
mdtype = root.get('MDTYPE')
if not mdtype:
raise exceptions.ParseError('mdWrap must have a MDTYPE')
if mdtype in MDWrap.MDTYPE_CLASSES.keys():
mdtype_class = MDWrap.MDTYPE_CLASSES[mdtype]()
data = mdtype_class.parse(root.find('mets:xmlData', namespaces=utils.NAMESPACES)).__dict__
else:
data = None

othermdtype = root.get('OTHERMDTYPE')
document = root.xpath('mets:xmlData/*', namespaces=utils.NAMESPACES)
if len(document) == 0:
Expand All @@ -329,7 +340,7 @@ def parse(cls, root):
' one has none')
elif len(document) == 1:
document = document[0]
return cls(document, mdtype, othermdtype)
return cls(document, mdtype, othermdtype=othermdtype, data=data)

def serialize(self):
el = etree.Element(utils.lxmlns('mets') + 'mdWrap', MDTYPE=self.mdtype)
Expand Down
5 changes: 5 additions & 0 deletions metsrw/mets.py
Original file line number Diff line number Diff line change
Expand Up @@ -389,6 +389,11 @@ def _add_amdsecs_to_fs_entry(amdids, fs_entry, tree):
namespaces=utils.NAMESPACES)
amdsec = metadata.AMDSec.parse(amdsec_elem)
fs_entry.amdsecs.append(amdsec)
# Add subsections to convience properties
for subsection in amdsec.subsections:
getattr(
fs_entry, subsection.subsection.lower() + 's').append(
subsection)

def _parse_tree(self, tree=None):
if tree is None:
Expand Down
30 changes: 30 additions & 0 deletions metsrw/plugins/dcrw/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
from __future__ import absolute_import

import logging

from .dc import DublinCoreXmlData
from .utils import (
NAMESPACES,
DUBLINCORE_SCHEMA_LOCATIONS,
lxmlns,
)
from .exceptions import (
DcError,
ConstructError,
ParseError
)


LOGGER = logging.getLogger(__name__)
LOGGER.addHandler(logging.NullHandler())


__all__ = [
'DublinCoreXmlData',
'NAMESPACES',
'DUBLINCORE_SCHEMA_LOCATIONS',
'lxmlns',
'DcError',
'ConstructError',
'ParseError',
]
73 changes: 73 additions & 0 deletions metsrw/plugins/dcrw/dc.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,73 @@
from __future__ import absolute_import

from collections import OrderedDict
import logging
from lxml import etree

from .exceptions import ParseError
from .utils import lxmlns, NAMESPACES, DUBLINCORE_SCHEMA_LOCATIONS

LOGGER = logging.getLogger(__name__)


class DublinCoreXmlData(object):
"""
An object representing a METS xmlData element containing a Dublin Core element.
:raises ParseError: If the root element tag is not xmlData.
"""
DC_ELEMENTS = ['title', 'creator', 'subject', 'description', 'publisher', 'contributor', 'date', 'format', 'identifier', 'source', 'relation', 'language', 'coverage', 'rights']

def __init__(self, title=None, creator=None, subject=None, description=None, publisher=None, contributor=None, date=None, format=None, identifier=None, source=None, relation=None, language=None, coverage=None, rights=None):
for element in self.DC_ELEMENTS:
setattr(self, element, locals()[element])

@classmethod
def parse(cls, root):
"""
Parse an xmlData element containing a Dublin Core dublincore element.
:param root: Element or ElementTree to be parsed into an object.
:raises ParseError: If the root is not xmlData or doesn't contain a dublincore element.
"""
if root.tag != lxmlns('mets') + 'xmlData':
raise ParseError('DublinCoreXmlData can only parse xmlData elements with mets namespace.')

dc_el = root.find('dcterms:dublincore', namespaces=NAMESPACES)

if dc_el is None or dc_el.tag != lxmlns('dcterms') + 'dublincore':
raise ParseError('xmlData can only contain a dublincore element with the dcterms namespace.')

args = []

for element in DublinCoreXmlData.DC_ELEMENTS:
args.append(dc_el.findtext("dc:" + element, namespaces=NAMESPACES))

return cls(*args)

fromtree = parse

def serialize(self):
nsmap = OrderedDict([
('mets', NAMESPACES['mets']),
('xsi', NAMESPACES['xsi']),
('xlink', NAMESPACES['xlink'])
])
root = etree.Element(lxmlns('mets') + 'xmlData', nsmap=nsmap)
root.append(self._serialize_dublincore())
return root

def _serialize_dublincore(self):
nsmap = OrderedDict([
('dcterms', NAMESPACES['dcterms']),
('dc', NAMESPACES['dc'])
])
attrib = {'{}schemaLocation'.format(lxmlns('xsi')): DUBLINCORE_SCHEMA_LOCATIONS}
dc_root = etree.Element(lxmlns('dcterms') + 'dublincore', nsmap=nsmap, attrib=attrib)

for element in DublinCoreXmlData.DC_ELEMENTS:
dc_el = etree.Element(lxmlns('dc') + element)
dc_el.text = getattr(self, element)
dc_root.append(dc_el)

return dc_root
19 changes: 19 additions & 0 deletions metsrw/plugins/dcrw/exceptions.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
"""Exceptions for dcrw.
All exceptions generated by this library will descend from DcError.
"""


class DcError(Exception):
""" Base Exception for this module. """
pass


class ConstructError(DcError):
""" Error constructing an object. """
pass


class ParseError(DcError):
""" Error parsing a DC element. """
pass
17 changes: 17 additions & 0 deletions metsrw/plugins/dcrw/utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
NAMESPACES = {
"xsi": "http://www.w3.org/2001/XMLSchema-instance",
"mets": "http://www.loc.gov/METS/",
"premis": "info:lc/xmlns/premis-v2",
"dcterms": "http://purl.org/dc/terms/",
"fits": "http://hul.harvard.edu/ois/xml/ns/fits/fits_output",
"xlink": "http://www.w3.org/1999/xlink",
"dc": "http://purl.org/dc/elements/1.1/"
}

DUBLINCORE_SCHEMA_LOCATIONS = "http://purl.org/dc/terms/ " + \
"http://dublincore.org/schemas/xmls/qdc/2008/02/11/dcterms.xsd"


def lxmlns(arg):
""" Return XPath-usable namespace. """
return '{' + NAMESPACES[arg] + '}'
Loading

0 comments on commit 3caf782

Please sign in to comment.