Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fixes to enable custom structmaps to be more expressive #1404

Merged
merged 1 commit into from
Apr 17, 2019
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
286 changes: 186 additions & 100 deletions src/MCPClient/lib/assets/mets/mets.xsd

Large diffs are not rendered by default.

39 changes: 0 additions & 39 deletions src/MCPClient/lib/clientScripts/archivematicaVerifyMets.sh

This file was deleted.

68 changes: 48 additions & 20 deletions src/MCPClient/lib/clientScripts/create_mets_v2.py
Original file line number Diff line number Diff line change
Expand Up @@ -73,6 +73,7 @@
)
from custom_handlers import get_script_logger
import namespaces as ns
from sanitize_names import sanitizeName

from bagit import Bag, BagError

Expand Down Expand Up @@ -872,39 +873,61 @@ def getAMDSec(
return ret


def getIncludedStructMap(job, baseDirectoryPath, state):
def _fixup_path_input_by_user(job, path):
"""Fix-up paths submitted by a user, e.g. in custom structmap examples so
that they don't have to anticipate the Archivematica normalization process.
"""
return os.path.join(
"", *[sanitizeName(name.encode("utf8")) for name in path.split(os.path.sep)]
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The idea here is that these files will already have been sanitized, so we're just matching up to the current state on the filesystem/database, correct?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Sorry i missed this @cole, yep! That's right. We'll have to keep this in mind when you complete your work on that microservice job.

)


def include_custom_structmap(
job, baseDirectoryPath, state, custom_structmap="mets_structmap.xml"
):
"""Enable users in submitting a structmap with a transfer and have that
included in the eventual AIP METS.
"""
ret = []
transferMetadata = os.path.join(baseDirectoryPath, "objects/metadata/transfers")
transferMetadata = os.path.join(
baseDirectoryPath, os.path.join("objects", "metadata", "transfers")
)
if not os.path.isdir(transferMetadata):
return []
return ret
baseLocations = os.listdir(transferMetadata)
baseLocations.append(baseDirectoryPath)
for dir_ in baseLocations:
dirPath = os.path.join(transferMetadata, dir_)
structMapXmlPath = os.path.join(dirPath, "mets_structmap.xml")
structMapXmlPath = os.path.join(dirPath, custom_structmap)
if not os.path.isdir(dirPath):
continue
if os.path.isfile(structMapXmlPath):
tree = etree.parse(structMapXmlPath)
root = (
tree.getroot()
) # TDOD - not root to return, but sub element structMap
# print etree.tostring(root)
root = tree.getroot()
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Just to confirm, does metsrw not yet work for this case?

structMap = root.find(ns.metsBNS + "structMap")
id_ = structMap.get("ID")
if not id_:
structMap.set("ID", "structMap_2")
state.globalStructMapCounter += 1
structMap.set("ID", "structMap_{}".format(state.globalStructMapCounter))
ret.append(structMap)
for item in structMap.findall(".//" + ns.metsBNS + "fptr"):
fileName = item.get("FILEID")
if fileName in state.fileNameToFileID:
# print fileName, " -> ", state.fileNameToFileID[fileName]
item.set("FILEID", state.fileNameToFileID[fileName])
fileids = structMap.xpath(
"//*[@CONTENTIDS]", namespaces={"mets:": ns.metsNS}
)
for item in fileids:
file_path = item.get("CONTENTIDS")
normalized_path = _fixup_path_input_by_user(job, file_path)
if normalized_path in state.fileNameToFileID:
item.set("FILEID", state.fileNameToFileID[normalized_path])
else:
job.pyprint("error: no fileUUID for ", fileName, file=sys.stderr)
job.pyprint(
"Custom structmap error: no fileUUID for",
file_path,
normalized_path,
file=sys.stderr,
)
state.error_accumulator.error_count += 1
if state.trimStructMap is not None:
ret.append(state.trimStructMap)
if ret:
job.pyprint("Custom structmap will be included in AIP METS")
return ret


Expand Down Expand Up @@ -1064,7 +1087,9 @@ def createFileSec(
structMapDiv, ns.metsBNS + "div", LABEL=label, TYPE="Item"
)
etree.SubElement(fileDiv, ns.metsBNS + "fptr", FILEID=fileId)
state.fileNameToFileID[item] = fileId
# Pair items listed in custom structmaps. Strip leading path
# separator if it exists.
state.fileNameToFileID[directoryPathSTR] = fileId

# Determine fileGrp @GROUPID based on the file's fileGrpUse and transfer type
GROUPID = ""
Expand Down Expand Up @@ -1787,10 +1812,13 @@ def call(jobs):
if normativeStructMap is not None:
root.append(normativeStructMap)

for structMapIncl in getIncludedStructMap(
for custom_structmap in include_custom_structmap(
job, baseDirectoryPath, state
):
root.append(structMapIncl)
root.append(custom_structmap)

if state.trimStructMap is not None:
root.append(state.trimStructMap)

arranged_structmap = build_arranged_structmap(
job, structMap, fileGroupIdentifier
Expand Down
38 changes: 26 additions & 12 deletions src/MCPClient/lib/clientScripts/verify_mets.py
Original file line number Diff line number Diff line change
@@ -1,20 +1,34 @@
#!/usr/bin/env python2
# -*- coding: utf8

"""verify_mets.py

Verify METS documents provided to the script. Its first, and primary use so
far is to verify the validity of custom structmaps included with transfers and
supplied on ingest after appraisal.
"""
from __future__ import unicode_literals
from lxml import etree
import os

from executeOrRunSubProcess import executeOrRun

class VerifyMETSException(Exception):
"""Exception to raise if METS validation fails."""

def call(jobs):
command = os.path.join(
os.path.dirname(os.path.realpath(__file__)), "archivematicaVerifyMets.sh"
)

def call(jobs):
"""Primary entry point for this script."""
for job in jobs:
with job.JobContext():
exit_code, std_out, std_error = executeOrRun(
"command", [command] + job.args[1:], printing=True, capture_output=True
)

job.write_error(std_error)
job.write_output(std_out)
job.set_status(exit_code)
mets_structmap = os.path.join(job.args[1], "metadata", "mets_structmap.xml")
mets_xsd = job.args[2]
if not os.path.isfile(mets_structmap):
job.pyprint("Custom structmap not supplied with package")
return
if not os.path.isfile(mets_xsd):
raise (VerifyMETSException("METS asset is unavailable"))
xmlschema = etree.XMLSchema(etree.parse(mets_xsd))
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Just to confirm, does metsrw not yet work for this case?

# Raise an exception if not valid, e.g. etree.DocumentInvalid
# otherwise, the document validates correctly and returns.
xmlschema.assertValid(etree.parse(mets_structmap))
job.pyprint("Custom structmap validated correctly")
2 changes: 1 addition & 1 deletion src/MCPClient/lib/ensure_no_mutable_globals.py
Original file line number Diff line number Diff line change
Expand Up @@ -133,7 +133,7 @@ def print_mutable_globals_usage(supported_modules):
'create_mets_v2:call',
'create_mets_v2:createDublincoreDMDSecFromDBData',
'create_mets_v2:createFileSec',
'create_mets_v2:getIncludedStructMap',
'create_mets_v2:include_custom_structmap',
'create_mets_v2:parseMetadata',
'extract_maildir_attachments:handle_job',
'extract_maildir_attachments:parse'],
Expand Down
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
<?xml version="1.0" encoding="utf-8"?>
<mets:mets xmlns:mets="http://www.loc.gov/METS/">
<mets:structMap TYPE="logical">
<mets:div TYPE="Short Listen: Ferdinand, the Misunderstood Bull by With Good Reason" LABEL="documentary">
<mets:div TYPE="track" LABEL="Complete documentary">
<!-- 001 is a valid IDREF value used for FILEID -->
<mets:fptr FILEID="001" CONTENTIDS="objects/test_file.flac"/>
</mets:div>
</mets:div>
</mets:structMap>
</mets:mets>
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
<?xml version="1.0" encoding="utf-8"?>
<mets:mets xmlns:mets="http://www.loc.gov/METS/">
<mets:structMap TYPE="logical">
<mets:div TYPE="book" LABEL="How to create a hierarchical book">
<mets:div TYPE="page" LABEL="Cover">
<mets:fptr FILEID="FILE000" CONTENTIDS="objects/duplicate_file_name.png"/>
</mets:div>
<mets:div TYPE="chapter" LABEL="Chapter 1">
<mets:div TYPE="page" LABEL="Page 1">
<mets:fptr FILEID="FILE001" CONTENTIDS="objects/nested_dir/duplicate_file_name.png"/>
</mets:div>
</mets:div>
<!-- chapter -->
</mets:div>
<!-- book -->
</mets:structMap>
</mets:mets>
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
<?xml version="1.0" encoding="utf-8"?>
<mets:mets xmlns:mets="http://www.loc.gov/METS/">
<mets:structMap TYPE="logical">
<mets:div LABEL="Short Listen: Testing structmaps in Archivematica" TYPE="documentary">
<mets:div LABEL="Introduction" ORDER="1">
<mets:fptr FILEID="test_file.mp3" CONTENTIDS="objects/test_file.mp3">
<mets:area FILEID="test_file.mp3" CONTENTIDS="objects/test_file.mp3" BEGIN="00:00:00" END="00:00:17" BETYPE="TIME"/>
</mets:fptr>
</mets:div>
<mets:div LABEL="Archivematica's testing strategy" ORDER="2">
<mets:fptr FILEID="test_file.mp3" CONTENTIDS="objects/test_file.mp3">
<mets:area FILEID="test_file.mp3" CONTENTIDS="objects/test_file.mp3" BEGIN="00:00:18" END="00:01:13" BETYPE="TIME"/>
</mets:fptr>
</mets:div>
<mets:div LABEL="Outro" ORDER="3">
<mets:fptr FILEID="test_file.mp3" CONTENTIDS="objects/test_file.mp3">
<mets:area FILEID="test_file.mp3" CONTENTIDS="objects/test_file.mp3" BEGIN="00:01:14" END="00:01:33" BETYPE="TIME"/>
</mets:fptr>
</mets:div>
<!-- documentary -->
</mets:div>
</mets:structMap>
</mets:mets>
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
<?xml version="1.0" encoding="utf-8"?>
<mets:mets xmlns:mets="http://www.loc.gov/METS/">
<mets:structMap TYPE="logical">
<mets:div TYPE="Short Listen: Ferdinand, the Misunderstood Bull by With Good Reason" LABEL="documentary">
<mets:div TYPE="track" LABEL="Complete documentary">
<mets:fptr FILEID="test_file.flac" CONTENTIDS="objects/test_file.flac"/>
</mets:div>
</mets:div>
</mets:structMap>
</mets:mets>
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
<?xml version="1.0" encoding="utf-8"?>
<mets:mets xmlns:mets="http://www.loc.gov/METS/">
<mets:structMap TYPE="logical">
<mets:div LABEL="Short Listen: Testing structmaps in Archivematica" TYPE="documentary">
<mets:div LABEL="Introduction" ORDER="1">
<mets:fptr FILEID="FILE001" CONTENTIDS="objects/nested_dir/nested_file.rdata">
<mets:area FILEID="FILE001" CONTENTIDS="objects/nested_dir/nested_file.rdata" BEGIN="00:00:00" END="00:00:17" BETYPE="TIME"/>
</mets:fptr>
</mets:div>
<mets:div LABEL="Archivematica's testing strategy" ORDER="2">
<mets:fptr FILEID="FILE002" CONTENTIDS="objects/nested_dir/nested_file.rdata">
<mets:area FILEID="FILE002" CONTENTIDS="objects/nested_dir/nested_file.rdata" BEGIN="00:00:18" END="00:01:13" BETYPE="TIME"/>
</mets:fptr>
</mets:div>
<mets:div LABEL="Outro" ORDER="3">
<mets:fptr FILEID="FILE003" CONTENTIDS="objects/nested_dir/nested_file.rdata">
<mets:area FILEID="FILE003" CONTENTIDS="objects/nested_dir/nested_file.rdata" BEGIN="00:01:14" END="00:01:33" BETYPE="TIME"/>
</mets:fptr>
</mets:div>
<!-- documentary -->
</mets:div>
</mets:structMap>
</mets:mets>
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
<?xml version="1.0" encoding="utf-8"?>
<mets:mets xmlns:mets="http://www.loc.gov/METS/">
<mets:structMap TYPE="logical">
<mets:div TYPE="Testing a path with spaces" LABEL="Archivemtica Tests">
<mets:div TYPE="binary" LABEL="Complete test">
<mets:fptr FILEID="FILE001" CONTENTIDS="objects/dir-with-dashes/file with spaces.bin"/>
</mets:div>
</mets:div>
</mets:structMap>
</mets:mets>
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
<?xml version="1.0" encoding="utf-8"?>
<mets:mets xmlns:mets="http://www.loc.gov/METS/">
<mets:structMap TYPE="logical">
<mets:div TYPE="book" LABEL="How to create a hierarchical book">
<mets:div TYPE="page" LABEL="Cover">
<mets:fptr FILEID="test_file.png" CONTENTIDS="objects/test_file.png"/>
</mets:div>
<mets:div TYPE="chapter" LABEL="Chapter 1">
<mets:div TYPE="page" LABEL="Page 1">
<mets:fptr FILEID="test_file.jpg" CONTENTIDS="objects/test_file.jpg"/>
</mets:div>
</mets:div>
<!-- chapter -->
</mets:div>
<!-- book -->
</mets:structMap>
</mets:mets>
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
<?xml version="1.0" encoding="utf-8"?>
<mets:mets xmlns:mets="http://www.loc.gov/METS/">
<mets:structMap TYPE="lógico" ID="custom_structmap">
<mets:div TYPE="libro" LABEL="Cómo crear un libro jerárquico">
<mets:div TYPE="página" LABEL="Cubierta interior">
<mets:fptr FILEID="página_de_prueba.png" CONTENTIDS="objects/página_de_prueba.png"/>
</mets:div>
<mets:div TYPE="capítulo" LABEL="Capítulo 1">
<mets:div TYPE="página" LABEL="Página 1">
<mets:fptr FILEID="página_de_prueba.jpg" CONTENTIDS="objects/página_de_prueba.jpg"/>
</mets:div>
</mets:div>
<!-- capítulo 1 -->
</mets:div>
<!-- book -->
</mets:structMap>
</mets:mets>
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading