-
Notifications
You must be signed in to change notification settings - Fork 108
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Fixes to enable custom structmaps to be more expressive #1404
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Large diffs are not rendered by default.
This file was deleted.
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -73,6 +73,7 @@ | |
) | ||
from custom_handlers import get_script_logger | ||
import namespaces as ns | ||
from sanitize_names import sanitizeName | ||
|
||
from bagit import Bag, BagError | ||
|
||
|
@@ -872,39 +873,61 @@ def getAMDSec( | |
return ret | ||
|
||
|
||
def getIncludedStructMap(job, baseDirectoryPath, state): | ||
def _fixup_path_input_by_user(job, path): | ||
"""Fix-up paths submitted by a user, e.g. in custom structmap examples so | ||
that they don't have to anticipate the Archivematica normalization process. | ||
""" | ||
return os.path.join( | ||
"", *[sanitizeName(name.encode("utf8")) for name in path.split(os.path.sep)] | ||
) | ||
|
||
|
||
def include_custom_structmap( | ||
job, baseDirectoryPath, state, custom_structmap="mets_structmap.xml" | ||
): | ||
"""Enable users in submitting a structmap with a transfer and have that | ||
included in the eventual AIP METS. | ||
""" | ||
ret = [] | ||
transferMetadata = os.path.join(baseDirectoryPath, "objects/metadata/transfers") | ||
transferMetadata = os.path.join( | ||
baseDirectoryPath, os.path.join("objects", "metadata", "transfers") | ||
) | ||
if not os.path.isdir(transferMetadata): | ||
return [] | ||
return ret | ||
baseLocations = os.listdir(transferMetadata) | ||
baseLocations.append(baseDirectoryPath) | ||
for dir_ in baseLocations: | ||
dirPath = os.path.join(transferMetadata, dir_) | ||
structMapXmlPath = os.path.join(dirPath, "mets_structmap.xml") | ||
structMapXmlPath = os.path.join(dirPath, custom_structmap) | ||
if not os.path.isdir(dirPath): | ||
continue | ||
if os.path.isfile(structMapXmlPath): | ||
tree = etree.parse(structMapXmlPath) | ||
root = ( | ||
tree.getroot() | ||
) # TDOD - not root to return, but sub element structMap | ||
# print etree.tostring(root) | ||
root = tree.getroot() | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Just to confirm, does metsrw not yet work for this case? |
||
structMap = root.find(ns.metsBNS + "structMap") | ||
id_ = structMap.get("ID") | ||
if not id_: | ||
structMap.set("ID", "structMap_2") | ||
state.globalStructMapCounter += 1 | ||
structMap.set("ID", "structMap_{}".format(state.globalStructMapCounter)) | ||
ret.append(structMap) | ||
for item in structMap.findall(".//" + ns.metsBNS + "fptr"): | ||
fileName = item.get("FILEID") | ||
if fileName in state.fileNameToFileID: | ||
# print fileName, " -> ", state.fileNameToFileID[fileName] | ||
item.set("FILEID", state.fileNameToFileID[fileName]) | ||
fileids = structMap.xpath( | ||
"//*[@CONTENTIDS]", namespaces={"mets:": ns.metsNS} | ||
) | ||
for item in fileids: | ||
file_path = item.get("CONTENTIDS") | ||
normalized_path = _fixup_path_input_by_user(job, file_path) | ||
if normalized_path in state.fileNameToFileID: | ||
item.set("FILEID", state.fileNameToFileID[normalized_path]) | ||
else: | ||
job.pyprint("error: no fileUUID for ", fileName, file=sys.stderr) | ||
job.pyprint( | ||
"Custom structmap error: no fileUUID for", | ||
file_path, | ||
normalized_path, | ||
file=sys.stderr, | ||
) | ||
state.error_accumulator.error_count += 1 | ||
if state.trimStructMap is not None: | ||
ret.append(state.trimStructMap) | ||
if ret: | ||
job.pyprint("Custom structmap will be included in AIP METS") | ||
return ret | ||
|
||
|
||
|
@@ -1064,7 +1087,9 @@ def createFileSec( | |
structMapDiv, ns.metsBNS + "div", LABEL=label, TYPE="Item" | ||
) | ||
etree.SubElement(fileDiv, ns.metsBNS + "fptr", FILEID=fileId) | ||
state.fileNameToFileID[item] = fileId | ||
# Pair items listed in custom structmaps. Strip leading path | ||
# separator if it exists. | ||
state.fileNameToFileID[directoryPathSTR] = fileId | ||
|
||
# Determine fileGrp @GROUPID based on the file's fileGrpUse and transfer type | ||
GROUPID = "" | ||
|
@@ -1787,10 +1812,13 @@ def call(jobs): | |
if normativeStructMap is not None: | ||
root.append(normativeStructMap) | ||
|
||
for structMapIncl in getIncludedStructMap( | ||
for custom_structmap in include_custom_structmap( | ||
job, baseDirectoryPath, state | ||
): | ||
root.append(structMapIncl) | ||
root.append(custom_structmap) | ||
|
||
if state.trimStructMap is not None: | ||
root.append(state.trimStructMap) | ||
|
||
arranged_structmap = build_arranged_structmap( | ||
job, structMap, fileGroupIdentifier | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,20 +1,34 @@ | ||
#!/usr/bin/env python2 | ||
# -*- coding: utf8 | ||
|
||
"""verify_mets.py | ||
|
||
Verify METS documents provided to the script. Its first, and primary use so | ||
far is to verify the validity of custom structmaps included with transfers and | ||
supplied on ingest after appraisal. | ||
""" | ||
from __future__ import unicode_literals | ||
from lxml import etree | ||
import os | ||
|
||
from executeOrRunSubProcess import executeOrRun | ||
|
||
class VerifyMETSException(Exception): | ||
"""Exception to raise if METS validation fails.""" | ||
|
||
def call(jobs): | ||
command = os.path.join( | ||
os.path.dirname(os.path.realpath(__file__)), "archivematicaVerifyMets.sh" | ||
) | ||
|
||
def call(jobs): | ||
"""Primary entry point for this script.""" | ||
for job in jobs: | ||
with job.JobContext(): | ||
exit_code, std_out, std_error = executeOrRun( | ||
"command", [command] + job.args[1:], printing=True, capture_output=True | ||
) | ||
|
||
job.write_error(std_error) | ||
job.write_output(std_out) | ||
job.set_status(exit_code) | ||
mets_structmap = os.path.join(job.args[1], "metadata", "mets_structmap.xml") | ||
mets_xsd = job.args[2] | ||
if not os.path.isfile(mets_structmap): | ||
job.pyprint("Custom structmap not supplied with package") | ||
return | ||
if not os.path.isfile(mets_xsd): | ||
raise (VerifyMETSException("METS asset is unavailable")) | ||
xmlschema = etree.XMLSchema(etree.parse(mets_xsd)) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Just to confirm, does |
||
# Raise an exception if not valid, e.g. etree.DocumentInvalid | ||
# otherwise, the document validates correctly and returns. | ||
xmlschema.assertValid(etree.parse(mets_structmap)) | ||
job.pyprint("Custom structmap validated correctly") |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,11 @@ | ||
<?xml version="1.0" encoding="utf-8"?> | ||
<mets:mets xmlns:mets="http://www.loc.gov/METS/"> | ||
<mets:structMap TYPE="logical"> | ||
<mets:div TYPE="Short Listen: Ferdinand, the Misunderstood Bull by With Good Reason" LABEL="documentary"> | ||
<mets:div TYPE="track" LABEL="Complete documentary"> | ||
<!-- 001 is a valid IDREF value used for FILEID --> | ||
<mets:fptr FILEID="001" CONTENTIDS="objects/test_file.flac"/> | ||
</mets:div> | ||
</mets:div> | ||
</mets:structMap> | ||
</mets:mets> |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,17 @@ | ||
<?xml version="1.0" encoding="utf-8"?> | ||
<mets:mets xmlns:mets="http://www.loc.gov/METS/"> | ||
<mets:structMap TYPE="logical"> | ||
<mets:div TYPE="book" LABEL="How to create a hierarchical book"> | ||
<mets:div TYPE="page" LABEL="Cover"> | ||
<mets:fptr FILEID="FILE000" CONTENTIDS="objects/duplicate_file_name.png"/> | ||
</mets:div> | ||
<mets:div TYPE="chapter" LABEL="Chapter 1"> | ||
<mets:div TYPE="page" LABEL="Page 1"> | ||
<mets:fptr FILEID="FILE001" CONTENTIDS="objects/nested_dir/duplicate_file_name.png"/> | ||
</mets:div> | ||
</mets:div> | ||
<!-- chapter --> | ||
</mets:div> | ||
<!-- book --> | ||
</mets:structMap> | ||
</mets:mets> |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,23 @@ | ||
<?xml version="1.0" encoding="utf-8"?> | ||
<mets:mets xmlns:mets="http://www.loc.gov/METS/"> | ||
<mets:structMap TYPE="logical"> | ||
<mets:div LABEL="Short Listen: Testing structmaps in Archivematica" TYPE="documentary"> | ||
<mets:div LABEL="Introduction" ORDER="1"> | ||
<mets:fptr FILEID="test_file.mp3" CONTENTIDS="objects/test_file.mp3"> | ||
<mets:area FILEID="test_file.mp3" CONTENTIDS="objects/test_file.mp3" BEGIN="00:00:00" END="00:00:17" BETYPE="TIME"/> | ||
</mets:fptr> | ||
</mets:div> | ||
<mets:div LABEL="Archivematica's testing strategy" ORDER="2"> | ||
<mets:fptr FILEID="test_file.mp3" CONTENTIDS="objects/test_file.mp3"> | ||
<mets:area FILEID="test_file.mp3" CONTENTIDS="objects/test_file.mp3" BEGIN="00:00:18" END="00:01:13" BETYPE="TIME"/> | ||
</mets:fptr> | ||
</mets:div> | ||
<mets:div LABEL="Outro" ORDER="3"> | ||
<mets:fptr FILEID="test_file.mp3" CONTENTIDS="objects/test_file.mp3"> | ||
<mets:area FILEID="test_file.mp3" CONTENTIDS="objects/test_file.mp3" BEGIN="00:01:14" END="00:01:33" BETYPE="TIME"/> | ||
</mets:fptr> | ||
</mets:div> | ||
<!-- documentary --> | ||
</mets:div> | ||
</mets:structMap> | ||
</mets:mets> |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,10 @@ | ||
<?xml version="1.0" encoding="utf-8"?> | ||
<mets:mets xmlns:mets="http://www.loc.gov/METS/"> | ||
<mets:structMap TYPE="logical"> | ||
<mets:div TYPE="Short Listen: Ferdinand, the Misunderstood Bull by With Good Reason" LABEL="documentary"> | ||
<mets:div TYPE="track" LABEL="Complete documentary"> | ||
<mets:fptr FILEID="test_file.flac" CONTENTIDS="objects/test_file.flac"/> | ||
</mets:div> | ||
</mets:div> | ||
</mets:structMap> | ||
</mets:mets> |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,23 @@ | ||
<?xml version="1.0" encoding="utf-8"?> | ||
<mets:mets xmlns:mets="http://www.loc.gov/METS/"> | ||
<mets:structMap TYPE="logical"> | ||
<mets:div LABEL="Short Listen: Testing structmaps in Archivematica" TYPE="documentary"> | ||
<mets:div LABEL="Introduction" ORDER="1"> | ||
<mets:fptr FILEID="FILE001" CONTENTIDS="objects/nested_dir/nested_file.rdata"> | ||
<mets:area FILEID="FILE001" CONTENTIDS="objects/nested_dir/nested_file.rdata" BEGIN="00:00:00" END="00:00:17" BETYPE="TIME"/> | ||
</mets:fptr> | ||
</mets:div> | ||
<mets:div LABEL="Archivematica's testing strategy" ORDER="2"> | ||
<mets:fptr FILEID="FILE002" CONTENTIDS="objects/nested_dir/nested_file.rdata"> | ||
<mets:area FILEID="FILE002" CONTENTIDS="objects/nested_dir/nested_file.rdata" BEGIN="00:00:18" END="00:01:13" BETYPE="TIME"/> | ||
</mets:fptr> | ||
</mets:div> | ||
<mets:div LABEL="Outro" ORDER="3"> | ||
<mets:fptr FILEID="FILE003" CONTENTIDS="objects/nested_dir/nested_file.rdata"> | ||
<mets:area FILEID="FILE003" CONTENTIDS="objects/nested_dir/nested_file.rdata" BEGIN="00:01:14" END="00:01:33" BETYPE="TIME"/> | ||
</mets:fptr> | ||
</mets:div> | ||
<!-- documentary --> | ||
</mets:div> | ||
</mets:structMap> | ||
</mets:mets> |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,10 @@ | ||
<?xml version="1.0" encoding="utf-8"?> | ||
<mets:mets xmlns:mets="http://www.loc.gov/METS/"> | ||
<mets:structMap TYPE="logical"> | ||
<mets:div TYPE="Testing a path with spaces" LABEL="Archivemtica Tests"> | ||
<mets:div TYPE="binary" LABEL="Complete test"> | ||
<mets:fptr FILEID="FILE001" CONTENTIDS="objects/dir-with-dashes/file with spaces.bin"/> | ||
</mets:div> | ||
</mets:div> | ||
</mets:structMap> | ||
</mets:mets> |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,17 @@ | ||
<?xml version="1.0" encoding="utf-8"?> | ||
<mets:mets xmlns:mets="http://www.loc.gov/METS/"> | ||
<mets:structMap TYPE="logical"> | ||
<mets:div TYPE="book" LABEL="How to create a hierarchical book"> | ||
<mets:div TYPE="page" LABEL="Cover"> | ||
<mets:fptr FILEID="test_file.png" CONTENTIDS="objects/test_file.png"/> | ||
</mets:div> | ||
<mets:div TYPE="chapter" LABEL="Chapter 1"> | ||
<mets:div TYPE="page" LABEL="Page 1"> | ||
<mets:fptr FILEID="test_file.jpg" CONTENTIDS="objects/test_file.jpg"/> | ||
</mets:div> | ||
</mets:div> | ||
<!-- chapter --> | ||
</mets:div> | ||
<!-- book --> | ||
</mets:structMap> | ||
</mets:mets> |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,17 @@ | ||
<?xml version="1.0" encoding="utf-8"?> | ||
<mets:mets xmlns:mets="http://www.loc.gov/METS/"> | ||
<mets:structMap TYPE="lógico" ID="custom_structmap"> | ||
<mets:div TYPE="libro" LABEL="Cómo crear un libro jerárquico"> | ||
<mets:div TYPE="página" LABEL="Cubierta interior"> | ||
<mets:fptr FILEID="página_de_prueba.png" CONTENTIDS="objects/página_de_prueba.png"/> | ||
</mets:div> | ||
<mets:div TYPE="capítulo" LABEL="Capítulo 1"> | ||
<mets:div TYPE="página" LABEL="Página 1"> | ||
<mets:fptr FILEID="página_de_prueba.jpg" CONTENTIDS="objects/página_de_prueba.jpg"/> | ||
</mets:div> | ||
</mets:div> | ||
<!-- capítulo 1 --> | ||
</mets:div> | ||
<!-- book --> | ||
</mets:structMap> | ||
</mets:mets> |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The idea here is that these files will already have been sanitized, so we're just matching up to the current state on the filesystem/database, correct?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Sorry i missed this @cole, yep! That's right. We'll have to keep this in mind when you complete your work on that microservice job.