Skip to content

Commit

Permalink
Add type hints to the characterize_file script
Browse files Browse the repository at this point in the history
This also removes an unnecessary argument from the workflow definition
and adds a typed argument parser to the script.
  • Loading branch information
replaceafill committed Aug 20, 2024
1 parent 82be9b0 commit 8742013
Show file tree
Hide file tree
Showing 5 changed files with 82 additions and 50 deletions.
2 changes: 2 additions & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -57,11 +57,13 @@ warn_unused_configs = true
[[tool.mypy.overrides]]
module = [
"src.MCPClient.lib.client.*",
"src.MCPClient.lib.clientScripts.characterize_file",
"src.MCPClient.lib.clientScripts.identify_file_format",
"src.MCPClient.lib.clientScripts.normalize",
"src.MCPClient.lib.clientScripts.policy_check",
"src.MCPClient.lib.clientScripts.validate_file",
"tests.MCPClient.conftest",
"tests.MCPClient.test_characterize_file",
"tests.MCPClient.test_identify_file_format",
"tests.MCPClient.test_normalize",
"tests.MCPClient.test_policy_check",
Expand Down
43 changes: 34 additions & 9 deletions src/MCPClient/lib/clientScripts/characterize_file.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,34 +6,42 @@
#
# If a tool has no defined characterization commands, then the default
# will be run instead (currently FITS).
import argparse
import dataclasses
import multiprocessing
import uuid
from typing import List

import django
from lxml import etree

django.setup()

from client.job import Job
from databaseFunctions import insertIntoFPCommandOutput
from dicts import ReplacementDict
from dicts import replace_string_values
from django.conf import settings as mcpclient_settings
from django.core.exceptions import ValidationError
from django.db import transaction

# archivematicaCommon
from executeOrRunSubProcess import executeOrRun
from fpr.models import FormatVersion
from fpr.models import FPRule
from lib import setup_dicts

# dashboard
from lxml import etree
from main.models import FPCommandOutput


def concurrent_instances():
@dataclasses.dataclass
class CharacterizeFileArgs:
file_uuid: uuid.UUID
sip_uuid: uuid.UUID


def concurrent_instances() -> int:
return multiprocessing.cpu_count()


def main(job, file_path, file_uuid, sip_uuid):
def main(job: Job, file_uuid: uuid.UUID, sip_uuid: uuid.UUID) -> int:
setup_dicts(mcpclient_settings)

failed = False
Expand Down Expand Up @@ -117,8 +125,25 @@ def main(job, file_path, file_uuid, sip_uuid):
return 0


def call(jobs):
def get_parser() -> argparse.ArgumentParser:
parser = argparse.ArgumentParser(description="Identify file formats.")
parser.add_argument("file_uuid", type=uuid.UUID)
parser.add_argument("sip_uuid", type=uuid.UUID)

return parser


def parse_args(parser: argparse.ArgumentParser, job: Job) -> CharacterizeFileArgs:
namespace = parser.parse_args(job.args[1:])

return CharacterizeFileArgs(**vars(namespace))


def call(jobs: List[Job]) -> None:
parser = get_parser()

with transaction.atomic():
for job in jobs:
with job.JobContext():
job.set_status(main(job, *job.args[1:]))
args = parse_args(parser, job)
job.set_status(main(job, args.file_uuid, args.sip_uuid))
6 changes: 3 additions & 3 deletions src/MCPServer/lib/assets/workflow.json
Original file line number Diff line number Diff line change
Expand Up @@ -1286,7 +1286,7 @@
"config": {
"@manager": "linkTaskManagerFiles",
"@model": "StandardTaskConfig",
"arguments": "\"%relativeLocation%\" \"%fileUUID%\" \"%SIPUUID%\"",
"arguments": "\"%fileUUID%\" \"%SIPUUID%\"",
"execute": "characterizeFile_v0.0",
"filter_subdir": "objects/metadata/"
},
Expand Down Expand Up @@ -3480,7 +3480,7 @@
"config": {
"@manager": "linkTaskManagerFiles",
"@model": "StandardTaskConfig",
"arguments": "\"%relativeLocation%\" \"%fileUUID%\" \"%SIPUUID%\"",
"arguments": "\"%fileUUID%\" \"%SIPUUID%\"",
"execute": "characterizeFile_v0.0",
"filter_subdir": "objects"
},
Expand Down Expand Up @@ -3703,7 +3703,7 @@
"config": {
"@manager": "linkTaskManagerFiles",
"@model": "StandardTaskConfig",
"arguments": "\"%relativeLocation%\" \"%fileUUID%\" \"%SIPUUID%\"",
"arguments": "\"%fileUUID%\" \"%SIPUUID%\"",
"execute": "characterizeFile_v0.0",
"filter_subdir": "objects/submissionDocumentation"
},
Expand Down
5 changes: 3 additions & 2 deletions src/archivematicaCommon/lib/dicts.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@
import ast
import os
import re
import uuid

from main import models

Expand Down Expand Up @@ -83,9 +84,9 @@ def frommodel(type_="file", sip=None, file_=None, expand_path=True):
# In order to make this code accessible to MCPServer,
# we need to support passing in UUID strings instead
# of models.
if isinstance(file_, str):
if isinstance(file_, (str, uuid.UUID)):
file_ = models.File.objects.get(uuid=file_)
if isinstance(sip, str):
if isinstance(sip, (str, uuid.UUID)):
# sip can be a SIP or Transfer
try:
sip = models.SIP.objects.get(uuid=sip)
Expand Down
76 changes: 40 additions & 36 deletions tests/MCPClient/test_characterize_file.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,9 @@


@pytest.fixture
def rule_with_xml_output_format(format_version):
def rule_with_xml_output_format(
format_version: fprmodels.FormatVersion,
) -> fprmodels.FPRule:
return fprmodels.FPRule.objects.create(
command=fprmodels.FPCommand.objects.create(
output_format=fprmodels.FormatVersion.objects.get(pronom_id="fmt/101")
Expand All @@ -19,25 +21,28 @@ def rule_with_xml_output_format(format_version):


@pytest.fixture
def fpcommand_output(sip_file, fprule_characterization):
def fpcommand_output(
sip_file: models.File, fprule_characterization: fprmodels.FPRule
) -> models.FPCommandOutput:
return models.FPCommandOutput.objects.create(
file=sip_file, rule=fprule_characterization
)


@pytest.fixture
def delete_characterization_rules(db):
def delete_characterization_rules() -> None:
fprmodels.FPRule.objects.filter(
purpose__in=["characterization", "default_characterization"]
).delete()


@pytest.mark.django_db
def test_job_succeeds_if_file_is_already_characterized(sip_file, sip, fpcommand_output):
def test_job_succeeds_if_file_is_already_characterized(
sip_file: models.File, sip: models.SIP, fpcommand_output: models.FPCommandOutput
) -> None:
job = mock.Mock(
args=[
"characterize_file",
"file_path_not_used",
str(sip_file.uuid),
str(sip.uuid),
],
Expand All @@ -52,12 +57,11 @@ def test_job_succeeds_if_file_is_already_characterized(sip_file, sip, fpcommand_

@pytest.mark.django_db
def test_job_succeeds_if_no_characterization_rules_exist(
sip_file, sip, delete_characterization_rules
):
sip_file: models.File, sip: models.SIP, delete_characterization_rules: None
) -> None:
job = mock.Mock(
args=[
"characterize_file",
"file_path_not_used",
str(sip_file.uuid),
str(sip.uuid),
],
Expand All @@ -78,13 +82,13 @@ def test_job_succeeds_if_no_characterization_rules_exist(
)
@mock.patch("characterize_file.executeOrRun")
def test_job_executes_command(
execute_or_run,
script_type,
sip_file,
sip,
fprule_characterization,
sip_file_format_version,
):
execute_or_run: mock.Mock,
script_type: str,
sip_file: models.File,
sip: models.SIP,
fprule_characterization: fprmodels.FPRule,
sip_file_format_version: models.FileFormatVersion,
) -> None:
fprule_characterization.command.script_type = script_type
fprule_characterization.command.save()
exit_code = 0
Expand All @@ -94,7 +98,6 @@ def test_job_executes_command(
job = mock.Mock(
args=[
"characterize_file",
"file_path_not_used",
str(sip_file.uuid),
str(sip.uuid),
],
Expand All @@ -117,16 +120,19 @@ def test_job_executes_command(
@pytest.mark.django_db
@mock.patch("characterize_file.executeOrRun")
def test_job_fails_if_command_fails(
execute_or_run, sip_file, sip, fprule_characterization, sip_file_format_version
):
execute_or_run: mock.Mock,
sip_file: models.File,
sip: models.SIP,
fprule_characterization: fprmodels.FPRule,
sip_file_format_version: models.FileFormatVersion,
) -> None:
exit_code = 1
stdout = ""
stderr = "error!"
execute_or_run.return_value = (exit_code, stdout, stderr)
job = mock.Mock(
args=[
"characterize_file",
"file_path_not_used",
str(sip_file.uuid),
str(sip.uuid),
],
Expand All @@ -151,22 +157,21 @@ def test_job_fails_if_command_fails(
@mock.patch("characterize_file.etree")
@mock.patch("characterize_file.executeOrRun")
def test_job_saves_valid_xml_command_output(
execute_or_run,
etree,
insert_into_fp_command_output,
sip_file,
sip,
rule_with_xml_output_format,
sip_file_format_version,
):
execute_or_run: mock.Mock,
etree: mock.Mock,
insert_into_fp_command_output: mock.Mock,
sip_file: models.File,
sip: models.SIP,
rule_with_xml_output_format: fprmodels.FPRule,
sip_file_format_version: models.FileFormatVersion,
) -> None:
exit_code = 0
stdout = "<mock>success</mock>"
stderr = ""
execute_or_run.return_value = (exit_code, stdout, stderr)
job = mock.Mock(
args=[
"characterize_file",
"file_path_not_used",
str(sip_file.uuid),
str(sip.uuid),
],
Expand All @@ -186,27 +191,26 @@ def test_job_saves_valid_xml_command_output(
job.write_error.assert_called_once_with(stderr)
etree.fromstring.assert_called_once_with(stdout.encode())
insert_into_fp_command_output.assert_called_once_with(
str(sip_file.uuid), stdout, rule_with_xml_output_format.uuid
sip_file.uuid, stdout, rule_with_xml_output_format.uuid
)


@pytest.mark.django_db
@mock.patch("characterize_file.executeOrRun")
def test_job_fails_with_invalid_xml_command_output(
execute_or_run,
sip_file,
sip,
rule_with_xml_output_format,
sip_file_format_version,
):
execute_or_run: mock.Mock,
sip_file: models.File,
sip: models.SIP,
rule_with_xml_output_format: fprmodels.FPRule,
sip_file_format_version: models.FileFormatVersion,
) -> None:
exit_code = 0
stdout = "invalid xml"
stderr = ""
execute_or_run.return_value = (exit_code, stdout, stderr)
job = mock.Mock(
args=[
"characterize_file",
"file_path_not_used",
str(sip_file.uuid),
str(sip.uuid),
],
Expand Down

0 comments on commit 8742013

Please sign in to comment.