From b33bb09494d83c072be8a880f58297aed268fc02 Mon Sep 17 00:00:00 2001 From: "Douglas Cerna (Soy Douglas)" Date: Tue, 20 Aug 2024 17:44:42 +0000 Subject: [PATCH 1/2] Add type hints to the transcribe_file script --- pyproject.toml | 2 + .../lib/clientScripts/transcribe_file.py | 75 +++++++++---- tests/MCPClient/test_transcribe_file.py | 102 +++++++++++------- 3 files changed, 117 insertions(+), 62 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 1b3a778dca..7e04c2b202 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -61,12 +61,14 @@ module = [ "src.MCPClient.lib.clientScripts.identify_file_format", "src.MCPClient.lib.clientScripts.normalize", "src.MCPClient.lib.clientScripts.policy_check", + "src.MCPClient.lib.clientScripts.transcribe_file", "src.MCPClient.lib.clientScripts.validate_file", "tests.MCPClient.conftest", "tests.MCPClient.test_characterize_file", "tests.MCPClient.test_identify_file_format", "tests.MCPClient.test_normalize", "tests.MCPClient.test_policy_check", + "tests.MCPClient.test_transcribe_file", "tests.MCPClient.test_validate_file", ] check_untyped_defs = true diff --git a/src/MCPClient/lib/clientScripts/transcribe_file.py b/src/MCPClient/lib/clientScripts/transcribe_file.py index e0215a0efb..906db4061c 100755 --- a/src/MCPClient/lib/clientScripts/transcribe_file.py +++ b/src/MCPClient/lib/clientScripts/transcribe_file.py @@ -1,20 +1,25 @@ #!/usr/bin/env python +import argparse +import dataclasses import multiprocessing import os -from uuid import uuid4 +import uuid +from typing import List +from typing import Optional +from typing import Sequence +from typing import Tuple import django -from django.db import transaction django.setup() -# dashboard + import databaseFunctions import fileOperations - -# archivematicaCommon +from client.job import Job from dicts import ReplacementDict from django.conf import settings as mcpclient_settings from django.core.exceptions import ValidationError +from django.db import transaction from django.utils import timezone from executeOrRunSubProcess import executeOrRun from fpr.models import FPRule @@ -24,11 +29,19 @@ from main.models import FileFormatVersion -def concurrent_instances(): +@dataclasses.dataclass +class TranscribeFileArgs: + task_uuid: uuid.UUID + file_uuid: uuid.UUID + + +def concurrent_instances() -> int: return multiprocessing.cpu_count() -def insert_transcription_event(status, file_uuid, rule, relative_location): +def insert_transcription_event( + status: int, file_uuid: uuid.UUID, rule: FPRule, relative_location: str +) -> str: outcome = "transcribed" if status == 0 else "not transcribed" tool = rule.command.tool @@ -36,7 +49,7 @@ def insert_transcription_event(status, file_uuid, rule, relative_location): tool.description, tool.version, rule.command.command.replace('"', r"\"") ) - event_uuid = str(uuid4()) + event_uuid = str(uuid.uuid4()) databaseFunctions.insertIntoEvents( fileUUID=file_uuid, @@ -51,9 +64,14 @@ def insert_transcription_event(status, file_uuid, rule, relative_location): def insert_file_into_database( - task_uuid, file_uuid, sip_uuid, event_uuid, rule, output_path, relative_path -): - transcription_uuid = str(uuid4()) + task_uuid: uuid.UUID, + file_uuid: uuid.UUID, + sip_uuid: str, + event_uuid: str, + output_path: str, + relative_path: str, +) -> None: + transcription_uuid = str(uuid.uuid4()) today = timezone.now() fileOperations.addFileToSIP( relative_path, @@ -66,7 +84,7 @@ def insert_file_into_database( ) fileOperations.updateSizeAndChecksum( - transcription_uuid, output_path, today, str(uuid4()) + transcription_uuid, output_path, today, str(uuid.uuid4()) ) databaseFunctions.insertIntoDerivations( @@ -76,17 +94,18 @@ def insert_file_into_database( ) -def fetch_rules_for(file_): +def fetch_rules_for(file_: File) -> Sequence[FPRule]: try: format = FileFormatVersion.objects.get(file_uuid=file_) - return FPRule.active.filter( + result: Sequence[FPRule] = FPRule.active.filter( format=format.format_version, purpose="transcription" ) + return result except (FileFormatVersion.DoesNotExist, ValidationError): return [] -def fetch_rules_for_derivatives(file_): +def fetch_rules_for_derivatives(file_: File) -> Tuple[Optional[File], Sequence[FPRule]]: derivs = Derivation.objects.filter(source_file=file_) for deriv in derivs: derived_file = deriv.derived_file @@ -101,7 +120,7 @@ def fetch_rules_for_derivatives(file_): return None, [] -def main(job, task_uuid, file_uuid): +def main(job: Job, task_uuid: uuid.UUID, file_uuid: uuid.UUID) -> int: setup_dicts(mcpclient_settings) succeeded = True @@ -163,7 +182,6 @@ def main(job, task_uuid, file_uuid): file_uuid, rd["%SIPUUID%"], event, - rule, output_path, relative_path, ) @@ -171,11 +189,26 @@ def main(job, task_uuid, file_uuid): return 0 if succeeded else 1 -def call(jobs): +def get_parser() -> argparse.ArgumentParser: + parser = argparse.ArgumentParser(description="Transcribe file.") + parser.add_argument("task_uuid", type=uuid.UUID) + parser.add_argument("file_uuid", type=uuid.UUID) + + return parser + + +def parse_args(parser: argparse.ArgumentParser, job: Job) -> TranscribeFileArgs: + namespace = parser.parse_args(job.args[1:]) + + return TranscribeFileArgs(**vars(namespace)) + + +def call(jobs: List[Job]) -> None: + parser = get_parser() + with transaction.atomic(): for job in jobs: with job.JobContext(): - task_uuid = job.args[1] - file_uuid = job.args[2] + args = parse_args(parser, job) - job.set_status(main(job, task_uuid, file_uuid)) + job.set_status(main(job, args.task_uuid, args.file_uuid)) diff --git a/tests/MCPClient/test_transcribe_file.py b/tests/MCPClient/test_transcribe_file.py index 53b405ed07..4f0c6bdb21 100644 --- a/tests/MCPClient/test_transcribe_file.py +++ b/tests/MCPClient/test_transcribe_file.py @@ -2,23 +2,17 @@ from unittest import mock import pytest +import pytest_django import transcribe_file from client.job import Job +from fpr import models as fprmodels from main import models EXECUTE_OR_RUN_STDOUT = "Hello" -@pytest.fixture(scope="module", autouse=True) -def execute_or_run(): - with mock.patch( - "transcribe_file.executeOrRun", return_value=(0, EXECUTE_OR_RUN_STDOUT, "") - ): - yield - - @pytest.fixture() -def sip(sip): +def sip(sip: models.SIP) -> models.SIP: # ReplacementDict expands SIP paths based on the shared directory. sip.currentpath = "%sharedPath%" sip.save() @@ -27,7 +21,7 @@ def sip(sip): @pytest.fixture -def create_sip_file(sip_directory_path, sip_file): +def create_sip_file(sip_directory_path: pathlib.Path, sip_file: models.File) -> None: file_path = pathlib.Path( sip_file.currentlocation.decode().replace("%SIPDirectory%", "") ) @@ -39,7 +33,9 @@ def create_sip_file(sip_directory_path, sip_file): @pytest.fixture -def fpcommand(fpcommand, sip_file): +def fpcommand( + fpcommand: fprmodels.FPCommand, sip_file: models.File +) -> fprmodels.FPCommand: fpcommand.output_location = sip_file.currentlocation.decode() fpcommand.save() @@ -47,29 +43,35 @@ def fpcommand(fpcommand, sip_file): @pytest.fixture -def derivation(sip_file, preservation_file): +def derivation( + sip_file: models.File, preservation_file: models.File +) -> models.Derivation: return models.Derivation.objects.create( source_file=sip_file, derived_file=preservation_file ) @pytest.fixture -def preservation_file_format_version(preservation_file, format_version): +def preservation_file_format_version( + preservation_file: models.File, format_version: fprmodels.FormatVersion +) -> models.FileFormatVersion: return models.FileFormatVersion.objects.create( file_uuid=preservation_file, format_version=format_version ) @pytest.mark.django_db +@mock.patch("transcribe_file.executeOrRun", return_value=(0, EXECUTE_OR_RUN_STDOUT, "")) def test_main( - sip_file, - task, - fprule_transcription, - sip_file_format_version, - settings, - sip_directory_path, - create_sip_file, -): + execute_or_run: mock.Mock, + sip_file: models.File, + task: models.Task, + fprule_transcription: fprmodels.FPRule, + sip_file_format_version: models.FileFormatVersion, + settings: pytest_django.fixtures.SettingsWrapper, + sip_directory_path: pathlib.Path, + create_sip_file: None, +) -> None: job = mock.Mock(spec=Job) settings.SHARED_DIRECTORY = f"{sip_directory_path}/" @@ -115,8 +117,12 @@ def test_main( @pytest.mark.django_db def test_main_if_filegroup_is_not_original( - preservation_file, task, fprule_transcription, sip_file_format_version, capsys -): + preservation_file: models.File, + task: models.Task, + fprule_transcription: fprmodels.FPRule, + sip_file_format_version: models.FileFormatVersion, + capsys: pytest.CaptureFixture[str], +) -> None: job = mock.Mock(spec=Job) result = transcribe_file.main( @@ -133,7 +139,12 @@ def test_main_if_filegroup_is_not_original( @pytest.mark.django_db -def test_main_if_no_rules_exist(sip_file, task, sip_file_format_version, capsys): +def test_main_if_no_rules_exist( + sip_file: models.File, + task: models.Task, + sip_file_format_version: models.FileFormatVersion, + capsys: pytest.CaptureFixture[str], +) -> None: job = mock.Mock(spec=Job) result = transcribe_file.main(job, task_uuid=task.taskuuid, file_uuid=sip_file.uuid) @@ -151,8 +162,10 @@ def test_main_if_no_rules_exist(sip_file, task, sip_file_format_version, capsys) @pytest.mark.django_db def test_fetch_rules_for_derivatives_if_rules_are_absent_for_derivates( - derivation, fprule_transcription, sip_file_format_version -): + derivation: models.Derivation, + fprule_transcription: fprmodels.FPRule, + sip_file_format_version: models.FileFormatVersion, +) -> None: result = transcribe_file.fetch_rules_for_derivatives(file_=derivation.source_file) assert result == (None, []) @@ -160,8 +173,10 @@ def test_fetch_rules_for_derivatives_if_rules_are_absent_for_derivates( @pytest.mark.django_db def test_fetch_rules_for_derivatives( - derivation, fprule_transcription, preservation_file_format_version -): + derivation: models.Derivation, + fprule_transcription: fprmodels.FPRule, + preservation_file_format_version: models.FileFormatVersion, +) -> None: derived_file, rules_of_derived_file = transcribe_file.fetch_rules_for_derivatives( file_=derivation.source_file ) @@ -179,7 +194,9 @@ def test_fetch_rules_for_derivatives( @pytest.fixture -def disabled_fprule_transcription(fprule_transcription): +def disabled_fprule_transcription( + fprule_transcription: fprmodels.FPRule, +) -> fprmodels.FPRule: fprule_transcription.enabled = 0 fprule_transcription.save() @@ -188,8 +205,11 @@ def disabled_fprule_transcription(fprule_transcription): @pytest.mark.django_db def test_main_if_fprule_is_disabled( - sip_file, task, disabled_fprule_transcription, sip_file_format_version -): + sip_file: models.File, + task: models.Task, + disabled_fprule_transcription: fprmodels.FPRule, + sip_file_format_version: models.FileFormatVersion, +) -> None: job = mock.Mock(spec=Job) result = transcribe_file.main(job, task_uuid=task.taskuuid, file_uuid=sip_file.uuid) @@ -234,14 +254,14 @@ def test_main_if_fprule_is_disabled( @pytest.mark.django_db @mock.patch("transcribe_file.executeOrRun") def test_main_if_command_is_not_bash_script( - _execute_or_run, - fpcommand, - sip_file, - task, - fprule_transcription, - sip_file_format_version, -): - _execute_or_run.return_value = (0, fpcommand.command, "") + execute_or_run: mock.Mock, + fpcommand: fprmodels.FPCommand, + sip_file: models.File, + task: models.Task, + fprule_transcription: fprmodels.FPRule, + sip_file_format_version: models.FileFormatVersion, +) -> None: + execute_or_run.return_value = (0, fpcommand.command, "") job = mock.Mock(spec=Job) result = transcribe_file.main(job, task_uuid=task.taskuuid, file_uuid=sip_file.uuid) @@ -254,10 +274,10 @@ def test_main_if_command_is_not_bash_script( assert job.write_output.mock_calls == [mock.call(fpcommand.command)] # executeOrRun is called once. - transcribe_file.executeOrRun.assert_called_once() + execute_or_run.assert_called_once() # Get the call to executeOrRun. - execute_or_run_call = transcribe_file.executeOrRun.mock_calls[0] + execute_or_run_call = execute_or_run.mock_calls[0] call_kwargs = execute_or_run_call[-1] # Ensure the arguments passed is a list. From dd125added6b450cc632ee9331b5460837e388c4 Mon Sep 17 00:00:00 2001 From: "Douglas Cerna (Soy Douglas)" Date: Tue, 20 Aug 2024 17:48:18 +0000 Subject: [PATCH 2/2] Fix argument parser descriptions --- src/MCPClient/lib/clientScripts/characterize_file.py | 2 +- src/MCPClient/lib/clientScripts/normalize.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/src/MCPClient/lib/clientScripts/characterize_file.py b/src/MCPClient/lib/clientScripts/characterize_file.py index 8c1497a17b..f9c0cc239e 100755 --- a/src/MCPClient/lib/clientScripts/characterize_file.py +++ b/src/MCPClient/lib/clientScripts/characterize_file.py @@ -126,7 +126,7 @@ def main(job: Job, file_uuid: uuid.UUID, sip_uuid: uuid.UUID) -> int: def get_parser() -> argparse.ArgumentParser: - parser = argparse.ArgumentParser(description="Identify file formats.") + parser = argparse.ArgumentParser(description="Characterize file.") parser.add_argument("file_uuid", type=uuid.UUID) parser.add_argument("sip_uuid", type=uuid.UUID) diff --git a/src/MCPClient/lib/clientScripts/normalize.py b/src/MCPClient/lib/clientScripts/normalize.py index 453f01ac39..6e240b6b00 100755 --- a/src/MCPClient/lib/clientScripts/normalize.py +++ b/src/MCPClient/lib/clientScripts/normalize.py @@ -578,7 +578,7 @@ def main(job: Job, opts: NormalizeArgs) -> int: def get_parser() -> argparse.ArgumentParser: - parser = argparse.ArgumentParser(description="Identify file formats.") + parser = argparse.ArgumentParser(description="Normalize.") parser.add_argument( "purpose", type=str, help='"preservation", "access", "thumbnail"' )