diff --git a/readux_ingest_ecds/admin.py b/readux_ingest_ecds/admin.py index 6a847e6..4b72bbf 100644 --- a/readux_ingest_ecds/admin.py +++ b/readux_ingest_ecds/admin.py @@ -2,8 +2,8 @@ import logging from django.contrib import admin from django.shortcuts import redirect -from .models import Local -from .tasks import local_ingest_task_ecds +from .models import Local, Bulk +from .tasks import local_ingest_task_ecds, bulk_ingest_task_ecds LOGGER = logging.getLogger(__name__) @@ -30,4 +30,18 @@ def response_add(self, request, obj, post_url_continue=None): class Meta: # pylint: disable=too-few-public-methods, missing-class-docstring model = Local +class BulkAdmin(admin.ModelAdmin): + def save_model(self, request, obj, form, change): + LOGGER.info(f'INGEST: Bulk ingest started by {request.user.username}') + obj.creator = request.user + super().save_model(request, obj, form, change) + if os.environ["DJANGO_ENV"] != 'test': # pragma: no cover + bulk_ingest_task_ecds.apply_async(args=[obj.id]) + else: + bulk_ingest_task_ecds(obj.id) + + class Meta: + model = Bulk + admin.site.register(Local, LocalAdmin) +admin.site.register(Bulk, BulkAdmin) \ No newline at end of file diff --git a/readux_ingest_ecds/forms.py b/readux_ingest_ecds/forms.py deleted file mode 100644 index ef43df6..0000000 --- a/readux_ingest_ecds/forms.py +++ /dev/null @@ -1,11 +0,0 @@ -from django import forms -from django.forms import ClearableFileInput -from .models import Bulk - -class BulkVolumeUploadForm(forms.ModelForm): - class Meta: - model = Bulk - fields = ['image_server', 'volume_files', 'collections'] - widgets = { - 'volume_files': ClearableFileInput(attrs={'allow_multiple_selected': True}), - } diff --git a/readux_ingest_ecds/migrations/0002_auto_20240201_1406.py b/readux_ingest_ecds/migrations/0002_auto_20240201_1406.py new file mode 100644 index 0000000..7ed71a2 --- /dev/null +++ b/readux_ingest_ecds/migrations/0002_auto_20240201_1406.py @@ -0,0 +1,42 @@ +# Generated by Django 3.2.23 on 2024-02-01 14:06 + +from django.conf import settings +import django.core.files.storage +from django.db import migrations, models +import django.db.models.deletion +import readux_ingest_ecds.models +import uuid + +class Migration(migrations.Migration): + + dependencies = [ + ('readux_ingest_ecds', '0001_initial'), + ] + + operations = [ + migrations.CreateModel( + name='VolumeFile', + fields=[ + ('id', models.UUIDField(default=uuid.uuid4, primary_key=True, serialize=False)), + ('file', models.FileField(storage=django.core.files.storage.FileSystemStorage(location='tmp'), upload_to=readux_ingest_ecds.models.bulk_path)), + ], + ), + migrations.AddField( + model_name='local', + name='bundle_path', + field=models.CharField(blank=True, max_length=1000), + ), + migrations.CreateModel( + name='Bulk', + fields=[ + ('id', models.BigAutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID')), + ('collections', models.ManyToManyField(blank=True, help_text='Optional: Collections to attach to the volume ingested in this form.', related_name='ecds_bulk_ingest_collections', to='iiif.Collection')), + ('creator', models.ForeignKey(null=True, on_delete=django.db.models.deletion.SET_NULL, related_name='ecds_bulk_ingest_created_locals', to=settings.AUTH_USER_MODEL)), + ('image_server', models.ForeignKey(null=True, on_delete=django.db.models.deletion.DO_NOTHING, related_name='ecds_bulk_ingest_image_server', to='iiif.imageserver')), + ('volume_files', models.ManyToManyField(blank=True, to='readux_ingest_ecds.VolumeFile')), + ], + options={ + 'verbose_name_plural': 'Bulk', + }, + ), + ] diff --git a/readux_ingest_ecds/models.py b/readux_ingest_ecds/models.py index c1456f5..0989423 100644 --- a/readux_ingest_ecds/models.py +++ b/readux_ingest_ecds/models.py @@ -1,5 +1,6 @@ import os import logging +import uuid from zipfile import ZipFile from django.core.files.storage import FileSystemStorage from django.db import models @@ -7,6 +8,7 @@ from .services.file_services import is_image, is_ocr, is_junk, move_image_file, move_ocr_file, canvas_dimensions, upload_trigger_file from .services.iiif_services import create_manifest from .services.metadata_services import metadata_from_file, clean_metadata +from .services.ocr_services import add_ocr_to_canvases from .helpers import get_iiif_models Manifest = get_iiif_models()['Manifest'] @@ -19,6 +21,9 @@ location=settings.INGEST_TMP_DIR ) +def bulk_path(instance, filename): + return os.path.join(str(instance.id), filename ) + class IngestAbstractModel(models.Model): metadata = models.JSONField(default=dict, blank=True) manifest = models.ForeignKey( @@ -56,6 +61,8 @@ class Local(IngestAbstractModel): storage=tmp_storage ) + bundle_path = models.CharField(blank=True, max_length=1000) + class Meta: verbose_name_plural = 'Local' @@ -101,7 +108,9 @@ def ingest(self): def unzip_bundle(self): open(self.trigger_file, 'a').close() - with ZipFile(self.bundle, 'r') as zip_ref: + bundle_to_unzip = self.bundle_path if self.bundle_path else self.bundle + + with ZipFile(bundle_to_unzip, 'r') as zip_ref: for member in zip_ref.infolist(): file_name = member.filename @@ -161,7 +170,7 @@ def open_metadata(self): if metadata_file is None or os.path.exists(metadata_file) is False: return - self.metadata = metadata_from_file(metadata_file) + self.metadata = metadata_from_file(metadata_file)[0] def create_canvases(self): Canvas = get_iiif_models()['Canvas'] @@ -192,3 +201,59 @@ def create_canvases(self): ) upload_trigger_file(self.trigger_file) + +class VolumeFile(models.Model): + id = models.UUIDField(primary_key=True, default=uuid.uuid4, editable=False) + file = models.FileField( + blank=False, + storage=tmp_storage, + upload_to=bulk_path + ) + +class Bulk(models.Model): + collections = models.ManyToManyField( + Collection, + blank=True, + help_text="Optional: Collections to attach to the volume ingested in this form.", + related_name='ecds_bulk_ingest_collections' + ) + image_server = models.ForeignKey( + ImageServer, + on_delete=models.DO_NOTHING, + null=True, + related_name='ecds_bulk_ingest_image_server' + ) + creator = models.ForeignKey( + settings.AUTH_USER_MODEL, + on_delete=models.SET_NULL, + null=True, + related_name='ecds_bulk_ingest_created_locals' + ) + volume_files = models.ManyToManyField(VolumeFile, blank=True) + + class Meta: + verbose_name_plural = 'Bulk' + + def ingest(self): + LOGGER.info('Ingesting Bulk') + for uploaded_file in self.volume_files.all(): + if os.path.splitext(os.path.basename(uploaded_file.file.name))[0] == 'metadata': + metadata = metadata_from_file(uploaded_file.file.path) + for volume in metadata: + bundle_filename = [d['value'] for d in volume['metadata'] if d['label'].casefold() == 'filename'][0] + try: + bundle = self.volume_files.all().get(file__contains=bundle_filename) + if os.path.exists(bundle.file.path) and bundle.file.name.endswith('.zip'): + local = Local.objects.create( + metadata=volume, + bundle_path=bundle.file.path, + image_server=self.image_server, + creator=self.creator + ) + local.prep() + local.ingest() + add_ocr_to_canvases(local.manifest) + except VolumeFile.DoesNotExist: + pass + self.volume_files.all().delete() + self.delete() diff --git a/readux_ingest_ecds/services/iiif_services.py b/readux_ingest_ecds/services/iiif_services.py index 1590484..af60b69 100644 --- a/readux_ingest_ecds/services/iiif_services.py +++ b/readux_ingest_ecds/services/iiif_services.py @@ -24,7 +24,7 @@ def create_manifest(ingest): metadata = None if metadata: if 'pid' in metadata: - manifest, _ = Manifest.objects.get_or_create(pid=metadata['pid'].replace('_', '-')) + manifest, _ = Manifest.objects.get_or_create(pid=metadata['pid']) else: manifest = Manifest.objects.create() for (key, value) in metadata.items(): @@ -34,7 +34,7 @@ def create_manifest(ingest): else: # all other keys should exist as fields on Manifest (for now) setattr(manifest, key, value) - # TODO: if the key doesn't exist on Manifest model, add it to Manifest.metadata + # If the key doesn't exist on Manifest model, add it to Manifest.metadata else: manifest = Manifest() diff --git a/readux_ingest_ecds/services/metadata_services.py b/readux_ingest_ecds/services/metadata_services.py index 4e324b1..3b626dc 100644 --- a/readux_ingest_ecds/services/metadata_services.py +++ b/readux_ingest_ecds/services/metadata_services.py @@ -89,17 +89,19 @@ def metadata_from_file(metadata_file): if format is None: return - metadata = None + metadata = [] + metadata_set = None if format == 'excel': with open(metadata_file, 'rb') as fh: - metadata = Dataset().load(fh.read(), format=metadata_file.split('.')[-1]) + metadata_set = Dataset().load(fh.read(), format=metadata_file.split('.')[-1]) else: with open(metadata_file, 'r', encoding="utf-8-sig") as fh: - metadata = Dataset().load(fh.read(), format=format) + metadata_set = Dataset().load(fh.read(), format=format) - if metadata is not None: - metadata = clean_metadata(metadata.dict[0]) + if metadata_set is not None: + for row in metadata_set.dict: + metadata.append(clean_metadata(row)) return metadata diff --git a/readux_ingest_ecds/services/ocr_services.py b/readux_ingest_ecds/services/ocr_services.py index 8cefb5e..1192357 100644 --- a/readux_ingest_ecds/services/ocr_services.py +++ b/readux_ingest_ecds/services/ocr_services.py @@ -375,6 +375,7 @@ def parse_xml_ocr(result): return None def add_ocr_annotations(canvas, ocr): + OCR = get_iiif_models()['OCR'] word_order = 1 annotations = [] for word in ocr: @@ -483,3 +484,15 @@ def is_tsv(to_test): if len(as_str.split('\t')) > 1: return True return False + +def add_ocr_to_canvases(manifest): + OCR = get_iiif_models()['OCR'] + for canvas in manifest.canvas_set.all(): + ocr = get_ocr(canvas) + if ocr is not None: + add_ocr_annotations(canvas, ocr) + # The add_ocr_annotations method uses bulk_create() which does not call save() on the model. + # Calling save() is really slow and I don't know why. Calling save() after the annotation + # has been created, calling save is as fast as expected. + [ocr.save() for ocr in OCR.objects.filter(canvas=canvas)] + canvas.save() # trigger reindex diff --git a/readux_ingest_ecds/tasks.py b/readux_ingest_ecds/tasks.py index d991d1e..46e7185 100644 --- a/readux_ingest_ecds/tasks.py +++ b/readux_ingest_ecds/tasks.py @@ -6,11 +6,12 @@ from django.apps import apps from django.conf import settings from .helpers import get_iiif_models -from .services.ocr_services import get_ocr, add_ocr_annotations +from .services.ocr_services import add_ocr_to_canvases # Use `apps.get_model` to avoid circular import error. Because the parameters used to # create a background task have to be serializable, we can't just pass in the model object. Local = apps.get_model('readux_ingest_ecds.local') # pylint: disable = invalid-name +Bulk = apps.get_model('readux_ingest_ecds.bulk') # pylint: disable = invalid-name Manifest = get_iiif_models()['Manifest'] Canvas = get_iiif_models()['Canvas'] @@ -35,17 +36,19 @@ def local_ingest_task_ecds(ingest_id): else: add_ocr_task(local_ingest.manifest.pk) +@app.task(name='bulk_ingest_task_ecds', autoretry_for=(Exception,), retry_backoff=True, max_retries=20) +def bulk_ingest_task_ecds(ingest_id): + """Background task to start ingest process. + + :param ingest_id: Primary key for .models.Local object + :type ingest_id: UUID + + """ + bulk_ingest = Bulk.objects.get(pk=ingest_id) + bulk_ingest.ingest() @app.task(name='ingest_ocr_to_canvas', autoretry_for=(Manifest.DoesNotExist,), retry_backoff=5) def add_ocr_task(manifest_id, *args, **kwargs): """Function for parsing and adding OCR.""" manifest = Manifest.objects.get(pk=manifest_id) - for canvas in manifest.canvas_set.all(): - ocr = get_ocr(canvas) - if ocr is not None: - add_ocr_annotations(canvas, ocr) - # The add_ocr_annotations method uses bulk_create() which does not call save() on the model. - # Calling save() is really slow and I don't know why. Calling save() after the annotation - # has been created, calling save is as fast as expected. - [ocr.save() for ocr in OCR.objects.filter(canvas=canvas)] - canvas.save() # trigger reindex + add_ocr_to_canvases(manifest) diff --git a/test_app/fixtures/bulk/metadata.csv b/test_app/fixtures/bulk/metadata.csv new file mode 100644 index 0000000..ceb7b1b --- /dev/null +++ b/test_app/fixtures/bulk/metadata.csv @@ -0,0 +1,5 @@ +Filename,pid +volume1.zip,pid1 +volume2.zip,pid2 +volume3.zip,pid3 + diff --git a/test_app/fixtures/bulk/volume1.zip b/test_app/fixtures/bulk/volume1.zip new file mode 100644 index 0000000..8896492 Binary files /dev/null and b/test_app/fixtures/bulk/volume1.zip differ diff --git a/test_app/fixtures/bulk/volume2.zip b/test_app/fixtures/bulk/volume2.zip new file mode 100644 index 0000000..8896492 Binary files /dev/null and b/test_app/fixtures/bulk/volume2.zip differ diff --git a/test_app/fixtures/bulk/volume3.zip b/test_app/fixtures/bulk/volume3.zip new file mode 100644 index 0000000..8896492 Binary files /dev/null and b/test_app/fixtures/bulk/volume3.zip differ diff --git a/test_app/fixtures/metadata.csv b/test_app/fixtures/metadata.csv index 35814bd..421020d 100644 --- a/test_app/fixtures/metadata.csv +++ b/test_app/fixtures/metadata.csv @@ -1,2 +1,3 @@ Filename,Label,Summary,Author,Published city,Published date,Publisher -no_meta_file,Test Bundle,Test file,Test author,Test City,2021,Pubilsher test \ No newline at end of file +no_meta_file,Test Bundle,Test file,Test author,Test City,2021,Pubilsher test + diff --git a/test_app/fixtures/multi_volume_metadata.csv b/test_app/fixtures/multi_volume_metadata.csv new file mode 100644 index 0000000..f3b1674 --- /dev/null +++ b/test_app/fixtures/multi_volume_metadata.csv @@ -0,0 +1,3 @@ +PID,Filename,Label,Summary,Author,Published city,Published date,Publisher +pid1,no_meta_file,Test Bundle,Test file,Test author,Test City,2021,Pubilsher test + diff --git a/test_app/iiif/migrations/0001_initial.py b/test_app/iiif/migrations/0001_initial.py index 2145aad..f2b6373 100644 --- a/test_app/iiif/migrations/0001_initial.py +++ b/test_app/iiif/migrations/0001_initial.py @@ -45,7 +45,8 @@ class Migration(migrations.Migration): migrations.CreateModel( name='Manifest', fields=[ - ('pid', models.CharField(default=uuid.uuid4, max_length=255, primary_key=True, serialize=False)), + ('id', models.BigAutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID')), + ('pid', models.CharField(default=uuid.uuid4, max_length=255, serialize=False)), ('published_city', models.TextField(blank=True, null=True)), ('publisher', models.TextField(blank=True, null=True)), ('metadata', models.JSONField(blank=True, default=dict)), diff --git a/test_app/iiif/migrations/0002_alter_manifest_pid.py b/test_app/iiif/migrations/0002_alter_manifest_pid.py new file mode 100644 index 0000000..abd409f --- /dev/null +++ b/test_app/iiif/migrations/0002_alter_manifest_pid.py @@ -0,0 +1,19 @@ +# Generated by Django 3.2.23 on 2024-02-01 13:40 + +from django.db import migrations, models +import uuid + + +class Migration(migrations.Migration): + + dependencies = [ + ('iiif', '0001_initial'), + ] + + operations = [ + migrations.AlterField( + model_name='manifest', + name='pid', + field=models.CharField(default=uuid.uuid4, max_length=255), + ), + ] diff --git a/test_app/iiif/models.py b/test_app/iiif/models.py index 0eca236..cf02619 100644 --- a/test_app/iiif/models.py +++ b/test_app/iiif/models.py @@ -10,7 +10,7 @@ class ImageServer(models.Model): storage_service = models.CharField(max_length=25, default='local') class Manifest(models.Model): - pid = models.CharField(max_length=255, primary_key=True, default=uuid4, editable=True) + pid = models.CharField(max_length=255, default=uuid4, editable=True) image_server = models.ForeignKey(ImageServer, on_delete=models.DO_NOTHING, null=True) collections = models.ManyToManyField(Collection, blank=True, related_name='manifests') published_city = models.TextField(null=True, blank=True) diff --git a/test_app/test_app/settings.py b/test_app/test_app/settings.py index 24a81bc..4079a3b 100644 --- a/test_app/test_app/settings.py +++ b/test_app/test_app/settings.py @@ -57,6 +57,7 @@ 'django.contrib.contenttypes', 'django.contrib.messages', 'django_celery_results', + 'django_extensions', 'iiif.apps.IiifConfig', 'readux_ingest_ecds', 'test_app' diff --git a/test_app/tests/factories.py b/test_app/tests/factories.py index f639874..14a44ca 100644 --- a/test_app/tests/factories.py +++ b/test_app/tests/factories.py @@ -3,7 +3,7 @@ from factory.django import DjangoModelFactory, FileField, ImageField from factory import Faker, SubFactory from django.conf import settings -from readux_ingest_ecds.models import Local +from readux_ingest_ecds.models import Local, Bulk, VolumeFile from iiif.models import ImageServer, Manifest, User, Collection class ImageServerFactory(DjangoModelFactory): @@ -34,14 +34,23 @@ class UserFactory(DjangoModelFactory): class Meta: model = User -class LocalFactory(DjangoModelFactory): +class VolumeBundleFileFactory(DjangoModelFactory): + file = FileField(filename='volume2.zip', filepath=join(settings.FIXTURE_DIR, 'bulk', 'volume2.zip')) + class Meta: - model = Local + model = VolumeFile - bundle = FileField(from_path=join(settings.FIXTURE_DIR, 'bundle.zip')) - image_server = SubFactory(ImageServerFactory) - manifest = None +class VolumeMetadataFileFactory(DjangoModelFactory): + file = FileField(filename='metadata.csv', filepath=join(settings.FIXTURE_DIR, 'bulk', 'metadata.csv')) + class Meta: + model = VolumeFile + +class BulkFactory(DjangoModelFactory): + class Meta: + model = Bulk + + image_server = SubFactory(ImageServerFactory) class CollectionFactory(DjangoModelFactory): """ Factory for mocking :class:`apps.iiif.kollections.models.Collection` objects. diff --git a/test_app/tests/test_bulk.py b/test_app/tests/test_bulk.py new file mode 100644 index 0000000..9c2de12 --- /dev/null +++ b/test_app/tests/test_bulk.py @@ -0,0 +1,66 @@ +""" Tests for bulk ingest """ +import os +from shutil import rmtree +import pytest +import boto3 +from moto import mock_s3 +from django.test import TestCase +from django.core.files.uploadedfile import SimpleUploadedFile +from django.conf import settings +from .factories import ImageServerFactory, BulkFactory +from readux_ingest_ecds.models import Bulk, VolumeFile + +pytestmark = pytest.mark.django_db(transaction=True) # pylint: disable = invalid-name + +@mock_s3 +class BulkTest(TestCase): + """ Tests for ingest.models.Local """ + def setUp(self): + """ Set instance variables. """ + super().setUp() + rmtree(settings.INGEST_TMP_DIR, ignore_errors=True) + self.fixture_path = os.path.join(settings.FIXTURE_DIR, 'bulk') + self.image_server = ImageServerFactory() + self.ingest_files = [] + + conn = boto3.resource('s3', region_name='us-east-1') + conn.create_bucket(Bucket=settings.INGEST_TRIGGER_BUCKET) + + for bulk_file in os.listdir(self.fixture_path): + ingest_file = VolumeFile.objects.create() + file_to_ingest = SimpleUploadedFile( + name=bulk_file, + content=open( + os.path.join(self.fixture_path, bulk_file), + 'rb' + ).read() + ) + ingest_file.file = file_to_ingest + ingest_file.save() + self.ingest_files.append(ingest_file) + + self.bulk = BulkFactory.create( + image_server = self.image_server, + ) + + self.bulk.volume_files.set(self.ingest_files) + + def teardown_class(): + rmtree(settings.INGEST_TMP_DIR, ignore_errors=True) + + def test_bulk_upload(self): + """ It should upload all files """ + + for ingest_file in self.bulk.volume_files.all(): + assert os.path.exists( + os.path.join( + settings.INGEST_TMP_DIR, + str(ingest_file.pk), + os.path.basename(ingest_file.file.name) + ) + ) + + def test_bulk_ingest(self): + self.bulk.ingest() + + assert os.path.isfile(os.path.join(settings.INGEST_PROCESSING_DIR, 'pid3_00000005.jpg')) diff --git a/test_app/tests/test_bulk_admin.py b/test_app/tests/test_bulk_admin.py new file mode 100644 index 0000000..4da309b --- /dev/null +++ b/test_app/tests/test_bulk_admin.py @@ -0,0 +1,58 @@ +import os +from shutil import rmtree +import boto3 +from moto import mock_s3 +from django.contrib.admin.sites import AdminSite +from django.test import TestCase +from django.core.files.uploadedfile import SimpleUploadedFile +from django.test.client import RequestFactory +from django.conf import settings +from readux_ingest_ecds.admin import BulkAdmin +from readux_ingest_ecds.models import Bulk, VolumeFile +from .factories import BulkFactory, UserFactory + +@mock_s3 +class BulkIngestAdminTest(TestCase): + """ Tests Ingest Admin """ + def setUp(self): + """ Set instance variables. """ + conn = boto3.resource('s3', region_name='us-east-1') + conn.create_bucket(Bucket=settings.INGEST_TRIGGER_BUCKET) + + self.fixture_path = os.path.join(settings.FIXTURE_DIR, 'bulk') + self.request_factory = RequestFactory() + self.user = UserFactory.create(is_superuser=True) + self.bulk = BulkFactory.create() + metadata_file = VolumeFile.objects.create( + file=SimpleUploadedFile( + name='metadata.csv', + content=open( + os.path.join(self.fixture_path, 'metadata.csv'), + 'rb' + ).read() + ) + ) + self.bundle_file = VolumeFile.objects.create( + file=SimpleUploadedFile( + name='volume2.zip', + content=open( + os.path.join(self.fixture_path, 'volume2.zip'), + 'rb' + ).read() + ) + ) + + self.bulk.volume_files.add(metadata_file) + self.bulk.volume_files.add(self.bundle_file) + + + # def teardown_class(): + # rmtree(settings.INGEST_TMP_DIR, ignore_errors=True) + + def test_bulk_admin_save(self): + req = self.request_factory.post('/admin/readux_ingest_ecds/bulk/add/', data={}) + req.user = self.user + bulk_model_admin = BulkAdmin(model=Bulk, admin_site=AdminSite()) + bulk_model_admin.save_model(obj=self.bulk, request=req, form=None, change=None) + + assert os.path.isfile(os.path.join(settings.INGEST_PROCESSING_DIR, 'pid2_00000005.jpg')) diff --git a/test_app/tests/test_admin.py b/test_app/tests/test_local_admin.py similarity index 99% rename from test_app/tests/test_admin.py rename to test_app/tests/test_local_admin.py index ca16159..aa5a197 100644 --- a/test_app/tests/test_admin.py +++ b/test_app/tests/test_local_admin.py @@ -16,7 +16,7 @@ from readux_ingest_ecds.admin import LocalAdmin @mock_s3 -class IngestAdminTest(TestCase): +class LocalIngestAdminTest(TestCase): """ Tests Ingest Admin """ def setUp(self): """ Set instance variables. """