Skip to content

Commit

Permalink
Add bulk ingest
Browse files Browse the repository at this point in the history
  • Loading branch information
jayvarner committed Feb 2, 2024
1 parent f0bcba8 commit d4b3f04
Show file tree
Hide file tree
Showing 22 changed files with 333 additions and 42 deletions.
18 changes: 16 additions & 2 deletions readux_ingest_ecds/admin.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,8 @@
import logging
from django.contrib import admin
from django.shortcuts import redirect
from .models import Local
from .tasks import local_ingest_task_ecds
from .models import Local, Bulk
from .tasks import local_ingest_task_ecds, bulk_ingest_task_ecds

LOGGER = logging.getLogger(__name__)

Expand All @@ -30,4 +30,18 @@ def response_add(self, request, obj, post_url_continue=None):
class Meta: # pylint: disable=too-few-public-methods, missing-class-docstring
model = Local

class BulkAdmin(admin.ModelAdmin):
def save_model(self, request, obj, form, change):
LOGGER.info(f'INGEST: Bulk ingest started by {request.user.username}')
obj.creator = request.user
super().save_model(request, obj, form, change)
if os.environ["DJANGO_ENV"] != 'test': # pragma: no cover
bulk_ingest_task_ecds.apply_async(args=[obj.id])
else:
bulk_ingest_task_ecds(obj.id)

class Meta:
model = Bulk

admin.site.register(Local, LocalAdmin)
admin.site.register(Bulk, BulkAdmin)
11 changes: 0 additions & 11 deletions readux_ingest_ecds/forms.py

This file was deleted.

42 changes: 42 additions & 0 deletions readux_ingest_ecds/migrations/0002_auto_20240201_1406.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
# Generated by Django 3.2.23 on 2024-02-01 14:06

from django.conf import settings
import django.core.files.storage
from django.db import migrations, models
import django.db.models.deletion
import readux_ingest_ecds.models
import uuid

class Migration(migrations.Migration):

dependencies = [
('readux_ingest_ecds', '0001_initial'),
]

operations = [
migrations.CreateModel(
name='VolumeFile',
fields=[
('id', models.UUIDField(default=uuid.uuid4, primary_key=True, serialize=False)),
('file', models.FileField(storage=django.core.files.storage.FileSystemStorage(location='tmp'), upload_to=readux_ingest_ecds.models.bulk_path)),
],
),
migrations.AddField(
model_name='local',
name='bundle_path',
field=models.CharField(blank=True, max_length=1000),
),
migrations.CreateModel(
name='Bulk',
fields=[
('id', models.BigAutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID')),
('collections', models.ManyToManyField(blank=True, help_text='Optional: Collections to attach to the volume ingested in this form.', related_name='ecds_bulk_ingest_collections', to='iiif.Collection')),
('creator', models.ForeignKey(null=True, on_delete=django.db.models.deletion.SET_NULL, related_name='ecds_bulk_ingest_created_locals', to=settings.AUTH_USER_MODEL)),
('image_server', models.ForeignKey(null=True, on_delete=django.db.models.deletion.DO_NOTHING, related_name='ecds_bulk_ingest_image_server', to='iiif.imageserver')),
('volume_files', models.ManyToManyField(blank=True, to='readux_ingest_ecds.VolumeFile')),
],
options={
'verbose_name_plural': 'Bulk',
},
),
]
69 changes: 67 additions & 2 deletions readux_ingest_ecds/models.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,14 @@
import os
import logging
import uuid
from zipfile import ZipFile
from django.core.files.storage import FileSystemStorage
from django.db import models
from django.conf import settings
from .services.file_services import is_image, is_ocr, is_junk, move_image_file, move_ocr_file, canvas_dimensions, upload_trigger_file
from .services.iiif_services import create_manifest
from .services.metadata_services import metadata_from_file, clean_metadata
from .services.ocr_services import add_ocr_to_canvases
from .helpers import get_iiif_models

Manifest = get_iiif_models()['Manifest']
Expand All @@ -19,6 +21,9 @@
location=settings.INGEST_TMP_DIR
)

def bulk_path(instance, filename):
return os.path.join(str(instance.id), filename )

class IngestAbstractModel(models.Model):
metadata = models.JSONField(default=dict, blank=True)
manifest = models.ForeignKey(
Expand Down Expand Up @@ -56,6 +61,8 @@ class Local(IngestAbstractModel):
storage=tmp_storage
)

bundle_path = models.CharField(blank=True, max_length=1000)

class Meta:
verbose_name_plural = 'Local'

Expand Down Expand Up @@ -101,7 +108,9 @@ def ingest(self):
def unzip_bundle(self):
open(self.trigger_file, 'a').close()

with ZipFile(self.bundle, 'r') as zip_ref:
bundle_to_unzip = self.bundle_path if self.bundle_path else self.bundle

with ZipFile(bundle_to_unzip, 'r') as zip_ref:
for member in zip_ref.infolist():
file_name = member.filename

Expand Down Expand Up @@ -161,7 +170,7 @@ def open_metadata(self):
if metadata_file is None or os.path.exists(metadata_file) is False:
return

self.metadata = metadata_from_file(metadata_file)
self.metadata = metadata_from_file(metadata_file)[0]

def create_canvases(self):
Canvas = get_iiif_models()['Canvas']
Expand Down Expand Up @@ -192,3 +201,59 @@ def create_canvases(self):
)

upload_trigger_file(self.trigger_file)

class VolumeFile(models.Model):
id = models.UUIDField(primary_key=True, default=uuid.uuid4, editable=False)
file = models.FileField(
blank=False,
storage=tmp_storage,
upload_to=bulk_path
)

class Bulk(models.Model):
collections = models.ManyToManyField(
Collection,
blank=True,
help_text="Optional: Collections to attach to the volume ingested in this form.",
related_name='ecds_bulk_ingest_collections'
)
image_server = models.ForeignKey(
ImageServer,
on_delete=models.DO_NOTHING,
null=True,
related_name='ecds_bulk_ingest_image_server'
)
creator = models.ForeignKey(
settings.AUTH_USER_MODEL,
on_delete=models.SET_NULL,
null=True,
related_name='ecds_bulk_ingest_created_locals'
)
volume_files = models.ManyToManyField(VolumeFile, blank=True)

class Meta:
verbose_name_plural = 'Bulk'

def ingest(self):
LOGGER.info('Ingesting Bulk')
for uploaded_file in self.volume_files.all():
if os.path.splitext(os.path.basename(uploaded_file.file.name))[0] == 'metadata':
metadata = metadata_from_file(uploaded_file.file.path)
for volume in metadata:
bundle_filename = [d['value'] for d in volume['metadata'] if d['label'].casefold() == 'filename'][0]
try:
bundle = self.volume_files.all().get(file__contains=bundle_filename)
if os.path.exists(bundle.file.path) and bundle.file.name.endswith('.zip'):
local = Local.objects.create(
metadata=volume,
bundle_path=bundle.file.path,
image_server=self.image_server,
creator=self.creator
)
local.prep()
local.ingest()
add_ocr_to_canvases(local.manifest)
except VolumeFile.DoesNotExist:
pass
self.volume_files.all().delete()
self.delete()
4 changes: 2 additions & 2 deletions readux_ingest_ecds/services/iiif_services.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@ def create_manifest(ingest):
metadata = None
if metadata:
if 'pid' in metadata:
manifest, _ = Manifest.objects.get_or_create(pid=metadata['pid'].replace('_', '-'))
manifest, _ = Manifest.objects.get_or_create(pid=metadata['pid'])
else:
manifest = Manifest.objects.create()
for (key, value) in metadata.items():
Expand All @@ -34,7 +34,7 @@ def create_manifest(ingest):
else:
# all other keys should exist as fields on Manifest (for now)
setattr(manifest, key, value)
# TODO: if the key doesn't exist on Manifest model, add it to Manifest.metadata
# If the key doesn't exist on Manifest model, add it to Manifest.metadata
else:
manifest = Manifest()

Expand Down
12 changes: 7 additions & 5 deletions readux_ingest_ecds/services/metadata_services.py
Original file line number Diff line number Diff line change
Expand Up @@ -89,17 +89,19 @@ def metadata_from_file(metadata_file):
if format is None:
return

metadata = None
metadata = []
metadata_set = None

if format == 'excel':
with open(metadata_file, 'rb') as fh:
metadata = Dataset().load(fh.read(), format=metadata_file.split('.')[-1])
metadata_set = Dataset().load(fh.read(), format=metadata_file.split('.')[-1])
else:
with open(metadata_file, 'r', encoding="utf-8-sig") as fh:
metadata = Dataset().load(fh.read(), format=format)
metadata_set = Dataset().load(fh.read(), format=format)

if metadata is not None:
metadata = clean_metadata(metadata.dict[0])
if metadata_set is not None:
for row in metadata_set.dict:
metadata.append(clean_metadata(row))

return metadata

Expand Down
13 changes: 13 additions & 0 deletions readux_ingest_ecds/services/ocr_services.py
Original file line number Diff line number Diff line change
Expand Up @@ -375,6 +375,7 @@ def parse_xml_ocr(result):
return None

def add_ocr_annotations(canvas, ocr):
OCR = get_iiif_models()['OCR']
word_order = 1
annotations = []
for word in ocr:
Expand Down Expand Up @@ -483,3 +484,15 @@ def is_tsv(to_test):
if len(as_str.split('\t')) > 1:
return True
return False

def add_ocr_to_canvases(manifest):
OCR = get_iiif_models()['OCR']
for canvas in manifest.canvas_set.all():
ocr = get_ocr(canvas)
if ocr is not None:
add_ocr_annotations(canvas, ocr)
# The add_ocr_annotations method uses bulk_create() which does not call save() on the model.
# Calling save() is really slow and I don't know why. Calling save() after the annotation
# has been created, calling save is as fast as expected.
[ocr.save() for ocr in OCR.objects.filter(canvas=canvas)]
canvas.save() # trigger reindex
23 changes: 13 additions & 10 deletions readux_ingest_ecds/tasks.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,11 +6,12 @@
from django.apps import apps
from django.conf import settings
from .helpers import get_iiif_models
from .services.ocr_services import get_ocr, add_ocr_annotations
from .services.ocr_services import add_ocr_to_canvases

# Use `apps.get_model` to avoid circular import error. Because the parameters used to
# create a background task have to be serializable, we can't just pass in the model object.
Local = apps.get_model('readux_ingest_ecds.local') # pylint: disable = invalid-name
Bulk = apps.get_model('readux_ingest_ecds.bulk') # pylint: disable = invalid-name

Manifest = get_iiif_models()['Manifest']
Canvas = get_iiif_models()['Canvas']
Expand All @@ -35,17 +36,19 @@ def local_ingest_task_ecds(ingest_id):
else:
add_ocr_task(local_ingest.manifest.pk)

@app.task(name='bulk_ingest_task_ecds', autoretry_for=(Exception,), retry_backoff=True, max_retries=20)
def bulk_ingest_task_ecds(ingest_id):
"""Background task to start ingest process.
:param ingest_id: Primary key for .models.Local object
:type ingest_id: UUID
"""
bulk_ingest = Bulk.objects.get(pk=ingest_id)
bulk_ingest.ingest()

@app.task(name='ingest_ocr_to_canvas', autoretry_for=(Manifest.DoesNotExist,), retry_backoff=5)
def add_ocr_task(manifest_id, *args, **kwargs):
"""Function for parsing and adding OCR."""
manifest = Manifest.objects.get(pk=manifest_id)
for canvas in manifest.canvas_set.all():
ocr = get_ocr(canvas)
if ocr is not None:
add_ocr_annotations(canvas, ocr)
# The add_ocr_annotations method uses bulk_create() which does not call save() on the model.
# Calling save() is really slow and I don't know why. Calling save() after the annotation
# has been created, calling save is as fast as expected.
[ocr.save() for ocr in OCR.objects.filter(canvas=canvas)]
canvas.save() # trigger reindex
add_ocr_to_canvases(manifest)
5 changes: 5 additions & 0 deletions test_app/fixtures/bulk/metadata.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
Filename,pid
volume1.zip,pid1
volume2.zip,pid2
volume3.zip,pid3

Binary file added test_app/fixtures/bulk/volume1.zip
Binary file not shown.
Binary file added test_app/fixtures/bulk/volume2.zip
Binary file not shown.
Binary file added test_app/fixtures/bulk/volume3.zip
Binary file not shown.
3 changes: 2 additions & 1 deletion test_app/fixtures/metadata.csv
Original file line number Diff line number Diff line change
@@ -1,2 +1,3 @@
Filename,Label,Summary,Author,Published city,Published date,Publisher
no_meta_file,Test Bundle,Test file,Test author,Test City,2021,Pubilsher test
no_meta_file,Test Bundle,Test file,Test author,Test City,2021,Pubilsher test

3 changes: 3 additions & 0 deletions test_app/fixtures/multi_volume_metadata.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
PID,Filename,Label,Summary,Author,Published city,Published date,Publisher
pid1,no_meta_file,Test Bundle,Test file,Test author,Test City,2021,Pubilsher test

3 changes: 2 additions & 1 deletion test_app/iiif/migrations/0001_initial.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,8 @@ class Migration(migrations.Migration):
migrations.CreateModel(
name='Manifest',
fields=[
('pid', models.CharField(default=uuid.uuid4, max_length=255, primary_key=True, serialize=False)),
('id', models.BigAutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID')),
('pid', models.CharField(default=uuid.uuid4, max_length=255, serialize=False)),
('published_city', models.TextField(blank=True, null=True)),
('publisher', models.TextField(blank=True, null=True)),
('metadata', models.JSONField(blank=True, default=dict)),
Expand Down
19 changes: 19 additions & 0 deletions test_app/iiif/migrations/0002_alter_manifest_pid.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
# Generated by Django 3.2.23 on 2024-02-01 13:40

from django.db import migrations, models
import uuid


class Migration(migrations.Migration):

dependencies = [
('iiif', '0001_initial'),
]

operations = [
migrations.AlterField(
model_name='manifest',
name='pid',
field=models.CharField(default=uuid.uuid4, max_length=255),
),
]
2 changes: 1 addition & 1 deletion test_app/iiif/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ class ImageServer(models.Model):
storage_service = models.CharField(max_length=25, default='local')

class Manifest(models.Model):
pid = models.CharField(max_length=255, primary_key=True, default=uuid4, editable=True)
pid = models.CharField(max_length=255, default=uuid4, editable=True)
image_server = models.ForeignKey(ImageServer, on_delete=models.DO_NOTHING, null=True)
collections = models.ManyToManyField(Collection, blank=True, related_name='manifests')
published_city = models.TextField(null=True, blank=True)
Expand Down
1 change: 1 addition & 0 deletions test_app/test_app/settings.py
Original file line number Diff line number Diff line change
Expand Up @@ -57,6 +57,7 @@
'django.contrib.contenttypes',
'django.contrib.messages',
'django_celery_results',
'django_extensions',
'iiif.apps.IiifConfig',
'readux_ingest_ecds',
'test_app'
Expand Down
Loading

0 comments on commit d4b3f04

Please sign in to comment.