Skip to content

Commit

Permalink
Add S3 ingest
Browse files Browse the repository at this point in the history
  • Loading branch information
jayvarner committed Aug 9, 2024
1 parent 705bac4 commit 003b066
Show file tree
Hide file tree
Showing 16 changed files with 507 additions and 86 deletions.
20 changes: 18 additions & 2 deletions readux_ingest_ecds/admin.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,8 +4,8 @@
from django.core.files.base import ContentFile
from django.contrib import admin
from django.shortcuts import redirect
from .models import Local, Bulk
from .tasks import local_ingest_task_ecds, bulk_ingest_task_ecds
from .models import Local, Bulk, S3Ingest
from .tasks import local_ingest_task_ecds, bulk_ingest_task_ecds, s3_ingest_task
from .forms import BulkVolumeUploadForm

LOGGER = logging.getLogger(__name__)
Expand Down Expand Up @@ -82,5 +82,21 @@ class Meta:
model = Bulk


class S3IngestAdmin(admin.ModelAdmin):
def save_model(self, request, obj, form, change):
LOGGER.info(f"INGEST: S3 ingest started by {request.user.username}")
obj.creator = request.user

super().save_model(request, obj, form, change)
if os.environ["DJANGO_ENV"] != "test": # pragma: no cover
s3_ingest_task.apply_async(args=[obj.id])
else:
s3_ingest_task(obj.id)

class Meta:
model = S3Ingest


admin.site.register(Local, LocalAdmin)
admin.site.register(Bulk, BulkAdmin)
admin.site.register(S3Ingest, S3IngestAdmin)
33 changes: 33 additions & 0 deletions readux_ingest_ecds/migrations/0004_s3ingest.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
# Generated by Django 3.2.23 on 2024-08-08 15:27

from django.conf import settings
import django.core.validators
from django.db import migrations, models
import django.db.models.deletion
import uuid


class Migration(migrations.Migration):

dependencies = [
('iiif', '0005_canvas_resource'),
migrations.swappable_dependency(settings.AUTH_USER_MODEL),
('readux_ingest_ecds', '0003_bulk_metadata_file'),
]

operations = [
migrations.CreateModel(
name='S3Ingest',
fields=[
('id', models.UUIDField(default=uuid.uuid4, editable=False, primary_key=True, serialize=False)),
('s3_bucket', models.CharField(help_text="The name of a publicly-accessible S3 bucket containing volumes to\n ingest, either at the bucket root or within subfolder(s). Each volume should have its own\n subfolder, with the volume's PID as its name.\n <br />\n <strong>Example:</strong> if the bucket's URL is\n https://my-bucket.s3.us-east-1.amazonaws.com/, its name is <strong>my-bucket</strong>.", max_length=255)),
('metadata_spreadsheet', models.FileField(help_text='A spreadsheet file with a row for each volume, including the\n volume PID (column name <strong>pid</strong>).', upload_to='', validators=[django.core.validators.FileExtensionValidator(allowed_extensions=['csv', 'xlsx'])])),
('collections', models.ManyToManyField(blank=True, help_text='Optional: Collections to attach to ALL volumes ingested in this form.', to='iiif.Collection')),
('creator', models.ForeignKey(null=True, on_delete=django.db.models.deletion.SET_NULL, related_name='ecds_ingest_created_s3', to=settings.AUTH_USER_MODEL)),
('image_server', models.ForeignKey(null=True, on_delete=django.db.models.deletion.DO_NOTHING, related_name='ecds_s3_ingest_image_server', to='iiif.imageserver')),
],
options={
'verbose_name_plural': 'Amazon S3 Ingests',
},
),
]
87 changes: 84 additions & 3 deletions readux_ingest_ecds/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
from django.core.files.uploadedfile import InMemoryUploadedFile
from django.core.files.storage import FileSystemStorage
from django.core.files.base import ContentFile
from django.core.validators import FileExtensionValidator
from .services.file_services import (
is_image,
is_ocr,
Expand All @@ -16,8 +17,9 @@
move_ocr_file,
canvas_dimensions,
upload_trigger_file,
s3_copy,
)
from .services.iiif_services import create_manifest
from .services.iiif_services import create_manifest, create_manifest_from_pid
from .services.metadata_services import metadata_from_file, clean_metadata
from .helpers import get_iiif_models
from .storages import TmpStorage
Expand Down Expand Up @@ -339,8 +341,6 @@ def ingest(self):
)
)

print(["volume3" in str(local.bundle) for local in self.local_set.all()])

for index, volume in enumerate(metadata):
for local_ingest in self.local_set.all():
if volume["filename"] in str(local_ingest.bundle):
Expand Down Expand Up @@ -375,3 +375,84 @@ def ingest(self):
# local.prep()
# local.ingest()
# self.delete()


class S3Ingest(models.Model):
"""Model class for bulk ingesting volumes from an Amazon AWS S3 bucket."""

id = models.UUIDField(primary_key=True, default=uuid.uuid4, editable=False)
s3_bucket = models.CharField(
null=False,
blank=False,
max_length=255,
help_text="""The name of a publicly-accessible S3 bucket containing volumes to
ingest, either at the bucket root or within subfolder(s). Each volume should have its own
subfolder, with the volume's PID as its name.
<br />
<strong>Example:</strong> if the bucket's URL is
https://my-bucket.s3.us-east-1.amazonaws.com/, its name is <strong>my-bucket</strong>.""",
)
metadata_spreadsheet = models.FileField(
null=False,
blank=False,
help_text="""A spreadsheet file with a row for each volume, including the
volume PID (column name <strong>pid</strong>).""",
validators=[FileExtensionValidator(allowed_extensions=["csv", "xlsx"])],
)
image_server = models.ForeignKey(
ImageServer,
on_delete=models.DO_NOTHING,
null=True,
related_name="ecds_s3_ingest_image_server",
)
collections = models.ManyToManyField(
Collection,
blank=True,
help_text="Optional: Collections to attach to ALL volumes ingested in this form.",
)
creator = models.ForeignKey(
settings.AUTH_USER_MODEL,
on_delete=models.SET_NULL,
null=True,
related_name="ecds_ingest_created_s3",
)

class Meta:
verbose_name_plural = "Amazon S3 Ingests"

def ingest(self):
metadata = metadata_from_file(self.metadata_spreadsheet.path)

for pid in [row["pid"] for row in metadata]:
manifest = create_manifest_from_pid(pid, self.image_server)
manifest.collections.set(self.collections.all())
manifest.save()
local_ingest = Local.objects.create(
manifest=manifest, image_server=self.image_server, creator=self.creator
)

trigger_file = os.path.join(
settings.INGEST_TMP_DIR, str(local_ingest.id), f"{pid}.txt"
)

os.makedirs(
os.path.join(settings.INGEST_TMP_DIR, str(local_ingest.id)),
exist_ok=True,
)

os.makedirs(
os.path.join(settings.INGEST_OCR_DIR, str(pid)),
exist_ok=True,
)

open(trigger_file, "a", encoding="utf-8").close()

image_files, _ = s3_copy(self.s3_bucket, pid)

for image_file in image_files:
with open(trigger_file, "a", encoding="utf-8") as t_file:
t_file.write(f"{image_file}\n")

local_ingest.create_canvases()

self.delete()
50 changes: 48 additions & 2 deletions readux_ingest_ecds/services/file_services.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
""" Module of service methods for ingest files. """

import os
import boto3
from moto import mock_aws
from shutil import move
from mimetypes import guess_type
from PIL import Image
Expand Down Expand Up @@ -37,7 +39,7 @@ def is_ocr(file_path):
:return: Bool if file type matches OCR file types.
:rtype: bool
"""
ocr_file_types = ["text", "xml", "json", "html", "hocr", "tsv"]
ocr_file_types = ["text", "txt", "xml", "json", "html", "hocr", "tsv"]
return (
file_path is not None
and "ocr" in file_path
Expand Down Expand Up @@ -100,7 +102,7 @@ def divide_chunks(file_list):
:param file_list: List of images to ingest.
:type file_list: list of str
"""
chunk_size = settings.CHUNK_SIZE if settings.CHUNK_SIZE else 10
chunk_size = 10
for filename in range(0, len(file_list), chunk_size):
yield file_list[filename : filename + chunk_size]

Expand Down Expand Up @@ -146,3 +148,47 @@ def canvas_dimensions(image_name):
os.path.join(settings.INGEST_PROCESSING_DIR, original_image[0])
).size
return (0, 0)


def s3_copy(source, pid):
"""Copy S3 objects to ingest
Args:
source (str): Source bucket name
pid (str): PID for volume being ingested
Returns:
list[str]: List of copied image files for volume
"""
# if os.environ["DJANGO_ENV"] != "test":
# mock = mock_aws()
# mock.start()
s3 = boto3.resource("s3")
destination_bucket = s3.Bucket(settings.INGEST_BUCKET)
source_bucket = s3.Bucket(source)

keys_to_copy = [
str(obj.key) for obj in source_bucket.objects.all() if pid in obj.key
]

images = []
ocr = []
for key in keys_to_copy:
copy_source = {"Bucket": source, "Key": key}
filename = os.path.basename(key)
if pid not in filename:
filename = f"{pid}_{filename}"
if "image" in guess_type(key)[0]:
images.append(filename)
destination_bucket.copy(
copy_source, f"{settings.INGEST_STAGING_PREFIX}/{filename}"
)
else:
ocr_path = f"{settings.INGEST_OCR_PREFIX}/{pid}/{filename}"
ocr.append(ocr_path)
destination_bucket.copy(copy_source, ocr_path)

# if os.environ["DJANGO_ENV"] != "test":
# mock.stop()
images.sort()
return (images, ocr)
35 changes: 26 additions & 9 deletions readux_ingest_ecds/services/iiif_services.py
Original file line number Diff line number Diff line change
@@ -1,18 +1,22 @@
""" Module of service methods for IIIF objects. """

import os
from readux_ingest_ecds.helpers import get_iiif_models
from .metadata_services import create_related_links

Manifest = get_iiif_models()['Manifest']
RelatedLink = get_iiif_models()['RelatedLink']
OCR = get_iiif_models()['OCR']
Manifest = get_iiif_models()["Manifest"]
RelatedLink = get_iiif_models()["RelatedLink"]
Canvas = get_iiif_models()["Canvas"]
OCR = get_iiif_models()["OCR"]


def create_manifest(ingest):
"""
Create or update a Manifest from supplied metadata and images.
:return: New or updated Manifest with supplied `pid`
:rtype: iiif.manifest.models.Manifest
"""
Manifest = get_iiif_models()['Manifest']
Manifest = get_iiif_models()["Manifest"]
manifest = None
# Make a copy of the metadata so we don't extract it over and over.
try:
Expand All @@ -23,13 +27,13 @@ def create_manifest(ingest):
except TypeError:
metadata = None
if metadata:
if 'pid' in metadata:
manifest, _ = Manifest.objects.get_or_create(pid=metadata['pid'])
if "pid" in metadata:
manifest, _ = Manifest.objects.get_or_create(pid=metadata["pid"])
else:
manifest = Manifest.objects.create()
for (key, value) in metadata.items():
if key == 'related':
# add RelatedLinks from metadata spreadsheet key "related"
for key, value in metadata.items():
if key == "related":
# add RelatedLinks from metadata spreadsheet key "related"
create_related_links(manifest, value)
else:
# all other keys should exist as fields on Manifest (for now)
Expand All @@ -48,3 +52,16 @@ def create_manifest(ingest):
manifest.save()

return manifest


def create_manifest_from_pid(pid, image_server):
"""Create Manifest and Canvases
Args:
pid (str): PID for new Manifest
images (list[str]): List of image file names
collections (list[IIIF.Collection])
"""
Manifest = get_iiif_models()["Manifest"]
manifest, _ = Manifest.objects.get_or_create(pid=pid, image_server=image_server)
return manifest
Loading

0 comments on commit 003b066

Please sign in to comment.