From 21d52e7d09a54aeea9228899f65bd48689469eac Mon Sep 17 00:00:00 2001 From: Jay Varner Date: Fri, 19 Jul 2024 10:59:24 -0400 Subject: [PATCH] Rework bulk upload. --- readux_ingest_ecds/admin.py | 2 +- .../migrations/0002_local_bulk.py | 19 +++ .../migrations/0003_bulk_metadata_file.py | 20 +++ readux_ingest_ecds/models.py | 116 ++++++++++++------ .../services/metadata_services.py | 71 +++++++---- 5 files changed, 160 insertions(+), 68 deletions(-) create mode 100644 readux_ingest_ecds/migrations/0002_local_bulk.py create mode 100644 readux_ingest_ecds/migrations/0003_bulk_metadata_file.py diff --git a/readux_ingest_ecds/admin.py b/readux_ingest_ecds/admin.py index 97932c5..fa86703 100644 --- a/readux_ingest_ecds/admin.py +++ b/readux_ingest_ecds/admin.py @@ -52,7 +52,7 @@ def save_model(self, request, obj, form, change): ingest_files = request.FILES.getlist("volume_files") for ingest_file in ingest_files: - obj.upload_files(ingest_file) + obj.upload_files(ingest_file, request.user) obj.creator = request.user super().save_model(request, obj, form, change) diff --git a/readux_ingest_ecds/migrations/0002_local_bulk.py b/readux_ingest_ecds/migrations/0002_local_bulk.py new file mode 100644 index 0000000..cdf5f8e --- /dev/null +++ b/readux_ingest_ecds/migrations/0002_local_bulk.py @@ -0,0 +1,19 @@ +# Generated by Django 3.2.23 on 2024-07-19 13:55 + +from django.db import migrations, models +import django.db.models.deletion + + +class Migration(migrations.Migration): + + dependencies = [ + ('readux_ingest_ecds', '0001_squashed_0005_alter_bulk_volume_files'), + ] + + operations = [ + migrations.AddField( + model_name='local', + name='bulk', + field=models.ForeignKey(null=True, on_delete=django.db.models.deletion.CASCADE, to='readux_ingest_ecds.bulk'), + ), + ] diff --git a/readux_ingest_ecds/migrations/0003_bulk_metadata_file.py b/readux_ingest_ecds/migrations/0003_bulk_metadata_file.py new file mode 100644 index 0000000..d61e7f4 --- /dev/null +++ b/readux_ingest_ecds/migrations/0003_bulk_metadata_file.py @@ -0,0 +1,20 @@ +# Generated by Django 3.2.23 on 2024-07-19 14:00 + +from django.db import migrations, models +import readux_ingest_ecds.models +import readux_ingest_ecds.storages + + +class Migration(migrations.Migration): + + dependencies = [ + ('readux_ingest_ecds', '0002_local_bulk'), + ] + + operations = [ + migrations.AddField( + model_name='bulk', + name='metadata_file', + field=models.FileField(null=True, storage=readux_ingest_ecds.storages.TmpStorage, upload_to=readux_ingest_ecds.models.bulk_path), + ), + ] diff --git a/readux_ingest_ecds/models.py b/readux_ingest_ecds/models.py index 502b02f..2563d06 100644 --- a/readux_ingest_ecds/models.py +++ b/readux_ingest_ecds/models.py @@ -2,10 +2,12 @@ import logging import uuid from zipfile import ZipFile +from mimetypes import guess_type from django.db import models from django.conf import settings from django.core.files.uploadedfile import InMemoryUploadedFile from django.core.files.storage import FileSystemStorage +from django.core.files.base import ContentFile from .services.file_services import ( is_image, is_ocr, @@ -84,6 +86,7 @@ class IngestAbstractModel(models.Model): help_text="Optional: Collections to attach to the volume ingested in this form.", related_name="ecds_ingest_collections", ) + bulk = models.ForeignKey("Bulk", on_delete=models.CASCADE, null=True) class Meta: # pylint: disable=too-few-public-methods, missing-class-docstring abstract = True @@ -279,28 +282,47 @@ class Bulk(models.Model): volume_files = models.FileField( blank=False, null=True, upload_to=bulk_path, storage=TmpStorage ) + metadata_file = models.FileField( + blank=False, null=True, upload_to=bulk_path, storage=TmpStorage + ) - def upload_files(self, files): + def upload_files(self, files, creator): """_summary_ :param files: _description_ :type files: _type_ """ - print(files) - print(str(files)) - if isinstance(files, InMemoryUploadedFile): - FileSystemStorage( - location=os.path.join(settings.INGEST_TMP_DIR, str(self.id)) - ).save(files.name, files) - else: - for uploaded_file in files: - with open( - os.path.join( - settings.INGEST_TMP_DIR, bulk_path(self, uploaded_file.name) - ), - "wb", - ) as out_file: - out_file.write(uploaded_file.read()) + # print(files) + # print(str(files)) + # if isinstance(files, InMemoryUploadedFile): + # FileSystemStorage( + # location=os.path.join(settings.INGEST_TMP_DIR, str(self.id)) + # ).save(files.name, files) + # else: + # for uploaded_file in files: + # with open( + # os.path.join( + # settings.INGEST_TMP_DIR, bulk_path(self, uploaded_file.name) + # ), + # "wb", + # ) as out_file: + # out_file.write(uploaded_file.read()) + for uploaded_file in files: + if ( + "metadata" in uploaded_file.name.casefold() + and "zip" not in guess_type(uploaded_file.name)[0] + ): + with ContentFile(uploaded_file.read()) as file_content: + self.metadata_file.save(uploaded_file.name, file_content) + else: + local_ingest = Local.objects.create( + bulk=self, image_server=self.image_server, creator=creator + ) + + local_ingest.collections.set(self.collections.all()) + with ContentFile(uploaded_file.read()) as file_content: + local_ingest.bundle.save(uploaded_file.name, file_content) + local_ingest.save() class Meta: """Model Meta""" @@ -310,29 +332,43 @@ class Meta: def ingest(self): """Doc""" LOGGER.info("Ingesting Bulk") - ingest_directory = os.path.join(settings.INGEST_TMP_DIR, str(self.id)) - ingest_files = os.listdir(ingest_directory) - for uploaded_file in ingest_files: - if os.path.splitext(os.path.basename(uploaded_file))[0] == "metadata": - metadata = metadata_from_file( - os.path.join(ingest_directory, uploaded_file) - ) - for volume in metadata: - bundle_filename = [ - d["value"] - for d in volume["metadata"] - if d["label"].casefold() == "filename" - ][0] - bundle = os.path.join( - settings.INGEST_TMP_DIR, str(self.id), bundle_filename + metadata = metadata_from_file( + os.path.join( + bulk_path(self, self.metadata_file.filename), + self.metadata_file.filename, ) - if os.path.exists(bundle) and bundle.endswith(".zip"): - local = Local.objects.create( - metadata=volume, - bundle_path=bundle, - image_server=self.image_server, - creator=self.creator, - ) - local.prep() - local.ingest() + ) + + for volume in metadata: + local_ingest = Local.objects.get(bundle=volume["filename"]) + local_ingest.metadata = volume + local_ingest.save() + local_ingest.prep() + local_ingest.ingest() + + # ingest_directory = os.path.join(settings.INGEST_TMP_DIR, str(self.id)) + # ingest_files = os.listdir(ingest_directory) + # for uploaded_file in ingest_files: + # if os.path.splitext(os.path.basename(uploaded_file))[0] == "metadata": + # metadata = metadata_from_file( + # os.path.join(ingest_directory, uploaded_file) + # ) + # for volume in metadata: + # bundle_filename = [ + # d["value"] + # for d in volume["metadata"] + # if d["label"].casefold() == "filename" + # ][0] + # bundle = os.path.join( + # settings.INGEST_TMP_DIR, str(self.id), bundle_filename + # ) + # if os.path.exists(bundle) and bundle.endswith(".zip"): + # local = Local.objects.create( + # metadata=volume, + # bundle_path=bundle, + # image_server=self.image_server, + # creator=self.creator, + # ) + # local.prep() + # local.ingest() # self.delete() diff --git a/readux_ingest_ecds/services/metadata_services.py b/readux_ingest_ecds/services/metadata_services.py index 44f9212..9f0e906 100644 --- a/readux_ingest_ecds/services/metadata_services.py +++ b/readux_ingest_ecds/services/metadata_services.py @@ -1,10 +1,12 @@ """ Module of service methods for ingest files. """ + from readux_ingest_ecds.helpers import get_iiif_models from mimetypes import guess_type from tablib.core import Dataset -Manifest = get_iiif_models()['Manifest'] -RelatedLink = get_iiif_models()['RelatedLink'] +Manifest = get_iiif_models()["Manifest"] +RelatedLink = get_iiif_models()["RelatedLink"] + def clean_metadata(metadata): """Remove keys that do not align with Manifest fields. @@ -15,8 +17,8 @@ def clean_metadata(metadata): :rtype: dict """ fields = [ - *(f.name for f in get_iiif_models()['Manifest']._meta.get_fields()), - 'related' + *(f.name for f in get_iiif_models()["Manifest"]._meta.get_fields()), + "related", ] metadata = { @@ -24,11 +26,12 @@ def clean_metadata(metadata): key.casefold().replace(" ", "_") if key.casefold().replace(" ", "_") in fields else key - ): value for key, value in metadata.items() + ): value + for key, value in metadata.items() } - if 'metadata' not in metadata.keys(): - metadata['metadata'] = [] + if "metadata" not in metadata.keys(): + metadata["metadata"] = [] extra_keys = [] @@ -44,22 +47,23 @@ def clean_metadata(metadata): # pass for key in metadata.keys(): - if key != 'metadata' and isinstance(metadata[key], list): + if key != "metadata" and isinstance(metadata[key], list): if isinstance(metadata[key][0], dict): for meta_key in metadata[key][0].keys(): - if 'value' in meta_key: + if "value" in meta_key: metadata[key] = metadata[key][0][meta_key] else: - metadata[key] = ', '.join(metadata[key]) + metadata[key] = ", ".join(metadata[key]) if key not in fields: extra_keys.append(key) for key in extra_keys: - metadata['metadata'].append({"label": key, "value": metadata[key]}) + metadata["metadata"].append({"label": key, "value": metadata[key]}) metadata.pop(key) return metadata + def create_related_links(manifest, related_str): """ Create RelatedLink objects from supplied related links string and associate each with supplied @@ -72,13 +76,15 @@ def create_related_links(manifest, related_str): """ for link in related_str.split(";"): (format, _) = guess_type(link) - get_iiif_models()['RelatedLink'].objects.create( + get_iiif_models()["RelatedLink"].objects.create( manifest=manifest, link=link, - format=format or "text/html", # assume web page if MIME type cannot be determined + format=format + or "text/html", # assume web page if MIME type cannot be determined is_structured_data=False, # assume this is not meant for seeAlso ) + def get_metadata_from(files): """ Find metadata file in uploaded files. @@ -89,16 +95,20 @@ def get_metadata_from(files): for file in files: if metadata is not None: continue - if 'zip' in guess_type(file.name)[0]: + if "zip" in guess_type(file.name)[0]: continue - if 'metadata' in file.name.casefold(): + if "metadata" in file.name.casefold(): stream = file.read() - if 'csv' in guess_type(file.name)[0] or 'tab-separated' in guess_type(file.name)[0]: - metadata = Dataset().load(stream.decode('utf-8-sig'), format='csv').dict + if ( + "csv" in guess_type(file.name)[0] + or "tab-separated" in guess_type(file.name)[0] + ): + metadata = Dataset().load(stream.decode("utf-8-sig"), format="csv").dict else: metadata = Dataset().load(stream).dict return metadata + def metadata_from_file(metadata_file): format = metadata_file_format(metadata_file) if format is None: @@ -107,19 +117,26 @@ def metadata_from_file(metadata_file): metadata = [] metadata_set = None - if format == 'excel': - with open(metadata_file, 'rb') as fh: - metadata_set = Dataset().load(fh.read(), format=metadata_file.split('.')[-1]) + if format == "excel": + with open(metadata_file, "rb") as fh: + metadata_set = Dataset().load( + fh.read(), format=metadata_file.split(".")[-1] + ) else: - with open(metadata_file, 'r', encoding="utf-8-sig") as fh: + with open(metadata_file, "r", encoding="utf-8-sig") as fh: metadata_set = Dataset().load(fh.read(), format=format) if metadata_set is not None: + metadata_set.headers = [ + header.casefold() if header.casefold() == "filename" else header + for header in metadata_set.headers + ] for row in metadata_set.dict: metadata.append(clean_metadata(row)) return metadata + def metadata_file_format(file_path): """Get format used to read the metadata file @@ -133,11 +150,11 @@ def metadata_file_format(file_path): file_type = guess_type(file_path)[0] - if 'csv' in file_type: - return 'csv' - elif 'tab-separated' in file_type: - return 'tsv' - elif 'officedocument' in file_type: - return 'excel' + if "csv" in file_type: + return "csv" + elif "tab-separated" in file_type: + return "tsv" + elif "officedocument" in file_type: + return "excel" return None