Skip to content

Commit

Permalink
Rework bulk upload.
Browse files Browse the repository at this point in the history
  • Loading branch information
jayvarner committed Jul 19, 2024
1 parent a1c5310 commit 21d52e7
Show file tree
Hide file tree
Showing 5 changed files with 160 additions and 68 deletions.
2 changes: 1 addition & 1 deletion readux_ingest_ecds/admin.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,7 +52,7 @@ def save_model(self, request, obj, form, change):
ingest_files = request.FILES.getlist("volume_files")

for ingest_file in ingest_files:
obj.upload_files(ingest_file)
obj.upload_files(ingest_file, request.user)

obj.creator = request.user
super().save_model(request, obj, form, change)
Expand Down
19 changes: 19 additions & 0 deletions readux_ingest_ecds/migrations/0002_local_bulk.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
# Generated by Django 3.2.23 on 2024-07-19 13:55

from django.db import migrations, models
import django.db.models.deletion


class Migration(migrations.Migration):

dependencies = [
('readux_ingest_ecds', '0001_squashed_0005_alter_bulk_volume_files'),
]

operations = [
migrations.AddField(
model_name='local',
name='bulk',
field=models.ForeignKey(null=True, on_delete=django.db.models.deletion.CASCADE, to='readux_ingest_ecds.bulk'),
),
]
20 changes: 20 additions & 0 deletions readux_ingest_ecds/migrations/0003_bulk_metadata_file.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
# Generated by Django 3.2.23 on 2024-07-19 14:00

from django.db import migrations, models
import readux_ingest_ecds.models
import readux_ingest_ecds.storages


class Migration(migrations.Migration):

dependencies = [
('readux_ingest_ecds', '0002_local_bulk'),
]

operations = [
migrations.AddField(
model_name='bulk',
name='metadata_file',
field=models.FileField(null=True, storage=readux_ingest_ecds.storages.TmpStorage, upload_to=readux_ingest_ecds.models.bulk_path),
),
]
116 changes: 76 additions & 40 deletions readux_ingest_ecds/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,10 +2,12 @@
import logging
import uuid
from zipfile import ZipFile
from mimetypes import guess_type
from django.db import models
from django.conf import settings
from django.core.files.uploadedfile import InMemoryUploadedFile
from django.core.files.storage import FileSystemStorage
from django.core.files.base import ContentFile
from .services.file_services import (
is_image,
is_ocr,
Expand Down Expand Up @@ -84,6 +86,7 @@ class IngestAbstractModel(models.Model):
help_text="Optional: Collections to attach to the volume ingested in this form.",
related_name="ecds_ingest_collections",
)
bulk = models.ForeignKey("Bulk", on_delete=models.CASCADE, null=True)

class Meta: # pylint: disable=too-few-public-methods, missing-class-docstring
abstract = True
Expand Down Expand Up @@ -279,28 +282,47 @@ class Bulk(models.Model):
volume_files = models.FileField(
blank=False, null=True, upload_to=bulk_path, storage=TmpStorage
)
metadata_file = models.FileField(
blank=False, null=True, upload_to=bulk_path, storage=TmpStorage
)

def upload_files(self, files):
def upload_files(self, files, creator):
"""_summary_
:param files: _description_
:type files: _type_
"""
print(files)
print(str(files))
if isinstance(files, InMemoryUploadedFile):
FileSystemStorage(
location=os.path.join(settings.INGEST_TMP_DIR, str(self.id))
).save(files.name, files)
else:
for uploaded_file in files:
with open(
os.path.join(
settings.INGEST_TMP_DIR, bulk_path(self, uploaded_file.name)
),
"wb",
) as out_file:
out_file.write(uploaded_file.read())
# print(files)
# print(str(files))
# if isinstance(files, InMemoryUploadedFile):
# FileSystemStorage(
# location=os.path.join(settings.INGEST_TMP_DIR, str(self.id))
# ).save(files.name, files)
# else:
# for uploaded_file in files:
# with open(
# os.path.join(
# settings.INGEST_TMP_DIR, bulk_path(self, uploaded_file.name)
# ),
# "wb",
# ) as out_file:
# out_file.write(uploaded_file.read())
for uploaded_file in files:
if (
"metadata" in uploaded_file.name.casefold()
and "zip" not in guess_type(uploaded_file.name)[0]
):
with ContentFile(uploaded_file.read()) as file_content:
self.metadata_file.save(uploaded_file.name, file_content)
else:
local_ingest = Local.objects.create(
bulk=self, image_server=self.image_server, creator=creator
)

local_ingest.collections.set(self.collections.all())
with ContentFile(uploaded_file.read()) as file_content:
local_ingest.bundle.save(uploaded_file.name, file_content)
local_ingest.save()

class Meta:
"""Model Meta"""
Expand All @@ -310,29 +332,43 @@ class Meta:
def ingest(self):
"""Doc"""
LOGGER.info("Ingesting Bulk")
ingest_directory = os.path.join(settings.INGEST_TMP_DIR, str(self.id))
ingest_files = os.listdir(ingest_directory)
for uploaded_file in ingest_files:
if os.path.splitext(os.path.basename(uploaded_file))[0] == "metadata":
metadata = metadata_from_file(
os.path.join(ingest_directory, uploaded_file)
)
for volume in metadata:
bundle_filename = [
d["value"]
for d in volume["metadata"]
if d["label"].casefold() == "filename"
][0]
bundle = os.path.join(
settings.INGEST_TMP_DIR, str(self.id), bundle_filename
metadata = metadata_from_file(
os.path.join(
bulk_path(self, self.metadata_file.filename),
self.metadata_file.filename,
)
if os.path.exists(bundle) and bundle.endswith(".zip"):
local = Local.objects.create(
metadata=volume,
bundle_path=bundle,
image_server=self.image_server,
creator=self.creator,
)
local.prep()
local.ingest()
)

for volume in metadata:
local_ingest = Local.objects.get(bundle=volume["filename"])
local_ingest.metadata = volume
local_ingest.save()
local_ingest.prep()
local_ingest.ingest()

# ingest_directory = os.path.join(settings.INGEST_TMP_DIR, str(self.id))
# ingest_files = os.listdir(ingest_directory)
# for uploaded_file in ingest_files:
# if os.path.splitext(os.path.basename(uploaded_file))[0] == "metadata":
# metadata = metadata_from_file(
# os.path.join(ingest_directory, uploaded_file)
# )
# for volume in metadata:
# bundle_filename = [
# d["value"]
# for d in volume["metadata"]
# if d["label"].casefold() == "filename"
# ][0]
# bundle = os.path.join(
# settings.INGEST_TMP_DIR, str(self.id), bundle_filename
# )
# if os.path.exists(bundle) and bundle.endswith(".zip"):
# local = Local.objects.create(
# metadata=volume,
# bundle_path=bundle,
# image_server=self.image_server,
# creator=self.creator,
# )
# local.prep()
# local.ingest()
# self.delete()
71 changes: 44 additions & 27 deletions readux_ingest_ecds/services/metadata_services.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,12 @@
""" Module of service methods for ingest files. """

from readux_ingest_ecds.helpers import get_iiif_models
from mimetypes import guess_type
from tablib.core import Dataset

Manifest = get_iiif_models()['Manifest']
RelatedLink = get_iiif_models()['RelatedLink']
Manifest = get_iiif_models()["Manifest"]
RelatedLink = get_iiif_models()["RelatedLink"]


def clean_metadata(metadata):
"""Remove keys that do not align with Manifest fields.
Expand All @@ -15,20 +17,21 @@ def clean_metadata(metadata):
:rtype: dict
"""
fields = [
*(f.name for f in get_iiif_models()['Manifest']._meta.get_fields()),
'related'
*(f.name for f in get_iiif_models()["Manifest"]._meta.get_fields()),
"related",
]

metadata = {
(
key.casefold().replace(" ", "_")
if key.casefold().replace(" ", "_") in fields
else key
): value for key, value in metadata.items()
): value
for key, value in metadata.items()
}

if 'metadata' not in metadata.keys():
metadata['metadata'] = []
if "metadata" not in metadata.keys():
metadata["metadata"] = []

extra_keys = []

Expand All @@ -44,22 +47,23 @@ def clean_metadata(metadata):
# pass

for key in metadata.keys():
if key != 'metadata' and isinstance(metadata[key], list):
if key != "metadata" and isinstance(metadata[key], list):
if isinstance(metadata[key][0], dict):
for meta_key in metadata[key][0].keys():
if 'value' in meta_key:
if "value" in meta_key:
metadata[key] = metadata[key][0][meta_key]
else:
metadata[key] = ', '.join(metadata[key])
metadata[key] = ", ".join(metadata[key])
if key not in fields:
extra_keys.append(key)

for key in extra_keys:
metadata['metadata'].append({"label": key, "value": metadata[key]})
metadata["metadata"].append({"label": key, "value": metadata[key]})
metadata.pop(key)

return metadata


def create_related_links(manifest, related_str):
"""
Create RelatedLink objects from supplied related links string and associate each with supplied
Expand All @@ -72,13 +76,15 @@ def create_related_links(manifest, related_str):
"""
for link in related_str.split(";"):
(format, _) = guess_type(link)
get_iiif_models()['RelatedLink'].objects.create(
get_iiif_models()["RelatedLink"].objects.create(
manifest=manifest,
link=link,
format=format or "text/html", # assume web page if MIME type cannot be determined
format=format
or "text/html", # assume web page if MIME type cannot be determined
is_structured_data=False, # assume this is not meant for seeAlso
)


def get_metadata_from(files):
"""
Find metadata file in uploaded files.
Expand All @@ -89,16 +95,20 @@ def get_metadata_from(files):
for file in files:
if metadata is not None:
continue
if 'zip' in guess_type(file.name)[0]:
if "zip" in guess_type(file.name)[0]:
continue
if 'metadata' in file.name.casefold():
if "metadata" in file.name.casefold():
stream = file.read()
if 'csv' in guess_type(file.name)[0] or 'tab-separated' in guess_type(file.name)[0]:
metadata = Dataset().load(stream.decode('utf-8-sig'), format='csv').dict
if (
"csv" in guess_type(file.name)[0]
or "tab-separated" in guess_type(file.name)[0]
):
metadata = Dataset().load(stream.decode("utf-8-sig"), format="csv").dict
else:
metadata = Dataset().load(stream).dict
return metadata


def metadata_from_file(metadata_file):
format = metadata_file_format(metadata_file)
if format is None:
Expand All @@ -107,19 +117,26 @@ def metadata_from_file(metadata_file):
metadata = []
metadata_set = None

if format == 'excel':
with open(metadata_file, 'rb') as fh:
metadata_set = Dataset().load(fh.read(), format=metadata_file.split('.')[-1])
if format == "excel":
with open(metadata_file, "rb") as fh:
metadata_set = Dataset().load(
fh.read(), format=metadata_file.split(".")[-1]
)
else:
with open(metadata_file, 'r', encoding="utf-8-sig") as fh:
with open(metadata_file, "r", encoding="utf-8-sig") as fh:
metadata_set = Dataset().load(fh.read(), format=format)

if metadata_set is not None:
metadata_set.headers = [
header.casefold() if header.casefold() == "filename" else header
for header in metadata_set.headers
]
for row in metadata_set.dict:
metadata.append(clean_metadata(row))

return metadata


def metadata_file_format(file_path):
"""Get format used to read the metadata file
Expand All @@ -133,11 +150,11 @@ def metadata_file_format(file_path):

file_type = guess_type(file_path)[0]

if 'csv' in file_type:
return 'csv'
elif 'tab-separated' in file_type:
return 'tsv'
elif 'officedocument' in file_type:
return 'excel'
if "csv" in file_type:
return "csv"
elif "tab-separated" in file_type:
return "tsv"
elif "officedocument" in file_type:
return "excel"

return None

0 comments on commit 21d52e7

Please sign in to comment.