Skip to content

Commit

Permalink
Prevent multiple local ingest with existing manifest/volume
Browse files Browse the repository at this point in the history
  • Loading branch information
jayvarner committed Nov 1, 2024
1 parent 04f6da7 commit fb371df
Show file tree
Hide file tree
Showing 5 changed files with 50 additions and 44 deletions.
53 changes: 29 additions & 24 deletions readux_ingest_ecds/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -441,40 +441,45 @@ def ingest(self):

manifest.collections.set(self.collections.all())
manifest.save()
local_ingest = Local.objects.create(
local_ingest, created = Local.objects.get_or_create(
manifest=manifest, image_server=self.image_server, creator=self.creator
)

trigger_file = os.path.join(
settings.INGEST_TMP_DIR, str(local_ingest.id), f"{pid}.txt"
)
if created:

os.makedirs(
os.path.join(settings.INGEST_TMP_DIR, str(local_ingest.id)),
exist_ok=True,
)
trigger_file = os.path.join(
settings.INGEST_TMP_DIR, str(local_ingest.id), f"{pid}.txt"
)

os.makedirs(
os.path.join(settings.INGEST_OCR_DIR, str(pid)),
exist_ok=True,
)
os.makedirs(
os.path.join(settings.INGEST_TMP_DIR, str(local_ingest.id)),
exist_ok=True,
)

open(trigger_file, "a", encoding="utf-8").close()
os.makedirs(
os.path.join(settings.INGEST_OCR_DIR, str(pid)),
exist_ok=True,
)

image_files, _ = s3_copy(self.s3_bucket, pid)
open(trigger_file, "a", encoding="utf-8").close()

for image_file in image_files:
with open(trigger_file, "a", encoding="utf-8") as t_file:
t_file.write(f"{image_file}\n")
image_files, _ = s3_copy(self.s3_bucket, pid)

local_ingest.create_canvases()
LOGGER.info(f"Canvases created for {pid}")
manifest.save()
from .tasks import add_ocr_task_local
for image_file in image_files:
with open(trigger_file, "a", encoding="utf-8") as t_file:
t_file.write(f"{image_file}\n")

local_ingest.create_canvases()
LOGGER.info(f"Canvases created for {pid}")
manifest.save()
from .tasks import add_ocr_task_local

if os.environ["DJANGO_ENV"] == "test":
add_ocr_task_local(str(local_ingest.id), manifest.pid)
else:
add_ocr_task_local.delay(str(local_ingest.id), manifest.pid)

if os.environ["DJANGO_ENV"] == "test":
add_ocr_task_local(str(local_ingest.id))
else:
add_ocr_task_local.delay(str(local_ingest.id))
LOGGER.warning(f"Ingest for {manifest.pid} already exists.")

self.delete()
25 changes: 16 additions & 9 deletions readux_ingest_ecds/services/file_services.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
""" Module of service methods for ingest files. """

import os
import logging
from moto import mock_aws
from shutil import move
from mimetypes import guess_type
Expand All @@ -14,6 +15,8 @@
Manifest = get_iiif_models()["Manifest"]
RelatedLink = get_iiif_models()["RelatedLink"]

LOGGER = logging.getLogger(__name__)


def is_image(file_path):
"""Check if file is expected type for image files
Expand Down Expand Up @@ -176,15 +179,19 @@ def s3_copy(source, pid):
filename = os.path.basename(key)
if pid not in filename:
filename = f"{pid}_{filename}"
if "image" in guess_type(key)[0] and "images" in key.casefold():
images.append(filename)
destination_bucket.copy(
copy_source, f"{settings.INGEST_STAGING_PREFIX}/{filename}"
)
elif "ocr" in key.casefold() and is_ocr(f"ocr_{key}"):
ocr_path = f"{settings.INGEST_OCR_PREFIX}/{pid}/{filename}"
ocr.append(ocr_path)
destination_bucket.copy(copy_source, ocr_path)
try:
if "image" in guess_type(key)[0] and "images" in key.casefold():
images.append(filename)
destination_bucket.copy(
copy_source, f"{settings.INGEST_STAGING_PREFIX}/{filename}"
)
elif "ocr" in key.casefold() and is_ocr(f"ocr_{key}"):
ocr_path = f"{settings.INGEST_OCR_PREFIX}/{pid}/{filename}"
ocr.append(ocr_path)
destination_bucket.copy(copy_source, ocr_path)
except TypeError as error:
LOGGER.warning(f"Could not determine file type for {key}")
LOGGER.warning(error)

images.sort()
return (images, ocr)
8 changes: 4 additions & 4 deletions readux_ingest_ecds/tasks.py
Original file line number Diff line number Diff line change
Expand Up @@ -82,9 +82,9 @@ def local_ingest_task_ecds(ingest_id):
local_ingest = Local.objects.get(pk=ingest_id)
local_ingest.ingest()
if os.environ["DJANGO_ENV"] != "test": # pragma: no cover
add_ocr_task_local.delay(ingest_id)
add_ocr_task_local.delay(ingest_id, local_ingest.manifest.pid)
else:
add_ocr_task_local(ingest_id)
add_ocr_task_local(ingest_id, local_ingest.manifest.pid)


@app.task(
Expand All @@ -110,9 +110,9 @@ def bulk_ingest_task_ecds(ingest_id):
autoretry_for=(Manifest.DoesNotExist,),
retry_backoff=5,
)
def add_ocr_task_local(ingest_id, *args, **kwargs):
def add_ocr_task_local(ingest_id, manifest_pid, *args, **kwargs):
"""Function for parsing and adding OCR."""
LOGGER.info("ADDING OCR")
LOGGER.info(f"ADDING OCR for {manifest_pid}")
local_ingest = Local.objects.get(pk=ingest_id)
manifest = Manifest.objects.get(pk=local_ingest.manifest.pk)
warnings = add_ocr_to_canvases(manifest)
Expand Down
2 changes: 1 addition & 1 deletion test_app/tests/test_ocr.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@ def test_empty_xml(self):
manifest=manifest,
)
local = LocalFactory.create(manifest=manifest, creator=UserFactory.create())
add_ocr_task_local(local.id)
add_ocr_task_local(local.id, manifest.pid)
local.refresh_from_db()
local.success()
assert local.warnings.startswith(
Expand Down
6 changes: 0 additions & 6 deletions test_app/tests/test_s3.py
Original file line number Diff line number Diff line change
Expand Up @@ -77,12 +77,6 @@ def create_source_images(self, pid=None, count=1, include_pid_in_file=True):
self.fs_storage.rel_path, os.path.basename(fake_image)
)
)
print("##########")
print(fake_image)
print(image_key)
print(self.fs_storage.root_path)
print(self.fs_storage.rel_path)
print("##########")
ocr_key = image_key.replace("jpg", "txt")
open(
os.path.join(self.fs_storage.root_path, ocr_key),
Expand Down

0 comments on commit fb371df

Please sign in to comment.