diff --git a/readux_ingest_ecds/models.py b/readux_ingest_ecds/models.py index 9f45593..fa9e336 100644 --- a/readux_ingest_ecds/models.py +++ b/readux_ingest_ecds/models.py @@ -441,40 +441,45 @@ def ingest(self): manifest.collections.set(self.collections.all()) manifest.save() - local_ingest = Local.objects.create( + local_ingest, created = Local.objects.get_or_create( manifest=manifest, image_server=self.image_server, creator=self.creator ) - trigger_file = os.path.join( - settings.INGEST_TMP_DIR, str(local_ingest.id), f"{pid}.txt" - ) + if created: - os.makedirs( - os.path.join(settings.INGEST_TMP_DIR, str(local_ingest.id)), - exist_ok=True, - ) + trigger_file = os.path.join( + settings.INGEST_TMP_DIR, str(local_ingest.id), f"{pid}.txt" + ) - os.makedirs( - os.path.join(settings.INGEST_OCR_DIR, str(pid)), - exist_ok=True, - ) + os.makedirs( + os.path.join(settings.INGEST_TMP_DIR, str(local_ingest.id)), + exist_ok=True, + ) - open(trigger_file, "a", encoding="utf-8").close() + os.makedirs( + os.path.join(settings.INGEST_OCR_DIR, str(pid)), + exist_ok=True, + ) - image_files, _ = s3_copy(self.s3_bucket, pid) + open(trigger_file, "a", encoding="utf-8").close() - for image_file in image_files: - with open(trigger_file, "a", encoding="utf-8") as t_file: - t_file.write(f"{image_file}\n") + image_files, _ = s3_copy(self.s3_bucket, pid) - local_ingest.create_canvases() - LOGGER.info(f"Canvases created for {pid}") - manifest.save() - from .tasks import add_ocr_task_local + for image_file in image_files: + with open(trigger_file, "a", encoding="utf-8") as t_file: + t_file.write(f"{image_file}\n") + + local_ingest.create_canvases() + LOGGER.info(f"Canvases created for {pid}") + manifest.save() + from .tasks import add_ocr_task_local + + if os.environ["DJANGO_ENV"] == "test": + add_ocr_task_local(str(local_ingest.id), manifest.pid) + else: + add_ocr_task_local.delay(str(local_ingest.id), manifest.pid) - if os.environ["DJANGO_ENV"] == "test": - add_ocr_task_local(str(local_ingest.id)) else: - add_ocr_task_local.delay(str(local_ingest.id)) + LOGGER.warning(f"Ingest for {manifest.pid} already exists.") self.delete() diff --git a/readux_ingest_ecds/services/file_services.py b/readux_ingest_ecds/services/file_services.py index 1909107..43bc353 100644 --- a/readux_ingest_ecds/services/file_services.py +++ b/readux_ingest_ecds/services/file_services.py @@ -1,6 +1,7 @@ """ Module of service methods for ingest files. """ import os +import logging from moto import mock_aws from shutil import move from mimetypes import guess_type @@ -14,6 +15,8 @@ Manifest = get_iiif_models()["Manifest"] RelatedLink = get_iiif_models()["RelatedLink"] +LOGGER = logging.getLogger(__name__) + def is_image(file_path): """Check if file is expected type for image files @@ -176,15 +179,19 @@ def s3_copy(source, pid): filename = os.path.basename(key) if pid not in filename: filename = f"{pid}_{filename}" - if "image" in guess_type(key)[0] and "images" in key.casefold(): - images.append(filename) - destination_bucket.copy( - copy_source, f"{settings.INGEST_STAGING_PREFIX}/{filename}" - ) - elif "ocr" in key.casefold() and is_ocr(f"ocr_{key}"): - ocr_path = f"{settings.INGEST_OCR_PREFIX}/{pid}/{filename}" - ocr.append(ocr_path) - destination_bucket.copy(copy_source, ocr_path) + try: + if "image" in guess_type(key)[0] and "images" in key.casefold(): + images.append(filename) + destination_bucket.copy( + copy_source, f"{settings.INGEST_STAGING_PREFIX}/{filename}" + ) + elif "ocr" in key.casefold() and is_ocr(f"ocr_{key}"): + ocr_path = f"{settings.INGEST_OCR_PREFIX}/{pid}/{filename}" + ocr.append(ocr_path) + destination_bucket.copy(copy_source, ocr_path) + except TypeError as error: + LOGGER.warning(f"Could not determine file type for {key}") + LOGGER.warning(error) images.sort() return (images, ocr) diff --git a/readux_ingest_ecds/tasks.py b/readux_ingest_ecds/tasks.py index 99a9d65..74bfbef 100644 --- a/readux_ingest_ecds/tasks.py +++ b/readux_ingest_ecds/tasks.py @@ -82,9 +82,9 @@ def local_ingest_task_ecds(ingest_id): local_ingest = Local.objects.get(pk=ingest_id) local_ingest.ingest() if os.environ["DJANGO_ENV"] != "test": # pragma: no cover - add_ocr_task_local.delay(ingest_id) + add_ocr_task_local.delay(ingest_id, local_ingest.manifest.pid) else: - add_ocr_task_local(ingest_id) + add_ocr_task_local(ingest_id, local_ingest.manifest.pid) @app.task( @@ -110,9 +110,9 @@ def bulk_ingest_task_ecds(ingest_id): autoretry_for=(Manifest.DoesNotExist,), retry_backoff=5, ) -def add_ocr_task_local(ingest_id, *args, **kwargs): +def add_ocr_task_local(ingest_id, manifest_pid, *args, **kwargs): """Function for parsing and adding OCR.""" - LOGGER.info("ADDING OCR") + LOGGER.info(f"ADDING OCR for {manifest_pid}") local_ingest = Local.objects.get(pk=ingest_id) manifest = Manifest.objects.get(pk=local_ingest.manifest.pk) warnings = add_ocr_to_canvases(manifest) diff --git a/test_app/tests/test_ocr.py b/test_app/tests/test_ocr.py index 2c94194..dc951d1 100644 --- a/test_app/tests/test_ocr.py +++ b/test_app/tests/test_ocr.py @@ -34,7 +34,7 @@ def test_empty_xml(self): manifest=manifest, ) local = LocalFactory.create(manifest=manifest, creator=UserFactory.create()) - add_ocr_task_local(local.id) + add_ocr_task_local(local.id, manifest.pid) local.refresh_from_db() local.success() assert local.warnings.startswith( diff --git a/test_app/tests/test_s3.py b/test_app/tests/test_s3.py index 45f7bf5..b73a0d3 100644 --- a/test_app/tests/test_s3.py +++ b/test_app/tests/test_s3.py @@ -77,12 +77,6 @@ def create_source_images(self, pid=None, count=1, include_pid_in_file=True): self.fs_storage.rel_path, os.path.basename(fake_image) ) ) - print("##########") - print(fake_image) - print(image_key) - print(self.fs_storage.root_path) - print(self.fs_storage.rel_path) - print("##########") ocr_key = image_key.replace("jpg", "txt") open( os.path.join(self.fs_storage.root_path, ocr_key),