From d3caba23f18b2dbdce9f52427516e29ed980cb7b Mon Sep 17 00:00:00 2001 From: Jay Varner Date: Mon, 21 Oct 2024 09:33:25 -0400 Subject: [PATCH] Minor changes to flow. --- readux_ingest_ecds/models.py | 4 +++- readux_ingest_ecds/services/file_services.py | 10 +++++----- readux_ingest_ecds/services/ocr_services.py | 14 +++++++++----- readux_ingest_ecds/tasks.py | 2 -- 4 files changed, 17 insertions(+), 13 deletions(-) diff --git a/readux_ingest_ecds/models.py b/readux_ingest_ecds/models.py index bdacf06..2705697 100644 --- a/readux_ingest_ecds/models.py +++ b/readux_ingest_ecds/models.py @@ -423,13 +423,15 @@ class S3Ingest(models.Model): ) class Meta: - verbose_name_plural = "Amazon S3 Ingests" + verbose_name_plural = "S3 Ingests" def ingest(self): rows = metadata_from_file(self.metadata_spreadsheet.path) for row in rows: pid = row["pid"] + if pid is None: + continue manifest = create_manifest_from_pid(pid, self.image_server) metadata = dict(row) for key, value in metadata.items(): diff --git a/readux_ingest_ecds/services/file_services.py b/readux_ingest_ecds/services/file_services.py index e267525..bc866af 100644 --- a/readux_ingest_ecds/services/file_services.py +++ b/readux_ingest_ecds/services/file_services.py @@ -96,15 +96,15 @@ def move_ocr_file(ingest, file_path): move(file_path, os.path.join(ingest.ocr_directory, base_name)) -def divide_chunks(file_list): +def divide_chunks(item_list, chunk_size=10): """ Divide list of files into smaller chunks for processing. :param file_list: List of images to ingest. - :type file_list: list of str + :param chunk_size: Number of items in each chunk. Defaults to 10. + :type file_list: list """ - chunk_size = 10 - for filename in range(0, len(file_list), chunk_size): - yield file_list[filename : filename + chunk_size] + for item in range(0, len(item_list), chunk_size): + yield item_list[item : item + chunk_size] def upload_trigger_file(trigger_file): diff --git a/readux_ingest_ecds/services/ocr_services.py b/readux_ingest_ecds/services/ocr_services.py index 421112e..0d9d908 100644 --- a/readux_ingest_ecds/services/ocr_services.py +++ b/readux_ingest_ecds/services/ocr_services.py @@ -13,6 +13,7 @@ from django.core.serializers import deserialize from readux_ingest_ecds.helpers import get_iiif_models from .services import fetch_url +from ..services.file_services import divide_chunks LOGGER = logging.getLogger(__name__) OCR = get_iiif_models()["OCR"] @@ -538,14 +539,17 @@ def add_ocr_to_canvases(manifest): ocr = get_ocr(canvas) if isinstance(ocr, etree.XMLSyntaxError): warnings.append(f"Canvas {canvas.pid} - {ocr.__class__.__name__}: {ocr}") - elif canvas.ocr_file_path is not None and not os.path.exists( - canvas.ocr_file_path - ): - warnings.append(f"No OCR file for {canvas.pid}.") + # elif canvas.ocr_file_path is not None and not os.path.exists( + # canvas.ocr_file_path + # ): + # warnings.append(f"No OCR file for {canvas.pid}.") elif ocr is not None: new_ocr_annotations += add_ocr_annotations(canvas, ocr) else: warnings.append(f"No OCR for {canvas.pid}") - OCR.objects.bulk_create(new_ocr_annotations) + chunks = divide_chunks(new_ocr_annotations, 100) + for chunk in list(chunks): + OCR.objects.bulk_create(chunk) + return warnings diff --git a/readux_ingest_ecds/tasks.py b/readux_ingest_ecds/tasks.py index 8eb96e9..99a9d65 100644 --- a/readux_ingest_ecds/tasks.py +++ b/readux_ingest_ecds/tasks.py @@ -8,7 +8,6 @@ from django.conf import settings from .helpers import get_iiif_models from .services.ocr_services import add_ocr_to_canvases -from .mail import send_email_on_success, send_email_on_failure # Use `apps.get_model` to avoid circular import error. Because the parameters used to # create a background task have to be serializable, we can't just pass in the model object. @@ -130,7 +129,6 @@ def add_ocr_task_local(ingest_id, *args, **kwargs): def s3_ingest_task(ingest_id, *args, **kwargs): """S3 Ingest Task""" LOGGER.info("Starting ingest from S3") - print(ingest_id) s3_ingest = S3Ingest.objects.get(pk=ingest_id) s3_ingest.ingest()