Skip to content

Commit

Permalink
Minor changes to flow.
Browse files Browse the repository at this point in the history
  • Loading branch information
jayvarner committed Oct 21, 2024
1 parent 61e948e commit d3caba2
Show file tree
Hide file tree
Showing 4 changed files with 17 additions and 13 deletions.
4 changes: 3 additions & 1 deletion readux_ingest_ecds/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -423,13 +423,15 @@ class S3Ingest(models.Model):
)

class Meta:
verbose_name_plural = "Amazon S3 Ingests"
verbose_name_plural = "S3 Ingests"

def ingest(self):
rows = metadata_from_file(self.metadata_spreadsheet.path)

for row in rows:
pid = row["pid"]
if pid is None:
continue
manifest = create_manifest_from_pid(pid, self.image_server)
metadata = dict(row)
for key, value in metadata.items():
Expand Down
10 changes: 5 additions & 5 deletions readux_ingest_ecds/services/file_services.py
Original file line number Diff line number Diff line change
Expand Up @@ -96,15 +96,15 @@ def move_ocr_file(ingest, file_path):
move(file_path, os.path.join(ingest.ocr_directory, base_name))


def divide_chunks(file_list):
def divide_chunks(item_list, chunk_size=10):
"""
Divide list of files into smaller chunks for processing.
:param file_list: List of images to ingest.
:type file_list: list of str
:param chunk_size: Number of items in each chunk. Defaults to 10.
:type file_list: list
"""
chunk_size = 10
for filename in range(0, len(file_list), chunk_size):
yield file_list[filename : filename + chunk_size]
for item in range(0, len(item_list), chunk_size):
yield item_list[item : item + chunk_size]


def upload_trigger_file(trigger_file):
Expand Down
14 changes: 9 additions & 5 deletions readux_ingest_ecds/services/ocr_services.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
from django.core.serializers import deserialize
from readux_ingest_ecds.helpers import get_iiif_models
from .services import fetch_url
from ..services.file_services import divide_chunks

LOGGER = logging.getLogger(__name__)
OCR = get_iiif_models()["OCR"]
Expand Down Expand Up @@ -538,14 +539,17 @@ def add_ocr_to_canvases(manifest):
ocr = get_ocr(canvas)
if isinstance(ocr, etree.XMLSyntaxError):
warnings.append(f"Canvas {canvas.pid} - {ocr.__class__.__name__}: {ocr}")
elif canvas.ocr_file_path is not None and not os.path.exists(
canvas.ocr_file_path
):
warnings.append(f"No OCR file for {canvas.pid}.")
# elif canvas.ocr_file_path is not None and not os.path.exists(
# canvas.ocr_file_path
# ):
# warnings.append(f"No OCR file for {canvas.pid}.")
elif ocr is not None:
new_ocr_annotations += add_ocr_annotations(canvas, ocr)
else:
warnings.append(f"No OCR for {canvas.pid}")

OCR.objects.bulk_create(new_ocr_annotations)
chunks = divide_chunks(new_ocr_annotations, 100)
for chunk in list(chunks):
OCR.objects.bulk_create(chunk)

return warnings
2 changes: 0 additions & 2 deletions readux_ingest_ecds/tasks.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,6 @@
from django.conf import settings
from .helpers import get_iiif_models
from .services.ocr_services import add_ocr_to_canvases
from .mail import send_email_on_success, send_email_on_failure

# Use `apps.get_model` to avoid circular import error. Because the parameters used to
# create a background task have to be serializable, we can't just pass in the model object.
Expand Down Expand Up @@ -130,7 +129,6 @@ def add_ocr_task_local(ingest_id, *args, **kwargs):
def s3_ingest_task(ingest_id, *args, **kwargs):
"""S3 Ingest Task"""
LOGGER.info("Starting ingest from S3")
print(ingest_id)
s3_ingest = S3Ingest.objects.get(pk=ingest_id)
s3_ingest.ingest()

Expand Down

0 comments on commit d3caba2

Please sign in to comment.