From b5f7bda2c4a70dd216ec9e2ff4fe238dfaf68f87 Mon Sep 17 00:00:00 2001 From: James McKinney <26463+jpmckinney@users.noreply.github.com> Date: Fri, 1 Nov 2024 22:51:15 -0400 Subject: [PATCH] fix: Stop creating collection_file rows for file errors, #366 --- docs/database.rst | 4 +-- process/management/commands/api_loader.py | 31 ++++++++++------------- process/processors/loader.py | 13 +++------- 3 files changed, 19 insertions(+), 29 deletions(-) diff --git a/docs/database.rst b/docs/database.rst index 50811387..d13273da 100644 --- a/docs/database.rst +++ b/docs/database.rst @@ -6,7 +6,7 @@ Database tables .. To update the diagram, see https://ocp-software-handbook.readthedocs.io/en/latest/services/postgresql.html#generate-entity-relationship-diagram - java -jar schemaspy.jar -t pgsql -dp postgresql.jar -host localhost -db kingfisher_process -o schemaspy -norows -I '(django|auth).*' + java -jar schemaspy.jar -t pgsql -dp postgresql.jar -o schemaspy -norows -I '(django|auth).*' -host localhost -db kingfisher_process -u MYUSER .. list-table:: :header-rows: 1 @@ -81,7 +81,7 @@ collection table Use this column to find the crawl log. * - ``expected_files_count`` - - The number of messages to expect from Kingfisher Collect. + - The number of non-error messages to expect from Kingfisher Collect. * - ``store_start_at`` - The time at which the collection was added. * - ``store_end_at`` diff --git a/process/management/commands/api_loader.py b/process/management/commands/api_loader.py index 0db67a52..4a346d71 100644 --- a/process/management/commands/api_loader.py +++ b/process/management/commands/api_loader.py @@ -1,3 +1,4 @@ +import json import os.path from django.conf import settings @@ -6,9 +7,9 @@ from django.utils.translation import gettext as t from yapw.methods import ack, publish -from process.models import Collection +from process.models import Collection, CollectionNote from process.processors.loader import create_collection_file -from process.util import consume, decorator +from process.util import consume, create_note, decorator from process.util import wrap as w # Other applications use this routing key. @@ -27,27 +28,23 @@ def handle(self, *args, **options): def callback(client_state, channel, method, properties, input_message): collection_id = input_message["collection_id"] - collection_file_url = input_message["url"] + url = input_message["url"] + path = input_message.get("path") + errors = input_message.get("errors") collection = Collection.objects.get(pk=collection_id) if collection.deleted_at: ack(client_state, channel, method.delivery_tag) return - # In Kingfisher Collect, `path` is set if and only if `errors` isn't set. - if "path" in input_message: - filename = os.path.join(settings.KINGFISHER_COLLECT_FILES_STORE, input_message["path"]) - else: - filename = collection_file_url - with transaction.atomic(): - collection_file = create_collection_file( - collection, filename=filename, url=collection_file_url, errors=input_message.get("errors") - ) - - # FileError items from Kingfisher Collect are not processed further. - if "errors" not in input_message: - message = {"collection_id": collection_id, "collection_file_id": collection_file.pk} - publish(client_state, channel, message, routing_key) + if errors: + create_note(collection, CollectionNote.Level.ERROR, f"Couldn't download {url}", data=json.loads(errors)) + # FileError items from Kingfisher Collect are not processed further. + else: + filename = os.path.join(settings.KINGFISHER_COLLECT_FILES_STORE, path) + collection_file = create_collection_file(collection, filename=filename, url=url) + message = {"collection_id": collection_id, "collection_file_id": collection_file.pk} + publish(client_state, channel, message, routing_key) ack(client_state, channel, method.delivery_tag) diff --git a/process/processors/loader.py b/process/processors/loader.py index f18370a5..ec40963b 100644 --- a/process/processors/loader.py +++ b/process/processors/loader.py @@ -1,6 +1,5 @@ import argparse import copy -import json import logging import os @@ -9,7 +8,7 @@ from process.exceptions import InvalidFormError from process.forms import CollectionFileForm, CollectionForm, CollectionNote, CollectionNoteForm from process.models import Collection, CollectionFile, ProcessingStep -from process.util import create_note, create_step +from process.util import create_step logger = logging.getLogger(__name__) @@ -21,13 +20,12 @@ def file_or_directory(path): return path -def create_collection_file(collection, filename=None, url=None, errors=None) -> CollectionFile: +def create_collection_file(collection, filename=None, url=None) -> CollectionFile: """ Create file for a collection and steps for this file. :param Collection collection: collection :param str filename: path to file data - :param json errors: errors to be stored :returns: created collection file :raises InvalidFormError: if there is a validation error """ @@ -35,12 +33,7 @@ def create_collection_file(collection, filename=None, url=None, errors=None) -> if form.is_valid(): collection_file = form.save() - if not errors: - create_step(ProcessingStep.Name.LOAD, collection.pk, collection_file=collection_file) - else: - note = f"Couldn't download {collection_file}" # path is set to url in api_loader - create_note(collection, CollectionNote.Level.ERROR, note, data=json.loads(errors)) - + create_step(ProcessingStep.Name.LOAD, collection.pk, collection_file=collection_file) return collection_file raise InvalidFormError(form.error_messages)