From d4ff4ca488f0105c5cc78f1c0d08e6445d3be12f Mon Sep 17 00:00:00 2001 From: benoit74 Date: Wed, 17 Jan 2024 14:19:35 +0100 Subject: [PATCH 1/2] Fix default publisher value + add missing publisher flags in some offliners --- .../backend/src/common/schemas/offliners/freecodecamp.py | 2 +- .../backend/src/common/schemas/offliners/kolibri.py | 2 +- .../backend/src/common/schemas/offliners/mwoffliner.py | 2 +- .../backend/src/common/schemas/offliners/nautilus.py | 8 ++++++++ .../backend/src/common/schemas/offliners/openedx.py | 7 +++++++ dispatcher/backend/src/common/schemas/offliners/sotoki.py | 2 +- dispatcher/backend/src/common/schemas/offliners/ted.py | 7 +++++++ .../backend/src/common/schemas/offliners/youtube.py | 7 +++++++ dispatcher/backend/src/common/schemas/offliners/zimit.py | 7 +++++++ 9 files changed, 40 insertions(+), 4 deletions(-) diff --git a/dispatcher/backend/src/common/schemas/offliners/freecodecamp.py b/dispatcher/backend/src/common/schemas/offliners/freecodecamp.py index c50949b8..2f398e9b 100644 --- a/dispatcher/backend/src/common/schemas/offliners/freecodecamp.py +++ b/dispatcher/backend/src/common/schemas/offliners/freecodecamp.py @@ -89,7 +89,7 @@ class Meta: publisher = String( metadata={ "label": "Publisher", - "description": "Custom publisher name (ZIM metadata). “OpenZIM” otherwise", + "description": "Custom publisher name (ZIM metadata). “openZIM” otherwise", } ) diff --git a/dispatcher/backend/src/common/schemas/offliners/kolibri.py b/dispatcher/backend/src/common/schemas/offliners/kolibri.py index c73b4ce9..7e64a9af 100644 --- a/dispatcher/backend/src/common/schemas/offliners/kolibri.py +++ b/dispatcher/backend/src/common/schemas/offliners/kolibri.py @@ -109,7 +109,7 @@ class Meta: publisher = String( metadata={ "label": "Publisher", - "description": "Custom publisher name (ZIM metadata). “OpenZIM” otherwise", + "description": "Custom publisher name (ZIM metadata). “openZIM” otherwise", } ) diff --git a/dispatcher/backend/src/common/schemas/offliners/mwoffliner.py b/dispatcher/backend/src/common/schemas/offliners/mwoffliner.py index c0ffe873..aa69209f 100644 --- a/dispatcher/backend/src/common/schemas/offliners/mwoffliner.py +++ b/dispatcher/backend/src/common/schemas/offliners/mwoffliner.py @@ -94,7 +94,7 @@ class Meta: publisher = String( metadata={ "label": "Publisher", - "description": "ZIM publisher metadata. `Kiwix` otherwise.", + "description": "ZIM publisher metadata. `openZIM` otherwise.", } ) filenamePrefix = String( diff --git a/dispatcher/backend/src/common/schemas/offliners/nautilus.py b/dispatcher/backend/src/common/schemas/offliners/nautilus.py index c24611b3..d2aa1130 100644 --- a/dispatcher/backend/src/common/schemas/offliners/nautilus.py +++ b/dispatcher/backend/src/common/schemas/offliners/nautilus.py @@ -113,6 +113,14 @@ class Meta: "description": "Name of content creator.", } ) + + publisher = String( + metadata={ + "label": "Publisher", + "description": "Custom publisher name (ZIM metadata). “openZIM” otherwise", + } + ) + tags = String( metadata={ "label": "ZIM Tags", diff --git a/dispatcher/backend/src/common/schemas/offliners/openedx.py b/dispatcher/backend/src/common/schemas/offliners/openedx.py index 0be71e56..a19a02cd 100644 --- a/dispatcher/backend/src/common/schemas/offliners/openedx.py +++ b/dispatcher/backend/src/common/schemas/offliners/openedx.py @@ -193,6 +193,13 @@ class Meta: data_key="creator", ) + publisher = String( + metadata={ + "label": "Publisher", + "description": "Custom publisher name (ZIM metadata). “openZIM” otherwise", + } + ) + tags = String( metadata={ "label": "ZIM Tags", diff --git a/dispatcher/backend/src/common/schemas/offliners/sotoki.py b/dispatcher/backend/src/common/schemas/offliners/sotoki.py index 8c6af97c..f1b56a9b 100644 --- a/dispatcher/backend/src/common/schemas/offliners/sotoki.py +++ b/dispatcher/backend/src/common/schemas/offliners/sotoki.py @@ -60,7 +60,7 @@ class Meta: publisher = String( metadata={ "label": "Publisher", - "description": "Custom publisher name (ZIM metadata). “OpenZIM” otherwise", + "description": "Custom publisher name (ZIM metadata). “openZIM” otherwise", }, ) diff --git a/dispatcher/backend/src/common/schemas/offliners/ted.py b/dispatcher/backend/src/common/schemas/offliners/ted.py index 13c1d247..d1ca8707 100644 --- a/dispatcher/backend/src/common/schemas/offliners/ted.py +++ b/dispatcher/backend/src/common/schemas/offliners/ted.py @@ -165,6 +165,13 @@ class Meta: } ) + publisher = String( + metadata={ + "label": "Publisher", + "description": "Custom publisher name (ZIM metadata). “openZIM” otherwise", + } + ) + tags = String( metadata={ "label": "ZIM Tags", diff --git a/dispatcher/backend/src/common/schemas/offliners/youtube.py b/dispatcher/backend/src/common/schemas/offliners/youtube.py index b16e28b9..83002c55 100644 --- a/dispatcher/backend/src/common/schemas/offliners/youtube.py +++ b/dispatcher/backend/src/common/schemas/offliners/youtube.py @@ -152,6 +152,13 @@ class Meta: } ) + publisher = String( + metadata={ + "label": "Publisher", + "description": "Custom publisher name (ZIM metadata). “openZIM” otherwise", + } + ) + tags = String( metadata={ "label": "ZIM Tags", diff --git a/dispatcher/backend/src/common/schemas/offliners/zimit.py b/dispatcher/backend/src/common/schemas/offliners/zimit.py index ce63d38a..b59187b5 100644 --- a/dispatcher/backend/src/common/schemas/offliners/zimit.py +++ b/dispatcher/backend/src/common/schemas/offliners/zimit.py @@ -207,6 +207,13 @@ class Meta: } ) + publisher = String( + metadata={ + "label": "Publisher", + "description": "Custom publisher name (ZIM metadata). “openZIM” otherwise", + } + ) + source = String( metadata={ "label": "Content Source", From 64867e21888a4d95146223d094a3bab914c58ba8 Mon Sep 17 00:00:00 2001 From: benoit74 Date: Wed, 17 Jan 2024 14:23:03 +0100 Subject: [PATCH 2/2] Publisher flag might be set automatically when not provided in configuration --- dispatcher/backend/src/common/constants.py | 3 + dispatcher/backend/src/tests/conftest.py | 12 +++- .../src/tests/unit/utils/test_offliners.py | 56 +++++++++++++++++++ dispatcher/backend/src/utils/offliners.py | 55 +++++++++++++----- 4 files changed, 112 insertions(+), 14 deletions(-) create mode 100644 dispatcher/backend/src/tests/unit/utils/test_offliners.py diff --git a/dispatcher/backend/src/common/constants.py b/dispatcher/backend/src/common/constants.py index 5f4ee187..918b6355 100644 --- a/dispatcher/backend/src/common/constants.py +++ b/dispatcher/backend/src/common/constants.py @@ -37,6 +37,9 @@ # empty ZIMCHECK_OPTION means no zimcheck ZIMCHECK_OPTION = os.getenv("ZIMCHECK_OPTION", "") +# Publisher value to "force" in all scrapers if not set in the recipe +DEFAULT_PUBLISHER = os.getenv("DEFAULT_PUBLISHER") + # NOTIFICATIONS # in-notification URLs diff --git a/dispatcher/backend/src/tests/conftest.py b/dispatcher/backend/src/tests/conftest.py index 230be5da..051f26e5 100644 --- a/dispatcher/backend/src/tests/conftest.py +++ b/dispatcher/backend/src/tests/conftest.py @@ -1,8 +1,9 @@ -from typing import Generator +from typing import Callable, Generator import pytest from sqlalchemy.orm import Session as OrmSession +from common import constants from db import Session @@ -10,3 +11,12 @@ def dbsession() -> Generator[OrmSession, None, None]: with Session.begin() as session: yield session + + +@pytest.fixture +def set_default_publisher() -> Generator[Callable, None, None]: + def _set_default_publisher(publisher: str): + constants.DEFAULT_PUBLISHER = publisher + + yield _set_default_publisher + constants.DEFAULT_PUBLISHER = None # Reset to default after test diff --git a/dispatcher/backend/src/tests/unit/utils/test_offliners.py b/dispatcher/backend/src/tests/unit/utils/test_offliners.py new file mode 100644 index 00000000..8708ab71 --- /dev/null +++ b/dispatcher/backend/src/tests/unit/utils/test_offliners.py @@ -0,0 +1,56 @@ +import pytest + +from common.enum import Offliner +from utils.offliners import command_for + + +@pytest.mark.parametrize( + "offliner, flags, default_publisher, expected_result", + [ + ( + Offliner.freecodecamp, + {}, + None, + ["fcc2zim", '--output="/"'], + ), # no default publisher + ( + Offliner.freecodecamp, + {}, + "openZIM", + ["fcc2zim", '--output="/"', '--publisher="openZIM"'], + ), # default publisher is "openZIM" + ( + Offliner.freecodecamp, + {}, + "Kiwix", + ["fcc2zim", '--output="/"', '--publisher="Kiwix"'], + ), # default publisher is "Kiwix" + ( + Offliner.freecodecamp, + {"publisher": "Kiwix"}, + "openZIM", + ["fcc2zim", '--output="/"', '--publisher="Kiwix"'], + ), # publisher is already set "manually" in the configuration + (Offliner.gutenberg, {}, None, ["gutenberg2zim"]), + ( + Offliner.gutenberg, + {}, + "openZIM", + ["gutenberg2zim"], + ), # offliner does not support the publisher flag + ], +) +def test_command_for( + offliner, flags, default_publisher, expected_result, set_default_publisher +): + set_default_publisher(default_publisher) + command = command_for(offliner=offliner, flags=flags, mount_point="/") + assert ( + command[0] == expected_result[0] + ) # first item is the executable, it must match + assert set(command[1:]) == set( + expected_result[1:] + ) # other flags order does not matter + assert len(command) == len( + expected_result + ) # but we must not have duplicate flags, so length must match diff --git a/dispatcher/backend/src/utils/offliners.py b/dispatcher/backend/src/utils/offliners.py index 20ad3984..abaa2c3c 100644 --- a/dispatcher/backend/src/utils/offliners.py +++ b/dispatcher/backend/src/utils/offliners.py @@ -8,22 +8,25 @@ # from common.constants import DISALLOW_CAPABILITIES from typing import List +from common import constants from common.enum import Offliner -od = collections.namedtuple("OfflinerDef", ["cmd", "std_output", "std_stats"]) +od = collections.namedtuple( + "OfflinerDef", ["cmd", "std_output", "std_stats", "publisher_flag"] +) OFFLINER_DEFS = { - Offliner.freecodecamp: od("fcc2zim", True, False), - Offliner.gutenberg: od("gutenberg2zim", False, False), - Offliner.sotoki: od("sotoki", True, True), - Offliner.wikihow: od("wikihow2zim", True, True), - Offliner.ifixit: od("ifixit2zim", True, True), - Offliner.mwoffliner: od("mwoffliner", "outputDirectory", False), - Offliner.youtube: od("youtube2zim-playlists", True, False), - Offliner.ted: od("ted2zim-multi", True, False), - Offliner.openedx: od("openedx2zim", True, False), - Offliner.nautilus: od("nautiluszim", True, False), - Offliner.zimit: od("zimit", True, "statsFilename"), - Offliner.kolibri: od("kolibri2zim", True, False), + Offliner.freecodecamp: od("fcc2zim", True, False, True), + Offliner.gutenberg: od("gutenberg2zim", False, False, False), + Offliner.sotoki: od("sotoki", True, True, True), + Offliner.wikihow: od("wikihow2zim", True, True, True), + Offliner.ifixit: od("ifixit2zim", True, True, True), + Offliner.mwoffliner: od("mwoffliner", "outputDirectory", False, True), + Offliner.youtube: od("youtube2zim-playlists", True, False, True), + Offliner.ted: od("ted2zim-multi", True, False, True), + Offliner.openedx: od("openedx2zim", True, False, True), + Offliner.nautilus: od("nautiluszim", True, False, True), + Offliner.zimit: od("zimit", True, "statsFilename", True), + Offliner.kolibri: od("kolibri2zim", True, False, True), } @@ -73,9 +76,35 @@ def command_for(offliner, flags, mount_point): if offliner == Offliner.zimit: if "adminEmail" not in flags: flags["adminEmail"] = "contact+zimfarm@kiwix.org" + + _command_for_set_default_publisher(flags, offliner_def) + return [cmd] + compute_flags(flags) +def _command_for_set_default_publisher(flags, offliner_def): + """Set a default publisher in the command + + The "publisher" flag is set if a default is provided in the local environment, if + the scraper supports it, and if it is not already set manually. + + The "publisher" flag might have a different name, configured in the offliner + definition. + """ + + flag_name = ( + offliner_def.publisher_flag + if isinstance(offliner_def.publisher_flag, str) + else "publisher" + ) + if ( + constants.DEFAULT_PUBLISHER + and offliner_def.publisher_flag + and flag_name not in flags + ): + flags[flag_name] = constants.DEFAULT_PUBLISHER + + def docker_config_for(offliner): # Note: in docker, --shm-size sets the size of /dev/shm # it is taken out of --memory (if set)