From 7da983752e6b7301a269c64dde62968287d1bd68 Mon Sep 17 00:00:00 2001 From: agmorev Date: Wed, 29 Nov 2023 02:30:23 +0200 Subject: [PATCH 01/11] SXDEDPCXZIC-241_DATAVIC-622 / DELWP harvest restricted records --- ckanext/datavic_harvester/harvesters/delwp.py | 91 +++++++++++++++---- 1 file changed, 75 insertions(+), 16 deletions(-) diff --git a/ckanext/datavic_harvester/harvesters/delwp.py b/ckanext/datavic_harvester/harvesters/delwp.py index 26cf548..869624f 100644 --- a/ckanext/datavic_harvester/harvesters/delwp.py +++ b/ckanext/datavic_harvester/harvesters/delwp.py @@ -275,6 +275,12 @@ def import_stage(self, harvest_object: HarvestObject) -> bool | str: pkg_dict = self._get_pkg_dict(harvest_object) + if not pkg_dict["notes"] or not pkg_dict["owner_org"]: + log.info( + f"Description or organization field is missing for object {harvest_object.id}, skipping..." + ) + return False + if status not in ["new", "change"]: return True @@ -350,6 +356,14 @@ def _get_pkg_dict(self, harvest_object): else "" ) + access_notes = """ + Aerial imagery and elevation datasets\n + You can access high-resolution aerial imagery and elevation (LiDAR point cloud) datasets by contacting a business that holds a commercial license.\n + We have two types of commercial licensing:\n + Data Service Providers (DSPs) provide access to the source imagery or elevation data.\n + Value Added Retailers (VARs ) use the imagery and elevation data to create new products and services. This includes advisory services and new knowledge products. + """ + pkg_dict = {} pkg_dict["personal_information"] = "no" @@ -357,11 +371,6 @@ def _get_pkg_dict(self, harvest_object): pkg_dict["access"] = "yes" pkg_dict["organization_visibility"] = "all" pkg_dict["workflow_status"] = "published" - pkg_dict["license_id"] = self.config.get("license_id", "cc-by") - pkg_dict["private"] = self._is_pkg_private( - metashare_dict - ) - pkg_dict["title"] = metashare_dict.get("title") pkg_dict["notes"] = metashare_dict.get("abstract", "") pkg_dict["tags"] = helpers.get_tags(metashare_dict.get("topiccat")) @@ -369,16 +378,13 @@ def _get_pkg_dict(self, harvest_object): pkg_dict["extract"] = f"{pkg_dict['notes'].split('.')[0]}..." pkg_dict["owner_org"] = self._get_organisation( self.config.get("organisation_mapping"), - metashare_dict.get("resowner").split(";")[0], + metashare_dict.get("resowner", "").split(";")[0], harvest_object, ) if not pkg_dict.get("name"): pkg_dict["name"] = self._get_package_name(harvest_object, pkg_dict["title"]) - if full_metadata_url: - pkg_dict["full_metadata_url"] = full_metadata_url - if uuid: pkg_dict["primary_purpose_of_collection"] = uuid @@ -412,6 +418,22 @@ def _get_pkg_dict(self, harvest_object): pkg_dict["resources"] = self._fetch_resources(metashare_dict) + pkg_dict["private"] = self._is_pkg_private( + metashare_dict, + pkg_dict["resources"] + ) + + pkg_dict["license_id"] = self.config.get("license_id", "cc-by") + + if pkg_dict["private"]: + pkg_dict["license_id"] = "other-closed" + + if self._is_delwp_raster_data(pkg_dict["resources"]): + pkg_dict["full_metadata_url"] = f"https://metashare.maps.vic.gov.au/geonetwork/srv/api/records/{uuid}/formatters/cip-pdf?root=export&output=pdf" + pkg_dict["access_description"] = access_notes + elif full_metadata_url: + pkg_dict["full_metadata_url"] = full_metadata_url + for key, value in [ ("harvest_source_id", harvest_object.source.id), ("harvest_source_title", harvest_object.source.title), @@ -431,13 +453,50 @@ def _create_custom_package_create_schema(self) -> dict[str, Any]: return package_schema - def _is_pkg_private(self, remote_dict: dict[str, Any]) -> bool: - """Check if the dataset should be private by `resclassification` field - value""" - return remote_dict.get("resclassification") in ( - "limitedDistribution", - "restricted", - ) + def _is_delwp_vector_data(self, resources: list[dict[str, Any]]) -> bool: + for res in resources: + if res["format"].lower() in [ + "dwg", + "dxf", + "gdb", + "shp", + "mif", + "tab", + "extended tab", + "mapinfo", + ]: + return True + + return False + + def _is_delwp_raster_data(self, resources: list[dict[str, Any]]) -> bool: + for res in resources: + if res["format"].lower() in [ + "ecw", + "geotiff", + "jpeg", + "jp2", + "jpeg 2000", + "tiff", + "lass", + "xyz", + ]: + return True + + return False + + def _is_pkg_private( + self, + remote_dict: dict[str, Any], + resources: list[dict[str, Any]] + ) -> bool: + """Check if the dataset should be private""" + if (self._is_delwp_vector_data(resources) and + remote_dict.get("mdclassification") == "unclassified" and + remote_dict.get("resclassification") == "unclassified"): + return False + + return True def _get_organisation( self, From fa98db27c4472cf11f8287dbd232c81e92488460 Mon Sep 17 00:00:00 2001 From: Yan Rudenko Date: Thu, 30 May 2024 13:32:37 +0200 Subject: [PATCH 02/11] SXDEDPCXZIC-308_DATAVIC-622 --- ckanext/datavic_harvester/harvesters/delwp.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/ckanext/datavic_harvester/harvesters/delwp.py b/ckanext/datavic_harvester/harvesters/delwp.py index 869624f..e4d0247 100644 --- a/ckanext/datavic_harvester/harvesters/delwp.py +++ b/ckanext/datavic_harvester/harvesters/delwp.py @@ -281,6 +281,13 @@ def import_stage(self, harvest_object: HarvestObject) -> bool | str: ) return False + # Remove restricted Datasets + if pkg_dict["private"]: + log.info( + f"Dataset is Restricted for object {harvest_object.id}, skipping..." + ) + return False + if status not in ["new", "change"]: return True From 9b3a64d1039edd393a874af2318580633025939f Mon Sep 17 00:00:00 2001 From: agmorev Date: Wed, 19 Jun 2024 00:29:38 +0300 Subject: [PATCH 03/11] SXDEDPCXZIC-315 / auto detect size for some data resources --- ckanext/datavic_harvester/harvesters/base.py | 84 +++++++++++++++++++ ckanext/datavic_harvester/harvesters/delwp.py | 5 +- ckanext/datavic_harvester/harvesters/ods.py | 5 ++ 3 files changed, 93 insertions(+), 1 deletion(-) diff --git a/ckanext/datavic_harvester/harvesters/base.py b/ckanext/datavic_harvester/harvesters/base.py index 9b15cc4..98ca716 100644 --- a/ckanext/datavic_harvester/harvesters/base.py +++ b/ckanext/datavic_harvester/harvesters/base.py @@ -15,6 +15,10 @@ log = logging.getLogger(__name__) +MAX_CONTENT_LENGTH = int(tk.config.get('ckanext.datavic_harvester.max_content_length') or 1e+8) +CHUNK_SIZE = 16 * 1024 +DOWNLOAD_TIMEOUT = 30 + class DataVicBaseHarvester(HarvesterBase): def __init__(self, **kwargs): @@ -139,3 +143,83 @@ def _make_context(self) -> dict[str, Any]: "model": model, "session": model.Session, } + + +class DataTooBigWarning(Exception): + pass + + +def get_resource_size(resource_url: str) -> int: + """Return external resource size in bytes + + Args: + resource_url (str): a URL for the resource’s source + + Returns: + int: resource size in bytes + """ + + length = 0 + cl = None + + try: + headers = {} + + response = _get_response(resource_url, headers) + cl = response.headers.get('content-length') + + if cl: + response.close() + log.info(f"Resource from url <{resource_url}> length is {cl} bytes.") + return int(cl) + + for chunk in response.iter_content(CHUNK_SIZE): + length += len(chunk) + if length > MAX_CONTENT_LENGTH: + response.close + raise DataTooBigWarning() + + response.close() + + except DataTooBigWarning: + message = f"Resource from url <{resource_url}> is more " \ + f"than {MAX_CONTENT_LENGTH} bytes. Skip its size calculation." + log.warning(message) + length = -1 # for the purpose of search possibility in the db + return length + + except requests.exceptions.HTTPError as error: + log.debug(f"HTTP error: {error}") + + except requests.exceptions.Timeout: + log.warning(f"URL time out after {DOWNLOAD_TIMEOUT}s") + + except requests.exceptions.RequestException as error: + log.warning(f"URL error: {error}") + + log.info(f"Resource from url <{resource_url}> length is {length} bytes.") + + return length + + +def _get_response(url, headers): + def get_url(): + kwargs = {"headers": headers, "timeout": 30, "stream": True} + + if "ckan.download_proxy" in tk.config: + proxy = tk.config.get("ckan.download_proxy") + kwargs["proxies"] = {"http": proxy, "https": proxy} + + return requests.get(url, **kwargs) + + response = get_url() + if response.status_code == 202: + wait = 1 + while wait < 120 and response.status_code == 202: + import time + time.sleep(wait) + response = get_url() + wait *= 3 + response.raise_for_status() + + return response diff --git a/ckanext/datavic_harvester/harvesters/delwp.py b/ckanext/datavic_harvester/harvesters/delwp.py index db2d4fb..8af4131 100644 --- a/ckanext/datavic_harvester/harvesters/delwp.py +++ b/ckanext/datavic_harvester/harvesters/delwp.py @@ -17,7 +17,7 @@ from ckanext.harvest.model import HarvestObject, HarvestObjectExtra import ckanext.datavic_harvester.helpers as helpers -from ckanext.datavic_harvester.harvesters.base import DataVicBaseHarvester +from ckanext.datavic_harvester.harvesters.base import DataVicBaseHarvester, get_resource_size log = logging.getLogger(__name__) @@ -650,6 +650,9 @@ def _get_resources_by_formats( res["name"] = f"{res['name']} {res_format}".replace("_", "") + res["size"] = get_resource_size(res_url) + res["filesize"] = res["size"] + if attribution: res["attribution"] = attribution diff --git a/ckanext/datavic_harvester/harvesters/ods.py b/ckanext/datavic_harvester/harvesters/ods.py index 21c65bc..cb074aa 100644 --- a/ckanext/datavic_harvester/harvesters/ods.py +++ b/ckanext/datavic_harvester/harvesters/ods.py @@ -15,6 +15,7 @@ import ckan.plugins.toolkit as tk from ckanext.harvest_basket.harvesters import ODSHarvester +from .base import get_resource_size class DataVicODSHarvester(ODSHarvester): @@ -38,4 +39,8 @@ def _fetch_resources(self, source_url, resource_urls, pkg_data): for res in resources: if res["format"] == "CSV": res["url"] = f'{res["url"]}?delimiter=%2C' + + res["size"] = get_resource_size(res["url"]) + res["filesize"] = res["size"] + return resources From 4029b53bcb4fc665e192dbbfb6370ffbed496dfc Mon Sep 17 00:00:00 2001 From: agmorev Date: Thu, 20 Jun 2024 23:46:45 +0300 Subject: [PATCH 04/11] SXDEDPCXZIC-321_DATAVIC-699 / set dcat harvester default visibility --- ckanext/datavic_harvester/harvesters/dcat_json.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/ckanext/datavic_harvester/harvesters/dcat_json.py b/ckanext/datavic_harvester/harvesters/dcat_json.py index 9ee6e33..6ffebe6 100644 --- a/ckanext/datavic_harvester/harvesters/dcat_json.py +++ b/ckanext/datavic_harvester/harvesters/dcat_json.py @@ -184,7 +184,12 @@ def _set_required_fields_defaults( if not self._get_extra(pkg_dict, "protective_marking"): pkg_dict["protective_marking"] = "official" - if not self._get_extra(pkg_dict, "organization_visibility"): + if not self._get_extra(pkg_dict, "organization_visibility") \ + and "default_visibility" in self.config: + pkg_dict["organization_visibility"] = self.config["default_visibility"][ + "organization_visibility" + ] + else: pkg_dict["organization_visibility"] = "current" pkg_dict["workflow_status"] = "published" From d9256e69b5ab5dcfd88e63d8d57afa6f81cac0af Mon Sep 17 00:00:00 2001 From: agmorev Date: Fri, 21 Jun 2024 00:38:15 +0300 Subject: [PATCH 05/11] SXDEDPCXZIC-321_DATAVIC-699 / fix the logic of default value --- ckanext/datavic_harvester/harvesters/dcat_json.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/ckanext/datavic_harvester/harvesters/dcat_json.py b/ckanext/datavic_harvester/harvesters/dcat_json.py index 6ffebe6..54627a6 100644 --- a/ckanext/datavic_harvester/harvesters/dcat_json.py +++ b/ckanext/datavic_harvester/harvesters/dcat_json.py @@ -190,7 +190,9 @@ def _set_required_fields_defaults( "organization_visibility" ] else: - pkg_dict["organization_visibility"] = "current" + pkg_dict["organization_visibility"] = self._get_extra( + pkg_dict, "organization_visibility" + ) or "current" pkg_dict["workflow_status"] = "published" From 9397321ce7a4c7486ee11b2fd3697f0b18ba6644 Mon Sep 17 00:00:00 2001 From: alexmorev Date: Mon, 24 Jun 2024 22:58:49 +0300 Subject: [PATCH 06/11] SXDEDPCXZIC-322_DATAVIC-703 / prevent records being updated unnecessarily --- .../datavic_harvester/harvesters/dcat_json.py | 21 ++++++++++++++++++- 1 file changed, 20 insertions(+), 1 deletion(-) diff --git a/ckanext/datavic_harvester/harvesters/dcat_json.py b/ckanext/datavic_harvester/harvesters/dcat_json.py index 9ee6e33..d873d1f 100644 --- a/ckanext/datavic_harvester/harvesters/dcat_json.py +++ b/ckanext/datavic_harvester/harvesters/dcat_json.py @@ -34,6 +34,25 @@ def gather_stage(self, harvest_job): def import_stage(self, harvest_object): self._set_config(harvest_object.source.config) + + package_dict, dcat_dict = self._get_package_dict(harvest_object) + dcat_modified = dcat_dict.get("modified") + existing_dataset = self._get_existing_dataset(harvest_object.guid) + + if dcat_modified and existing_dataset: + dcat_modified = helpers.convert_date_to_isoformat( + dcat_modified, "modified", dcat_dict["title"] + ).lower().split("t")[0] + + pkg_modified = existing_dataset['date_modified_data_asset'] + + if pkg_modified and pkg_modified == dcat_modified: + log.info( + f"Dataset with id {existing_dataset['id']} wasn't modified " + "from the last harvest. Skipping this dataset..." + ) + return False + return super().import_stage(harvest_object) def _get_package_dict( @@ -43,7 +62,7 @@ def _get_package_dict( conversions of the data""" dcat_dict: dict[str, Any] = json.loads(harvest_object.content) - pkg_dict = converters.dcat_to_ckan(dcat_dict) + pkg_dict = converters.dcat_to_ckan(dcat_dict) soup: BeautifulSoup = BeautifulSoup(pkg_dict["notes"], "html.parser") From d49eed159ac37fba90643fd54f89c4cd6f335298 Mon Sep 17 00:00:00 2001 From: alexmorev Date: Thu, 27 Jun 2024 20:04:07 +0300 Subject: [PATCH 07/11] SXDEDPCXZIC-315 / additional changes for autocalculation --- ckanext/datavic_harvester/harvesters/base.py | 23 +++++++++++++------ .../datavic_harvester/harvesters/dcat_json.py | 13 ++++++++++- 2 files changed, 28 insertions(+), 8 deletions(-) diff --git a/ckanext/datavic_harvester/harvesters/base.py b/ckanext/datavic_harvester/harvesters/base.py index 98ca716..7481dd6 100644 --- a/ckanext/datavic_harvester/harvesters/base.py +++ b/ckanext/datavic_harvester/harvesters/base.py @@ -15,7 +15,7 @@ log = logging.getLogger(__name__) -MAX_CONTENT_LENGTH = int(tk.config.get('ckanext.datavic_harvester.max_content_length') or 1e+8) +MAX_CONTENT_LENGTH = int(tk.config.get('ckanext.datavic_harvester.max_content_length') or 104857600) CHUNK_SIZE = 16 * 1024 DOWNLOAD_TIMEOUT = 30 @@ -161,17 +161,26 @@ def get_resource_size(resource_url: str) -> int: length = 0 cl = None + + if not resource_url or MAX_CONTENT_LENGTH <= 0: + return length try: headers = {} response = _get_response(resource_url, headers) - cl = response.headers.get('content-length') - - if cl: - response.close() - log.info(f"Resource from url <{resource_url}> length is {cl} bytes.") - return int(cl) + ct = response.headers.get("content-type") + cl = response.headers.get("content-length") + + if ct and "text/html" in ct: + message = f"Resource from url <{resource_url}> is of HTML type. " \ + "Skip its size calculation." + log.warning(message) + return length + + if cl and int(cl) > MAX_CONTENT_LENGTH: + response.close + raise DataTooBigWarning() for chunk in response.iter_content(CHUNK_SIZE): length += len(chunk) diff --git a/ckanext/datavic_harvester/harvesters/dcat_json.py b/ckanext/datavic_harvester/harvesters/dcat_json.py index 9ee6e33..88529a8 100644 --- a/ckanext/datavic_harvester/harvesters/dcat_json.py +++ b/ckanext/datavic_harvester/harvesters/dcat_json.py @@ -14,7 +14,7 @@ from ckanext.harvest.model import HarvestObject from ckanext.datavic_harvester import helpers -from ckanext.datavic_harvester.harvesters.base import DataVicBaseHarvester +from ckanext.datavic_harvester.harvesters.base import DataVicBaseHarvester, get_resource_size log = logging.getLogger(__name__) @@ -246,3 +246,14 @@ def _get_mocked_full_metadata(self): here: str = path.abspath(path.dirname(__file__)) with open(path.join(here, "../data/dcat_json_full_metadata.txt")) as f: return f.read() + + def modify_package_dict(self, package_dict, dcat_dict, harvest_object): + ''' + Allows custom harvesters to modify the package dict before + creating or updating the actual package. + ''' + resources = package_dict["resources"] + for resource in resources: + resource["size"] = get_resource_size(resource["url"]) + resource["filesize"] = resource["size"] + return package_dict From c5ef1d7956b9ee81e5a9ffff4877063cd00a17db Mon Sep 17 00:00:00 2001 From: alexmorev Date: Fri, 28 Jun 2024 22:22:27 +0300 Subject: [PATCH 08/11] SXDEDPCXZIC-315_DATAVIC-691 / add optional use of content-length header --- ckanext/datavic_harvester/harvesters/base.py | 39 ++++++++++++++------ 1 file changed, 28 insertions(+), 11 deletions(-) diff --git a/ckanext/datavic_harvester/harvesters/base.py b/ckanext/datavic_harvester/harvesters/base.py index 7481dd6..d35fb06 100644 --- a/ckanext/datavic_harvester/harvesters/base.py +++ b/ckanext/datavic_harvester/harvesters/base.py @@ -4,6 +4,7 @@ from typing import Optional, Any import requests +import time from ckan import model from ckan.plugins import toolkit as tk @@ -15,7 +16,9 @@ log = logging.getLogger(__name__) -MAX_CONTENT_LENGTH = int(tk.config.get('ckanext.datavic_harvester.max_content_length') or 104857600) +MAX_CONTENT_LENGTH = int( + tk.config.get("ckanext.datavic_harvester.max_content_length") or 104857600 +) CHUNK_SIZE = 16 * 1024 DOWNLOAD_TIMEOUT = 30 @@ -161,7 +164,7 @@ def get_resource_size(resource_url: str) -> int: length = 0 cl = None - + if not resource_url or MAX_CONTENT_LENGTH <= 0: return length @@ -171,30 +174,45 @@ def get_resource_size(resource_url: str) -> int: response = _get_response(resource_url, headers) ct = response.headers.get("content-type") cl = response.headers.get("content-length") + cl_enabled = tk.asbool(tk.config.get( + "ckanext.datavic_harvester.content_length_enabled", False) + ) if ct and "text/html" in ct: - message = f"Resource from url <{resource_url}> is of HTML type. " \ - "Skip its size calculation." + message = ( + f"Resource from url <{resource_url}> is of HTML type. " + "Skip its size calculation." + ) log.warning(message) return length - if cl and int(cl) > MAX_CONTENT_LENGTH: - response.close - raise DataTooBigWarning() + if cl: + if int(cl) > MAX_CONTENT_LENGTH: + response.close() + raise DataTooBigWarning() + + if cl_enabled: + response.close() + log.info( + f"Resource from url <{resource_url}> content-length is {int(cl)} bytes." + ) + return int(cl) for chunk in response.iter_content(CHUNK_SIZE): length += len(chunk) if length > MAX_CONTENT_LENGTH: - response.close + response.close() raise DataTooBigWarning() response.close() except DataTooBigWarning: - message = f"Resource from url <{resource_url}> is more " \ + message = ( + f"Resource from url <{resource_url}> is more " f"than {MAX_CONTENT_LENGTH} bytes. Skip its size calculation." + ) log.warning(message) - length = -1 # for the purpose of search possibility in the db + length = -1 # for the purpose of search possibility in the db return length except requests.exceptions.HTTPError as error: @@ -225,7 +243,6 @@ def get_url(): if response.status_code == 202: wait = 1 while wait < 120 and response.status_code == 202: - import time time.sleep(wait) response = get_url() wait *= 3 From 79cd3d769c123d90c7b5392ad6ae8c6009902edd Mon Sep 17 00:00:00 2001 From: alexmorev Date: Sat, 29 Jun 2024 01:27:20 +0300 Subject: [PATCH 09/11] SXDEDPCXZIC-315 / fix the logic --- ckanext/datavic_harvester/harvesters/base.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/ckanext/datavic_harvester/harvesters/base.py b/ckanext/datavic_harvester/harvesters/base.py index d35fb06..a7e706d 100644 --- a/ckanext/datavic_harvester/harvesters/base.py +++ b/ckanext/datavic_harvester/harvesters/base.py @@ -165,7 +165,7 @@ def get_resource_size(resource_url: str) -> int: length = 0 cl = None - if not resource_url or MAX_CONTENT_LENGTH <= 0: + if not resource_url or MAX_CONTENT_LENGTH < 0: return length try: @@ -187,7 +187,7 @@ def get_resource_size(resource_url: str) -> int: return length if cl: - if int(cl) > MAX_CONTENT_LENGTH: + if int(cl) > MAX_CONTENT_LENGTH and MAX_CONTENT_LENGTH > 0: response.close() raise DataTooBigWarning() @@ -208,8 +208,8 @@ def get_resource_size(resource_url: str) -> int: except DataTooBigWarning: message = ( - f"Resource from url <{resource_url}> is more " - f"than {MAX_CONTENT_LENGTH} bytes. Skip its size calculation." + f"Resource from url <{resource_url}> is more than the set limit " + f"{MAX_CONTENT_LENGTH} bytes. Skip its size calculation." ) log.warning(message) length = -1 # for the purpose of search possibility in the db From ac1a8e0fec2bd14e7b27b1ab396541c7bf07936f Mon Sep 17 00:00:00 2001 From: alexmorev Date: Thu, 11 Jul 2024 20:49:51 +0300 Subject: [PATCH 10/11] SXDEDPCXZIC-340 / fix harvester error --- ckanext/datavic_harvester/harvesters/base.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/ckanext/datavic_harvester/harvesters/base.py b/ckanext/datavic_harvester/harvesters/base.py index 9b15cc4..5b7cfce 100644 --- a/ckanext/datavic_harvester/harvesters/base.py +++ b/ckanext/datavic_harvester/harvesters/base.py @@ -128,8 +128,11 @@ def fetch_stage(self, harvest_object: HarvestObject) -> bool: return True def _delete_package(self, package_id: str, guid: str): - tk.get_action("package_delete")(self._make_context(), {"id": package_id}) - log.info(f"Deleted package {package_id} with guid {guid}") + try: + tk.get_action("package_delete")(self._make_context(), {"id": package_id}) + log.info(f"Deleted package {package_id} with guid {guid}") + except tk.ObjectNotFound: + log.error(f"Package {package_id} not found") def _make_context(self) -> dict[str, Any]: return { From 3893473ba76d8a7eca27817f7903ce288c4676d1 Mon Sep 17 00:00:00 2001 From: alexmorev Date: Fri, 1 Nov 2024 17:36:53 +0200 Subject: [PATCH 11/11] SXDEDPCXZIC-393 / exclude domains from size calculations --- ckanext/datavic_harvester/harvesters/base.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/ckanext/datavic_harvester/harvesters/base.py b/ckanext/datavic_harvester/harvesters/base.py index a7e706d..a340be0 100644 --- a/ckanext/datavic_harvester/harvesters/base.py +++ b/ckanext/datavic_harvester/harvesters/base.py @@ -2,6 +2,7 @@ import logging from typing import Optional, Any +from urllib.parse import urlparse import requests import time @@ -21,6 +22,9 @@ ) CHUNK_SIZE = 16 * 1024 DOWNLOAD_TIMEOUT = 30 +CONFIG_FSC_EXCLUDED_DOMAINS = tk.aslist( + tk.config.get("ckanext.datavic_harvester.filesize_excluded_domains", "") +) class DataVicBaseHarvester(HarvesterBase): @@ -168,6 +172,10 @@ def get_resource_size(resource_url: str) -> int: if not resource_url or MAX_CONTENT_LENGTH < 0: return length + hostname = urlparse(resource_url).hostname + if hostname in CONFIG_FSC_EXCLUDED_DOMAINS: + return length + try: headers = {}