From 7da983752e6b7301a269c64dde62968287d1bd68 Mon Sep 17 00:00:00 2001
From: agmorev <agmorev@gmail.com>
Date: Wed, 29 Nov 2023 02:30:23 +0200
Subject: [PATCH 01/11] SXDEDPCXZIC-241_DATAVIC-622 / DELWP harvest restricted
 records

---
 ckanext/datavic_harvester/harvesters/delwp.py | 91 +++++++++++++++----
 1 file changed, 75 insertions(+), 16 deletions(-)

diff --git a/ckanext/datavic_harvester/harvesters/delwp.py b/ckanext/datavic_harvester/harvesters/delwp.py
index 26cf548..869624f 100644
--- a/ckanext/datavic_harvester/harvesters/delwp.py
+++ b/ckanext/datavic_harvester/harvesters/delwp.py
@@ -275,6 +275,12 @@ def import_stage(self, harvest_object: HarvestObject) -> bool | str:
 
         pkg_dict = self._get_pkg_dict(harvest_object)
 
+        if not pkg_dict["notes"] or not pkg_dict["owner_org"]:
+            log.info(
+                f"Description or organization field is missing for object {harvest_object.id}, skipping..."
+            )
+            return False
+
         if status not in ["new", "change"]:
             return True
 
@@ -350,6 +356,14 @@ def _get_pkg_dict(self, harvest_object):
             else ""
         )
 
+        access_notes = """
+            Aerial imagery and elevation datasets\n
+            You can access high-resolution aerial imagery and elevation (LiDAR point cloud) datasets by contacting a business that holds a commercial license.\n
+            We have two types of commercial licensing:\n
+            Data Service Providers (DSPs) provide access to the source imagery or elevation data.\n
+            Value Added Retailers (VARs ) use the imagery and elevation data to create new products and services. This includes advisory services and new knowledge products.
+        """
+
         pkg_dict = {}
 
         pkg_dict["personal_information"] = "no"
@@ -357,11 +371,6 @@ def _get_pkg_dict(self, harvest_object):
         pkg_dict["access"] = "yes"
         pkg_dict["organization_visibility"] = "all"
         pkg_dict["workflow_status"] = "published"
-        pkg_dict["license_id"] = self.config.get("license_id", "cc-by")
-        pkg_dict["private"] = self._is_pkg_private(
-            metashare_dict
-        )
-
         pkg_dict["title"] = metashare_dict.get("title")
         pkg_dict["notes"] = metashare_dict.get("abstract", "")
         pkg_dict["tags"] = helpers.get_tags(metashare_dict.get("topiccat"))
@@ -369,16 +378,13 @@ def _get_pkg_dict(self, harvest_object):
         pkg_dict["extract"] = f"{pkg_dict['notes'].split('.')[0]}..."
         pkg_dict["owner_org"] = self._get_organisation(
             self.config.get("organisation_mapping"),
-            metashare_dict.get("resowner").split(";")[0],
+            metashare_dict.get("resowner", "").split(";")[0],
             harvest_object,
         )
 
         if not pkg_dict.get("name"):
             pkg_dict["name"] = self._get_package_name(harvest_object, pkg_dict["title"])
 
-        if full_metadata_url:
-            pkg_dict["full_metadata_url"] = full_metadata_url
-
         if uuid:
             pkg_dict["primary_purpose_of_collection"] = uuid
 
@@ -412,6 +418,22 @@ def _get_pkg_dict(self, harvest_object):
 
         pkg_dict["resources"] = self._fetch_resources(metashare_dict)
 
+        pkg_dict["private"] = self._is_pkg_private(
+            metashare_dict,
+            pkg_dict["resources"]
+        )
+
+        pkg_dict["license_id"] = self.config.get("license_id", "cc-by")
+
+        if pkg_dict["private"]:
+            pkg_dict["license_id"] = "other-closed"
+
+        if self._is_delwp_raster_data(pkg_dict["resources"]):
+            pkg_dict["full_metadata_url"] = f"https://metashare.maps.vic.gov.au/geonetwork/srv/api/records/{uuid}/formatters/cip-pdf?root=export&output=pdf"
+            pkg_dict["access_description"] = access_notes
+        elif full_metadata_url:
+            pkg_dict["full_metadata_url"] = full_metadata_url
+
         for key, value in [
             ("harvest_source_id", harvest_object.source.id),
             ("harvest_source_title", harvest_object.source.title),
@@ -431,13 +453,50 @@ def _create_custom_package_create_schema(self) -> dict[str, Any]:
 
         return package_schema
 
-    def _is_pkg_private(self, remote_dict: dict[str, Any]) -> bool:
-        """Check if the dataset should be private by `resclassification` field
-        value"""
-        return remote_dict.get("resclassification") in (
-            "limitedDistribution",
-            "restricted",
-        )
+    def _is_delwp_vector_data(self, resources: list[dict[str, Any]]) -> bool:
+        for res in resources:
+            if res["format"].lower() in [
+                "dwg",
+                "dxf",
+                "gdb",
+                "shp",
+                "mif",
+                "tab",
+                "extended tab",
+                "mapinfo",
+            ]:
+                return True
+
+        return False
+
+    def _is_delwp_raster_data(self, resources: list[dict[str, Any]]) -> bool:
+        for res in resources:
+            if res["format"].lower() in [
+                "ecw",
+                "geotiff",
+                "jpeg",
+                "jp2",
+                "jpeg 2000",
+                "tiff",
+                "lass",
+                "xyz",
+            ]:
+                return True
+
+        return False
+
+    def _is_pkg_private(
+        self,
+        remote_dict: dict[str, Any],
+        resources: list[dict[str, Any]]
+    ) -> bool:
+        """Check if the dataset should be private"""
+        if (self._is_delwp_vector_data(resources) and
+            remote_dict.get("mdclassification") == "unclassified" and
+            remote_dict.get("resclassification") == "unclassified"):
+            return False
+
+        return True
 
     def _get_organisation(
         self,

From fa98db27c4472cf11f8287dbd232c81e92488460 Mon Sep 17 00:00:00 2001
From: Yan Rudenko <yan.rudenko@linkdigital.com.au>
Date: Thu, 30 May 2024 13:32:37 +0200
Subject: [PATCH 02/11] SXDEDPCXZIC-308_DATAVIC-622

---
 ckanext/datavic_harvester/harvesters/delwp.py | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/ckanext/datavic_harvester/harvesters/delwp.py b/ckanext/datavic_harvester/harvesters/delwp.py
index 869624f..e4d0247 100644
--- a/ckanext/datavic_harvester/harvesters/delwp.py
+++ b/ckanext/datavic_harvester/harvesters/delwp.py
@@ -281,6 +281,13 @@ def import_stage(self, harvest_object: HarvestObject) -> bool | str:
             )
             return False
 
+        # Remove restricted Datasets
+        if pkg_dict["private"]:
+            log.info(
+                f"Dataset is Restricted for object {harvest_object.id}, skipping..."
+            )
+            return False
+
         if status not in ["new", "change"]:
             return True
 

From 9b3a64d1039edd393a874af2318580633025939f Mon Sep 17 00:00:00 2001
From: agmorev <agmorev@gmail.com>
Date: Wed, 19 Jun 2024 00:29:38 +0300
Subject: [PATCH 03/11] SXDEDPCXZIC-315 / auto detect size for some data
 resources

---
 ckanext/datavic_harvester/harvesters/base.py  | 84 +++++++++++++++++++
 ckanext/datavic_harvester/harvesters/delwp.py |  5 +-
 ckanext/datavic_harvester/harvesters/ods.py   |  5 ++
 3 files changed, 93 insertions(+), 1 deletion(-)

diff --git a/ckanext/datavic_harvester/harvesters/base.py b/ckanext/datavic_harvester/harvesters/base.py
index 9b15cc4..98ca716 100644
--- a/ckanext/datavic_harvester/harvesters/base.py
+++ b/ckanext/datavic_harvester/harvesters/base.py
@@ -15,6 +15,10 @@
 
 log = logging.getLogger(__name__)
 
+MAX_CONTENT_LENGTH = int(tk.config.get('ckanext.datavic_harvester.max_content_length') or 1e+8)
+CHUNK_SIZE = 16 * 1024
+DOWNLOAD_TIMEOUT = 30
+
 
 class DataVicBaseHarvester(HarvesterBase):
     def __init__(self, **kwargs):
@@ -139,3 +143,83 @@ def _make_context(self) -> dict[str, Any]:
             "model": model,
             "session": model.Session,
         }
+
+
+class DataTooBigWarning(Exception):
+    pass
+
+
+def get_resource_size(resource_url: str) -> int:
+    """Return external resource size in bytes
+
+    Args:
+        resource_url (str): a URL for the resource’s source
+
+    Returns:
+        int: resource size in bytes
+    """
+
+    length = 0
+    cl = None
+
+    try:
+        headers = {}
+
+        response = _get_response(resource_url, headers)
+        cl = response.headers.get('content-length')
+
+        if cl:
+            response.close()
+            log.info(f"Resource from url <{resource_url}> length is {cl} bytes.")
+            return int(cl)
+
+        for chunk in response.iter_content(CHUNK_SIZE):
+            length += len(chunk)
+            if length > MAX_CONTENT_LENGTH:
+                response.close
+                raise DataTooBigWarning()
+
+        response.close()
+
+    except DataTooBigWarning:
+        message = f"Resource from url <{resource_url}> is more " \
+            f"than {MAX_CONTENT_LENGTH} bytes. Skip its size calculation."
+        log.warning(message)
+        length = -1 # for the purpose of search possibility in the db
+        return length
+
+    except requests.exceptions.HTTPError as error:
+        log.debug(f"HTTP error: {error}")
+
+    except requests.exceptions.Timeout:
+        log.warning(f"URL time out after {DOWNLOAD_TIMEOUT}s")
+
+    except requests.exceptions.RequestException as error:
+        log.warning(f"URL error: {error}")
+
+    log.info(f"Resource from url <{resource_url}> length is {length} bytes.")
+
+    return length
+
+
+def _get_response(url, headers):
+    def get_url():
+        kwargs = {"headers": headers, "timeout": 30, "stream": True}
+
+        if "ckan.download_proxy" in tk.config:
+            proxy = tk.config.get("ckan.download_proxy")
+            kwargs["proxies"] = {"http": proxy, "https": proxy}
+
+        return requests.get(url, **kwargs)
+
+    response = get_url()
+    if response.status_code == 202:
+        wait = 1
+        while wait < 120 and response.status_code == 202:
+            import time
+            time.sleep(wait)
+            response = get_url()
+            wait *= 3
+    response.raise_for_status()
+
+    return response
diff --git a/ckanext/datavic_harvester/harvesters/delwp.py b/ckanext/datavic_harvester/harvesters/delwp.py
index db2d4fb..8af4131 100644
--- a/ckanext/datavic_harvester/harvesters/delwp.py
+++ b/ckanext/datavic_harvester/harvesters/delwp.py
@@ -17,7 +17,7 @@
 from ckanext.harvest.model import HarvestObject, HarvestObjectExtra
 
 import ckanext.datavic_harvester.helpers as helpers
-from ckanext.datavic_harvester.harvesters.base import DataVicBaseHarvester
+from ckanext.datavic_harvester.harvesters.base import DataVicBaseHarvester, get_resource_size
 
 
 log = logging.getLogger(__name__)
@@ -650,6 +650,9 @@ def _get_resources_by_formats(
 
             res["name"] = f"{res['name']} {res_format}".replace("_", "")
 
+            res["size"] = get_resource_size(res_url)
+            res["filesize"] = res["size"]
+
             if attribution:
                 res["attribution"] = attribution
 
diff --git a/ckanext/datavic_harvester/harvesters/ods.py b/ckanext/datavic_harvester/harvesters/ods.py
index 21c65bc..cb074aa 100644
--- a/ckanext/datavic_harvester/harvesters/ods.py
+++ b/ckanext/datavic_harvester/harvesters/ods.py
@@ -15,6 +15,7 @@
 
 import ckan.plugins.toolkit as tk
 from ckanext.harvest_basket.harvesters import ODSHarvester
+from .base import get_resource_size
 
 class DataVicODSHarvester(ODSHarvester):
 
@@ -38,4 +39,8 @@ def _fetch_resources(self, source_url, resource_urls, pkg_data):
         for res in resources:
             if res["format"] == "CSV":
                 res["url"] = f'{res["url"]}?delimiter=%2C'
+
+            res["size"] = get_resource_size(res["url"])
+            res["filesize"] = res["size"]
+
         return resources

From 4029b53bcb4fc665e192dbbfb6370ffbed496dfc Mon Sep 17 00:00:00 2001
From: agmorev <agmorev@gmail.com>
Date: Thu, 20 Jun 2024 23:46:45 +0300
Subject: [PATCH 04/11] SXDEDPCXZIC-321_DATAVIC-699 / set dcat harvester
 default visibility

---
 ckanext/datavic_harvester/harvesters/dcat_json.py | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/ckanext/datavic_harvester/harvesters/dcat_json.py b/ckanext/datavic_harvester/harvesters/dcat_json.py
index 9ee6e33..6ffebe6 100644
--- a/ckanext/datavic_harvester/harvesters/dcat_json.py
+++ b/ckanext/datavic_harvester/harvesters/dcat_json.py
@@ -184,7 +184,12 @@ def _set_required_fields_defaults(
         if not self._get_extra(pkg_dict, "protective_marking"):
             pkg_dict["protective_marking"] = "official"
 
-        if not self._get_extra(pkg_dict, "organization_visibility"):
+        if not self._get_extra(pkg_dict, "organization_visibility") \
+            and "default_visibility" in self.config:
+            pkg_dict["organization_visibility"] = self.config["default_visibility"][
+                "organization_visibility"
+            ]
+        else:
             pkg_dict["organization_visibility"] = "current"
 
         pkg_dict["workflow_status"] = "published"

From d9256e69b5ab5dcfd88e63d8d57afa6f81cac0af Mon Sep 17 00:00:00 2001
From: agmorev <agmorev@gmail.com>
Date: Fri, 21 Jun 2024 00:38:15 +0300
Subject: [PATCH 05/11] SXDEDPCXZIC-321_DATAVIC-699 / fix the logic of default
 value

---
 ckanext/datavic_harvester/harvesters/dcat_json.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/ckanext/datavic_harvester/harvesters/dcat_json.py b/ckanext/datavic_harvester/harvesters/dcat_json.py
index 6ffebe6..54627a6 100644
--- a/ckanext/datavic_harvester/harvesters/dcat_json.py
+++ b/ckanext/datavic_harvester/harvesters/dcat_json.py
@@ -190,7 +190,9 @@ def _set_required_fields_defaults(
                 "organization_visibility"
             ]
         else:
-            pkg_dict["organization_visibility"] = "current"
+            pkg_dict["organization_visibility"] = self._get_extra(
+                pkg_dict, "organization_visibility"
+            ) or "current"
 
         pkg_dict["workflow_status"] = "published"
 

From 9397321ce7a4c7486ee11b2fd3697f0b18ba6644 Mon Sep 17 00:00:00 2001
From: alexmorev <aleksey.morev@linkdigital.com.au>
Date: Mon, 24 Jun 2024 22:58:49 +0300
Subject: [PATCH 06/11] SXDEDPCXZIC-322_DATAVIC-703 / prevent records being
 updated unnecessarily

---
 .../datavic_harvester/harvesters/dcat_json.py | 21 ++++++++++++++++++-
 1 file changed, 20 insertions(+), 1 deletion(-)

diff --git a/ckanext/datavic_harvester/harvesters/dcat_json.py b/ckanext/datavic_harvester/harvesters/dcat_json.py
index 9ee6e33..d873d1f 100644
--- a/ckanext/datavic_harvester/harvesters/dcat_json.py
+++ b/ckanext/datavic_harvester/harvesters/dcat_json.py
@@ -34,6 +34,25 @@ def gather_stage(self, harvest_job):
 
     def import_stage(self, harvest_object):
         self._set_config(harvest_object.source.config)
+
+        package_dict, dcat_dict = self._get_package_dict(harvest_object)
+        dcat_modified = dcat_dict.get("modified")
+        existing_dataset = self._get_existing_dataset(harvest_object.guid)
+
+        if dcat_modified and existing_dataset:
+            dcat_modified = helpers.convert_date_to_isoformat(
+                dcat_modified, "modified", dcat_dict["title"]
+            ).lower().split("t")[0]
+
+            pkg_modified = existing_dataset['date_modified_data_asset']
+            
+            if pkg_modified and pkg_modified == dcat_modified:
+                log.info(
+                    f"Dataset with id {existing_dataset['id']} wasn't modified "
+                    "from the last harvest. Skipping this dataset..."
+                )
+                return False
+
         return super().import_stage(harvest_object)
 
     def _get_package_dict(
@@ -43,7 +62,7 @@ def _get_package_dict(
         conversions of the data"""
 
         dcat_dict: dict[str, Any] = json.loads(harvest_object.content)
-        pkg_dict = converters.dcat_to_ckan(dcat_dict)
+        pkg_dict = converters.dcat_to_ckan(dcat_dict) 
 
         soup: BeautifulSoup = BeautifulSoup(pkg_dict["notes"], "html.parser")
 

From d49eed159ac37fba90643fd54f89c4cd6f335298 Mon Sep 17 00:00:00 2001
From: alexmorev <aleksey.morev@linkdigital.com.au>
Date: Thu, 27 Jun 2024 20:04:07 +0300
Subject: [PATCH 07/11] SXDEDPCXZIC-315 / additional changes for
 autocalculation

---
 ckanext/datavic_harvester/harvesters/base.py  | 23 +++++++++++++------
 .../datavic_harvester/harvesters/dcat_json.py | 13 ++++++++++-
 2 files changed, 28 insertions(+), 8 deletions(-)

diff --git a/ckanext/datavic_harvester/harvesters/base.py b/ckanext/datavic_harvester/harvesters/base.py
index 98ca716..7481dd6 100644
--- a/ckanext/datavic_harvester/harvesters/base.py
+++ b/ckanext/datavic_harvester/harvesters/base.py
@@ -15,7 +15,7 @@
 
 log = logging.getLogger(__name__)
 
-MAX_CONTENT_LENGTH = int(tk.config.get('ckanext.datavic_harvester.max_content_length') or 1e+8)
+MAX_CONTENT_LENGTH = int(tk.config.get('ckanext.datavic_harvester.max_content_length') or 104857600)
 CHUNK_SIZE = 16 * 1024
 DOWNLOAD_TIMEOUT = 30
 
@@ -161,17 +161,26 @@ def get_resource_size(resource_url: str) -> int:
 
     length = 0
     cl = None
+    
+    if not resource_url or MAX_CONTENT_LENGTH <= 0:
+        return length
 
     try:
         headers = {}
 
         response = _get_response(resource_url, headers)
-        cl = response.headers.get('content-length')
-
-        if cl:
-            response.close()
-            log.info(f"Resource from url <{resource_url}> length is {cl} bytes.")
-            return int(cl)
+        ct = response.headers.get("content-type")
+        cl = response.headers.get("content-length")
+
+        if ct and "text/html" in ct:
+            message = f"Resource from url <{resource_url}> is of HTML type. " \
+            "Skip its size calculation."
+            log.warning(message)
+            return length
+
+        if cl and int(cl) > MAX_CONTENT_LENGTH:
+            response.close
+            raise DataTooBigWarning()
 
         for chunk in response.iter_content(CHUNK_SIZE):
             length += len(chunk)
diff --git a/ckanext/datavic_harvester/harvesters/dcat_json.py b/ckanext/datavic_harvester/harvesters/dcat_json.py
index 9ee6e33..88529a8 100644
--- a/ckanext/datavic_harvester/harvesters/dcat_json.py
+++ b/ckanext/datavic_harvester/harvesters/dcat_json.py
@@ -14,7 +14,7 @@
 from ckanext.harvest.model import HarvestObject
 
 from ckanext.datavic_harvester import helpers
-from ckanext.datavic_harvester.harvesters.base import DataVicBaseHarvester
+from ckanext.datavic_harvester.harvesters.base import DataVicBaseHarvester, get_resource_size
 
 
 log = logging.getLogger(__name__)
@@ -246,3 +246,14 @@ def _get_mocked_full_metadata(self):
         here: str = path.abspath(path.dirname(__file__))
         with open(path.join(here, "../data/dcat_json_full_metadata.txt")) as f:
             return f.read()
+        
+    def modify_package_dict(self, package_dict, dcat_dict, harvest_object):
+        '''
+            Allows custom harvesters to modify the package dict before
+            creating or updating the actual package.
+        '''
+        resources = package_dict["resources"]
+        for resource in resources:
+            resource["size"] = get_resource_size(resource["url"])
+            resource["filesize"] = resource["size"]
+        return package_dict

From c5ef1d7956b9ee81e5a9ffff4877063cd00a17db Mon Sep 17 00:00:00 2001
From: alexmorev <aleksey.morev@linkdigital.com.au>
Date: Fri, 28 Jun 2024 22:22:27 +0300
Subject: [PATCH 08/11] SXDEDPCXZIC-315_DATAVIC-691 / add optional use of
 content-length header

---
 ckanext/datavic_harvester/harvesters/base.py | 39 ++++++++++++++------
 1 file changed, 28 insertions(+), 11 deletions(-)

diff --git a/ckanext/datavic_harvester/harvesters/base.py b/ckanext/datavic_harvester/harvesters/base.py
index 7481dd6..d35fb06 100644
--- a/ckanext/datavic_harvester/harvesters/base.py
+++ b/ckanext/datavic_harvester/harvesters/base.py
@@ -4,6 +4,7 @@
 from typing import Optional, Any
 
 import requests
+import time
 
 from ckan import model
 from ckan.plugins import toolkit as tk
@@ -15,7 +16,9 @@
 
 log = logging.getLogger(__name__)
 
-MAX_CONTENT_LENGTH = int(tk.config.get('ckanext.datavic_harvester.max_content_length') or 104857600)
+MAX_CONTENT_LENGTH = int(
+    tk.config.get("ckanext.datavic_harvester.max_content_length") or 104857600
+)
 CHUNK_SIZE = 16 * 1024
 DOWNLOAD_TIMEOUT = 30
 
@@ -161,7 +164,7 @@ def get_resource_size(resource_url: str) -> int:
 
     length = 0
     cl = None
-    
+
     if not resource_url or MAX_CONTENT_LENGTH <= 0:
         return length
 
@@ -171,30 +174,45 @@ def get_resource_size(resource_url: str) -> int:
         response = _get_response(resource_url, headers)
         ct = response.headers.get("content-type")
         cl = response.headers.get("content-length")
+        cl_enabled = tk.asbool(tk.config.get(
+            "ckanext.datavic_harvester.content_length_enabled", False)
+        )
 
         if ct and "text/html" in ct:
-            message = f"Resource from url <{resource_url}> is of HTML type. " \
-            "Skip its size calculation."
+            message = (
+                f"Resource from url <{resource_url}> is of HTML type. "
+                "Skip its size calculation."
+            )
             log.warning(message)
             return length
 
-        if cl and int(cl) > MAX_CONTENT_LENGTH:
-            response.close
-            raise DataTooBigWarning()
+        if cl:
+            if int(cl) > MAX_CONTENT_LENGTH:
+                response.close()
+                raise DataTooBigWarning()
+
+            if cl_enabled:
+                response.close()
+                log.info(
+                    f"Resource from url <{resource_url}> content-length is {int(cl)} bytes."
+                )
+                return int(cl)
 
         for chunk in response.iter_content(CHUNK_SIZE):
             length += len(chunk)
             if length > MAX_CONTENT_LENGTH:
-                response.close
+                response.close()
                 raise DataTooBigWarning()
 
         response.close()
 
     except DataTooBigWarning:
-        message = f"Resource from url <{resource_url}> is more " \
+        message = (
+            f"Resource from url <{resource_url}> is more "
             f"than {MAX_CONTENT_LENGTH} bytes. Skip its size calculation."
+        )
         log.warning(message)
-        length = -1 # for the purpose of search possibility in the db
+        length = -1  # for the purpose of search possibility in the db
         return length
 
     except requests.exceptions.HTTPError as error:
@@ -225,7 +243,6 @@ def get_url():
     if response.status_code == 202:
         wait = 1
         while wait < 120 and response.status_code == 202:
-            import time
             time.sleep(wait)
             response = get_url()
             wait *= 3

From 79cd3d769c123d90c7b5392ad6ae8c6009902edd Mon Sep 17 00:00:00 2001
From: alexmorev <aleksey.morev@linkdigital.com.au>
Date: Sat, 29 Jun 2024 01:27:20 +0300
Subject: [PATCH 09/11] SXDEDPCXZIC-315 / fix the logic

---
 ckanext/datavic_harvester/harvesters/base.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/ckanext/datavic_harvester/harvesters/base.py b/ckanext/datavic_harvester/harvesters/base.py
index d35fb06..a7e706d 100644
--- a/ckanext/datavic_harvester/harvesters/base.py
+++ b/ckanext/datavic_harvester/harvesters/base.py
@@ -165,7 +165,7 @@ def get_resource_size(resource_url: str) -> int:
     length = 0
     cl = None
 
-    if not resource_url or MAX_CONTENT_LENGTH <= 0:
+    if not resource_url or MAX_CONTENT_LENGTH < 0:
         return length
 
     try:
@@ -187,7 +187,7 @@ def get_resource_size(resource_url: str) -> int:
             return length
 
         if cl:
-            if int(cl) > MAX_CONTENT_LENGTH:
+            if int(cl) > MAX_CONTENT_LENGTH and MAX_CONTENT_LENGTH > 0:
                 response.close()
                 raise DataTooBigWarning()
 
@@ -208,8 +208,8 @@ def get_resource_size(resource_url: str) -> int:
 
     except DataTooBigWarning:
         message = (
-            f"Resource from url <{resource_url}> is more "
-            f"than {MAX_CONTENT_LENGTH} bytes. Skip its size calculation."
+            f"Resource from url <{resource_url}> is more than the set limit "
+            f"{MAX_CONTENT_LENGTH} bytes. Skip its size calculation."
         )
         log.warning(message)
         length = -1  # for the purpose of search possibility in the db

From ac1a8e0fec2bd14e7b27b1ab396541c7bf07936f Mon Sep 17 00:00:00 2001
From: alexmorev <aleksey.morev@linkdigital.com.au>
Date: Thu, 11 Jul 2024 20:49:51 +0300
Subject: [PATCH 10/11] SXDEDPCXZIC-340 / fix harvester error

---
 ckanext/datavic_harvester/harvesters/base.py | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/ckanext/datavic_harvester/harvesters/base.py b/ckanext/datavic_harvester/harvesters/base.py
index 9b15cc4..5b7cfce 100644
--- a/ckanext/datavic_harvester/harvesters/base.py
+++ b/ckanext/datavic_harvester/harvesters/base.py
@@ -128,8 +128,11 @@ def fetch_stage(self, harvest_object: HarvestObject) -> bool:
         return True
 
     def _delete_package(self, package_id: str, guid: str):
-        tk.get_action("package_delete")(self._make_context(), {"id": package_id})
-        log.info(f"Deleted package {package_id} with guid {guid}")
+        try:
+            tk.get_action("package_delete")(self._make_context(), {"id": package_id})
+            log.info(f"Deleted package {package_id} with guid {guid}")
+        except tk.ObjectNotFound:
+            log.error(f"Package {package_id} not found")
 
     def _make_context(self) -> dict[str, Any]:
         return {

From 3893473ba76d8a7eca27817f7903ce288c4676d1 Mon Sep 17 00:00:00 2001
From: alexmorev <aleksey.morev@linkdigital.com.au>
Date: Fri, 1 Nov 2024 17:36:53 +0200
Subject: [PATCH 11/11] SXDEDPCXZIC-393 / exclude domains from size
 calculations

---
 ckanext/datavic_harvester/harvesters/base.py | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/ckanext/datavic_harvester/harvesters/base.py b/ckanext/datavic_harvester/harvesters/base.py
index a7e706d..a340be0 100644
--- a/ckanext/datavic_harvester/harvesters/base.py
+++ b/ckanext/datavic_harvester/harvesters/base.py
@@ -2,6 +2,7 @@
 
 import logging
 from typing import Optional, Any
+from urllib.parse import urlparse
 
 import requests
 import time
@@ -21,6 +22,9 @@
 )
 CHUNK_SIZE = 16 * 1024
 DOWNLOAD_TIMEOUT = 30
+CONFIG_FSC_EXCLUDED_DOMAINS = tk.aslist(
+    tk.config.get("ckanext.datavic_harvester.filesize_excluded_domains", "")
+)
 
 
 class DataVicBaseHarvester(HarvesterBase):
@@ -168,6 +172,10 @@ def get_resource_size(resource_url: str) -> int:
     if not resource_url or MAX_CONTENT_LENGTH < 0:
         return length
 
+    hostname = urlparse(resource_url).hostname
+    if hostname in CONFIG_FSC_EXCLUDED_DOMAINS:
+        return length
+
     try:
         headers = {}