dpc-sdp · joelwigley · Nov 29, 2023 · Feb 2, 2024 · Apr 15, 2024 · Apr 18, 2024
diff --git a/ckanext/datavic_harvester/harvesters/base.py b/ckanext/datavic_harvester/harvesters/base.py
@@ -2,8 +2,10 @@
 
 import logging
 from typing import Optional, Any
+from urllib.parse import urlparse
 
 import requests
+import time
 
 from ckan import model
 from ckan.plugins import toolkit as tk
@@ -15,6 +17,15 @@
 
 log = logging.getLogger(__name__)
 
+MAX_CONTENT_LENGTH = int(
+    tk.config.get("ckanext.datavic_harvester.max_content_length") or 104857600
+)
+CHUNK_SIZE = 16 * 1024
+DOWNLOAD_TIMEOUT = 30
+CONFIG_FSC_EXCLUDED_DOMAINS = tk.aslist(
+    tk.config.get("ckanext.datavic_harvester.filesize_excluded_domains", "")
+)
+
 
 class DataVicBaseHarvester(HarvesterBase):
     def __init__(self, **kwargs):
@@ -128,8 +139,11 @@ def fetch_stage(self, harvest_object: HarvestObject) -> bool:
         return True
 
     def _delete_package(self, package_id: str, guid: str):
-        tk.get_action("package_delete")(self._make_context(), {"id": package_id})
-        log.info(f"Deleted package {package_id} with guid {guid}")
+        try:
+            tk.get_action("package_delete")(self._make_context(), {"id": package_id})
+            log.info(f"Deleted package {package_id} with guid {guid}")
+        except tk.ObjectNotFound:
+            log.error(f"Package {package_id} not found")
 
     def _make_context(self) -> dict[str, Any]:
         return {
@@ -139,3 +153,110 @@ def _make_context(self) -> dict[str, Any]:
             "model": model,
             "session": model.Session,
         }
+
+
+class DataTooBigWarning(Exception):
+    pass
+
+
+def get_resource_size(resource_url: str) -> int:
+    """Return external resource size in bytes
+
+    Args:
+        resource_url (str): a URL for the resource’s source
+
+    Returns:
+        int: resource size in bytes
+    """
+
+    length = 0
+    cl = None
+
+    if not resource_url or MAX_CONTENT_LENGTH < 0:
+        return length
+
+    hostname = urlparse(resource_url).hostname
+    if hostname in CONFIG_FSC_EXCLUDED_DOMAINS:
+        return length
+
+    try:
+        headers = {}
+
+        response = _get_response(resource_url, headers)
+        ct = response.headers.get("content-type")
+        cl = response.headers.get("content-length")
+        cl_enabled = tk.asbool(tk.config.get(
+            "ckanext.datavic_harvester.content_length_enabled", False)
+        )
+
+        if ct and "text/html" in ct:
+            message = (
+                f"Resource from url <{resource_url}> is of HTML type. "
+                "Skip its size calculation."
+            )
+            log.warning(message)
+            return length
+
+        if cl:
+            if int(cl) > MAX_CONTENT_LENGTH and MAX_CONTENT_LENGTH > 0:
+                response.close()
+                raise DataTooBigWarning()
+
+            if cl_enabled:
+                response.close()
+                log.info(
+                    f"Resource from url <{resource_url}> content-length is {int(cl)} bytes."
+                )
+                return int(cl)
+
+        for chunk in response.iter_content(CHUNK_SIZE):
+            length += len(chunk)
+            if length > MAX_CONTENT_LENGTH:
+                response.close()
+                raise DataTooBigWarning()
+
+        response.close()
+
+    except DataTooBigWarning:
+        message = (
+            f"Resource from url <{resource_url}> is more than the set limit "
+            f"{MAX_CONTENT_LENGTH} bytes. Skip its size calculation."
+        )
+        log.warning(message)
+        length = -1  # for the purpose of search possibility in the db
+        return length
+
+    except requests.exceptions.HTTPError as error:
+        log.debug(f"HTTP error: {error}")
+
+    except requests.exceptions.Timeout:
+        log.warning(f"URL time out after {DOWNLOAD_TIMEOUT}s")
+
+    except requests.exceptions.RequestException as error:
+        log.warning(f"URL error: {error}")
+
+    log.info(f"Resource from url <{resource_url}> length is {length} bytes.")
+
+    return length
+
+
+def _get_response(url, headers):
+    def get_url():
+        kwargs = {"headers": headers, "timeout": 30, "stream": True}
+
+        if "ckan.download_proxy" in tk.config:
+            proxy = tk.config.get("ckan.download_proxy")
+            kwargs["proxies"] = {"http": proxy, "https": proxy}
+
+        return requests.get(url, **kwargs)
+
+    response = get_url()
+    if response.status_code == 202:
+        wait = 1
+        while wait < 120 and response.status_code == 202:
+            time.sleep(wait)
+            response = get_url()
+            wait *= 3
+    response.raise_for_status()
+
+    return response
diff --git a/ckanext/datavic_harvester/harvesters/dcat_json.py b/ckanext/datavic_harvester/harvesters/dcat_json.py
@@ -14,7 +14,7 @@
 from ckanext.harvest.model import HarvestObject
 
 from ckanext.datavic_harvester import helpers
-from ckanext.datavic_harvester.harvesters.base import DataVicBaseHarvester
+from ckanext.datavic_harvester.harvesters.base import DataVicBaseHarvester, get_resource_size
 
 
 log = logging.getLogger(__name__)
@@ -34,6 +34,25 @@ def gather_stage(self, harvest_job):
 
     def import_stage(self, harvest_object):
         self._set_config(harvest_object.source.config)
+
+        package_dict, dcat_dict = self._get_package_dict(harvest_object)
+        dcat_modified = dcat_dict.get("modified")
+        existing_dataset = self._get_existing_dataset(harvest_object.guid)
+
+        if dcat_modified and existing_dataset:
+            dcat_modified = helpers.convert_date_to_isoformat(
+                dcat_modified, "modified", dcat_dict["title"]
+            ).lower().split("t")[0]
+
+            pkg_modified = existing_dataset['date_modified_data_asset']
+
+            if pkg_modified and pkg_modified == dcat_modified:
+                log.info(
+                    f"Dataset with id {existing_dataset['id']} wasn't modified "
+                    "from the last harvest. Skipping this dataset..."
+                )
+                return False
+
         return super().import_stage(harvest_object)
 
     def _get_package_dict(
@@ -43,7 +62,7 @@ def _get_package_dict(
         conversions of the data"""
 
         dcat_dict: dict[str, Any] = json.loads(harvest_object.content)
-        pkg_dict = converters.dcat_to_ckan(dcat_dict)
+        pkg_dict = converters.dcat_to_ckan(dcat_dict) 
 
         soup: BeautifulSoup = BeautifulSoup(pkg_dict["notes"], "html.parser")
 
@@ -184,11 +203,17 @@ def _set_required_fields_defaults(
         if not self._get_extra(pkg_dict, "protective_marking"):
             pkg_dict["protective_marking"] = "official"
 
-        if not self._get_extra(pkg_dict, "organization_visibility"):
-            pkg_dict["organization_visibility"] = "current"
+        if not self._get_extra(pkg_dict, "organization_visibility") \
+            and "default_visibility" in self.config:
+            pkg_dict["organization_visibility"] = self.config["default_visibility"][
+                "organization_visibility"
+            ]
+        else:
+            pkg_dict["organization_visibility"] = self._get_extra(
+                pkg_dict, "organization_visibility"
+            ) or "current"
 
-        if not self._get_extra(pkg_dict, "workflow_status"):
-            pkg_dict["workflow_status"] = "draft"
+        pkg_dict["workflow_status"] = "published"
 
         issued: Optional[str] = dcat_dict.get("issued")
         if issued and not self._get_extra(pkg_dict, "date_created_data_asset"):
@@ -212,6 +237,8 @@ def _set_required_fields_defaults(
 
         pkg_dict["tag_string"] = dcat_dict.get("keyword", [])
 
+        pkg_dict.setdefault("update_frequency", "unknown")
+
     def _get_existing_dataset(self, guid: str) -> Optional[dict[str, Any]]:
         """Return a package with specific guid extra if exists"""
 
@@ -245,3 +272,14 @@ def _get_mocked_full_metadata(self):
         here: str = path.abspath(path.dirname(__file__))
         with open(path.join(here, "../data/dcat_json_full_metadata.txt")) as f:
             return f.read()
+
+    def modify_package_dict(self, package_dict, dcat_dict, harvest_object):
+        '''
+            Allows custom harvesters to modify the package dict before
+            creating or updating the actual package.
+        '''
+        resources = package_dict["resources"]
+        for resource in resources:
+            resource["size"] = get_resource_size(resource["url"])
+            resource["filesize"] = resource["size"]
+        return package_dict