Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Release 2.9.0 [DEV] #79

Open
wants to merge 29 commits into
base: develop
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
29 commits
Select commit Hold shift + click to select a range
7da9837
SXDEDPCXZIC-241_DATAVIC-622 / DELWP harvest restricted records
agmorev Nov 29, 2023
c898e82
Merge pull request #56 from dpc-sdp/SXDEDPCXZIC-228
joelwigley Feb 2, 2024
c34dd51
Merge pull request #60 from dpc-sdp/SXDEDPCXZIC-280_DATAVIC-665
joelwigley Apr 15, 2024
4d19bf4
Merge remote-tracking branch 'origin/release-2.6.0' into uat
mutantsan Apr 18, 2024
394f428
Merge pull request #62 from dpc-sdp/SXDEDPCXZIC-302
joelwigley May 17, 2024
fce3e1a
Merge pull request #63 from dpc-sdp/SXDEDPCXZIC-302
joelwigley May 30, 2024
cea5599
Merge pull request #53 from dpc-sdp/SXDEDPCXZIC-241
alexmorev May 30, 2024
fa98db2
SXDEDPCXZIC-308_DATAVIC-622
May 30, 2024
1c0fdcd
Merge pull request #65 from dpc-sdp/SXDEDPCXZIC-308
Engerrs May 30, 2024
ff34999
Merge pull request #66 from dpc-sdp/release-2.6.0
iaroslav13 May 31, 2024
96c93d3
Merge pull request #67 from dpc-sdp/uat
joelwigley Jun 12, 2024
9b3a64d
SXDEDPCXZIC-315 / auto detect size for some data resources
agmorev Jun 18, 2024
4029b53
SXDEDPCXZIC-321_DATAVIC-699 / set dcat harvester default visibility
agmorev Jun 20, 2024
d9256e6
SXDEDPCXZIC-321_DATAVIC-699 / fix the logic of default value
agmorev Jun 20, 2024
16ed3fd
Merge pull request #71 from dpc-sdp/SXDEDPCXZIC-321
joelwigley Jun 24, 2024
9397321
SXDEDPCXZIC-322_DATAVIC-703 / prevent records being updated unnecessa…
alexmorev Jun 24, 2024
d49eed1
SXDEDPCXZIC-315 / additional changes for autocalculation
alexmorev Jun 27, 2024
c5ef1d7
SXDEDPCXZIC-315_DATAVIC-691 / add optional use of content-length header
alexmorev Jun 28, 2024
79cd3d7
SXDEDPCXZIC-315 / fix the logic
alexmorev Jun 28, 2024
ac1a8e0
SXDEDPCXZIC-340 / fix harvester error
alexmorev Jul 11, 2024
a770a14
Merge pull request #75 from dpc-sdp/SXDEDPCXZIC-340
joelwigley Jul 17, 2024
de977f4
Merge pull request #76 from dpc-sdp/uat
iaroslav13 Jul 31, 2024
e9a0587
Merge pull request #73 from dpc-sdp/SXDEDPCXZIC-322
joelwigley Oct 16, 2024
74d5d38
Merge pull request #77 from dpc-sdp/master
joelwigley Oct 17, 2024
24b3970
Merge pull request #69 from dpc-sdp/SXDEDPCXZIC-315
iaroslav13 Oct 30, 2024
3893473
SXDEDPCXZIC-393 / exclude domains from size calculations
alexmorev Nov 1, 2024
2361033
Merge pull request #83 from dpc-sdp/SXDEDPCXZIC-393
iaroslav13 Nov 5, 2024
5efdb20
Merge pull request #84 from dpc-sdp/uat
joelwigley Nov 14, 2024
43426cc
Merge pull request #85 from dpc-sdp/master
joelwigley Nov 29, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
125 changes: 123 additions & 2 deletions ckanext/datavic_harvester/harvesters/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,10 @@

import logging
from typing import Optional, Any
from urllib.parse import urlparse

import requests
import time

from ckan import model
from ckan.plugins import toolkit as tk
Expand All @@ -15,6 +17,15 @@

log = logging.getLogger(__name__)

MAX_CONTENT_LENGTH = int(
tk.config.get("ckanext.datavic_harvester.max_content_length") or 104857600
)
CHUNK_SIZE = 16 * 1024
DOWNLOAD_TIMEOUT = 30
CONFIG_FSC_EXCLUDED_DOMAINS = tk.aslist(
tk.config.get("ckanext.datavic_harvester.filesize_excluded_domains", "")
)


class DataVicBaseHarvester(HarvesterBase):
def __init__(self, **kwargs):
Expand Down Expand Up @@ -128,8 +139,11 @@ def fetch_stage(self, harvest_object: HarvestObject) -> bool:
return True

def _delete_package(self, package_id: str, guid: str):
tk.get_action("package_delete")(self._make_context(), {"id": package_id})
log.info(f"Deleted package {package_id} with guid {guid}")
try:
tk.get_action("package_delete")(self._make_context(), {"id": package_id})
log.info(f"Deleted package {package_id} with guid {guid}")
except tk.ObjectNotFound:
log.error(f"Package {package_id} not found")

def _make_context(self) -> dict[str, Any]:
return {
Expand All @@ -139,3 +153,110 @@ def _make_context(self) -> dict[str, Any]:
"model": model,
"session": model.Session,
}


class DataTooBigWarning(Exception):
pass


def get_resource_size(resource_url: str) -> int:
"""Return external resource size in bytes

Args:
resource_url (str): a URL for the resource’s source

Returns:
int: resource size in bytes
"""

length = 0
cl = None

if not resource_url or MAX_CONTENT_LENGTH < 0:
return length

hostname = urlparse(resource_url).hostname
if hostname in CONFIG_FSC_EXCLUDED_DOMAINS:
return length

try:
headers = {}

response = _get_response(resource_url, headers)
ct = response.headers.get("content-type")
cl = response.headers.get("content-length")
cl_enabled = tk.asbool(tk.config.get(
"ckanext.datavic_harvester.content_length_enabled", False)
)

if ct and "text/html" in ct:
message = (
f"Resource from url <{resource_url}> is of HTML type. "
"Skip its size calculation."
)
log.warning(message)
return length

if cl:
if int(cl) > MAX_CONTENT_LENGTH and MAX_CONTENT_LENGTH > 0:
response.close()
raise DataTooBigWarning()

if cl_enabled:
response.close()
log.info(
f"Resource from url <{resource_url}> content-length is {int(cl)} bytes."
)
return int(cl)

for chunk in response.iter_content(CHUNK_SIZE):
length += len(chunk)
if length > MAX_CONTENT_LENGTH:
response.close()
raise DataTooBigWarning()

response.close()

except DataTooBigWarning:
message = (
f"Resource from url <{resource_url}> is more than the set limit "
f"{MAX_CONTENT_LENGTH} bytes. Skip its size calculation."
)
log.warning(message)
length = -1 # for the purpose of search possibility in the db
return length

except requests.exceptions.HTTPError as error:
log.debug(f"HTTP error: {error}")

except requests.exceptions.Timeout:
log.warning(f"URL time out after {DOWNLOAD_TIMEOUT}s")

except requests.exceptions.RequestException as error:
log.warning(f"URL error: {error}")

log.info(f"Resource from url <{resource_url}> length is {length} bytes.")

return length


def _get_response(url, headers):
def get_url():
kwargs = {"headers": headers, "timeout": 30, "stream": True}

if "ckan.download_proxy" in tk.config:
proxy = tk.config.get("ckan.download_proxy")
kwargs["proxies"] = {"http": proxy, "https": proxy}

return requests.get(url, **kwargs)

response = get_url()
if response.status_code == 202:
wait = 1
while wait < 120 and response.status_code == 202:
time.sleep(wait)
response = get_url()
wait *= 3
response.raise_for_status()

return response
50 changes: 44 additions & 6 deletions ckanext/datavic_harvester/harvesters/dcat_json.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@
from ckanext.harvest.model import HarvestObject

from ckanext.datavic_harvester import helpers
from ckanext.datavic_harvester.harvesters.base import DataVicBaseHarvester
from ckanext.datavic_harvester.harvesters.base import DataVicBaseHarvester, get_resource_size


log = logging.getLogger(__name__)
Expand All @@ -34,6 +34,25 @@ def gather_stage(self, harvest_job):

def import_stage(self, harvest_object):
self._set_config(harvest_object.source.config)

package_dict, dcat_dict = self._get_package_dict(harvest_object)
dcat_modified = dcat_dict.get("modified")
existing_dataset = self._get_existing_dataset(harvest_object.guid)

if dcat_modified and existing_dataset:
dcat_modified = helpers.convert_date_to_isoformat(
dcat_modified, "modified", dcat_dict["title"]
).lower().split("t")[0]

pkg_modified = existing_dataset['date_modified_data_asset']

if pkg_modified and pkg_modified == dcat_modified:
log.info(
f"Dataset with id {existing_dataset['id']} wasn't modified "
"from the last harvest. Skipping this dataset..."
)
return False

return super().import_stage(harvest_object)

def _get_package_dict(
Expand All @@ -43,7 +62,7 @@ def _get_package_dict(
conversions of the data"""

dcat_dict: dict[str, Any] = json.loads(harvest_object.content)
pkg_dict = converters.dcat_to_ckan(dcat_dict)
pkg_dict = converters.dcat_to_ckan(dcat_dict)

soup: BeautifulSoup = BeautifulSoup(pkg_dict["notes"], "html.parser")

Expand Down Expand Up @@ -184,11 +203,17 @@ def _set_required_fields_defaults(
if not self._get_extra(pkg_dict, "protective_marking"):
pkg_dict["protective_marking"] = "official"

if not self._get_extra(pkg_dict, "organization_visibility"):
pkg_dict["organization_visibility"] = "current"
if not self._get_extra(pkg_dict, "organization_visibility") \
and "default_visibility" in self.config:
pkg_dict["organization_visibility"] = self.config["default_visibility"][
"organization_visibility"
]
else:
pkg_dict["organization_visibility"] = self._get_extra(
pkg_dict, "organization_visibility"
) or "current"

if not self._get_extra(pkg_dict, "workflow_status"):
pkg_dict["workflow_status"] = "draft"
pkg_dict["workflow_status"] = "published"

issued: Optional[str] = dcat_dict.get("issued")
if issued and not self._get_extra(pkg_dict, "date_created_data_asset"):
Expand All @@ -212,6 +237,8 @@ def _set_required_fields_defaults(

pkg_dict["tag_string"] = dcat_dict.get("keyword", [])

pkg_dict.setdefault("update_frequency", "unknown")

def _get_existing_dataset(self, guid: str) -> Optional[dict[str, Any]]:
"""Return a package with specific guid extra if exists"""

Expand Down Expand Up @@ -245,3 +272,14 @@ def _get_mocked_full_metadata(self):
here: str = path.abspath(path.dirname(__file__))
with open(path.join(here, "../data/dcat_json_full_metadata.txt")) as f:
return f.read()

def modify_package_dict(self, package_dict, dcat_dict, harvest_object):
'''
Allows custom harvesters to modify the package dict before
creating or updating the actual package.
'''
resources = package_dict["resources"]
for resource in resources:
resource["size"] = get_resource_size(resource["url"])
resource["filesize"] = resource["size"]
return package_dict
Loading