diff --git a/.circleci/config.yml b/.circleci/config.yml index 9659574..35ed04d 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -8,7 +8,7 @@ jobs: - image: mongo:3.2 - image: redis - image: udata/elasticsearch:2.4.5 - - image: postgres:alpine + - image: postgres:11-alpine name: db environment: POSTGRES_DB: ckan diff --git a/CHANGELOG.md b/CHANGELOG.md index 75e0032..fe480e2 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -2,7 +2,7 @@ ## Current (in progress) -- Nothing yet +- DKAN support [#129](https://github.com/opendatateam/udata-ckan/pull/129) ## 1.2.3 (2019-05-29) diff --git a/docker-compose.yml b/docker-compose.yml index f394a58..027f02d 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -22,7 +22,7 @@ services: - POSTGRES_PASSWORD=ckan - DS_RO_PASS=datastore db: - image: postgres:alpine + image: postgres:11-alpine environment: - POSTGRES_DB=ckan - POSTGRES_USER=ckan diff --git a/requirements/install.pip b/requirements/install.pip index 9f037ca..6886822 100644 --- a/requirements/install.pip +++ b/requirements/install.pip @@ -1,2 +1,3 @@ udata>=1.6.0 requests==2.21.0 +humanfriendly==4.18 diff --git a/setup.py b/setup.py index c11d433..4e15518 100644 --- a/setup.py +++ b/setup.py @@ -121,6 +121,7 @@ def pip(filename): entry_points={ 'udata.harvesters': [ 'ckan = udata_ckan.harvesters:CkanBackend', + 'dkan = udata_ckan.harvesters:DkanBackend', ], 'udata.models': [ 'ckan = udata_ckan.models', diff --git a/tests/data/dkan-french-w-license.json b/tests/data/dkan-french-w-license.json new file mode 100644 index 0000000..ac94b08 --- /dev/null +++ b/tests/data/dkan-french-w-license.json @@ -0,0 +1,226 @@ +{ + "help": "Return the metadata of a dataset (package) and its resources. :param id: the id or name of the dataset :type id: string", + "success": true, + "result": [ + { + "id": "04be6288-696d-4331-850d-a144871a7e3a", + "name": "antennes-regionales-de-la-region-hauts-de-france-au-01102019-0", + "title": "Antennes régionales de la Région Hauts-de-France (au 01/10/2019)", + "author_email": "sig@hautsdefrance.fr", + "maintainer": "Opendata de la Région Hauts-de-France", + "maintainer_email": "opendata@hautsdefrance.fr", + "license_title": "http://www.etalab.gouv.fr/pages/Licence_ouverte_Open_licence-5899923.html", + "notes": "

Liste et coordonnées des antennes de proximité du conseil régional Hauts-de-France.

\n", + "url": "https://opendata.hautsdefrance.fr/?q=dataset/antennes-regionales-de-la-region-hauts-de-france-au-01102019-0", + "state": "Active", + "private": true, + "revision_timestamp": "jeu, 19/12/2019 - 03:00", + "metadata_created": "mar, 10/12/2019 - 09:23", + "metadata_modified": "2019-09-30 22:00:00", + "creator_user_id": "235f2695-89bd-4a0d-8bcf-b6e26b7b3981", + "type": "Dataset", + "resources": [ + { + "id": "33f30271-cd5c-49ae-b44b-595caae16126", + "revision_id": "", + "url": "https://geocatalogue.hautsdefrance.fr/geonetwork/srv/api/records/4b5f8e1b-de37-47cd-9203-37a59f318b09/attachments/coordonnees_antennes.xlsx", + "description": "

Tableau des données

\n", + "format": "xlsx", + "state": "Active", + "revision_timestamp": "jeu, 19/12/2019 - 03:00", + "name": "coordonnees_antennes.xlsx", + "mimetype": "xlsx", + "size": "42 octets", + "created": "jeu, 19/12/2019 - 03:00", + "resource_group_id": "b72cd25d-1cec-49f6-8c71-297bd373fa01", + "last_modified": "Date changed jeu, 19/12/2019 - 03:00" + }, + { + "id": "ab5948b1-95be-4806-ad8d-efaa9ffe43dc", + "revision_id": "", + "url": "https://sig.hautsdefrance.fr/ext/mv/?config=antenne_regionale.xml#", + "description": "

Visionneuse cartographique avec une représentation simplifiée des données

\n", + "format": "", + "state": "Active", + "revision_timestamp": "jeu, 19/12/2019 - 03:00", + "name": "Visionneuse mviewer", + "mimetype": "", + "size": "", + "created": "jeu, 19/12/2019 - 03:00", + "resource_group_id": "b72cd25d-1cec-49f6-8c71-297bd373fa01", + "last_modified": "Date changed jeu, 19/12/2019 - 03:00" + } + ], + "tags": [ + { + "id": "0800bf74-0728-48ef-b6bb-6e458feff785", + "vocabulary_id": "2", + "name": "ADMINISTRATION" + }, + { + "id": "3e213764-c884-402b-88f5-097a5de38876", + "vocabulary_id": "2", + "name": "ADRESSE" + }, + { + "id": "8da693f4-e4d3-432e-8192-fa4d12ee21e8", + "vocabulary_id": "2", + "name": "AISNE" + }, + { + "id": "40577117-4987-4588-b08c-e1636fa0865b", + "vocabulary_id": "2", + "name": "ANTENNE REGIONALE" + }, + { + "id": "46af567e-d862-4d65-8912-0f7eff3f94a8", + "vocabulary_id": "2", + "name": "DONNEES OUVERTES" + }, + { + "id": "713c6627-5caa-4434-9583-159b8f9fbfea", + "vocabulary_id": "2", + "name": "HAUTS-DE-FRANCE" + }, + { + "id": "1e14a78d-127f-4401-ac4a-09159f5d22b2", + "vocabulary_id": "2", + "name": "NORD" + }, + { + "id": "08f08873-ce41-465b-a055-826f6cee0ca4", + "vocabulary_id": "2", + "name": "OISE" + }, + { + "id": "de6a5ac7-c9cb-42c8-b6a9-506740c54be3", + "vocabulary_id": "2", + "name": "PAS-DE-CALAIS" + }, + { + "id": "b428bef2-9a4c-4545-9a70-1549ae2384a7", + "vocabulary_id": "2", + "name": "POLITIQUE REGIONALE" + }, + { + "id": "f8676e82-0864-4b8f-afd2-2ca3368ccb53", + "vocabulary_id": "2", + "name": "PROXIMITE" + }, + { + "id": "c18de608-ae9b-4043-81e1-27e367461a0d", + "vocabulary_id": "2", + "name": "SOMME" + }, + { + "id": "33efc6a3-86b4-4ca3-b9b1-301d8ce7a379", + "vocabulary_id": "2", + "name": "Services d utilité publique et services publics" + } + ], + "groups": [ + { + "description": "

Conseil régional des Hauts-de-France

\n", + "id": "b72cd25d-1cec-49f6-8c71-297bd373fa01", + "image_display_url": "https://opendata.hautsdefrance.fr/sites/default/files/Logo%20R%C3%A9gion%20HDF-pourleweb.jpg", + "title": "Région Hauts-de-France", + "name": "group/region-hauts-de-france" + } + ], + "extras": [ + { + "key": "access_constraints", + "value": "[]" + }, + { + "key": "bbox-east-long", + "value": "4.65820313" + }, + { + "key": "bbox-north-lat", + "value": "51.16423318" + }, + { + "key": "bbox-south-lat", + "value": "48.80546301" + }, + { + "key": "bbox-west-long", + "value": "1.18652344" + }, + { + "key": "contact-email", + "value": "sig@hautsdefrance.fr" + }, + { + "key": "coupled-resource", + "value": "[]" + }, + { + "key": "dataset-reference-date", + "value": "[{"type": "revision", "value": "2019-10-01"}]" + }, + { + "key": "frequency-of-update", + "value": "asNeeded" + }, + { + "key": "graphic-preview-description", + "value": "logohdf.png" + }, + { + "key": "graphic-preview-file", + "value": "https://geocatalogue.hautsdefrance.fr/geonetwork/srv/api/records/4b5f8e1b-de37-47cd-9203-37a59f318b09/attachments/logohdf.png" + }, + { + "key": "guid", + "value": "4b5f8e1b-de37-47cd-9203-37a59f318b09" + }, + { + "key": "licence", + "value": "[]" + }, + { + "key": "metadata-date", + "value": "2019-11-13T11:24:54" + }, + { + "key": "metadata-language", + "value": "fre" + }, + { + "key": "metadata_created", + "value": "2019-11-13T11:24:54" + }, + { + "key": "metadata_modified", + "value": "2019-11-13T11:24:54" + }, + { + "key": "resource-type", + "value": "dataset" + }, + { + "key": "responsible-party", + "value": "[{"name": "R\\u00e9gion Hauts-de-France", "roles": ["pointOfContact"]}]" + }, + { + "key": "spatial_harvester", + "value": "true" + }, + { + "key": "harvest_object_id", + "value": "281dfceb-ed90-46da-aef6-4ad5e1348150" + }, + { + "key": "harvest_source_id", + "value": "c6a6feb7-fc9a-4a26-ab6a-cd13024c4fdd" + }, + { + "key": "harvest_source_title", + "value": "Données ouvertes publiées par la région Hauts-de-France" + } + ] + } + ] + } diff --git a/tests/test_dkan_backend.py b/tests/test_dkan_backend.py new file mode 100644 index 0000000..90a5165 --- /dev/null +++ b/tests/test_dkan_backend.py @@ -0,0 +1,102 @@ +# -*- coding: utf-8 -*- +from __future__ import unicode_literals + +import json +import pytest +import os + +from datetime import datetime + +from udata.app import create_app +from udata.core.organization.factories import OrganizationFactory +from udata.harvest import actions +from udata.harvest.tests.factories import HarvestSourceFactory +from udata.models import Dataset +from udata.settings import Defaults, Testing +from udata.tests.plugin import drop_db + + +DKAN_TEST_INSTANCE = 'http://demo.getdkan.com' + + +def data_path(filename): + '''Get a test data path''' + return os.path.join(os.path.dirname(__file__), 'data', filename) + + +class DkanSettings(Testing): + PLUGINS = ['dkan'] + + +@pytest.fixture(scope='module') +def app(request): + '''Create an udata app once for the module. ''' + app = create_app(Defaults, override=DkanSettings) + with app.app_context(): + drop_db(app) + yield app + with app.app_context(): + drop_db(app) + + +@pytest.fixture(scope='module') +def source(app): + ''' + Create an harvest source for an organization. + The source is created once for the module. + ''' + with app.app_context(): + org = OrganizationFactory() + return HarvestSourceFactory(backend='dkan', + url=DKAN_TEST_INSTANCE, + organization=org) + + +def test_dkan_demo_harvest(source, app): + ''' + Harvest DKAN_TEST_INSTANCE and check some datasets are created + ''' + with app.app_context(): + actions.run(source.slug) + source.reload() + job = source.get_last_job() + + assert len(job.items) > 0 + datasets = Dataset.objects.filter(organization=source.organization) + assert len(job.items) == datasets.count() + + for dataset in datasets: + assert len(dataset.resources) > 0 + + assert job.status == 'done' + + +def test_dkan_french_w_license(app, rmock): + '''CKAN Harvester should accept the minimum dataset payload''' + DKAN_URL = 'https://harvest.me/' + API_URL = '{}api/3/action/'.format(DKAN_URL) + PACKAGE_LIST_URL = '{}package_list'.format(API_URL) + PACKAGE_SHOW_URL = '{}package_show'.format(API_URL) + + with open(data_path('dkan-french-w-license.json')) as ifile: + data = json.loads(ifile.read()) + + org = OrganizationFactory() + source = HarvestSourceFactory(backend='dkan', url=DKAN_URL, organization=org) + rmock.get(PACKAGE_LIST_URL, json={'success': True, 'result': ['fake-name']}, status_code=200, + headers={'Content-Type': 'application/json'}) + rmock.get(PACKAGE_SHOW_URL, json=data, status_code=200, + headers={'Content-Type': 'application/json'}) + actions.run(source.slug) + source.reload() + assert source.get_last_job().status == 'done' + + datasets = Dataset.objects.filter(organization=org) + assert len(datasets) > 0 + + q = {'extras__harvest:remote_id': '04be6288-696d-4331-850d-a144871a7e3a'} + dataset = datasets.get(**q) + assert dataset.created_at == datetime(2019, 12, 10, 0, 0) + assert dataset.last_modified == datetime(2019, 9, 30, 0, 0) + assert len(dataset.resources) == 2 + assert 'xlsx' in [r.format for r in dataset.resources] diff --git a/udata_ckan/harvesters.py b/udata_ckan/harvesters.py index f12c4ae..5951834 100644 --- a/udata_ckan/harvesters.py +++ b/udata_ckan/harvesters.py @@ -8,13 +8,10 @@ from uuid import UUID from urlparse import urljoin -from voluptuous import ( - Schema, All, Any, Lower, Coerce, DefaultTo, Optional -) - from udata import uris from udata.i18n import lazy_gettext as _ from udata.core.dataset.rdf import frequency_from_rdf +from udata.frontend.markdown import parse_html from udata.models import ( db, Resource, License, SpatialCoverage, GeoZone, UPDATE_FREQUENCIES, @@ -23,86 +20,14 @@ from udata.harvest.backends.base import BaseBackend, HarvestFilter from udata.harvest.exceptions import HarvestException, HarvestSkipException -from udata.harvest.filters import ( - boolean, email, to_date, slug, normalize_tag, normalize_string, - is_url, empty_none, hash -) + +from .schemas.ckan import schema as ckan_schema +from .schemas.dkan import schema as dkan_schema log = logging.getLogger(__name__) -RESOURCE_TYPES = ('file', 'file.upload', 'api', 'documentation', - 'image', 'visualization') - -ALLOWED_RESOURCE_TYPES = ('file', 'file.upload', 'api', 'metadata') - -resource = { - 'id': basestring, - 'position': int, - 'name': All(DefaultTo(''), basestring), - 'description': All(basestring, normalize_string), - 'format': All(basestring, Lower), - 'mimetype': Any(All(basestring, Lower), None), - 'size': Any(Coerce(int), None), - 'hash': Any(All(basestring, hash), None), - 'created': All(basestring, to_date), - 'last_modified': Any(All(basestring, to_date), None), - 'url': All(basestring, is_url()), - 'resource_type': All(empty_none, - DefaultTo('file'), - basestring, - Any(*RESOURCE_TYPES) - ), -} - -tag = { - 'id': basestring, - 'vocabulary_id': Any(basestring, None), - 'display_name': basestring, - 'name': All(basestring, normalize_tag), - 'state': basestring, -} - -organization = { - 'id': basestring, - 'description': basestring, - 'created': All(basestring, to_date), - 'title': basestring, - 'name': All(basestring, slug), - 'revision_timestamp': All(basestring, to_date), - 'is_organization': boolean, - 'state': basestring, - 'image_url': basestring, - 'revision_id': basestring, - 'type': 'organization', - 'approval_status': 'approved' -} - -schema = Schema({ - 'id': basestring, - 'name': basestring, - 'title': basestring, - 'notes': Any(All(basestring, normalize_string), None), - 'license_id': All(DefaultTo('not-specified'), basestring), - 'license_title': Any(basestring, None), - 'tags': [tag], - - 'metadata_created': All(basestring, to_date), - 'metadata_modified': All(basestring, to_date), - 'organization': Any(organization, None), - 'resources': [resource], - 'revision_id': basestring, - Optional('extras', default=list): [{ - 'key': basestring, - 'value': Any(basestring, int, float, boolean, dict, list), - }], - 'private': boolean, - 'type': 'dataset', - 'author': Any(basestring, None), - 'author_email': All(empty_none, Any(All(basestring, email), None)), - 'maintainer': Any(basestring, None), - 'maintainer_email': All(empty_none, Any(All(basestring, email), None)), - 'state': Any(basestring, None), -}, required=True, extra=True) +# dkan is a dummy value for dkan that does not provide resource_type +ALLOWED_RESOURCE_TYPES = ('dkan', 'file', 'file.upload', 'api', 'metadata') class CkanBackend(BaseBackend): @@ -112,6 +37,7 @@ class CkanBackend(BaseBackend): _('A CKAN Organization name')), HarvestFilter(_('Tag'), 'tags', str, _('A CKAN tag name')), ) + schema = ckan_schema def get_headers(self): headers = super(CkanBackend, self).get_headers() @@ -199,7 +125,10 @@ def initialize(self): def process(self, item): response = self.get_action('package_show', id=item.remote_id) - data = self.validate(response['result'], schema) + data = self.validate(response['result'], self.schema) + + if type(data) == list: + data = data[0] # Fix the remote_id: use real ID instead of not stable name item.remote_id = data['id'] @@ -215,7 +144,7 @@ def process(self, item): if not dataset.slug: dataset.slug = data['name'] dataset.title = data['title'] - dataset.description = data['notes'] + dataset.description = parse_html(data['notes']) # Detect license default_license = dataset.license or License.default() @@ -324,7 +253,7 @@ def process(self, item): resource = Resource(id=res['id']) dataset.resources.append(resource) resource.title = res.get('name', '') or '' - resource.description = res.get('description') + resource.description = parse_html(res.get('description')) resource.url = res['url'] resource.filetype = 'remote' resource.format = res.get('format') @@ -335,3 +264,8 @@ def process(self, item): resource.published = resource.published or resource.created return dataset + + +class DkanBackend(CkanBackend): + schema = dkan_schema + filters = [] diff --git a/udata_ckan/schemas/__init__.py b/udata_ckan/schemas/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/udata_ckan/schemas/ckan.py b/udata_ckan/schemas/ckan.py new file mode 100644 index 0000000..333037d --- /dev/null +++ b/udata_ckan/schemas/ckan.py @@ -0,0 +1,80 @@ +from voluptuous import ( + Schema, All, Any, Lower, Coerce, DefaultTo, Optional +) +from udata.harvest.filters import ( + boolean, email, to_date, slug, normalize_tag, normalize_string, + is_url, empty_none, hash +) + +RESOURCE_TYPES = ('file', 'file.upload', 'api', 'documentation', + 'image', 'visualization') + + +resource = { + 'id': basestring, + 'position': int, + 'name': All(DefaultTo(''), basestring), + 'description': All(basestring, normalize_string), + 'format': All(basestring, Lower), + 'mimetype': Any(All(basestring, Lower), None), + 'size': Any(Coerce(int), None), + 'hash': Any(All(basestring, hash), None), + 'created': All(basestring, to_date), + 'last_modified': Any(All(basestring, to_date), None), + 'url': All(basestring, is_url()), + 'resource_type': All(empty_none, + DefaultTo('file'), + basestring, + Any(*RESOURCE_TYPES) + ), +} + +tag = { + 'id': basestring, + Optional('vocabulary_id'): Any(basestring, None), + Optional('display_name'): basestring, + 'name': All(basestring, normalize_tag), + Optional('state'): basestring, +} + +organization = { + 'id': basestring, + 'description': basestring, + 'created': All(basestring, to_date), + 'title': basestring, + 'name': All(basestring, slug), + 'revision_timestamp': All(basestring, to_date), + 'is_organization': boolean, + 'state': basestring, + 'image_url': basestring, + 'revision_id': basestring, + 'type': 'organization', + 'approval_status': 'approved' +} + +schema = Schema({ + 'id': basestring, + 'name': basestring, + 'title': basestring, + 'notes': Any(All(basestring, normalize_string), None), + 'license_id': All(DefaultTo('not-specified'), basestring), + 'license_title': Any(basestring, None), + 'tags': [tag], + + 'metadata_created': All(basestring, to_date), + 'metadata_modified': All(basestring, to_date), + 'organization': Any(organization, None), + 'resources': [resource], + 'revision_id': basestring, + Optional('extras', default=list): [{ + 'key': basestring, + 'value': Any(basestring, int, float, boolean, dict, list), + }], + 'private': boolean, + 'type': 'dataset', + 'author': Any(basestring, None), + 'author_email': All(empty_none, Any(All(basestring, email), None)), + 'maintainer': Any(basestring, None), + 'maintainer_email': All(empty_none, Any(All(basestring, email), None)), + 'state': Any(basestring, None), +}, required=True, extra=True) diff --git a/udata_ckan/schemas/dkan.py b/udata_ckan/schemas/dkan.py new file mode 100644 index 0000000..2b03675 --- /dev/null +++ b/udata_ckan/schemas/dkan.py @@ -0,0 +1,97 @@ +import dateutil.parser +from humanfriendly import parse_size +from voluptuous import ( + Schema, All, Any, Lower, DefaultTo, Optional +) + +from udata.harvest.filters import ( + boolean, email, slug, normalize_string, + is_url, empty_none, hash +) + +from .ckan import RESOURCE_TYPES, tag + + +class FrenchParserInfo(dateutil.parser.parserinfo): + WEEKDAYS = [('Lun', 'Lundi'), + ('Mar', 'Mardi'), + ('Mer', 'Mercredi'), + ('Jeu', 'Jeudi'), + ('Ven', 'Vendredi'), + ('Sam', 'Samedi'), + ('Dim', 'Dimanche')] + + +def parse_date(value, **kwargs): + return dateutil.parser.parse(value, **kwargs).date() + + +def to_date(value): + ''' + Try w/ french weekdays then dateutil's default + `fuzzy` is used when 'Date changed' is in the value + ''' + try: + return parse_date(value, fuzzy=True, parserinfo=FrenchParserInfo(), dayfirst=True) + except ValueError: + return parse_date(value, fuzzy=True) + + +def dkan_parse_size(value): + if value: + # not strictly true but should be enough + value = value.replace('octets', 'bytes') + return parse_size(value) + + +resource = { + 'id': basestring, + 'name': All(DefaultTo(''), basestring), + 'description': All(basestring, normalize_string), + 'format': All(basestring, Lower), + 'mimetype': Any(All(basestring, Lower), None), + 'size': All(basestring, dkan_parse_size), + Optional('hash'): Any(All(basestring, hash), None), + 'created': All(basestring, to_date), + 'last_modified': Any(All(basestring, to_date), None), + 'url': All(basestring, is_url()), + Optional('resource_type', default='dkan'): All( + empty_none, + DefaultTo('file'), + basestring, + Any(*RESOURCE_TYPES) + ), +} + +group = { + 'id': basestring, + 'description': basestring, + 'image_display_url': basestring, + 'title': basestring, + 'name': All(basestring, slug), +} + +schema = Schema([{ + 'id': basestring, + 'name': basestring, + 'title': basestring, + 'notes': Any(All(basestring, normalize_string), None), + Optional('license_id', default=None): All(DefaultTo('not-specified'), basestring), + Optional('license_title', default=None): Any(basestring, None), + Optional('tags', default=list): [tag], + 'metadata_created': All(basestring, to_date), + 'metadata_modified': All(basestring, to_date), + Optional('groups'): [Any(group, None)], + 'resources': [resource], + Optional('extras', default=list): [{ + 'key': basestring, + 'value': Any(basestring, int, float, boolean, dict, list), + }], + 'private': boolean, + 'type': 'Dataset', + Optional('author'): Any(basestring, None), + Optional('author_email'): All(empty_none, Any(All(basestring, email), None)), + 'maintainer': Any(basestring, None), + 'maintainer_email': All(empty_none, Any(All(basestring, email), None)), + 'state': Any(basestring, None), +}], required=True, extra=True)