From 7c99d46f673d291b94cbed49dbbc84e1f328e412 Mon Sep 17 00:00:00 2001 From: Michael Mueller Date: Wed, 20 Nov 2024 13:16:11 +0100 Subject: [PATCH] add support for oereblex api version 1.2.5 --- CHANGELOG | 6 +- geolink_formatter/entity.py | 17 +- geolink_formatter/parser.py | 203 ++++++++++------ geolink_formatter/schema/v1.2.5.xsd | 113 +++++++++ tests/resources/geolink_v1.2.5.xml | 107 ++++++++ tests/resources/geolink_v1.2.5_ml.xml | 336 ++++++++++++++++++++++++++ tests/test_parser.py | 167 ++++++++++++- 7 files changed, 876 insertions(+), 73 deletions(-) create mode 100644 geolink_formatter/schema/v1.2.5.xsd create mode 100644 tests/resources/geolink_v1.2.5.xml create mode 100644 tests/resources/geolink_v1.2.5_ml.xml diff --git a/CHANGELOG b/CHANGELOG index d43968a7..03604aa1 100644 --- a/CHANGELOG +++ b/CHANGELOG @@ -1,10 +1,10 @@ -Changelog -========= +Change log +========== unreleased ---------- -Supported GEO-Link API versions: v1.0.0, v1.1.0, v1.1.1, v1.2.0, v1.2.1, v1.2.2, v1.2.3, v1.2.4 (default) +Supported GEO-Link API versions: v1.0.0, v1.1.0, v1.1.1, v1.2.0, v1.2.1, v1.2.2, v1.2.3, v1.2.4, v1.2.5 (default) - Drop support for Python 3.8 - Dependency updates diff --git a/geolink_formatter/entity.py b/geolink_formatter/entity.py index 939a0a79..6e1d0ba0 100644 --- a/geolink_formatter/entity.py +++ b/geolink_formatter/entity.py @@ -12,7 +12,8 @@ class Document(object): def __init__(self, files, id=None, category=None, doctype=None, federal_level=None, authority=None, authority_url=None, title=None, number=None, abbreviation=None, instance=None, type=None, subtype=None, decree_date=None, enactment_date=None, abrogation_date=None, cycle=None, - municipality=None, index=None, status=None, status_start_date=None, status_end_date=None): + municipality=None, index=None, status=None, status_start_date=None, status_end_date=None, + language=None, language_link=None): """Creates a new document instance. Args: @@ -38,6 +39,8 @@ def __init__(self, files, id=None, category=None, doctype=None, federal_level=No status (str or None): The status of the prebublication. status_start_date (datetime.date or None): Start date of the status. status_end_date (datetime.date or None): End date of the status. + language (str or None): Language of the document. + language_link (str or None): Language of the geolink/prepublink collection. Raises: TypeError: Raised on missing argument or invalid argument type. @@ -109,6 +112,8 @@ def __init__(self, files, id=None, category=None, doctype=None, federal_level=No self._status = status self._status_start_date = status_start_date self._status_end_date = status_end_date + self._language = language + self._language_link = language_link @property def files(self): @@ -220,6 +225,16 @@ def status_end_date(self): """datetime.date: End date of the status (since v1.2.2).""" return self._status_end_date + @property + def language(self): + """str: Language of the document (since v1.2.5).""" + return self._language + + @property + def language_link(self): + """str: Language of the geolink or prepublink (since v1.2.5).""" + return self._language_link + class File(object): def __init__(self, category=None, href=None, title=None, description=None): diff --git a/geolink_formatter/parser.py b/geolink_formatter/parser.py index d11b763e..65949728 100644 --- a/geolink_formatter/parser.py +++ b/geolink_formatter/parser.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- import datetime -import pkg_resources +from importlib import resources import requests from lxml.etree import DTD, DocumentInvalid, fromstring from xmlschema import XMLSchema11 @@ -32,7 +32,10 @@ class SCHEMA(object): """str: geoLink schema version 1.2.3""" V1_2_4 = '1.2.4' - """str: geoLink schema version 1.2.3""" + """str: geoLink schema version 1.2.4""" + + V1_2_5 = '1.2.5' + """str: geoLink schema version 1.2.5""" class XML(object): @@ -40,7 +43,7 @@ class XML(object): _date_format = '%Y-%m-%d' """str: Format of date values in XML.""" - def __init__(self, host_url=None, version='1.2.4', dtd_validation=False, xsd_validation=True): + def __init__(self, host_url=None, version='1.2.5', dtd_validation=False, xsd_validation=True): """Create a new XML parser instance containing the geoLink XSD for validation. Args: @@ -57,9 +60,9 @@ def __init__(self, host_url=None, version='1.2.4', dtd_validation=False, xsd_val self._version = version self._dtd_validation = dtd_validation self._xsd_validation = xsd_validation - xsd = pkg_resources.resource_filename('geolink_formatter', 'schema/v{0}.xsd'.format(version)) + xsd = resources.files('geolink_formatter') / 'schema' / 'v{0}.xsd'.format(version) if self._xsd_validation: - with open(xsd, encoding='utf-8') as f: + with xsd.open(mode='r', encoding='utf-8') as f: self._schema = XMLSchema11(f.read()) @property @@ -94,6 +97,123 @@ def _parse_xml(self, xml): raise DocumentInvalid('Missing DTD in parsed content') return content + def _process_single_document(self, document_el, language_link): + """ + Processes a single document element. + + Args: + document_el (lxml.etree._Element): element 'document' + language_link (str): language of the documents set + + Returns: + geolink_formatter.entity.Document: document + + """ + doc_id = document_el.attrib.get('id') + doctype = document_el.attrib.get('doctype') + + # Mangle doc_id for notices. While IDs are unique between decrees + # and edicts, this is not the case when adding notices to the mix. + if doctype == 'notice': + doc_id += doctype + + files = list() + for file_el in document_el.iter('file'): + href = file_el.attrib.get('href') + if self.host_url and not href.startswith(u'http://') and not href.startswith(u'https://'): + href = u'{host}{href}'.format(host=self.host_url, href=href) + files.append(File( + title=file_el.attrib.get('title'), + description=file_el.attrib.get('description'), + href=href, + category=file_el.attrib.get('category') + )) + enactment_date = document_el.attrib.get('enactment_date') + if enactment_date: + enactment_date = datetime.datetime.strptime(enactment_date, self._date_format).date() + decree_date = document_el.attrib.get('decree_date') + if decree_date: + decree_date = datetime.datetime.strptime(decree_date, self._date_format).date() + abrogation_date = document_el.attrib.get('abrogation_date') + if abrogation_date: + abrogation_date = datetime.datetime.strptime(abrogation_date, self._date_format).date() + status_start_date = document_el.attrib.get('status_start_date') + if status_start_date: + status_start_date = datetime.datetime.strptime(status_start_date, self._date_format)\ + .date() + status_end_date = document_el.attrib.get('status_end_date') + if status_end_date: + status_end_date = datetime.datetime\ + .strptime(status_end_date, self._date_format).date() + + document = Document( + files=files, + id=doc_id, + category=document_el.attrib.get('category'), + doctype=document_el.attrib.get('doctype'), + federal_level=document_el.attrib.get('federal_level'), + authority=document_el.attrib.get('authority'), + authority_url=document_el.attrib.get('authority_url'), + title=document_el.attrib.get('title'), + number=document_el.attrib.get('number'), + abbreviation=document_el.attrib.get('abbreviation'), + instance=document_el.attrib.get('instance'), + type=document_el.attrib.get('type'), + subtype=document_el.attrib.get('subtype'), + decree_date=decree_date, + enactment_date=enactment_date, + abrogation_date=abrogation_date, + cycle=document_el.attrib.get('cycle'), + municipality=document_el.attrib.get('municipality'), + index=document_el.attrib.get('index'), + status=document_el.attrib.get('status'), + status_start_date=status_start_date, + status_end_date=status_end_date, + language=document_el.attrib.get('language'), + language_link=language_link + ) + + assert isinstance(document, Document) + assert document.id is not None + + return document + + def _process_geolinks_prepublinks(self, geolink_prepublink_el): + """ + Processes a 'geolinks' or 'prepublinks' element. + + Args: + geolink_prepublink_el (lxml.etree._Element): element 'geolinks' or 'prepublinks' + + Return: + list[geolink_formatter.entity.Document]: list of documents + """ + language_link = geolink_prepublink_el.get('language') + + documents = list() + for document_el in geolink_prepublink_el.iter('document'): + documents.append(self._process_single_document(document_el, language_link)) + return documents + + def _filter_duplicated_documents(self, documents): + """ + Filters duplicated documents. + + Args: + documents (list[geolink_formatter.entity.Document]): list of documents + + Returns: + list[geolink_formatter.entity.Document]: filtered list of documents + """ + documents_filtered = list() + for document in documents: + if ( + [document.id, document.language_link] not in + [[doc.id, doc.language_link] for doc in documents_filtered] + ): + documents_filtered.append(document) + return documents_filtered + def from_string(self, xml): """Parses XML into internal structure. @@ -111,70 +231,17 @@ def from_string(self, xml): root = self._parse_xml(xml) documents = list() - for document_el in root.iter('document'): - doc_id = document_el.attrib.get('id') - doctype = document_el.attrib.get('doctype') - - # Mangle doc_id for notices. While IDs are unique between decrees - # and edicts, this is not the case when adding notices to the mix. - if doctype == 'notice': - doc_id += doctype - - if doc_id and doc_id not in [doc.id for doc in documents]: - files = list() - for file_el in document_el.iter('file'): - href = file_el.attrib.get('href') - if self.host_url and not href.startswith(u'http://') and not href.startswith(u'https://'): - href = u'{host}{href}'.format(host=self.host_url, href=href) - files.append(File( - title=file_el.attrib.get('title'), - description=file_el.attrib.get('description'), - href=href, - category=file_el.attrib.get('category') - )) - enactment_date = document_el.attrib.get('enactment_date') - if enactment_date: - enactment_date = datetime.datetime.strptime(enactment_date, self._date_format).date() - decree_date = document_el.attrib.get('decree_date') - if decree_date: - decree_date = datetime.datetime.strptime(decree_date, self._date_format).date() - abrogation_date = document_el.attrib.get('abrogation_date') - if abrogation_date: - abrogation_date = datetime.datetime.strptime(abrogation_date, self._date_format).date() - status_start_date = document_el.attrib.get('status_start_date') - if status_start_date: - status_start_date = datetime.datetime.strptime(status_start_date, self._date_format)\ - .date() - status_end_date = document_el.attrib.get('status_end_date') - if status_end_date: - status_end_date = datetime.datetime.strptime(status_end_date, self._date_format)\ - .date() - - documents.append(Document( - files=files, - id=doc_id, - category=document_el.attrib.get('category'), - doctype=document_el.attrib.get('doctype'), - federal_level=document_el.attrib.get('federal_level'), - authority=document_el.attrib.get('authority'), - authority_url=document_el.attrib.get('authority_url'), - title=document_el.attrib.get('title'), - number=document_el.attrib.get('number'), - abbreviation=document_el.attrib.get('abbreviation'), - instance=document_el.attrib.get('instance'), - type=document_el.attrib.get('type'), - subtype=document_el.attrib.get('subtype'), - decree_date=decree_date, - enactment_date=enactment_date, - abrogation_date=abrogation_date, - cycle=document_el.attrib.get('cycle'), - municipality=document_el.attrib.get('municipality'), - index=document_el.attrib.get('index'), - status=document_el.attrib.get('status'), - status_start_date=status_start_date, - status_end_date=status_end_date - )) + # evaluate root element's tag + if root.tag == 'multilang_geolinks': + for el in root.iter('geolinks', 'prepublinks'): + documents.extend(self._process_geolinks_prepublinks(el)) + elif root.tag in ['geolinks', 'prepublinks']: + documents.extend(self._process_geolinks_prepublinks(root)) + else: + raise RuntimeError('Unexpected tag name: {}'.format(root.tag)) + # filter documents (remove duplicates) + documents = self._filter_duplicated_documents(documents) return documents def from_url(self, url, params=None, **kwargs): diff --git a/geolink_formatter/schema/v1.2.5.xsd b/geolink_formatter/schema/v1.2.5.xsd new file mode 100644 index 00000000..7e61ad08 --- /dev/null +++ b/geolink_formatter/schema/v1.2.5.xsd @@ -0,0 +1,113 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/tests/resources/geolink_v1.2.5.xml b/tests/resources/geolink_v1.2.5.xml new file mode 100644 index 00000000..d2c800a3 --- /dev/null +++ b/tests/resources/geolink_v1.2.5.xml @@ -0,0 +1,107 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/tests/resources/geolink_v1.2.5_ml.xml b/tests/resources/geolink_v1.2.5_ml.xml new file mode 100644 index 00000000..d6f009b0 --- /dev/null +++ b/tests/resources/geolink_v1.2.5_ml.xml @@ -0,0 +1,336 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/tests/test_parser.py b/tests/test_parser.py index 24255af3..066cee5b 100644 --- a/tests/test_parser.py +++ b/tests/test_parser.py @@ -1,11 +1,13 @@ # -*- coding: utf-8 -*- import pytest import requests_mock -from lxml.etree import DocumentInvalid, _Element import xmlschema +from unittest.mock import patch +from lxml.etree import DocumentInvalid, _Element, Element, SubElement from requests import RequestException from geolink_formatter.parser import XML, SCHEMA +from geolink_formatter.entity import Document def test_xml_init(): @@ -351,6 +353,53 @@ def test_schema_version_1_2_4_faulty_geolink(): XML(version=SCHEMA.V1_2_4).from_url('http://oereblex.test.com/api/geolinks/1500.xml') +def test_schema_version_1_2_5(): + """ + test of schema version 1.2.5 + """ + with requests_mock.mock() as mock_m: + with open('tests/resources/geolink_v1.2.5.xml', 'rb') as file_f: + mock_m.get('http://oereblex.test.com/api/geolinks/1500.xml', content=file_f.read()) + documents = XML(version=SCHEMA.V1_2_5).from_url('http://oereblex.test.com/api/geolinks/1500.xml') + assert len(documents) == 12 + assert documents[0].index is None + assert documents[0].id == '400' + assert documents[-11].id == '390' + assert documents[-10].id == '17' + assert documents[-9].id == '18' + assert documents[-8].id == '34' + assert documents[-7].id == '19' + assert documents[-6].id == '23' + assert documents[-5].id == '24' + assert documents[-4].id == '5' + assert documents[-3].id == '11' + assert documents[-2].id == '13' + assert documents[-1].id == '14' + + +def test_schema_version_1_2_5_ml(): + """ + test of schema version 1.2.5 + """ + with requests_mock.mock() as mock_m: + with open('tests/resources/geolink_v1.2.5_ml.xml', 'rb') as file_f: + mock_m.get('http://oereblex.test.com/api/geolinks/1500.xml', content=file_f.read()) + documents = XML(version=SCHEMA.V1_2_5).from_url('http://oereblex.test.com/api/geolinks/1500.xml') + assert len(documents) == 36 + assert documents[0].id == '400' + assert documents[-11].id == '390' + assert documents[-10].id == '17' + assert documents[-9].id == '18' + assert documents[-8].id == '34' + assert documents[-7].id == '19' + assert documents[-6].id == '23' + assert documents[-5].id == '24' + assert documents[-4].id == '5' + assert documents[-3].id == '11' + assert documents[-2].id == '13' + assert documents[-1].id == '14' + + def test_default_version_with_locale(): with requests_mock.mock() as mock_m: with open('tests/resources/geolink_v1.2.1.xml', 'rb') as file_f: @@ -377,3 +426,119 @@ def test_dtd_validation_invalid(): """ ) + + +@pytest.mark.parametrize('docs,exp_docs_filtered', [ + ( + [ + Document([], id=1, language_link='de'), + Document([], id=1, language_link='fr'), + Document([], id=2, language_link='de')], + [ + Document([], id=1, language_link='de'), + Document([], id=1, language_link='fr'), + Document([], id=2, language_link='de')] + ), + ( + [ + Document([], id=2, language_link='de'), + Document([], id=1, language_link='fr'), + Document([], id=1, language_link='de'), + Document([], id=2, language_link='fr'), + Document([], id=2, language_link='de') + ], + [ + Document([], id=1, language_link='de'), + Document([], id=1, language_link='fr'), + Document([], id=2, language_link='de'), + Document([], id=2, language_link='fr') + ] + ), + ( + [ + Document([], id=2, language_link='de'), + Document([], id=1, language_link=None), + Document([], id=2, language_link=None), + Document([], id=1, language_link=None) + ], + [ + Document([], id=2, language_link='de'), + Document([], id=1, language_link=None), + Document([], id=2, language_link=None), + ] + ) +]) +def test_filter_duplicated_documents(docs, exp_docs_filtered): + + result = XML()._filter_duplicated_documents(docs) + + result = sorted(result, key=lambda x: (x.id, (x.language_link is None, x.language_link)), reverse=False) + + exp_docs_filtered = sorted( + exp_docs_filtered, key=lambda x: (x.id, (x.language_link is None, x.language_link)), reverse=False) + + assert [[x.id, x.language_link] for x in result] == \ + [[x.id, x.language_link] for x in exp_docs_filtered] + + +@pytest.fixture() +def provide_geolinks_el(): + geolinks_el = Element('geolinks', attrib={'language': 'de'}) + SubElement(geolinks_el, 'document', attrib={'id': '1'}) + SubElement(geolinks_el, 'document', attrib={'id': '2'}) + yield geolinks_el + + +def test_process_geolinks_prepublinks(provide_geolinks_el): + with patch.object( + XML, + '_process_single_document', + return_value=Document([], id=1, language_link='de') + ): + result = XML()._process_geolinks_prepublinks(provide_geolinks_el) + assert all([isinstance(x, Document) for x in result]) + assert len(result) == 2 + + +@pytest.fixture() +def provide_document_el(): + document_el = Element( + 'document', + attrib={ + 'language': 'de', + 'authority': 'Gemeindeverwaltung', + 'authority_url': 'https://www.domleschg.ch', + 'category': 'main', + 'doctype': 'decree', + 'enactment_date': '2012-06-22', + 'federal_level': 'Gemeinde', + 'id': '400', + 'language': 'de', + 'municipality': 'Domleschg', + 'number': '12.GDEf1', + 'subtype': 'Domleschg (Paspels) 3634', + 'title': 'Quartierplan Radiend', + 'type': 'Nutzungsplanung - Quartierplanverfahren' + } + ) + SubElement(document_el, 'file', attrib={ + 'category': 'main', + 'description': '3634_B_QP_Radiend_Platzhalter.pdf', + 'href': '/api/attachments/1123', + 'title': '3634_B_QP_Radiend_Platzhalter.pdf' + }) + SubElement(document_el, 'file', attrib={ + 'category': 'additional', + 'description': '', + 'href': '/api/attachments/6027', + 'title': 'PlatzhalterFehlendeDokumente.pdf' + }) + + yield document_el + + +def test_process_single_document(provide_document_el): + document = XML()._process_single_document(provide_document_el, language_link='de') + assert document.id == '400' + assert document.language_link == 'de' + assert document.files[1].title == 'PlatzhalterFehlendeDokumente.pdf'