From 4313fba1400f3cddd2cb2c065ea22c2b13612643 Mon Sep 17 00:00:00 2001 From: Michael Mueller Date: Wed, 20 Nov 2024 13:16:11 +0100 Subject: [PATCH 1/5] add support for oereblex api version 1.2.5 --- CHANGELOG | 6 +- geolink_formatter/entity.py | 17 +- geolink_formatter/parser.py | 203 ++++++++++------ geolink_formatter/schema/v1.2.5.xsd | 113 +++++++++ tests/resources/geolink_v1.2.5.xml | 107 ++++++++ tests/resources/geolink_v1.2.5_ml.xml | 336 ++++++++++++++++++++++++++ tests/test_parser.py | 167 ++++++++++++- 7 files changed, 876 insertions(+), 73 deletions(-) create mode 100644 geolink_formatter/schema/v1.2.5.xsd create mode 100644 tests/resources/geolink_v1.2.5.xml create mode 100644 tests/resources/geolink_v1.2.5_ml.xml diff --git a/CHANGELOG b/CHANGELOG index d43968a7..03604aa1 100644 --- a/CHANGELOG +++ b/CHANGELOG @@ -1,10 +1,10 @@ -Changelog -========= +Change log +========== unreleased ---------- -Supported GEO-Link API versions: v1.0.0, v1.1.0, v1.1.1, v1.2.0, v1.2.1, v1.2.2, v1.2.3, v1.2.4 (default) +Supported GEO-Link API versions: v1.0.0, v1.1.0, v1.1.1, v1.2.0, v1.2.1, v1.2.2, v1.2.3, v1.2.4, v1.2.5 (default) - Drop support for Python 3.8 - Dependency updates diff --git a/geolink_formatter/entity.py b/geolink_formatter/entity.py index 939a0a79..6e1d0ba0 100644 --- a/geolink_formatter/entity.py +++ b/geolink_formatter/entity.py @@ -12,7 +12,8 @@ class Document(object): def __init__(self, files, id=None, category=None, doctype=None, federal_level=None, authority=None, authority_url=None, title=None, number=None, abbreviation=None, instance=None, type=None, subtype=None, decree_date=None, enactment_date=None, abrogation_date=None, cycle=None, - municipality=None, index=None, status=None, status_start_date=None, status_end_date=None): + municipality=None, index=None, status=None, status_start_date=None, status_end_date=None, + language=None, language_link=None): """Creates a new document instance. Args: @@ -38,6 +39,8 @@ def __init__(self, files, id=None, category=None, doctype=None, federal_level=No status (str or None): The status of the prebublication. status_start_date (datetime.date or None): Start date of the status. status_end_date (datetime.date or None): End date of the status. + language (str or None): Language of the document. + language_link (str or None): Language of the geolink/prepublink collection. Raises: TypeError: Raised on missing argument or invalid argument type. @@ -109,6 +112,8 @@ def __init__(self, files, id=None, category=None, doctype=None, federal_level=No self._status = status self._status_start_date = status_start_date self._status_end_date = status_end_date + self._language = language + self._language_link = language_link @property def files(self): @@ -220,6 +225,16 @@ def status_end_date(self): """datetime.date: End date of the status (since v1.2.2).""" return self._status_end_date + @property + def language(self): + """str: Language of the document (since v1.2.5).""" + return self._language + + @property + def language_link(self): + """str: Language of the geolink or prepublink (since v1.2.5).""" + return self._language_link + class File(object): def __init__(self, category=None, href=None, title=None, description=None): diff --git a/geolink_formatter/parser.py b/geolink_formatter/parser.py index d11b763e..65949728 100644 --- a/geolink_formatter/parser.py +++ b/geolink_formatter/parser.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- import datetime -import pkg_resources +from importlib import resources import requests from lxml.etree import DTD, DocumentInvalid, fromstring from xmlschema import XMLSchema11 @@ -32,7 +32,10 @@ class SCHEMA(object): """str: geoLink schema version 1.2.3""" V1_2_4 = '1.2.4' - """str: geoLink schema version 1.2.3""" + """str: geoLink schema version 1.2.4""" + + V1_2_5 = '1.2.5' + """str: geoLink schema version 1.2.5""" class XML(object): @@ -40,7 +43,7 @@ class XML(object): _date_format = '%Y-%m-%d' """str: Format of date values in XML.""" - def __init__(self, host_url=None, version='1.2.4', dtd_validation=False, xsd_validation=True): + def __init__(self, host_url=None, version='1.2.5', dtd_validation=False, xsd_validation=True): """Create a new XML parser instance containing the geoLink XSD for validation. Args: @@ -57,9 +60,9 @@ def __init__(self, host_url=None, version='1.2.4', dtd_validation=False, xsd_val self._version = version self._dtd_validation = dtd_validation self._xsd_validation = xsd_validation - xsd = pkg_resources.resource_filename('geolink_formatter', 'schema/v{0}.xsd'.format(version)) + xsd = resources.files('geolink_formatter') / 'schema' / 'v{0}.xsd'.format(version) if self._xsd_validation: - with open(xsd, encoding='utf-8') as f: + with xsd.open(mode='r', encoding='utf-8') as f: self._schema = XMLSchema11(f.read()) @property @@ -94,6 +97,123 @@ def _parse_xml(self, xml): raise DocumentInvalid('Missing DTD in parsed content') return content + def _process_single_document(self, document_el, language_link): + """ + Processes a single document element. + + Args: + document_el (lxml.etree._Element): element 'document' + language_link (str): language of the documents set + + Returns: + geolink_formatter.entity.Document: document + + """ + doc_id = document_el.attrib.get('id') + doctype = document_el.attrib.get('doctype') + + # Mangle doc_id for notices. While IDs are unique between decrees + # and edicts, this is not the case when adding notices to the mix. + if doctype == 'notice': + doc_id += doctype + + files = list() + for file_el in document_el.iter('file'): + href = file_el.attrib.get('href') + if self.host_url and not href.startswith(u'http://') and not href.startswith(u'https://'): + href = u'{host}{href}'.format(host=self.host_url, href=href) + files.append(File( + title=file_el.attrib.get('title'), + description=file_el.attrib.get('description'), + href=href, + category=file_el.attrib.get('category') + )) + enactment_date = document_el.attrib.get('enactment_date') + if enactment_date: + enactment_date = datetime.datetime.strptime(enactment_date, self._date_format).date() + decree_date = document_el.attrib.get('decree_date') + if decree_date: + decree_date = datetime.datetime.strptime(decree_date, self._date_format).date() + abrogation_date = document_el.attrib.get('abrogation_date') + if abrogation_date: + abrogation_date = datetime.datetime.strptime(abrogation_date, self._date_format).date() + status_start_date = document_el.attrib.get('status_start_date') + if status_start_date: + status_start_date = datetime.datetime.strptime(status_start_date, self._date_format)\ + .date() + status_end_date = document_el.attrib.get('status_end_date') + if status_end_date: + status_end_date = datetime.datetime\ + .strptime(status_end_date, self._date_format).date() + + document = Document( + files=files, + id=doc_id, + category=document_el.attrib.get('category'), + doctype=document_el.attrib.get('doctype'), + federal_level=document_el.attrib.get('federal_level'), + authority=document_el.attrib.get('authority'), + authority_url=document_el.attrib.get('authority_url'), + title=document_el.attrib.get('title'), + number=document_el.attrib.get('number'), + abbreviation=document_el.attrib.get('abbreviation'), + instance=document_el.attrib.get('instance'), + type=document_el.attrib.get('type'), + subtype=document_el.attrib.get('subtype'), + decree_date=decree_date, + enactment_date=enactment_date, + abrogation_date=abrogation_date, + cycle=document_el.attrib.get('cycle'), + municipality=document_el.attrib.get('municipality'), + index=document_el.attrib.get('index'), + status=document_el.attrib.get('status'), + status_start_date=status_start_date, + status_end_date=status_end_date, + language=document_el.attrib.get('language'), + language_link=language_link + ) + + assert isinstance(document, Document) + assert document.id is not None + + return document + + def _process_geolinks_prepublinks(self, geolink_prepublink_el): + """ + Processes a 'geolinks' or 'prepublinks' element. + + Args: + geolink_prepublink_el (lxml.etree._Element): element 'geolinks' or 'prepublinks' + + Return: + list[geolink_formatter.entity.Document]: list of documents + """ + language_link = geolink_prepublink_el.get('language') + + documents = list() + for document_el in geolink_prepublink_el.iter('document'): + documents.append(self._process_single_document(document_el, language_link)) + return documents + + def _filter_duplicated_documents(self, documents): + """ + Filters duplicated documents. + + Args: + documents (list[geolink_formatter.entity.Document]): list of documents + + Returns: + list[geolink_formatter.entity.Document]: filtered list of documents + """ + documents_filtered = list() + for document in documents: + if ( + [document.id, document.language_link] not in + [[doc.id, doc.language_link] for doc in documents_filtered] + ): + documents_filtered.append(document) + return documents_filtered + def from_string(self, xml): """Parses XML into internal structure. @@ -111,70 +231,17 @@ def from_string(self, xml): root = self._parse_xml(xml) documents = list() - for document_el in root.iter('document'): - doc_id = document_el.attrib.get('id') - doctype = document_el.attrib.get('doctype') - - # Mangle doc_id for notices. While IDs are unique between decrees - # and edicts, this is not the case when adding notices to the mix. - if doctype == 'notice': - doc_id += doctype - - if doc_id and doc_id not in [doc.id for doc in documents]: - files = list() - for file_el in document_el.iter('file'): - href = file_el.attrib.get('href') - if self.host_url and not href.startswith(u'http://') and not href.startswith(u'https://'): - href = u'{host}{href}'.format(host=self.host_url, href=href) - files.append(File( - title=file_el.attrib.get('title'), - description=file_el.attrib.get('description'), - href=href, - category=file_el.attrib.get('category') - )) - enactment_date = document_el.attrib.get('enactment_date') - if enactment_date: - enactment_date = datetime.datetime.strptime(enactment_date, self._date_format).date() - decree_date = document_el.attrib.get('decree_date') - if decree_date: - decree_date = datetime.datetime.strptime(decree_date, self._date_format).date() - abrogation_date = document_el.attrib.get('abrogation_date') - if abrogation_date: - abrogation_date = datetime.datetime.strptime(abrogation_date, self._date_format).date() - status_start_date = document_el.attrib.get('status_start_date') - if status_start_date: - status_start_date = datetime.datetime.strptime(status_start_date, self._date_format)\ - .date() - status_end_date = document_el.attrib.get('status_end_date') - if status_end_date: - status_end_date = datetime.datetime.strptime(status_end_date, self._date_format)\ - .date() - - documents.append(Document( - files=files, - id=doc_id, - category=document_el.attrib.get('category'), - doctype=document_el.attrib.get('doctype'), - federal_level=document_el.attrib.get('federal_level'), - authority=document_el.attrib.get('authority'), - authority_url=document_el.attrib.get('authority_url'), - title=document_el.attrib.get('title'), - number=document_el.attrib.get('number'), - abbreviation=document_el.attrib.get('abbreviation'), - instance=document_el.attrib.get('instance'), - type=document_el.attrib.get('type'), - subtype=document_el.attrib.get('subtype'), - decree_date=decree_date, - enactment_date=enactment_date, - abrogation_date=abrogation_date, - cycle=document_el.attrib.get('cycle'), - municipality=document_el.attrib.get('municipality'), - index=document_el.attrib.get('index'), - status=document_el.attrib.get('status'), - status_start_date=status_start_date, - status_end_date=status_end_date - )) + # evaluate root element's tag + if root.tag == 'multilang_geolinks': + for el in root.iter('geolinks', 'prepublinks'): + documents.extend(self._process_geolinks_prepublinks(el)) + elif root.tag in ['geolinks', 'prepublinks']: + documents.extend(self._process_geolinks_prepublinks(root)) + else: + raise RuntimeError('Unexpected tag name: {}'.format(root.tag)) + # filter documents (remove duplicates) + documents = self._filter_duplicated_documents(documents) return documents def from_url(self, url, params=None, **kwargs): diff --git a/geolink_formatter/schema/v1.2.5.xsd b/geolink_formatter/schema/v1.2.5.xsd new file mode 100644 index 00000000..7e61ad08 --- /dev/null +++ b/geolink_formatter/schema/v1.2.5.xsd @@ -0,0 +1,113 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/tests/resources/geolink_v1.2.5.xml b/tests/resources/geolink_v1.2.5.xml new file mode 100644 index 00000000..d2c800a3 --- /dev/null +++ b/tests/resources/geolink_v1.2.5.xml @@ -0,0 +1,107 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/tests/resources/geolink_v1.2.5_ml.xml b/tests/resources/geolink_v1.2.5_ml.xml new file mode 100644 index 00000000..d6f009b0 --- /dev/null +++ b/tests/resources/geolink_v1.2.5_ml.xml @@ -0,0 +1,336 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/tests/test_parser.py b/tests/test_parser.py index 24255af3..066cee5b 100644 --- a/tests/test_parser.py +++ b/tests/test_parser.py @@ -1,11 +1,13 @@ # -*- coding: utf-8 -*- import pytest import requests_mock -from lxml.etree import DocumentInvalid, _Element import xmlschema +from unittest.mock import patch +from lxml.etree import DocumentInvalid, _Element, Element, SubElement from requests import RequestException from geolink_formatter.parser import XML, SCHEMA +from geolink_formatter.entity import Document def test_xml_init(): @@ -351,6 +353,53 @@ def test_schema_version_1_2_4_faulty_geolink(): XML(version=SCHEMA.V1_2_4).from_url('http://oereblex.test.com/api/geolinks/1500.xml') +def test_schema_version_1_2_5(): + """ + test of schema version 1.2.5 + """ + with requests_mock.mock() as mock_m: + with open('tests/resources/geolink_v1.2.5.xml', 'rb') as file_f: + mock_m.get('http://oereblex.test.com/api/geolinks/1500.xml', content=file_f.read()) + documents = XML(version=SCHEMA.V1_2_5).from_url('http://oereblex.test.com/api/geolinks/1500.xml') + assert len(documents) == 12 + assert documents[0].index is None + assert documents[0].id == '400' + assert documents[-11].id == '390' + assert documents[-10].id == '17' + assert documents[-9].id == '18' + assert documents[-8].id == '34' + assert documents[-7].id == '19' + assert documents[-6].id == '23' + assert documents[-5].id == '24' + assert documents[-4].id == '5' + assert documents[-3].id == '11' + assert documents[-2].id == '13' + assert documents[-1].id == '14' + + +def test_schema_version_1_2_5_ml(): + """ + test of schema version 1.2.5 + """ + with requests_mock.mock() as mock_m: + with open('tests/resources/geolink_v1.2.5_ml.xml', 'rb') as file_f: + mock_m.get('http://oereblex.test.com/api/geolinks/1500.xml', content=file_f.read()) + documents = XML(version=SCHEMA.V1_2_5).from_url('http://oereblex.test.com/api/geolinks/1500.xml') + assert len(documents) == 36 + assert documents[0].id == '400' + assert documents[-11].id == '390' + assert documents[-10].id == '17' + assert documents[-9].id == '18' + assert documents[-8].id == '34' + assert documents[-7].id == '19' + assert documents[-6].id == '23' + assert documents[-5].id == '24' + assert documents[-4].id == '5' + assert documents[-3].id == '11' + assert documents[-2].id == '13' + assert documents[-1].id == '14' + + def test_default_version_with_locale(): with requests_mock.mock() as mock_m: with open('tests/resources/geolink_v1.2.1.xml', 'rb') as file_f: @@ -377,3 +426,119 @@ def test_dtd_validation_invalid(): """ ) + + +@pytest.mark.parametrize('docs,exp_docs_filtered', [ + ( + [ + Document([], id=1, language_link='de'), + Document([], id=1, language_link='fr'), + Document([], id=2, language_link='de')], + [ + Document([], id=1, language_link='de'), + Document([], id=1, language_link='fr'), + Document([], id=2, language_link='de')] + ), + ( + [ + Document([], id=2, language_link='de'), + Document([], id=1, language_link='fr'), + Document([], id=1, language_link='de'), + Document([], id=2, language_link='fr'), + Document([], id=2, language_link='de') + ], + [ + Document([], id=1, language_link='de'), + Document([], id=1, language_link='fr'), + Document([], id=2, language_link='de'), + Document([], id=2, language_link='fr') + ] + ), + ( + [ + Document([], id=2, language_link='de'), + Document([], id=1, language_link=None), + Document([], id=2, language_link=None), + Document([], id=1, language_link=None) + ], + [ + Document([], id=2, language_link='de'), + Document([], id=1, language_link=None), + Document([], id=2, language_link=None), + ] + ) +]) +def test_filter_duplicated_documents(docs, exp_docs_filtered): + + result = XML()._filter_duplicated_documents(docs) + + result = sorted(result, key=lambda x: (x.id, (x.language_link is None, x.language_link)), reverse=False) + + exp_docs_filtered = sorted( + exp_docs_filtered, key=lambda x: (x.id, (x.language_link is None, x.language_link)), reverse=False) + + assert [[x.id, x.language_link] for x in result] == \ + [[x.id, x.language_link] for x in exp_docs_filtered] + + +@pytest.fixture() +def provide_geolinks_el(): + geolinks_el = Element('geolinks', attrib={'language': 'de'}) + SubElement(geolinks_el, 'document', attrib={'id': '1'}) + SubElement(geolinks_el, 'document', attrib={'id': '2'}) + yield geolinks_el + + +def test_process_geolinks_prepublinks(provide_geolinks_el): + with patch.object( + XML, + '_process_single_document', + return_value=Document([], id=1, language_link='de') + ): + result = XML()._process_geolinks_prepublinks(provide_geolinks_el) + assert all([isinstance(x, Document) for x in result]) + assert len(result) == 2 + + +@pytest.fixture() +def provide_document_el(): + document_el = Element( + 'document', + attrib={ + 'language': 'de', + 'authority': 'Gemeindeverwaltung', + 'authority_url': 'https://www.domleschg.ch', + 'category': 'main', + 'doctype': 'decree', + 'enactment_date': '2012-06-22', + 'federal_level': 'Gemeinde', + 'id': '400', + 'language': 'de', + 'municipality': 'Domleschg', + 'number': '12.GDEf1', + 'subtype': 'Domleschg (Paspels) 3634', + 'title': 'Quartierplan Radiend', + 'type': 'Nutzungsplanung - Quartierplanverfahren' + } + ) + SubElement(document_el, 'file', attrib={ + 'category': 'main', + 'description': '3634_B_QP_Radiend_Platzhalter.pdf', + 'href': '/api/attachments/1123', + 'title': '3634_B_QP_Radiend_Platzhalter.pdf' + }) + SubElement(document_el, 'file', attrib={ + 'category': 'additional', + 'description': '', + 'href': '/api/attachments/6027', + 'title': 'PlatzhalterFehlendeDokumente.pdf' + }) + + yield document_el + + +def test_process_single_document(provide_document_el): + document = XML()._process_single_document(provide_document_el, language_link='de') + assert document.id == '400' + assert document.language_link == 'de' + assert document.files[1].title == 'PlatzhalterFehlendeDokumente.pdf' From 6cf507ab8325bd59186101ecd7366159104d5d69 Mon Sep 17 00:00:00 2001 From: Michael Mueller Date: Wed, 20 Nov 2024 13:55:20 +0100 Subject: [PATCH 2/5] change name of attribute "language" to "language_document" --- geolink_formatter/entity.py | 10 +++++----- geolink_formatter/parser.py | 2 +- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/geolink_formatter/entity.py b/geolink_formatter/entity.py index 6e1d0ba0..02722b63 100644 --- a/geolink_formatter/entity.py +++ b/geolink_formatter/entity.py @@ -13,7 +13,7 @@ def __init__(self, files, id=None, category=None, doctype=None, federal_level=No authority_url=None, title=None, number=None, abbreviation=None, instance=None, type=None, subtype=None, decree_date=None, enactment_date=None, abrogation_date=None, cycle=None, municipality=None, index=None, status=None, status_start_date=None, status_end_date=None, - language=None, language_link=None): + language_document=None, language_link=None): """Creates a new document instance. Args: @@ -39,7 +39,7 @@ def __init__(self, files, id=None, category=None, doctype=None, federal_level=No status (str or None): The status of the prebublication. status_start_date (datetime.date or None): Start date of the status. status_end_date (datetime.date or None): End date of the status. - language (str or None): Language of the document. + language_document (str or None): Language of the document. language_link (str or None): Language of the geolink/prepublink collection. Raises: @@ -112,7 +112,7 @@ def __init__(self, files, id=None, category=None, doctype=None, federal_level=No self._status = status self._status_start_date = status_start_date self._status_end_date = status_end_date - self._language = language + self._language_document = language_document self._language_link = language_link @property @@ -226,9 +226,9 @@ def status_end_date(self): return self._status_end_date @property - def language(self): + def language_document(self): """str: Language of the document (since v1.2.5).""" - return self._language + return self._language_document @property def language_link(self): diff --git a/geolink_formatter/parser.py b/geolink_formatter/parser.py index 65949728..3056c858 100644 --- a/geolink_formatter/parser.py +++ b/geolink_formatter/parser.py @@ -169,7 +169,7 @@ def _process_single_document(self, document_el, language_link): status=document_el.attrib.get('status'), status_start_date=status_start_date, status_end_date=status_end_date, - language=document_el.attrib.get('language'), + language_document=document_el.attrib.get('language'), language_link=language_link ) From 5e5e1b0dfc4fa7ff8fce58f356eede4c3d7a4030 Mon Sep 17 00:00:00 2001 From: Michael Mueller Date: Wed, 20 Nov 2024 14:25:06 +0100 Subject: [PATCH 3/5] resolve pylint issues --- geolink_formatter/parser.py | 50 ++++++++++---------------------- geolink_formatter/utils.py | 20 +++++++++++++ tests/test_parser.py | 53 ---------------------------------- tests/test_utils.py | 57 +++++++++++++++++++++++++++++++++++++ 4 files changed, 92 insertions(+), 88 deletions(-) create mode 100644 geolink_formatter/utils.py create mode 100644 tests/test_utils.py diff --git a/geolink_formatter/parser.py b/geolink_formatter/parser.py index 3056c858..ee9ed4ea 100644 --- a/geolink_formatter/parser.py +++ b/geolink_formatter/parser.py @@ -1,10 +1,11 @@ # -*- coding: utf-8 -*- import datetime -from importlib import resources import requests +import importlib from lxml.etree import DTD, DocumentInvalid, fromstring from xmlschema import XMLSchema11 from geolink_formatter.entity import Document, File +from geolink_formatter.utils import filter_duplicated_documents class SCHEMA(object): @@ -60,10 +61,10 @@ def __init__(self, host_url=None, version='1.2.5', dtd_validation=False, xsd_val self._version = version self._dtd_validation = dtd_validation self._xsd_validation = xsd_validation - xsd = resources.files('geolink_formatter') / 'schema' / 'v{0}.xsd'.format(version) + xsd = importlib.resources.files('geolink_formatter') / 'schema' / f'v{version}.xsd' if self._xsd_validation: - with xsd.open(mode='r', encoding='utf-8') as f: - self._schema = XMLSchema11(f.read()) + with xsd.open(mode='r', encoding='utf-8') as xsd_f: + self._schema = XMLSchema11(xsd_f.read()) @property def host_url(self): @@ -117,11 +118,11 @@ def _process_single_document(self, document_el, language_link): if doctype == 'notice': doc_id += doctype - files = list() + files = [] for file_el in document_el.iter('file'): href = file_el.attrib.get('href') - if self.host_url and not href.startswith(u'http://') and not href.startswith(u'https://'): - href = u'{host}{href}'.format(host=self.host_url, href=href) + if self.host_url and not href.startswith(u'http://') and not href.startswith('https://'): + href = f'{self.host_url}{href}' files.append(File( title=file_el.attrib.get('title'), description=file_el.attrib.get('description'), @@ -173,9 +174,6 @@ def _process_single_document(self, document_el, language_link): language_link=language_link ) - assert isinstance(document, Document) - assert document.id is not None - return document def _process_geolinks_prepublinks(self, geolink_prepublink_el): @@ -190,30 +188,11 @@ def _process_geolinks_prepublinks(self, geolink_prepublink_el): """ language_link = geolink_prepublink_el.get('language') - documents = list() + documents = [] for document_el in geolink_prepublink_el.iter('document'): documents.append(self._process_single_document(document_el, language_link)) return documents - def _filter_duplicated_documents(self, documents): - """ - Filters duplicated documents. - - Args: - documents (list[geolink_formatter.entity.Document]): list of documents - - Returns: - list[geolink_formatter.entity.Document]: filtered list of documents - """ - documents_filtered = list() - for document in documents: - if ( - [document.id, document.language_link] not in - [[doc.id, doc.language_link] for doc in documents_filtered] - ): - documents_filtered.append(document) - return documents_filtered - def from_string(self, xml): """Parses XML into internal structure. @@ -229,19 +208,20 @@ def from_string(self, xml): lxml.etree.XMLSyntaxError: Raised on failed validation. """ root = self._parse_xml(xml) - documents = list() + documents = [] # evaluate root element's tag if root.tag == 'multilang_geolinks': - for el in root.iter('geolinks', 'prepublinks'): - documents.extend(self._process_geolinks_prepublinks(el)) + for elem in root.iter('geolinks', 'prepublinks'): + documents.extend(self._process_geolinks_prepublinks(elem)) elif root.tag in ['geolinks', 'prepublinks']: documents.extend(self._process_geolinks_prepublinks(root)) else: - raise RuntimeError('Unexpected tag name: {}'.format(root.tag)) + raise RuntimeError(f'Unexpected tag name: {root.tag}') # filter documents (remove duplicates) - documents = self._filter_duplicated_documents(documents) + documents = filter_duplicated_documents(documents) + return documents def from_url(self, url, params=None, **kwargs): diff --git a/geolink_formatter/utils.py b/geolink_formatter/utils.py new file mode 100644 index 00000000..8f740ca4 --- /dev/null +++ b/geolink_formatter/utils.py @@ -0,0 +1,20 @@ +# -*- coding: utf-8 -*- + +def filter_duplicated_documents(documents): + """ + Filters duplicated documents. + + Args: + documents (list[geolink_formatter.entity.Document]): list of documents + + Returns: + list[geolink_formatter.entity.Document]: filtered list of documents + """ + documents_filtered = list() + for document in documents: + if ( + [document.id, document.language_link] + not in [[doc.id, doc.language_link] for doc in documents_filtered] + ): + documents_filtered.append(document) + return documents_filtered diff --git a/tests/test_parser.py b/tests/test_parser.py index 066cee5b..ca566aac 100644 --- a/tests/test_parser.py +++ b/tests/test_parser.py @@ -428,59 +428,6 @@ def test_dtd_validation_invalid(): ) -@pytest.mark.parametrize('docs,exp_docs_filtered', [ - ( - [ - Document([], id=1, language_link='de'), - Document([], id=1, language_link='fr'), - Document([], id=2, language_link='de')], - [ - Document([], id=1, language_link='de'), - Document([], id=1, language_link='fr'), - Document([], id=2, language_link='de')] - ), - ( - [ - Document([], id=2, language_link='de'), - Document([], id=1, language_link='fr'), - Document([], id=1, language_link='de'), - Document([], id=2, language_link='fr'), - Document([], id=2, language_link='de') - ], - [ - Document([], id=1, language_link='de'), - Document([], id=1, language_link='fr'), - Document([], id=2, language_link='de'), - Document([], id=2, language_link='fr') - ] - ), - ( - [ - Document([], id=2, language_link='de'), - Document([], id=1, language_link=None), - Document([], id=2, language_link=None), - Document([], id=1, language_link=None) - ], - [ - Document([], id=2, language_link='de'), - Document([], id=1, language_link=None), - Document([], id=2, language_link=None), - ] - ) -]) -def test_filter_duplicated_documents(docs, exp_docs_filtered): - - result = XML()._filter_duplicated_documents(docs) - - result = sorted(result, key=lambda x: (x.id, (x.language_link is None, x.language_link)), reverse=False) - - exp_docs_filtered = sorted( - exp_docs_filtered, key=lambda x: (x.id, (x.language_link is None, x.language_link)), reverse=False) - - assert [[x.id, x.language_link] for x in result] == \ - [[x.id, x.language_link] for x in exp_docs_filtered] - - @pytest.fixture() def provide_geolinks_el(): geolinks_el = Element('geolinks', attrib={'language': 'de'}) diff --git a/tests/test_utils.py b/tests/test_utils.py new file mode 100644 index 00000000..d88053c1 --- /dev/null +++ b/tests/test_utils.py @@ -0,0 +1,57 @@ +# -*- coding: utf-8 -*- +import pytest +from geolink_formatter.utils import filter_duplicated_documents +from geolink_formatter.entity import Document + + +@pytest.mark.parametrize('docs,exp_docs_filtered', [ + ( + [ + Document([], id=1, language_link='de'), + Document([], id=1, language_link='fr'), + Document([], id=2, language_link='de')], + [ + Document([], id=1, language_link='de'), + Document([], id=1, language_link='fr'), + Document([], id=2, language_link='de')] + ), + ( + [ + Document([], id=2, language_link='de'), + Document([], id=1, language_link='fr'), + Document([], id=1, language_link='de'), + Document([], id=2, language_link='fr'), + Document([], id=2, language_link='de') + ], + [ + Document([], id=1, language_link='de'), + Document([], id=1, language_link='fr'), + Document([], id=2, language_link='de'), + Document([], id=2, language_link='fr') + ] + ), + ( + [ + Document([], id=2, language_link='de'), + Document([], id=1, language_link=None), + Document([], id=2, language_link=None), + Document([], id=1, language_link=None) + ], + [ + Document([], id=2, language_link='de'), + Document([], id=1, language_link=None), + Document([], id=2, language_link=None), + ] + ) +]) +def test_filter_duplicated_documents(docs, exp_docs_filtered): + + result = filter_duplicated_documents(docs) + + result = sorted(result, key=lambda x: (x.id, (x.language_link is None, x.language_link)), reverse=False) + + exp_docs_filtered = sorted( + exp_docs_filtered, key=lambda x: (x.id, (x.language_link is None, x.language_link)), reverse=False) + + assert [[x.id, x.language_link] for x in result] == \ + [[x.id, x.language_link] for x in exp_docs_filtered] From 008188927d0daf699a656704528b74fd80202c39 Mon Sep 17 00:00:00 2001 From: Michael Mueller Date: Wed, 20 Nov 2024 14:58:06 +0100 Subject: [PATCH 4/5] resolve pylint issues --- geolink_formatter/entity.py | 1 + geolink_formatter/format.py | 1 + geolink_formatter/parser.py | 7 ++++--- geolink_formatter/utils.py | 4 +++- 4 files changed, 9 insertions(+), 4 deletions(-) diff --git a/geolink_formatter/entity.py b/geolink_formatter/entity.py index 02722b63..c222df0d 100644 --- a/geolink_formatter/entity.py +++ b/geolink_formatter/entity.py @@ -1,4 +1,5 @@ # -*- coding: utf-8 -*- +""" Provides the Msg, Document and File classes. """ import datetime diff --git a/geolink_formatter/format.py b/geolink_formatter/format.py index c1eb8d79..c2f1cf82 100644 --- a/geolink_formatter/format.py +++ b/geolink_formatter/format.py @@ -1,4 +1,5 @@ # -*- coding: utf-8 -*- +""" Provides class HTML for rendering the xml document from the geolink api as html. """ from datetime import date diff --git a/geolink_formatter/parser.py b/geolink_formatter/parser.py index ee9ed4ea..3af94d59 100644 --- a/geolink_formatter/parser.py +++ b/geolink_formatter/parser.py @@ -1,7 +1,8 @@ # -*- coding: utf-8 -*- +""" Provides classes for parsing the xml document from the geolink api. """ +from importlib import resources as impresources import datetime import requests -import importlib from lxml.etree import DTD, DocumentInvalid, fromstring from xmlschema import XMLSchema11 from geolink_formatter.entity import Document, File @@ -61,7 +62,7 @@ def __init__(self, host_url=None, version='1.2.5', dtd_validation=False, xsd_val self._version = version self._dtd_validation = dtd_validation self._xsd_validation = xsd_validation - xsd = importlib.resources.files('geolink_formatter') / 'schema' / f'v{version}.xsd' + xsd = impresources.files('geolink_formatter') / 'schema' / f'v{version}.xsd' if self._xsd_validation: with xsd.open(mode='r', encoding='utf-8') as xsd_f: self._schema = XMLSchema11(xsd_f.read()) @@ -121,7 +122,7 @@ def _process_single_document(self, document_el, language_link): files = [] for file_el in document_el.iter('file'): href = file_el.attrib.get('href') - if self.host_url and not href.startswith(u'http://') and not href.startswith('https://'): + if self.host_url and not href.startswith('http://') and not href.startswith('https://'): href = f'{self.host_url}{href}' files.append(File( title=file_el.attrib.get('title'), diff --git a/geolink_formatter/utils.py b/geolink_formatter/utils.py index 8f740ca4..1d7bc083 100644 --- a/geolink_formatter/utils.py +++ b/geolink_formatter/utils.py @@ -1,4 +1,6 @@ # -*- coding: utf-8 -*- +""" Provides additional methods usable for parsing the xml document from the geolink api. """ + def filter_duplicated_documents(documents): """ @@ -10,7 +12,7 @@ def filter_duplicated_documents(documents): Returns: list[geolink_formatter.entity.Document]: filtered list of documents """ - documents_filtered = list() + documents_filtered = [] for document in documents: if ( [document.id, document.language_link] From fc4c63f90a9fb798baec9c47de1763444366e06e Mon Sep 17 00:00:00 2001 From: Michael Mueller Date: Wed, 4 Dec 2024 09:28:20 +0100 Subject: [PATCH 5/5] add tag name / add tests for prepublinks v1.2.5 --- geolink_formatter/parser.py | 2 +- tests/resources/prepublink_v1.2.5.xml | 43 ++++++++++++++++++++ tests/resources/prepublink_v1.2.5_ml.xml | 45 ++++++++++++++++++++ tests/test_parser.py | 52 +++++++++++++++++++++++- 4 files changed, 140 insertions(+), 2 deletions(-) create mode 100644 tests/resources/prepublink_v1.2.5.xml create mode 100644 tests/resources/prepublink_v1.2.5_ml.xml diff --git a/geolink_formatter/parser.py b/geolink_formatter/parser.py index 3af94d59..58b3f3eb 100644 --- a/geolink_formatter/parser.py +++ b/geolink_formatter/parser.py @@ -212,7 +212,7 @@ def from_string(self, xml): documents = [] # evaluate root element's tag - if root.tag == 'multilang_geolinks': + if root.tag in ['multilang_geolinks', 'multilang_prepublinks']: for elem in root.iter('geolinks', 'prepublinks'): documents.extend(self._process_geolinks_prepublinks(elem)) elif root.tag in ['geolinks', 'prepublinks']: diff --git a/tests/resources/prepublink_v1.2.5.xml b/tests/resources/prepublink_v1.2.5.xml new file mode 100644 index 00000000..daa9a353 --- /dev/null +++ b/tests/resources/prepublink_v1.2.5.xml @@ -0,0 +1,43 @@ + + + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/tests/resources/prepublink_v1.2.5_ml.xml b/tests/resources/prepublink_v1.2.5_ml.xml new file mode 100644 index 00000000..4a9a02b3 --- /dev/null +++ b/tests/resources/prepublink_v1.2.5_ml.xml @@ -0,0 +1,45 @@ + + + + + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/tests/test_parser.py b/tests/test_parser.py index ca566aac..5dadb050 100644 --- a/tests/test_parser.py +++ b/tests/test_parser.py @@ -375,11 +375,13 @@ def test_schema_version_1_2_5(): assert documents[-3].id == '11' assert documents[-2].id == '13' assert documents[-1].id == '14' + assert documents[0].language_document == 'de' + assert documents[0].language_link == 'de' def test_schema_version_1_2_5_ml(): """ - test of schema version 1.2.5 + test of schema version 1.2.5 multilang """ with requests_mock.mock() as mock_m: with open('tests/resources/geolink_v1.2.5_ml.xml', 'rb') as file_f: @@ -398,6 +400,54 @@ def test_schema_version_1_2_5_ml(): assert documents[-3].id == '11' assert documents[-2].id == '13' assert documents[-1].id == '14' + assert documents[0].language_document == 'de' + assert documents[0].language_link == 'de' + assert documents[2].language_document == 'de' + assert documents[2].language_link == 'de' + assert documents[12].language_document == 'de' + assert documents[12].language_link == 'rm' + assert documents[14].language_document == 'rm' + assert documents[14].language_link == 'rm' + assert documents[24].language_document == 'de' + assert documents[24].language_link == 'it' + assert documents[26].language_document == 'it' + assert documents[26].language_link == 'it' + + +def test_schema_version_1_2_5_prepublink(): + """ + test of schema version 1.2.5: prepublink + """ + with requests_mock.mock() as mock_m: + with open('tests/resources/prepublink_v1.2.5.xml', 'rb') as file_f: + mock_m.get('http://oereblex.test.com/api/prepubs/1500.xml', content=file_f.read()) + documents = XML(version=SCHEMA.V1_2_5).from_url('http://oereblex.test.com/api/prepubs/1500.xml') + assert len(documents) == 5 + assert documents[0].index is None + assert documents[1].index is None + assert documents[-3].index == 731 + assert documents[-2].index == 741 + assert documents[-1].index == 10 + assert documents[0].language_document == 'de' + assert documents[0].language_link == 'de' + + +def test_schema_version_1_2_5_prepublink_ml(): + """ + test of schema version 1.2.5: prepublink multilang + """ + with requests_mock.mock() as mock_m: + with open('tests/resources/prepublink_v1.2.5_ml.xml', 'rb') as file_f: + mock_m.get('http://oereblex.test.com/api/prepubs/1500.xml', content=file_f.read()) + documents = XML(version=SCHEMA.V1_2_5).from_url('http://oereblex.test.com/api/prepubs/1500.xml') + assert len(documents) == 5 + assert documents[0].index is None + assert documents[1].index is None + assert documents[-3].index == 731 + assert documents[-2].index == 741 + assert documents[-1].index == 10 + assert documents[0].language_document == 'de' + assert documents[0].language_link == 'de' def test_default_version_with_locale():