Skip to content

Commit

Permalink
Merge pull request #345 from openoereb/330-oereblex-api-version-1-2-5
Browse files Browse the repository at this point in the history
add support for oereblex api version 1.2.5
  • Loading branch information
michmuel authored Dec 13, 2024
2 parents a27645c + fc4c63f commit ab23b28
Show file tree
Hide file tree
Showing 12 changed files with 1,026 additions and 76 deletions.
6 changes: 3 additions & 3 deletions CHANGELOG
Original file line number Diff line number Diff line change
@@ -1,10 +1,10 @@
Changelog
=========
Change log
==========

unreleased
----------

Supported GEO-Link API versions: v1.0.0, v1.1.0, v1.1.1, v1.2.0, v1.2.1, v1.2.2, v1.2.3, v1.2.4 (default)
Supported GEO-Link API versions: v1.0.0, v1.1.0, v1.1.1, v1.2.0, v1.2.1, v1.2.2, v1.2.3, v1.2.4, v1.2.5 (default)

- Drop support for Python 3.8
- Dependency updates
Expand Down
18 changes: 17 additions & 1 deletion geolink_formatter/entity.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
# -*- coding: utf-8 -*-
""" Provides the Msg, Document and File classes. """
import datetime


Expand All @@ -12,7 +13,8 @@ class Document(object):
def __init__(self, files, id=None, category=None, doctype=None, federal_level=None, authority=None,
authority_url=None, title=None, number=None, abbreviation=None, instance=None, type=None,
subtype=None, decree_date=None, enactment_date=None, abrogation_date=None, cycle=None,
municipality=None, index=None, status=None, status_start_date=None, status_end_date=None):
municipality=None, index=None, status=None, status_start_date=None, status_end_date=None,
language_document=None, language_link=None):
"""Creates a new document instance.
Args:
Expand All @@ -38,6 +40,8 @@ def __init__(self, files, id=None, category=None, doctype=None, federal_level=No
status (str or None): The status of the prebublication.
status_start_date (datetime.date or None): Start date of the status.
status_end_date (datetime.date or None): End date of the status.
language_document (str or None): Language of the document.
language_link (str or None): Language of the geolink/prepublink collection.
Raises:
TypeError: Raised on missing argument or invalid argument type.
Expand Down Expand Up @@ -109,6 +113,8 @@ def __init__(self, files, id=None, category=None, doctype=None, federal_level=No
self._status = status
self._status_start_date = status_start_date
self._status_end_date = status_end_date
self._language_document = language_document
self._language_link = language_link

@property
def files(self):
Expand Down Expand Up @@ -220,6 +226,16 @@ def status_end_date(self):
"""datetime.date: End date of the status (since v1.2.2)."""
return self._status_end_date

@property
def language_document(self):
"""str: Language of the document (since v1.2.5)."""
return self._language_document

@property
def language_link(self):
"""str: Language of the geolink or prepublink (since v1.2.5)."""
return self._language_link


class File(object):
def __init__(self, category=None, href=None, title=None, description=None):
Expand Down
1 change: 1 addition & 0 deletions geolink_formatter/format.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
# -*- coding: utf-8 -*-
""" Provides class HTML for rendering the xml document from the geolink api as html. """
from datetime import date


Expand Down
190 changes: 119 additions & 71 deletions geolink_formatter/parser.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,12 @@
# -*- coding: utf-8 -*-
""" Provides classes for parsing the xml document from the geolink api. """
from importlib import resources as impresources
import datetime
import pkg_resources
import requests
from lxml.etree import DTD, DocumentInvalid, fromstring
from xmlschema import XMLSchema11
from geolink_formatter.entity import Document, File
from geolink_formatter.utils import filter_duplicated_documents


class SCHEMA(object):
Expand Down Expand Up @@ -32,15 +34,18 @@ class SCHEMA(object):
"""str: geoLink schema version 1.2.3"""

V1_2_4 = '1.2.4'
"""str: geoLink schema version 1.2.3"""
"""str: geoLink schema version 1.2.4"""

V1_2_5 = '1.2.5'
"""str: geoLink schema version 1.2.5"""


class XML(object):

_date_format = '%Y-%m-%d'
"""str: Format of date values in XML."""

def __init__(self, host_url=None, version='1.2.4', dtd_validation=False, xsd_validation=True):
def __init__(self, host_url=None, version='1.2.5', dtd_validation=False, xsd_validation=True):
"""Create a new XML parser instance containing the geoLink XSD for validation.
Args:
Expand All @@ -57,10 +62,10 @@ def __init__(self, host_url=None, version='1.2.4', dtd_validation=False, xsd_val
self._version = version
self._dtd_validation = dtd_validation
self._xsd_validation = xsd_validation
xsd = pkg_resources.resource_filename('geolink_formatter', 'schema/v{0}.xsd'.format(version))
xsd = impresources.files('geolink_formatter') / 'schema' / f'v{version}.xsd'
if self._xsd_validation:
with open(xsd, encoding='utf-8') as f:
self._schema = XMLSchema11(f.read())
with xsd.open(mode='r', encoding='utf-8') as xsd_f:
self._schema = XMLSchema11(xsd_f.read())

@property
def host_url(self):
Expand Down Expand Up @@ -94,6 +99,101 @@ def _parse_xml(self, xml):
raise DocumentInvalid('Missing DTD in parsed content')
return content

def _process_single_document(self, document_el, language_link):
"""
Processes a single document element.
Args:
document_el (lxml.etree._Element): element 'document'
language_link (str): language of the documents set
Returns:
geolink_formatter.entity.Document: document
"""
doc_id = document_el.attrib.get('id')
doctype = document_el.attrib.get('doctype')

# Mangle doc_id for notices. While IDs are unique between decrees
# and edicts, this is not the case when adding notices to the mix.
if doctype == 'notice':
doc_id += doctype

files = []
for file_el in document_el.iter('file'):
href = file_el.attrib.get('href')
if self.host_url and not href.startswith('http://') and not href.startswith('https://'):
href = f'{self.host_url}{href}'
files.append(File(
title=file_el.attrib.get('title'),
description=file_el.attrib.get('description'),
href=href,
category=file_el.attrib.get('category')
))
enactment_date = document_el.attrib.get('enactment_date')
if enactment_date:
enactment_date = datetime.datetime.strptime(enactment_date, self._date_format).date()
decree_date = document_el.attrib.get('decree_date')
if decree_date:
decree_date = datetime.datetime.strptime(decree_date, self._date_format).date()
abrogation_date = document_el.attrib.get('abrogation_date')
if abrogation_date:
abrogation_date = datetime.datetime.strptime(abrogation_date, self._date_format).date()
status_start_date = document_el.attrib.get('status_start_date')
if status_start_date:
status_start_date = datetime.datetime.strptime(status_start_date, self._date_format)\
.date()
status_end_date = document_el.attrib.get('status_end_date')
if status_end_date:
status_end_date = datetime.datetime\
.strptime(status_end_date, self._date_format).date()

document = Document(
files=files,
id=doc_id,
category=document_el.attrib.get('category'),
doctype=document_el.attrib.get('doctype'),
federal_level=document_el.attrib.get('federal_level'),
authority=document_el.attrib.get('authority'),
authority_url=document_el.attrib.get('authority_url'),
title=document_el.attrib.get('title'),
number=document_el.attrib.get('number'),
abbreviation=document_el.attrib.get('abbreviation'),
instance=document_el.attrib.get('instance'),
type=document_el.attrib.get('type'),
subtype=document_el.attrib.get('subtype'),
decree_date=decree_date,
enactment_date=enactment_date,
abrogation_date=abrogation_date,
cycle=document_el.attrib.get('cycle'),
municipality=document_el.attrib.get('municipality'),
index=document_el.attrib.get('index'),
status=document_el.attrib.get('status'),
status_start_date=status_start_date,
status_end_date=status_end_date,
language_document=document_el.attrib.get('language'),
language_link=language_link
)

return document

def _process_geolinks_prepublinks(self, geolink_prepublink_el):
"""
Processes a 'geolinks' or 'prepublinks' element.
Args:
geolink_prepublink_el (lxml.etree._Element): element 'geolinks' or 'prepublinks'
Return:
list[geolink_formatter.entity.Document]: list of documents
"""
language_link = geolink_prepublink_el.get('language')

documents = []
for document_el in geolink_prepublink_el.iter('document'):
documents.append(self._process_single_document(document_el, language_link))
return documents

def from_string(self, xml):
"""Parses XML into internal structure.
Expand All @@ -109,71 +209,19 @@ def from_string(self, xml):
lxml.etree.XMLSyntaxError: Raised on failed validation.
"""
root = self._parse_xml(xml)
documents = list()

for document_el in root.iter('document'):
doc_id = document_el.attrib.get('id')
doctype = document_el.attrib.get('doctype')

# Mangle doc_id for notices. While IDs are unique between decrees
# and edicts, this is not the case when adding notices to the mix.
if doctype == 'notice':
doc_id += doctype

if doc_id and doc_id not in [doc.id for doc in documents]:
files = list()
for file_el in document_el.iter('file'):
href = file_el.attrib.get('href')
if self.host_url and not href.startswith(u'http://') and not href.startswith(u'https://'):
href = u'{host}{href}'.format(host=self.host_url, href=href)
files.append(File(
title=file_el.attrib.get('title'),
description=file_el.attrib.get('description'),
href=href,
category=file_el.attrib.get('category')
))
enactment_date = document_el.attrib.get('enactment_date')
if enactment_date:
enactment_date = datetime.datetime.strptime(enactment_date, self._date_format).date()
decree_date = document_el.attrib.get('decree_date')
if decree_date:
decree_date = datetime.datetime.strptime(decree_date, self._date_format).date()
abrogation_date = document_el.attrib.get('abrogation_date')
if abrogation_date:
abrogation_date = datetime.datetime.strptime(abrogation_date, self._date_format).date()
status_start_date = document_el.attrib.get('status_start_date')
if status_start_date:
status_start_date = datetime.datetime.strptime(status_start_date, self._date_format)\
.date()
status_end_date = document_el.attrib.get('status_end_date')
if status_end_date:
status_end_date = datetime.datetime.strptime(status_end_date, self._date_format)\
.date()

documents.append(Document(
files=files,
id=doc_id,
category=document_el.attrib.get('category'),
doctype=document_el.attrib.get('doctype'),
federal_level=document_el.attrib.get('federal_level'),
authority=document_el.attrib.get('authority'),
authority_url=document_el.attrib.get('authority_url'),
title=document_el.attrib.get('title'),
number=document_el.attrib.get('number'),
abbreviation=document_el.attrib.get('abbreviation'),
instance=document_el.attrib.get('instance'),
type=document_el.attrib.get('type'),
subtype=document_el.attrib.get('subtype'),
decree_date=decree_date,
enactment_date=enactment_date,
abrogation_date=abrogation_date,
cycle=document_el.attrib.get('cycle'),
municipality=document_el.attrib.get('municipality'),
index=document_el.attrib.get('index'),
status=document_el.attrib.get('status'),
status_start_date=status_start_date,
status_end_date=status_end_date
))
documents = []

# evaluate root element's tag
if root.tag in ['multilang_geolinks', 'multilang_prepublinks']:
for elem in root.iter('geolinks', 'prepublinks'):
documents.extend(self._process_geolinks_prepublinks(elem))
elif root.tag in ['geolinks', 'prepublinks']:
documents.extend(self._process_geolinks_prepublinks(root))
else:
raise RuntimeError(f'Unexpected tag name: {root.tag}')

# filter documents (remove duplicates)
documents = filter_duplicated_documents(documents)

return documents

Expand Down
Loading

0 comments on commit ab23b28

Please sign in to comment.