Skip to content

Commit

Permalink
add support for oereblex api version 1.2.5
Browse files Browse the repository at this point in the history
  • Loading branch information
michmuel committed Nov 20, 2024
1 parent 42993ae commit 7c99d46
Show file tree
Hide file tree
Showing 7 changed files with 876 additions and 73 deletions.
6 changes: 3 additions & 3 deletions CHANGELOG
Original file line number Diff line number Diff line change
@@ -1,10 +1,10 @@
Changelog
=========
Change log
==========

unreleased
----------

Supported GEO-Link API versions: v1.0.0, v1.1.0, v1.1.1, v1.2.0, v1.2.1, v1.2.2, v1.2.3, v1.2.4 (default)
Supported GEO-Link API versions: v1.0.0, v1.1.0, v1.1.1, v1.2.0, v1.2.1, v1.2.2, v1.2.3, v1.2.4, v1.2.5 (default)

- Drop support for Python 3.8
- Dependency updates
Expand Down
17 changes: 16 additions & 1 deletion geolink_formatter/entity.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,8 @@ class Document(object):
def __init__(self, files, id=None, category=None, doctype=None, federal_level=None, authority=None,
authority_url=None, title=None, number=None, abbreviation=None, instance=None, type=None,
subtype=None, decree_date=None, enactment_date=None, abrogation_date=None, cycle=None,
municipality=None, index=None, status=None, status_start_date=None, status_end_date=None):
municipality=None, index=None, status=None, status_start_date=None, status_end_date=None,
language=None, language_link=None):
"""Creates a new document instance.
Args:
Expand All @@ -38,6 +39,8 @@ def __init__(self, files, id=None, category=None, doctype=None, federal_level=No
status (str or None): The status of the prebublication.
status_start_date (datetime.date or None): Start date of the status.
status_end_date (datetime.date or None): End date of the status.
language (str or None): Language of the document.
language_link (str or None): Language of the geolink/prepublink collection.
Raises:
TypeError: Raised on missing argument or invalid argument type.
Expand Down Expand Up @@ -109,6 +112,8 @@ def __init__(self, files, id=None, category=None, doctype=None, federal_level=No
self._status = status
self._status_start_date = status_start_date
self._status_end_date = status_end_date
self._language = language
self._language_link = language_link

@property
def files(self):
Expand Down Expand Up @@ -220,6 +225,16 @@ def status_end_date(self):
"""datetime.date: End date of the status (since v1.2.2)."""
return self._status_end_date

@property
def language(self):
"""str: Language of the document (since v1.2.5)."""
return self._language

@property
def language_link(self):
"""str: Language of the geolink or prepublink (since v1.2.5)."""
return self._language_link


class File(object):
def __init__(self, category=None, href=None, title=None, description=None):
Expand Down
203 changes: 135 additions & 68 deletions geolink_formatter/parser.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
# -*- coding: utf-8 -*-
import datetime
import pkg_resources
from importlib import resources

Check warning

Code scanning / Pylint (reported by Codacy)

No name 'resources' in module 'importlib' Warning

No name 'resources' in module 'importlib'
import requests
from lxml.etree import DTD, DocumentInvalid, fromstring
from xmlschema import XMLSchema11
Expand Down Expand Up @@ -32,15 +32,18 @@ class SCHEMA(object):
"""str: geoLink schema version 1.2.3"""

V1_2_4 = '1.2.4'
"""str: geoLink schema version 1.2.3"""
"""str: geoLink schema version 1.2.4"""

V1_2_5 = '1.2.5'
"""str: geoLink schema version 1.2.5"""


class XML(object):

_date_format = '%Y-%m-%d'
"""str: Format of date values in XML."""

def __init__(self, host_url=None, version='1.2.4', dtd_validation=False, xsd_validation=True):
def __init__(self, host_url=None, version='1.2.5', dtd_validation=False, xsd_validation=True):
"""Create a new XML parser instance containing the geoLink XSD for validation.
Args:
Expand All @@ -57,9 +60,9 @@ def __init__(self, host_url=None, version='1.2.4', dtd_validation=False, xsd_val
self._version = version
self._dtd_validation = dtd_validation
self._xsd_validation = xsd_validation
xsd = pkg_resources.resource_filename('geolink_formatter', 'schema/v{0}.xsd'.format(version))
xsd = resources.files('geolink_formatter') / 'schema' / 'v{0}.xsd'.format(version)

Check warning

Code scanning / Pylintpython3 (reported by Codacy)

Formatting a regular string which could be an f-string Warning

Formatting a regular string which could be an f-string

Check warning

Code scanning / Prospector (reported by Codacy)

Formatting a regular string which could be a f-string (consider-using-f-string) Warning

Formatting a regular string which could be a f-string (consider-using-f-string)
if self._xsd_validation:
with open(xsd, encoding='utf-8') as f:
with xsd.open(mode='r', encoding='utf-8') as f:

Check warning

Code scanning / Pylint (reported by Codacy)

Variable name "f" doesn't conform to snake_case naming style Warning

Variable name "f" doesn't conform to snake_case naming style
self._schema = XMLSchema11(f.read())

@property
Expand Down Expand Up @@ -94,6 +97,123 @@ def _parse_xml(self, xml):
raise DocumentInvalid('Missing DTD in parsed content')
return content

def _process_single_document(self, document_el, language_link):
"""
Processes a single document element.
Args:
document_el (lxml.etree._Element): element 'document'
language_link (str): language of the documents set
Returns:
geolink_formatter.entity.Document: document
"""
doc_id = document_el.attrib.get('id')
doctype = document_el.attrib.get('doctype')

# Mangle doc_id for notices. While IDs are unique between decrees
# and edicts, this is not the case when adding notices to the mix.
if doctype == 'notice':
doc_id += doctype

files = list()
for file_el in document_el.iter('file'):
href = file_el.attrib.get('href')
if self.host_url and not href.startswith(u'http://') and not href.startswith(u'https://'):
href = u'{host}{href}'.format(host=self.host_url, href=href)

Check warning

Code scanning / Pylintpython3 (reported by Codacy)

instead of list() Warning

instead of list()

Check warning

Code scanning / Prospector (reported by Codacy)

Consider using [] instead of list() (use-list-literal) Warning

Consider using [] instead of list() (use-list-literal)
files.append(File(
title=file_el.attrib.get('title'),
description=file_el.attrib.get('description'),

Check notice

Code scanning / Pylintpython3 (reported by Codacy)

The u prefix for strings is no longer necessary in Python >=3.0 Note

The u prefix for strings is no longer necessary in Python >=3.0

Check warning

Code scanning / Prospector (reported by Codacy)

The u prefix for strings is no longer necessary in Python >=3.0 (redundant-u-string-prefix) Warning

The u prefix for strings is no longer necessary in Python >=3.0 (redundant-u-string-prefix)
href=href,

Check warning

Code scanning / Pylintpython3 (reported by Codacy)

Formatting a regular string which could be an f-string Warning

Formatting a regular string which could be an f-string

Check notice

Code scanning / Pylintpython3 (reported by Codacy)

The u prefix for strings is no longer necessary in Python >=3.0 Note

The u prefix for strings is no longer necessary in Python >=3.0

Check warning

Code scanning / Prospector (reported by Codacy)

Formatting a regular string which could be a f-string (consider-using-f-string) Warning

Formatting a regular string which could be a f-string (consider-using-f-string)

Check warning

Code scanning / Prospector (reported by Codacy)

The u prefix for strings is no longer necessary in Python >=3.0 (redundant-u-string-prefix) Warning

The u prefix for strings is no longer necessary in Python >=3.0 (redundant-u-string-prefix)
category=file_el.attrib.get('category')
))
enactment_date = document_el.attrib.get('enactment_date')
if enactment_date:
enactment_date = datetime.datetime.strptime(enactment_date, self._date_format).date()
decree_date = document_el.attrib.get('decree_date')
if decree_date:
decree_date = datetime.datetime.strptime(decree_date, self._date_format).date()
abrogation_date = document_el.attrib.get('abrogation_date')
if abrogation_date:
abrogation_date = datetime.datetime.strptime(abrogation_date, self._date_format).date()
status_start_date = document_el.attrib.get('status_start_date')
if status_start_date:
status_start_date = datetime.datetime.strptime(status_start_date, self._date_format)\
.date()
status_end_date = document_el.attrib.get('status_end_date')
if status_end_date:
status_end_date = datetime.datetime\
.strptime(status_end_date, self._date_format).date()

document = Document(
files=files,
id=doc_id,
category=document_el.attrib.get('category'),
doctype=document_el.attrib.get('doctype'),
federal_level=document_el.attrib.get('federal_level'),
authority=document_el.attrib.get('authority'),
authority_url=document_el.attrib.get('authority_url'),
title=document_el.attrib.get('title'),
number=document_el.attrib.get('number'),
abbreviation=document_el.attrib.get('abbreviation'),
instance=document_el.attrib.get('instance'),
type=document_el.attrib.get('type'),
subtype=document_el.attrib.get('subtype'),
decree_date=decree_date,
enactment_date=enactment_date,
abrogation_date=abrogation_date,
cycle=document_el.attrib.get('cycle'),
municipality=document_el.attrib.get('municipality'),
index=document_el.attrib.get('index'),
status=document_el.attrib.get('status'),
status_start_date=status_start_date,
status_end_date=status_end_date,
language=document_el.attrib.get('language'),
language_link=language_link
)

assert isinstance(document, Document)

Check warning

Code scanning / Bandit (reported by Codacy)

Use of assert detected. The enclosed code will be removed when compiling to optimised byte code. Warning

Use of assert detected. The enclosed code will be removed when compiling to optimised byte code.
assert document.id is not None

Check warning

Code scanning / Bandit (reported by Codacy)

Use of assert detected. The enclosed code will be removed when compiling to optimised byte code. Warning

Use of assert detected. The enclosed code will be removed when compiling to optimised byte code.

return document

def _process_geolinks_prepublinks(self, geolink_prepublink_el):
"""
Processes a 'geolinks' or 'prepublinks' element.
Args:
geolink_prepublink_el (lxml.etree._Element): element 'geolinks' or 'prepublinks'
Return:
list[geolink_formatter.entity.Document]: list of documents
"""
language_link = geolink_prepublink_el.get('language')

documents = list()

Check warning

Code scanning / Pylintpython3 (reported by Codacy)

instead of list() Warning

instead of list()

Check warning

Code scanning / Prospector (reported by Codacy)

Consider using [] instead of list() (use-list-literal) Warning

Consider using [] instead of list() (use-list-literal)
for document_el in geolink_prepublink_el.iter('document'):
documents.append(self._process_single_document(document_el, language_link))
return documents

def _filter_duplicated_documents(self, documents):

Check warning

Code scanning / Pylint (reported by Codacy)

Method could be a function Warning

Method could be a function
"""
Filters duplicated documents.
Args:
documents (list[geolink_formatter.entity.Document]): list of documents
Returns:
list[geolink_formatter.entity.Document]: filtered list of documents
"""
documents_filtered = list()

Check warning

Code scanning / Pylintpython3 (reported by Codacy)

instead of list() Warning

instead of list()

Check warning

Code scanning / Prospector (reported by Codacy)

Consider using [] instead of list() (use-list-literal) Warning

Consider using [] instead of list() (use-list-literal)
for document in documents:
if (
[document.id, document.language_link] not in

Check warning

Code scanning / Pylint (reported by Codacy)

Wrong hanging indentation before block (add 4 spaces). Warning

Wrong hanging indentation before block (add 4 spaces).
[[doc.id, doc.language_link] for doc in documents_filtered]

Check warning

Code scanning / Pylint (reported by Codacy)

Wrong hanging indentation before block (add 4 spaces). Warning

Wrong hanging indentation before block (add 4 spaces).
):
documents_filtered.append(document)
return documents_filtered

def from_string(self, xml):
"""Parses XML into internal structure.
Expand All @@ -111,70 +231,17 @@ def from_string(self, xml):
root = self._parse_xml(xml)
documents = list()

for document_el in root.iter('document'):
doc_id = document_el.attrib.get('id')
doctype = document_el.attrib.get('doctype')

# Mangle doc_id for notices. While IDs are unique between decrees
# and edicts, this is not the case when adding notices to the mix.
if doctype == 'notice':
doc_id += doctype

if doc_id and doc_id not in [doc.id for doc in documents]:
files = list()
for file_el in document_el.iter('file'):
href = file_el.attrib.get('href')
if self.host_url and not href.startswith(u'http://') and not href.startswith(u'https://'):
href = u'{host}{href}'.format(host=self.host_url, href=href)
files.append(File(
title=file_el.attrib.get('title'),
description=file_el.attrib.get('description'),
href=href,
category=file_el.attrib.get('category')
))
enactment_date = document_el.attrib.get('enactment_date')
if enactment_date:
enactment_date = datetime.datetime.strptime(enactment_date, self._date_format).date()
decree_date = document_el.attrib.get('decree_date')
if decree_date:
decree_date = datetime.datetime.strptime(decree_date, self._date_format).date()
abrogation_date = document_el.attrib.get('abrogation_date')
if abrogation_date:
abrogation_date = datetime.datetime.strptime(abrogation_date, self._date_format).date()
status_start_date = document_el.attrib.get('status_start_date')
if status_start_date:
status_start_date = datetime.datetime.strptime(status_start_date, self._date_format)\
.date()
status_end_date = document_el.attrib.get('status_end_date')
if status_end_date:
status_end_date = datetime.datetime.strptime(status_end_date, self._date_format)\
.date()

documents.append(Document(
files=files,
id=doc_id,
category=document_el.attrib.get('category'),
doctype=document_el.attrib.get('doctype'),
federal_level=document_el.attrib.get('federal_level'),
authority=document_el.attrib.get('authority'),
authority_url=document_el.attrib.get('authority_url'),
title=document_el.attrib.get('title'),
number=document_el.attrib.get('number'),
abbreviation=document_el.attrib.get('abbreviation'),
instance=document_el.attrib.get('instance'),
type=document_el.attrib.get('type'),
subtype=document_el.attrib.get('subtype'),
decree_date=decree_date,
enactment_date=enactment_date,
abrogation_date=abrogation_date,
cycle=document_el.attrib.get('cycle'),
municipality=document_el.attrib.get('municipality'),
index=document_el.attrib.get('index'),
status=document_el.attrib.get('status'),
status_start_date=status_start_date,
status_end_date=status_end_date
))
# evaluate root element's tag
if root.tag == 'multilang_geolinks':
for el in root.iter('geolinks', 'prepublinks'):

Check warning

Code scanning / Pylint (reported by Codacy)

Variable name "el" doesn't conform to snake_case naming style Warning

Variable name "el" doesn't conform to snake_case naming style
documents.extend(self._process_geolinks_prepublinks(el))
elif root.tag in ['geolinks', 'prepublinks']:
documents.extend(self._process_geolinks_prepublinks(root))
else:
raise RuntimeError('Unexpected tag name: {}'.format(root.tag))

Check warning

Code scanning / Pylintpython3 (reported by Codacy)

Formatting a regular string which could be an f-string Warning

Formatting a regular string which could be an f-string

Check warning

Code scanning / Prospector (reported by Codacy)

Formatting a regular string which could be a f-string (consider-using-f-string) Warning

Formatting a regular string which could be a f-string (consider-using-f-string)

# filter documents (remove duplicates)
documents = self._filter_duplicated_documents(documents)
return documents

def from_url(self, url, params=None, **kwargs):
Expand Down
Loading

0 comments on commit 7c99d46

Please sign in to comment.