Skip to content

Commit

Permalink
Create pygeoapi sitemap processor
Browse files Browse the repository at this point in the history
  • Loading branch information
webb-ben committed Jul 31, 2023
1 parent 1954d44 commit 59d11b9
Show file tree
Hide file tree
Showing 2 changed files with 200 additions and 12 deletions.
21 changes: 9 additions & 12 deletions pygeoapi_plugins/formatter/xml.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@
import xml.etree.ElementTree as ET

from pygeoapi.formatter.base import (
BaseFormatter, FormatterGenericError, FormatterSerializationError)
BaseFormatter, FormatterSerializationError)

LOGGER = logging.getLogger(__name__)

Expand Down Expand Up @@ -62,13 +62,7 @@ def __init__(self, formatter_def: dict):
"""

geom = False
try:
self.uri_field = formatter_def['uri_field']
except KeyError:
msg = 'URI field required for XML output'
LOGGER.error(msg)
raise FormatterGenericError(msg)

self.uri_field = formatter_def.get('uri_field')
super().__init__({'name': 'xml', 'geom': geom})
self.mimetype = 'application/xml; charset=utf-8'

Expand All @@ -83,12 +77,11 @@ def write(self, options: dict = {}, data: dict = None) -> str:
"""

try:
fields = list(data['features'][0]['properties'].keys())
feature = list(data['features'][0])
except IndexError:
LOGGER.error('no features')
return str()

LOGGER.debug(f'XML fields: {fields}')
lastmod = datetime.now().strftime('%Y-%m-%dT%H:%M:%SZ')
root = ET.fromstring(URLSET)
tree = ET.ElementTree(root)
Expand All @@ -103,13 +96,17 @@ def write(self, options: dict = {}, data: dict = None) -> str:
LOGGER.warning('Maximum size of sitemap reached')
break

loc = feature['properties'][self.uri_field]
try:
loc = feature['properties'][self.uri_field]
except KeyError:
loc = feature['@id']

_ = URLSET_FOREACH.format(loc, lastmod)
root.append(ET.fromstring(_))

except ValueError as err:
LOGGER.error(err)
raise FormatterSerializationError('Error writing CSV output')
raise FormatterSerializationError('Error writing XML output')

output = io.BytesIO()
tree.write(output, encoding='utf-8', xml_declaration=True)
Expand Down
191 changes: 191 additions & 0 deletions pygeoapi_plugins/process/sitemap.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,191 @@
# =================================================================
#
# Author: Benjamin Webb <[email protected]>
#
# Copyright (c) 2023 Center for Geospatial Solutions
#
# Permission is hereby granted, free of charge, to any person
# obtaining a copy of this software and associated documentation
# files (the "Software"), to deal in the Software without
# restriction, including without limitation the rights to use,
# copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the
# Software is furnished to do so, subject to the following
# conditions:
#
# The above copyright notice and this permission notice shall be
# included in all copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
# OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
# HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
# WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
# OTHER DEALINGS IN THE SOFTWARE.
#
# =================================================================

import io
import math
import os
import logging
import zipfile

from pygeoapi.plugin import load_plugin
from pygeoapi.process.base import BaseProcessor
from pygeoapi.linked_data import geojson2jsonld
from pygeoapi.openapi import get_oas
from pygeoapi.util import (yaml_load, get_provider_default, url_join,
filter_dict_by_key_value)

from pygeoapi_plugins.formatter.xml import XMLFormatter


LOGGER = logging.getLogger(__name__)

with open(os.getenv('PYGEOAPI_CONFIG'), encoding='utf8') as fh:
CONFIG = yaml_load(fh)
COLLECTIONS = filter_dict_by_key_value(CONFIG['resources'],
'type', 'collection')
# TODO: Filter collections for those that support CQL


PROCESS_DEF = CONFIG['resources']['sitemap-generator']
PROCESS_DEF.update({
'version': '0.1.0',
'id': 'sitemap-generator',
'title': 'Sitemap Generator',
'description': ('A process that returns a sitemap of'
'all pygeoapi endpoints.'),
'links': [{
'type': 'text/html',
'rel': 'about',
'title': 'information',
'href': 'https://developers.google.com/search/docs/crawling-indexing/sitemaps/overview', # noqa
'hreflang': 'en-US'
}],
'inputs': {
'zip': {
'title': {
'en': 'ZIP response'
},
'description': {
'en': 'Boolean whether to ZIP the response'
},
'keywords': {
'en': ['sitemap', 'pygeoapi']
},
'schema': {
'type': 'boolean',
'default': True
},
'minOccurs': 0,
'maxOccurs': 1,
'metadata': None, # TODO how to use?
},
},
'outputs': {
'sitemap': {
'title': {
'en': 'Sitemap'
},
'description': {
'en': 'A sitemap of the pygeoapi instance'
},
'schema': {
'type': 'object',
'contentMediaType': 'application/json'
}
}
},
'example': {
'inputs': {
'zip': True
}
}
})


class SitemapProcessor(BaseProcessor):
"""Sitemap Processor"""

def __init__(self, processor_def):
"""
Initialize object
:param processor_def: provider definition
:returns: pygeoapi.process.sitemap.SitemapProcessor
"""
LOGGER.debug('SitemapProcesser init')
super().__init__(processor_def, PROCESS_DEF)
self.config = CONFIG
self.base_url = self.config['server']['url']
self.xml = XMLFormatter({})

def execute(self, data):
"""
Execute Sitemap Process
:param data: processor arguments
:returns: 'application/xml'
"""
mimetype = 'application/xml'
outputs = {}

LOGGER.debug('Generating core.xml')
oas = {'features': []}
for path in get_oas(self.config)['paths']:
path_uri = url_join(self.base_url, path)
oas['features'].append({'@id': path_uri})
outputs['core.xml'] = self.xml.write(data=oas)

LOGGER.debug('Generating collections sitemap')
for cname, c in COLLECTIONS.items():
p = get_provider_default(c['providers'])
provider = load_plugin('provider', p)
_ = provider.query(resulttype='hits')
hits = _['numberMatched']
for i in range(math.ceil(hits / 50000)):
sitemap_name = f'{cname}__{i}.xml'
LOGGER.debug(f'Generating {sitemap_name}')
outputs[sitemap_name] = self._generate(i, cname, provider)

LOGGER.debug('Returning response')
if data.get('zip'):
zip_output = io.BytesIO()
with zipfile.ZipFile(zip_output, 'w') as zipf:
for filename, content in outputs.items():
zipf.writestr(filename, content)
return 'application/zip', zip_output.read()

else:
return mimetype, outputs

def _generate(self, index, dataset, provider, n=50000):
"""
Private Function: Generate sitemap
:param index: feature list index
:param dataset: OGC API Provider name
:param provider: OGC API Provider definition
:param n: Number per index
:returns: List of GeoJSON Features
"""

content = provider.query(offset=(n*index), limit=n)
content['links'] = []
content = geojson2jsonld(
self, content, dataset, id_field=(provider.uri_field or 'id')
)
return self.xml.write(data=content)

def __repr__(self):
return f'<SitemapProcessor> {self.name}'

def get_collections_url(self):
return f'{self.base_url}/collections'

0 comments on commit 59d11b9

Please sign in to comment.